xref: /openbmc/linux/drivers/vdpa/mlx5/net/mlx5_vnet.c (revision 6846d656)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
3 
4 #include <linux/module.h>
5 #include <linux/vdpa.h>
6 #include <linux/vringh.h>
7 #include <uapi/linux/virtio_net.h>
8 #include <uapi/linux/virtio_ids.h>
9 #include <uapi/linux/vdpa.h>
10 #include <linux/virtio_config.h>
11 #include <linux/auxiliary_bus.h>
12 #include <linux/mlx5/cq.h>
13 #include <linux/mlx5/qp.h>
14 #include <linux/mlx5/device.h>
15 #include <linux/mlx5/driver.h>
16 #include <linux/mlx5/vport.h>
17 #include <linux/mlx5/fs.h>
18 #include <linux/mlx5/mlx5_ifc_vdpa.h>
19 #include <linux/mlx5/mpfs.h>
20 #include "mlx5_vdpa.h"
21 
22 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
23 MODULE_DESCRIPTION("Mellanox VDPA driver");
24 MODULE_LICENSE("Dual BSD/GPL");
25 
26 #define to_mlx5_vdpa_ndev(__mvdev)                                             \
27 	container_of(__mvdev, struct mlx5_vdpa_net, mvdev)
28 #define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev)
29 
30 #define VALID_FEATURES_MASK                                                                        \
31 	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
32 	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
33 	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
34 	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
35 	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
36 	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
37 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
38 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
39 	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
40 	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
41 	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
42 	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
43 	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
44 
45 #define VALID_STATUS_MASK                                                                          \
46 	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
47 	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
48 
49 #define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
50 
51 struct mlx5_vdpa_net_resources {
52 	u32 tisn;
53 	u32 tdn;
54 	u32 tirn;
55 	u32 rqtn;
56 	bool valid;
57 };
58 
59 struct mlx5_vdpa_cq_buf {
60 	struct mlx5_frag_buf_ctrl fbc;
61 	struct mlx5_frag_buf frag_buf;
62 	int cqe_size;
63 	int nent;
64 };
65 
66 struct mlx5_vdpa_cq {
67 	struct mlx5_core_cq mcq;
68 	struct mlx5_vdpa_cq_buf buf;
69 	struct mlx5_db db;
70 	int cqe;
71 };
72 
73 struct mlx5_vdpa_umem {
74 	struct mlx5_frag_buf_ctrl fbc;
75 	struct mlx5_frag_buf frag_buf;
76 	int size;
77 	u32 id;
78 };
79 
80 struct mlx5_vdpa_qp {
81 	struct mlx5_core_qp mqp;
82 	struct mlx5_frag_buf frag_buf;
83 	struct mlx5_db db;
84 	u16 head;
85 	bool fw;
86 };
87 
88 struct mlx5_vq_restore_info {
89 	u32 num_ent;
90 	u64 desc_addr;
91 	u64 device_addr;
92 	u64 driver_addr;
93 	u16 avail_index;
94 	u16 used_index;
95 	bool ready;
96 	bool restore;
97 };
98 
99 struct mlx5_vdpa_virtqueue {
100 	bool ready;
101 	u64 desc_addr;
102 	u64 device_addr;
103 	u64 driver_addr;
104 	u32 num_ent;
105 
106 	/* Resources for implementing the notification channel from the device
107 	 * to the driver. fwqp is the firmware end of an RC connection; the
108 	 * other end is vqqp used by the driver. cq is is where completions are
109 	 * reported.
110 	 */
111 	struct mlx5_vdpa_cq cq;
112 	struct mlx5_vdpa_qp fwqp;
113 	struct mlx5_vdpa_qp vqqp;
114 
115 	/* umem resources are required for the virtqueue operation. They're use
116 	 * is internal and they must be provided by the driver.
117 	 */
118 	struct mlx5_vdpa_umem umem1;
119 	struct mlx5_vdpa_umem umem2;
120 	struct mlx5_vdpa_umem umem3;
121 
122 	bool initialized;
123 	int index;
124 	u32 virtq_id;
125 	struct mlx5_vdpa_net *ndev;
126 	u16 avail_idx;
127 	u16 used_idx;
128 	int fw_state;
129 
130 	/* keep last in the struct */
131 	struct mlx5_vq_restore_info ri;
132 };
133 
134 static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
135 {
136 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) {
137 		if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
138 			return idx < 2;
139 		else
140 			return idx < 3;
141 	}
142 
143 	return idx <= mvdev->max_idx;
144 }
145 
146 struct mlx5_vdpa_net {
147 	struct mlx5_vdpa_dev mvdev;
148 	struct mlx5_vdpa_net_resources res;
149 	struct virtio_net_config config;
150 	struct mlx5_vdpa_virtqueue *vqs;
151 	struct vdpa_callback *event_cbs;
152 
153 	/* Serialize vq resources creation and destruction. This is required
154 	 * since memory map might change and we need to destroy and create
155 	 * resources while driver in operational.
156 	 */
157 	struct mutex reslock;
158 	struct mlx5_flow_table *rxft;
159 	struct mlx5_fc *rx_counter;
160 	struct mlx5_flow_handle *rx_rule_ucast;
161 	struct mlx5_flow_handle *rx_rule_mcast;
162 	bool setup;
163 	u32 cur_num_vqs;
164 	struct notifier_block nb;
165 	struct vdpa_callback config_cb;
166 };
167 
168 static void free_resources(struct mlx5_vdpa_net *ndev);
169 static void init_mvqs(struct mlx5_vdpa_net *ndev);
170 static int setup_driver(struct mlx5_vdpa_dev *mvdev);
171 static void teardown_driver(struct mlx5_vdpa_net *ndev);
172 
173 static bool mlx5_vdpa_debug;
174 
175 #define MLX5_CVQ_MAX_ENT 16
176 
177 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
178 	do {                                                                                       \
179 		if (features & BIT_ULL(_feature))                                                  \
180 			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
181 	} while (0)
182 
183 #define MLX5_LOG_VIO_STAT(_status)                                                                 \
184 	do {                                                                                       \
185 		if (status & (_status))                                                            \
186 			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
187 	} while (0)
188 
189 /* TODO: cross-endian support */
190 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
191 {
192 	return virtio_legacy_is_little_endian() ||
193 		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
194 }
195 
196 static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
197 {
198 	return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
199 }
200 
201 static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
202 {
203 	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
204 }
205 
206 static inline u32 mlx5_vdpa_max_qps(int max_vqs)
207 {
208 	return max_vqs / 2;
209 }
210 
211 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
212 {
213 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
214 		return 2;
215 
216 	return 2 * mlx5_vdpa_max_qps(mvdev->max_vqs);
217 }
218 
219 static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
220 {
221 	return idx == ctrl_vq_idx(mvdev);
222 }
223 
224 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
225 {
226 	if (status & ~VALID_STATUS_MASK)
227 		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
228 			       status & ~VALID_STATUS_MASK);
229 
230 	if (!mlx5_vdpa_debug)
231 		return;
232 
233 	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
234 	if (set && !status) {
235 		mlx5_vdpa_info(mvdev, "driver resets the device\n");
236 		return;
237 	}
238 
239 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
240 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
241 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
242 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
243 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
244 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
245 }
246 
247 static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
248 {
249 	if (features & ~VALID_FEATURES_MASK)
250 		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
251 			       features & ~VALID_FEATURES_MASK);
252 
253 	if (!mlx5_vdpa_debug)
254 		return;
255 
256 	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
257 	if (!features)
258 		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
259 
260 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
261 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
262 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
263 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
264 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
265 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
266 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
267 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
268 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
269 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
270 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
271 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
272 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
273 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
274 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
275 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
276 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
277 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
278 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
279 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
280 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
281 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
282 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
283 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
284 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
285 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
286 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
287 	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
288 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
289 	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
290 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
291 	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
292 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
293 	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
294 }
295 
296 static int create_tis(struct mlx5_vdpa_net *ndev)
297 {
298 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
299 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
300 	void *tisc;
301 	int err;
302 
303 	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
304 	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
305 	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
306 	if (err)
307 		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
308 
309 	return err;
310 }
311 
312 static void destroy_tis(struct mlx5_vdpa_net *ndev)
313 {
314 	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
315 }
316 
317 #define MLX5_VDPA_CQE_SIZE 64
318 #define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
319 
320 static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
321 {
322 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
323 	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
324 	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
325 	int err;
326 
327 	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
328 				       ndev->mvdev.mdev->priv.numa_node);
329 	if (err)
330 		return err;
331 
332 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
333 
334 	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
335 	buf->nent = nent;
336 
337 	return 0;
338 }
339 
340 static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
341 {
342 	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
343 
344 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
345 					ndev->mvdev.mdev->priv.numa_node);
346 }
347 
348 static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
349 {
350 	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
351 }
352 
353 static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
354 {
355 	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
356 }
357 
358 static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
359 {
360 	struct mlx5_cqe64 *cqe64;
361 	void *cqe;
362 	int i;
363 
364 	for (i = 0; i < buf->nent; i++) {
365 		cqe = get_cqe(vcq, i);
366 		cqe64 = cqe;
367 		cqe64->op_own = MLX5_CQE_INVALID << 4;
368 	}
369 }
370 
371 static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
372 {
373 	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
374 
375 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
376 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
377 		return cqe64;
378 
379 	return NULL;
380 }
381 
382 static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
383 {
384 	vqp->head += n;
385 	vqp->db.db[0] = cpu_to_be32(vqp->head);
386 }
387 
388 static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
389 		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
390 {
391 	struct mlx5_vdpa_qp *vqp;
392 	__be64 *pas;
393 	void *qpc;
394 
395 	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
396 	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
397 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
398 	if (vqp->fw) {
399 		/* Firmware QP is allocated by the driver for the firmware's
400 		 * use so we can skip part of the params as they will be chosen by firmware
401 		 */
402 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
403 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
404 		MLX5_SET(qpc, qpc, no_sq, 1);
405 		return;
406 	}
407 
408 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
409 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
410 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
411 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
412 	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
413 	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
414 	MLX5_SET(qpc, qpc, no_sq, 1);
415 	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
416 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
417 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
418 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
419 	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
420 }
421 
422 static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
423 {
424 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
425 					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
426 					ndev->mvdev.mdev->priv.numa_node);
427 }
428 
429 static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
430 {
431 	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
432 }
433 
434 static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
435 		     struct mlx5_vdpa_qp *vqp)
436 {
437 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
438 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
439 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
440 	void *qpc;
441 	void *in;
442 	int err;
443 
444 	if (!vqp->fw) {
445 		vqp = &mvq->vqqp;
446 		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
447 		if (err)
448 			return err;
449 
450 		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
451 		if (err)
452 			goto err_db;
453 		inlen += vqp->frag_buf.npages * sizeof(__be64);
454 	}
455 
456 	in = kzalloc(inlen, GFP_KERNEL);
457 	if (!in) {
458 		err = -ENOMEM;
459 		goto err_kzalloc;
460 	}
461 
462 	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
463 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
464 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
465 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
466 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
467 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
468 	if (!vqp->fw)
469 		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
470 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
471 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
472 	kfree(in);
473 	if (err)
474 		goto err_kzalloc;
475 
476 	vqp->mqp.uid = ndev->mvdev.res.uid;
477 	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
478 
479 	if (!vqp->fw)
480 		rx_post(vqp, mvq->num_ent);
481 
482 	return 0;
483 
484 err_kzalloc:
485 	if (!vqp->fw)
486 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
487 err_db:
488 	if (!vqp->fw)
489 		rq_buf_free(ndev, vqp);
490 
491 	return err;
492 }
493 
494 static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
495 {
496 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
497 
498 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
499 	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
500 	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
501 	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
502 		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
503 	if (!vqp->fw) {
504 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
505 		rq_buf_free(ndev, vqp);
506 	}
507 }
508 
509 static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
510 {
511 	return get_sw_cqe(cq, cq->mcq.cons_index);
512 }
513 
514 static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
515 {
516 	struct mlx5_cqe64 *cqe64;
517 
518 	cqe64 = next_cqe_sw(vcq);
519 	if (!cqe64)
520 		return -EAGAIN;
521 
522 	vcq->mcq.cons_index++;
523 	return 0;
524 }
525 
526 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
527 {
528 	struct mlx5_vdpa_net *ndev = mvq->ndev;
529 	struct vdpa_callback *event_cb;
530 
531 	event_cb = &ndev->event_cbs[mvq->index];
532 	mlx5_cq_set_ci(&mvq->cq.mcq);
533 
534 	/* make sure CQ cosumer update is visible to the hardware before updating
535 	 * RX doorbell record.
536 	 */
537 	dma_wmb();
538 	rx_post(&mvq->vqqp, num);
539 	if (event_cb->callback)
540 		event_cb->callback(event_cb->private);
541 }
542 
543 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
544 {
545 	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
546 	struct mlx5_vdpa_net *ndev = mvq->ndev;
547 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
548 	int num = 0;
549 
550 	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
551 		num++;
552 		if (num > mvq->num_ent / 2) {
553 			/* If completions keep coming while we poll, we want to
554 			 * let the hardware know that we consumed them by
555 			 * updating the doorbell record.  We also let vdpa core
556 			 * know about this so it passes it on the virtio driver
557 			 * on the guest.
558 			 */
559 			mlx5_vdpa_handle_completions(mvq, num);
560 			num = 0;
561 		}
562 	}
563 
564 	if (num)
565 		mlx5_vdpa_handle_completions(mvq, num);
566 
567 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
568 }
569 
570 static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
571 {
572 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
573 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
574 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
575 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
576 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
577 	__be64 *pas;
578 	int inlen;
579 	void *cqc;
580 	void *in;
581 	int err;
582 	int eqn;
583 
584 	err = mlx5_db_alloc(mdev, &vcq->db);
585 	if (err)
586 		return err;
587 
588 	vcq->mcq.set_ci_db = vcq->db.db;
589 	vcq->mcq.arm_db = vcq->db.db + 1;
590 	vcq->mcq.cqe_sz = 64;
591 
592 	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
593 	if (err)
594 		goto err_db;
595 
596 	cq_frag_buf_init(vcq, &vcq->buf);
597 
598 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
599 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
600 	in = kzalloc(inlen, GFP_KERNEL);
601 	if (!in) {
602 		err = -ENOMEM;
603 		goto err_vzalloc;
604 	}
605 
606 	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
607 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
608 	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
609 
610 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
611 	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
612 
613 	/* Use vector 0 by default. Consider adding code to choose least used
614 	 * vector.
615 	 */
616 	err = mlx5_vector2eqn(mdev, 0, &eqn);
617 	if (err)
618 		goto err_vec;
619 
620 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
621 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
622 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
623 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
624 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
625 
626 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
627 	if (err)
628 		goto err_vec;
629 
630 	vcq->mcq.comp = mlx5_vdpa_cq_comp;
631 	vcq->cqe = num_ent;
632 	vcq->mcq.set_ci_db = vcq->db.db;
633 	vcq->mcq.arm_db = vcq->db.db + 1;
634 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
635 	kfree(in);
636 	return 0;
637 
638 err_vec:
639 	kfree(in);
640 err_vzalloc:
641 	cq_frag_buf_free(ndev, &vcq->buf);
642 err_db:
643 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
644 	return err;
645 }
646 
647 static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
648 {
649 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
650 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
651 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
652 
653 	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
654 		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
655 		return;
656 	}
657 	cq_frag_buf_free(ndev, &vcq->buf);
658 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
659 }
660 
661 static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
662 			  struct mlx5_vdpa_umem **umemp)
663 {
664 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
665 	int p_a;
666 	int p_b;
667 
668 	switch (num) {
669 	case 1:
670 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_a);
671 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_b);
672 		*umemp = &mvq->umem1;
673 		break;
674 	case 2:
675 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_a);
676 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_b);
677 		*umemp = &mvq->umem2;
678 		break;
679 	case 3:
680 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_a);
681 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_b);
682 		*umemp = &mvq->umem3;
683 		break;
684 	}
685 	(*umemp)->size = p_a * mvq->num_ent + p_b;
686 }
687 
688 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
689 {
690 	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
691 }
692 
693 static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
694 {
695 	int inlen;
696 	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
697 	void *um;
698 	void *in;
699 	int err;
700 	__be64 *pas;
701 	struct mlx5_vdpa_umem *umem;
702 
703 	set_umem_size(ndev, mvq, num, &umem);
704 	err = umem_frag_buf_alloc(ndev, umem, umem->size);
705 	if (err)
706 		return err;
707 
708 	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
709 
710 	in = kzalloc(inlen, GFP_KERNEL);
711 	if (!in) {
712 		err = -ENOMEM;
713 		goto err_in;
714 	}
715 
716 	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
717 	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
718 	um = MLX5_ADDR_OF(create_umem_in, in, umem);
719 	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
720 	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
721 
722 	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
723 	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
724 
725 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
726 	if (err) {
727 		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
728 		goto err_cmd;
729 	}
730 
731 	kfree(in);
732 	umem->id = MLX5_GET(create_umem_out, out, umem_id);
733 
734 	return 0;
735 
736 err_cmd:
737 	kfree(in);
738 err_in:
739 	umem_frag_buf_free(ndev, umem);
740 	return err;
741 }
742 
743 static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
744 {
745 	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
746 	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
747 	struct mlx5_vdpa_umem *umem;
748 
749 	switch (num) {
750 	case 1:
751 		umem = &mvq->umem1;
752 		break;
753 	case 2:
754 		umem = &mvq->umem2;
755 		break;
756 	case 3:
757 		umem = &mvq->umem3;
758 		break;
759 	}
760 
761 	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
762 	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
763 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
764 		return;
765 
766 	umem_frag_buf_free(ndev, umem);
767 }
768 
769 static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
770 {
771 	int num;
772 	int err;
773 
774 	for (num = 1; num <= 3; num++) {
775 		err = create_umem(ndev, mvq, num);
776 		if (err)
777 			goto err_umem;
778 	}
779 	return 0;
780 
781 err_umem:
782 	for (num--; num > 0; num--)
783 		umem_destroy(ndev, mvq, num);
784 
785 	return err;
786 }
787 
788 static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
789 {
790 	int num;
791 
792 	for (num = 3; num > 0; num--)
793 		umem_destroy(ndev, mvq, num);
794 }
795 
796 static int get_queue_type(struct mlx5_vdpa_net *ndev)
797 {
798 	u32 type_mask;
799 
800 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
801 
802 	/* prefer split queue */
803 	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)
804 		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
805 
806 	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED));
807 
808 	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
809 }
810 
811 static bool vq_is_tx(u16 idx)
812 {
813 	return idx % 2;
814 }
815 
816 static u16 get_features_12_3(u64 features)
817 {
818 	return (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << 9) |
819 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << 8) |
820 	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << 7) |
821 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_CSUM)) << 6);
822 }
823 
824 static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
825 {
826 	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
827 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
828 	void *obj_context;
829 	void *cmd_hdr;
830 	void *vq_ctx;
831 	void *in;
832 	int err;
833 
834 	err = umems_create(ndev, mvq);
835 	if (err)
836 		return err;
837 
838 	in = kzalloc(inlen, GFP_KERNEL);
839 	if (!in) {
840 		err = -ENOMEM;
841 		goto err_alloc;
842 	}
843 
844 	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
845 
846 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
847 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
848 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
849 
850 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
851 	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
852 	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
853 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
854 		 get_features_12_3(ndev->mvdev.actual_features));
855 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
856 	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
857 
858 	if (vq_is_tx(mvq->index))
859 		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
860 
861 	MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
862 	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
863 	MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
864 	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
865 	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
866 		 !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
867 	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
868 	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
869 	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
870 	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey);
871 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
872 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
873 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
874 	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
875 	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
876 	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
877 	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
878 
879 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
880 	if (err)
881 		goto err_cmd;
882 
883 	kfree(in);
884 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
885 
886 	return 0;
887 
888 err_cmd:
889 	kfree(in);
890 err_alloc:
891 	umems_destroy(ndev, mvq);
892 	return err;
893 }
894 
895 static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
896 {
897 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
898 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
899 
900 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
901 		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
902 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
903 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
904 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
905 		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
906 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
907 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
908 		return;
909 	}
910 	umems_destroy(ndev, mvq);
911 }
912 
913 static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
914 {
915 	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
916 }
917 
918 static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
919 {
920 	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
921 }
922 
923 static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
924 			int *outlen, u32 qpn, u32 rqpn)
925 {
926 	void *qpc;
927 	void *pp;
928 
929 	switch (cmd) {
930 	case MLX5_CMD_OP_2RST_QP:
931 		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
932 		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
933 		*in = kzalloc(*inlen, GFP_KERNEL);
934 		*out = kzalloc(*outlen, GFP_KERNEL);
935 		if (!*in || !*out)
936 			goto outerr;
937 
938 		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
939 		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
940 		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
941 		break;
942 	case MLX5_CMD_OP_RST2INIT_QP:
943 		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
944 		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
945 		*in = kzalloc(*inlen, GFP_KERNEL);
946 		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
947 		if (!*in || !*out)
948 			goto outerr;
949 
950 		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
951 		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
952 		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
953 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
954 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
955 		MLX5_SET(qpc, qpc, rwe, 1);
956 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
957 		MLX5_SET(ads, pp, vhca_port_num, 1);
958 		break;
959 	case MLX5_CMD_OP_INIT2RTR_QP:
960 		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
961 		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
962 		*in = kzalloc(*inlen, GFP_KERNEL);
963 		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
964 		if (!*in || !*out)
965 			goto outerr;
966 
967 		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
968 		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
969 		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
970 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
971 		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
972 		MLX5_SET(qpc, qpc, log_msg_max, 30);
973 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
974 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
975 		MLX5_SET(ads, pp, fl, 1);
976 		break;
977 	case MLX5_CMD_OP_RTR2RTS_QP:
978 		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
979 		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
980 		*in = kzalloc(*inlen, GFP_KERNEL);
981 		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
982 		if (!*in || !*out)
983 			goto outerr;
984 
985 		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
986 		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
987 		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
988 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
989 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
990 		MLX5_SET(ads, pp, ack_timeout, 14);
991 		MLX5_SET(qpc, qpc, retry_count, 7);
992 		MLX5_SET(qpc, qpc, rnr_retry, 7);
993 		break;
994 	default:
995 		goto outerr_nullify;
996 	}
997 
998 	return;
999 
1000 outerr:
1001 	kfree(*in);
1002 	kfree(*out);
1003 outerr_nullify:
1004 	*in = NULL;
1005 	*out = NULL;
1006 }
1007 
1008 static void free_inout(void *in, void *out)
1009 {
1010 	kfree(in);
1011 	kfree(out);
1012 }
1013 
1014 /* Two QPs are used by each virtqueue. One is used by the driver and one by
1015  * firmware. The fw argument indicates whether the subjected QP is the one used
1016  * by firmware.
1017  */
1018 static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
1019 {
1020 	int outlen;
1021 	int inlen;
1022 	void *out;
1023 	void *in;
1024 	int err;
1025 
1026 	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
1027 	if (!in || !out)
1028 		return -ENOMEM;
1029 
1030 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
1031 	free_inout(in, out);
1032 	return err;
1033 }
1034 
1035 static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1036 {
1037 	int err;
1038 
1039 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
1040 	if (err)
1041 		return err;
1042 
1043 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
1044 	if (err)
1045 		return err;
1046 
1047 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
1048 	if (err)
1049 		return err;
1050 
1051 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
1052 	if (err)
1053 		return err;
1054 
1055 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
1056 	if (err)
1057 		return err;
1058 
1059 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
1060 	if (err)
1061 		return err;
1062 
1063 	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1064 }
1065 
1066 struct mlx5_virtq_attr {
1067 	u8 state;
1068 	u16 available_index;
1069 	u16 used_index;
1070 };
1071 
1072 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1073 			   struct mlx5_virtq_attr *attr)
1074 {
1075 	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1076 	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1077 	void *out;
1078 	void *obj_context;
1079 	void *cmd_hdr;
1080 	int err;
1081 
1082 	out = kzalloc(outlen, GFP_KERNEL);
1083 	if (!out)
1084 		return -ENOMEM;
1085 
1086 	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1087 
1088 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1089 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1090 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1091 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1092 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1093 	if (err)
1094 		goto err_cmd;
1095 
1096 	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1097 	memset(attr, 0, sizeof(*attr));
1098 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1099 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1100 	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
1101 	kfree(out);
1102 	return 0;
1103 
1104 err_cmd:
1105 	kfree(out);
1106 	return err;
1107 }
1108 
1109 static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
1110 {
1111 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1112 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1113 	void *obj_context;
1114 	void *cmd_hdr;
1115 	void *in;
1116 	int err;
1117 
1118 	in = kzalloc(inlen, GFP_KERNEL);
1119 	if (!in)
1120 		return -ENOMEM;
1121 
1122 	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1123 
1124 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1125 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1126 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1127 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1128 
1129 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1130 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select,
1131 		   MLX5_VIRTQ_MODIFY_MASK_STATE);
1132 	MLX5_SET(virtio_net_q_object, obj_context, state, state);
1133 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1134 	kfree(in);
1135 	if (!err)
1136 		mvq->fw_state = state;
1137 
1138 	return err;
1139 }
1140 
1141 static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1142 {
1143 	u16 idx = mvq->index;
1144 	int err;
1145 
1146 	if (!mvq->num_ent)
1147 		return 0;
1148 
1149 	if (mvq->initialized)
1150 		return 0;
1151 
1152 	err = cq_create(ndev, idx, mvq->num_ent);
1153 	if (err)
1154 		return err;
1155 
1156 	err = qp_create(ndev, mvq, &mvq->fwqp);
1157 	if (err)
1158 		goto err_fwqp;
1159 
1160 	err = qp_create(ndev, mvq, &mvq->vqqp);
1161 	if (err)
1162 		goto err_vqqp;
1163 
1164 	err = connect_qps(ndev, mvq);
1165 	if (err)
1166 		goto err_connect;
1167 
1168 	err = create_virtqueue(ndev, mvq);
1169 	if (err)
1170 		goto err_connect;
1171 
1172 	if (mvq->ready) {
1173 		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1174 		if (err) {
1175 			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
1176 				       idx, err);
1177 			goto err_connect;
1178 		}
1179 	}
1180 
1181 	mvq->initialized = true;
1182 	return 0;
1183 
1184 err_connect:
1185 	qp_destroy(ndev, &mvq->vqqp);
1186 err_vqqp:
1187 	qp_destroy(ndev, &mvq->fwqp);
1188 err_fwqp:
1189 	cq_destroy(ndev, idx);
1190 	return err;
1191 }
1192 
1193 static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1194 {
1195 	struct mlx5_virtq_attr attr;
1196 
1197 	if (!mvq->initialized)
1198 		return;
1199 
1200 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1201 		return;
1202 
1203 	if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
1204 		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
1205 
1206 	if (query_virtqueue(ndev, mvq, &attr)) {
1207 		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
1208 		return;
1209 	}
1210 	mvq->avail_idx = attr.available_index;
1211 	mvq->used_idx = attr.used_index;
1212 }
1213 
1214 static void suspend_vqs(struct mlx5_vdpa_net *ndev)
1215 {
1216 	int i;
1217 
1218 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
1219 		suspend_vq(ndev, &ndev->vqs[i]);
1220 }
1221 
1222 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1223 {
1224 	if (!mvq->initialized)
1225 		return;
1226 
1227 	suspend_vq(ndev, mvq);
1228 	destroy_virtqueue(ndev, mvq);
1229 	qp_destroy(ndev, &mvq->vqqp);
1230 	qp_destroy(ndev, &mvq->fwqp);
1231 	cq_destroy(ndev, mvq->index);
1232 	mvq->initialized = false;
1233 }
1234 
1235 static int create_rqt(struct mlx5_vdpa_net *ndev)
1236 {
1237 	__be32 *list;
1238 	int max_rqt;
1239 	void *rqtc;
1240 	int inlen;
1241 	void *in;
1242 	int i, j;
1243 	int err;
1244 	int num;
1245 
1246 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
1247 		num = 1;
1248 	else
1249 		num = ndev->cur_num_vqs / 2;
1250 
1251 	max_rqt = min_t(int, roundup_pow_of_two(num),
1252 			1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
1253 	if (max_rqt < 1)
1254 		return -EOPNOTSUPP;
1255 
1256 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num);
1257 	in = kzalloc(inlen, GFP_KERNEL);
1258 	if (!in)
1259 		return -ENOMEM;
1260 
1261 	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1262 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1263 
1264 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1265 	MLX5_SET(rqtc, rqtc, rqt_max_size, max_rqt);
1266 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1267 	for (i = 0, j = 0; i < max_rqt; i++, j += 2)
1268 		list[i] = cpu_to_be32(ndev->vqs[j % (2 * num)].virtq_id);
1269 
1270 	MLX5_SET(rqtc, rqtc, rqt_actual_size, max_rqt);
1271 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1272 	kfree(in);
1273 	if (err)
1274 		return err;
1275 
1276 	return 0;
1277 }
1278 
1279 #define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
1280 
1281 static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
1282 {
1283 	__be32 *list;
1284 	int max_rqt;
1285 	void *rqtc;
1286 	int inlen;
1287 	void *in;
1288 	int i, j;
1289 	int err;
1290 
1291 	max_rqt = min_t(int, roundup_pow_of_two(ndev->cur_num_vqs / 2),
1292 			1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
1293 	if (max_rqt < 1)
1294 		return -EOPNOTSUPP;
1295 
1296 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num);
1297 	in = kzalloc(inlen, GFP_KERNEL);
1298 	if (!in)
1299 		return -ENOMEM;
1300 
1301 	MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
1302 	MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
1303 	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
1304 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1305 
1306 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1307 	for (i = 0, j = 0; i < max_rqt; i++, j += 2)
1308 		list[i] = cpu_to_be32(ndev->vqs[j % num].virtq_id);
1309 
1310 	MLX5_SET(rqtc, rqtc, rqt_actual_size, max_rqt);
1311 	err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
1312 	kfree(in);
1313 	if (err)
1314 		return err;
1315 
1316 	return 0;
1317 }
1318 
1319 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1320 {
1321 	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1322 }
1323 
1324 static int create_tir(struct mlx5_vdpa_net *ndev)
1325 {
1326 #define HASH_IP_L4PORTS                                                                            \
1327 	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1328 	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1329 	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1330 						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1331 						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1332 						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1333 						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1334 	void *rss_key;
1335 	void *outer;
1336 	void *tirc;
1337 	void *in;
1338 	int err;
1339 
1340 	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1341 	if (!in)
1342 		return -ENOMEM;
1343 
1344 	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1345 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1346 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1347 
1348 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1349 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1350 	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1351 	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1352 
1353 	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1354 	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1355 	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1356 	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1357 
1358 	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1359 	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1360 
1361 	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1362 	kfree(in);
1363 	return err;
1364 }
1365 
1366 static void destroy_tir(struct mlx5_vdpa_net *ndev)
1367 {
1368 	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1369 }
1370 
1371 static int add_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1372 {
1373 	struct mlx5_flow_destination dest[2] = {};
1374 	struct mlx5_flow_table_attr ft_attr = {};
1375 	struct mlx5_flow_act flow_act = {};
1376 	struct mlx5_flow_namespace *ns;
1377 	struct mlx5_flow_spec *spec;
1378 	void *headers_c;
1379 	void *headers_v;
1380 	u8 *dmac_c;
1381 	u8 *dmac_v;
1382 	int err;
1383 
1384 	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
1385 	if (!spec)
1386 		return -ENOMEM;
1387 
1388 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
1389 	ft_attr.max_fte = 2;
1390 	ft_attr.autogroup.max_num_groups = 2;
1391 
1392 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
1393 	if (!ns) {
1394 		mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n");
1395 		err = -EOPNOTSUPP;
1396 		goto err_ns;
1397 	}
1398 
1399 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
1400 	if (IS_ERR(ndev->rxft)) {
1401 		err = PTR_ERR(ndev->rxft);
1402 		goto err_ns;
1403 	}
1404 
1405 	ndev->rx_counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1406 	if (IS_ERR(ndev->rx_counter)) {
1407 		err = PTR_ERR(ndev->rx_counter);
1408 		goto err_fc;
1409 	}
1410 
1411 	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers);
1412 	dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16);
1413 	memset(dmac_c, 0xff, ETH_ALEN);
1414 	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
1415 	dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16);
1416 	ether_addr_copy(dmac_v, ndev->config.mac);
1417 
1418 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_COUNT;
1419 	dest[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1420 	dest[0].tir_num = ndev->res.tirn;
1421 	dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1422 	dest[1].counter_id = mlx5_fc_id(ndev->rx_counter);
1423 	ndev->rx_rule_ucast = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dest, 2);
1424 
1425 	if (IS_ERR(ndev->rx_rule_ucast)) {
1426 		err = PTR_ERR(ndev->rx_rule_ucast);
1427 		ndev->rx_rule_ucast = NULL;
1428 		goto err_rule_ucast;
1429 	}
1430 
1431 	memset(dmac_c, 0, ETH_ALEN);
1432 	memset(dmac_v, 0, ETH_ALEN);
1433 	dmac_c[0] = 1;
1434 	dmac_v[0] = 1;
1435 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
1436 	ndev->rx_rule_mcast = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dest, 1);
1437 	if (IS_ERR(ndev->rx_rule_mcast)) {
1438 		err = PTR_ERR(ndev->rx_rule_mcast);
1439 		ndev->rx_rule_mcast = NULL;
1440 		goto err_rule_mcast;
1441 	}
1442 
1443 	kvfree(spec);
1444 	return 0;
1445 
1446 err_rule_mcast:
1447 	mlx5_del_flow_rules(ndev->rx_rule_ucast);
1448 	ndev->rx_rule_ucast = NULL;
1449 err_rule_ucast:
1450 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1451 err_fc:
1452 	mlx5_destroy_flow_table(ndev->rxft);
1453 err_ns:
1454 	kvfree(spec);
1455 	return err;
1456 }
1457 
1458 static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1459 {
1460 	if (!ndev->rx_rule_ucast)
1461 		return;
1462 
1463 	mlx5_del_flow_rules(ndev->rx_rule_mcast);
1464 	ndev->rx_rule_mcast = NULL;
1465 	mlx5_del_flow_rules(ndev->rx_rule_ucast);
1466 	ndev->rx_rule_ucast = NULL;
1467 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1468 	mlx5_destroy_flow_table(ndev->rxft);
1469 }
1470 
1471 static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1472 {
1473 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1474 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1475 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1476 	struct mlx5_core_dev *pfmdev;
1477 	size_t read;
1478 	u8 mac[ETH_ALEN];
1479 
1480 	pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
1481 	switch (cmd) {
1482 	case VIRTIO_NET_CTRL_MAC_ADDR_SET:
1483 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
1484 		if (read != ETH_ALEN)
1485 			break;
1486 
1487 		if (!memcmp(ndev->config.mac, mac, 6)) {
1488 			status = VIRTIO_NET_OK;
1489 			break;
1490 		}
1491 
1492 		if (!is_zero_ether_addr(ndev->config.mac)) {
1493 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1494 				mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
1495 					       ndev->config.mac);
1496 				break;
1497 			}
1498 		}
1499 
1500 		if (mlx5_mpfs_add_mac(pfmdev, mac)) {
1501 			mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
1502 				       mac);
1503 			break;
1504 		}
1505 
1506 		memcpy(ndev->config.mac, mac, ETH_ALEN);
1507 		status = VIRTIO_NET_OK;
1508 		break;
1509 
1510 	default:
1511 		break;
1512 	}
1513 
1514 	return status;
1515 }
1516 
1517 static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
1518 {
1519 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1520 	int cur_qps = ndev->cur_num_vqs / 2;
1521 	int err;
1522 	int i;
1523 
1524 	if (cur_qps > newqps) {
1525 		err = modify_rqt(ndev, 2 * newqps);
1526 		if (err)
1527 			return err;
1528 
1529 		for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
1530 			teardown_vq(ndev, &ndev->vqs[i]);
1531 
1532 		ndev->cur_num_vqs = 2 * newqps;
1533 	} else {
1534 		ndev->cur_num_vqs = 2 * newqps;
1535 		for (i = cur_qps * 2; i < 2 * newqps; i++) {
1536 			err = setup_vq(ndev, &ndev->vqs[i]);
1537 			if (err)
1538 				goto clean_added;
1539 		}
1540 		err = modify_rqt(ndev, 2 * newqps);
1541 		if (err)
1542 			goto clean_added;
1543 	}
1544 	return 0;
1545 
1546 clean_added:
1547 	for (--i; i >= 2 * cur_qps; --i)
1548 		teardown_vq(ndev, &ndev->vqs[i]);
1549 
1550 	ndev->cur_num_vqs = 2 * cur_qps;
1551 
1552 	return err;
1553 }
1554 
1555 static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1556 {
1557 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1558 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1559 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1560 	struct virtio_net_ctrl_mq mq;
1561 	size_t read;
1562 	u16 newqps;
1563 
1564 	switch (cmd) {
1565 	case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
1566 		/* This mq feature check aligns with pre-existing userspace
1567 		 * implementation.
1568 		 *
1569 		 * Without it, an untrusted driver could fake a multiqueue config
1570 		 * request down to a non-mq device that may cause kernel to
1571 		 * panic due to uninitialized resources for extra vqs. Even with
1572 		 * a well behaving guest driver, it is not expected to allow
1573 		 * changing the number of vqs on a non-mq device.
1574 		 */
1575 		if (!MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ))
1576 			break;
1577 
1578 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
1579 		if (read != sizeof(mq))
1580 			break;
1581 
1582 		newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
1583 		if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
1584 		    newqps > mlx5_vdpa_max_qps(mvdev->max_vqs))
1585 			break;
1586 
1587 		if (ndev->cur_num_vqs == 2 * newqps) {
1588 			status = VIRTIO_NET_OK;
1589 			break;
1590 		}
1591 
1592 		if (!change_num_qps(mvdev, newqps))
1593 			status = VIRTIO_NET_OK;
1594 
1595 		break;
1596 	default:
1597 		break;
1598 	}
1599 
1600 	return status;
1601 }
1602 
1603 static void mlx5_cvq_kick_handler(struct work_struct *work)
1604 {
1605 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1606 	struct virtio_net_ctrl_hdr ctrl;
1607 	struct mlx5_vdpa_wq_ent *wqent;
1608 	struct mlx5_vdpa_dev *mvdev;
1609 	struct mlx5_control_vq *cvq;
1610 	struct mlx5_vdpa_net *ndev;
1611 	size_t read, write;
1612 	int err;
1613 
1614 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
1615 	mvdev = wqent->mvdev;
1616 	ndev = to_mlx5_vdpa_ndev(mvdev);
1617 	cvq = &mvdev->cvq;
1618 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
1619 		goto out;
1620 
1621 	if (!cvq->ready)
1622 		goto out;
1623 
1624 	while (true) {
1625 		err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
1626 					   GFP_ATOMIC);
1627 		if (err <= 0)
1628 			break;
1629 
1630 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
1631 		if (read != sizeof(ctrl))
1632 			break;
1633 
1634 		switch (ctrl.class) {
1635 		case VIRTIO_NET_CTRL_MAC:
1636 			status = handle_ctrl_mac(mvdev, ctrl.cmd);
1637 			break;
1638 		case VIRTIO_NET_CTRL_MQ:
1639 			status = handle_ctrl_mq(mvdev, ctrl.cmd);
1640 			break;
1641 
1642 		default:
1643 			break;
1644 		}
1645 
1646 		/* Make sure data is written before advancing index */
1647 		smp_wmb();
1648 
1649 		write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
1650 		vringh_complete_iotlb(&cvq->vring, cvq->head, write);
1651 		vringh_kiov_cleanup(&cvq->riov);
1652 		vringh_kiov_cleanup(&cvq->wiov);
1653 
1654 		if (vringh_need_notify_iotlb(&cvq->vring))
1655 			vringh_notify(&cvq->vring);
1656 	}
1657 out:
1658 	kfree(wqent);
1659 }
1660 
1661 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
1662 {
1663 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1664 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1665 	struct mlx5_vdpa_virtqueue *mvq;
1666 	struct mlx5_vdpa_wq_ent *wqent;
1667 
1668 	if (!is_index_valid(mvdev, idx))
1669 		return;
1670 
1671 	if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
1672 		if (!mvdev->cvq.ready)
1673 			return;
1674 
1675 		wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
1676 		if (!wqent)
1677 			return;
1678 
1679 		wqent->mvdev = mvdev;
1680 		INIT_WORK(&wqent->work, mlx5_cvq_kick_handler);
1681 		queue_work(mvdev->wq, &wqent->work);
1682 		return;
1683 	}
1684 
1685 	mvq = &ndev->vqs[idx];
1686 	if (unlikely(!mvq->ready))
1687 		return;
1688 
1689 	iowrite16(idx, ndev->mvdev.res.kick_addr);
1690 }
1691 
1692 static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
1693 				    u64 driver_area, u64 device_area)
1694 {
1695 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1696 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1697 	struct mlx5_vdpa_virtqueue *mvq;
1698 
1699 	if (!is_index_valid(mvdev, idx))
1700 		return -EINVAL;
1701 
1702 	if (is_ctrl_vq_idx(mvdev, idx)) {
1703 		mvdev->cvq.desc_addr = desc_area;
1704 		mvdev->cvq.device_addr = device_area;
1705 		mvdev->cvq.driver_addr = driver_area;
1706 		return 0;
1707 	}
1708 
1709 	mvq = &ndev->vqs[idx];
1710 	mvq->desc_addr = desc_area;
1711 	mvq->device_addr = device_area;
1712 	mvq->driver_addr = driver_area;
1713 	return 0;
1714 }
1715 
1716 static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
1717 {
1718 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1719 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1720 	struct mlx5_vdpa_virtqueue *mvq;
1721 
1722 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
1723 		return;
1724 
1725 	mvq = &ndev->vqs[idx];
1726 	mvq->num_ent = num;
1727 }
1728 
1729 static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
1730 {
1731 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1732 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1733 
1734 	ndev->event_cbs[idx] = *cb;
1735 }
1736 
1737 static void mlx5_cvq_notify(struct vringh *vring)
1738 {
1739 	struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
1740 
1741 	if (!cvq->event_cb.callback)
1742 		return;
1743 
1744 	cvq->event_cb.callback(cvq->event_cb.private);
1745 }
1746 
1747 static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
1748 {
1749 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1750 
1751 	cvq->ready = ready;
1752 	if (!ready)
1753 		return;
1754 
1755 	cvq->vring.notify = mlx5_cvq_notify;
1756 }
1757 
1758 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
1759 {
1760 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1761 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1762 	struct mlx5_vdpa_virtqueue *mvq;
1763 
1764 	if (!mvdev->actual_features)
1765 		return;
1766 
1767 	if (!is_index_valid(mvdev, idx))
1768 		return;
1769 
1770 	if (is_ctrl_vq_idx(mvdev, idx)) {
1771 		set_cvq_ready(mvdev, ready);
1772 		return;
1773 	}
1774 
1775 	mvq = &ndev->vqs[idx];
1776 	if (!ready)
1777 		suspend_vq(ndev, mvq);
1778 
1779 	mvq->ready = ready;
1780 }
1781 
1782 static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
1783 {
1784 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1785 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1786 
1787 	if (!is_index_valid(mvdev, idx))
1788 		return false;
1789 
1790 	if (is_ctrl_vq_idx(mvdev, idx))
1791 		return mvdev->cvq.ready;
1792 
1793 	return ndev->vqs[idx].ready;
1794 }
1795 
1796 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
1797 				  const struct vdpa_vq_state *state)
1798 {
1799 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1800 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1801 	struct mlx5_vdpa_virtqueue *mvq;
1802 
1803 	if (!is_index_valid(mvdev, idx))
1804 		return -EINVAL;
1805 
1806 	if (is_ctrl_vq_idx(mvdev, idx)) {
1807 		mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
1808 		return 0;
1809 	}
1810 
1811 	mvq = &ndev->vqs[idx];
1812 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
1813 		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
1814 		return -EINVAL;
1815 	}
1816 
1817 	mvq->used_idx = state->split.avail_index;
1818 	mvq->avail_idx = state->split.avail_index;
1819 	return 0;
1820 }
1821 
1822 static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
1823 {
1824 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1825 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1826 	struct mlx5_vdpa_virtqueue *mvq;
1827 	struct mlx5_virtq_attr attr;
1828 	int err;
1829 
1830 	if (!is_index_valid(mvdev, idx))
1831 		return -EINVAL;
1832 
1833 	if (is_ctrl_vq_idx(mvdev, idx)) {
1834 		state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
1835 		return 0;
1836 	}
1837 
1838 	mvq = &ndev->vqs[idx];
1839 	/* If the virtq object was destroyed, use the value saved at
1840 	 * the last minute of suspend_vq. This caters for userspace
1841 	 * that cares about emulating the index after vq is stopped.
1842 	 */
1843 	if (!mvq->initialized) {
1844 		/* Firmware returns a wrong value for the available index.
1845 		 * Since both values should be identical, we take the value of
1846 		 * used_idx which is reported correctly.
1847 		 */
1848 		state->split.avail_index = mvq->used_idx;
1849 		return 0;
1850 	}
1851 
1852 	err = query_virtqueue(ndev, mvq, &attr);
1853 	if (err) {
1854 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
1855 		return err;
1856 	}
1857 	state->split.avail_index = attr.used_index;
1858 	return 0;
1859 }
1860 
1861 static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
1862 {
1863 	return PAGE_SIZE;
1864 }
1865 
1866 enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9,
1867 	MLX5_VIRTIO_NET_F_CSUM = 1 << 10,
1868 	MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11,
1869 	MLX5_VIRTIO_NET_F_HOST_TSO4 = 1 << 12,
1870 };
1871 
1872 static u64 mlx_to_vritio_features(u16 dev_features)
1873 {
1874 	u64 result = 0;
1875 
1876 	if (dev_features & MLX5_VIRTIO_NET_F_GUEST_CSUM)
1877 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
1878 	if (dev_features & MLX5_VIRTIO_NET_F_CSUM)
1879 		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
1880 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO6)
1881 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
1882 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO4)
1883 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
1884 
1885 	return result;
1886 }
1887 
1888 static u64 get_supported_features(struct mlx5_core_dev *mdev)
1889 {
1890 	u64 mlx_vdpa_features = 0;
1891 	u16 dev_features;
1892 
1893 	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mdev, device_features_bits_mask);
1894 	mlx_vdpa_features |= mlx_to_vritio_features(dev_features);
1895 	if (MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_version_1_0))
1896 		mlx_vdpa_features |= BIT_ULL(VIRTIO_F_VERSION_1);
1897 	mlx_vdpa_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
1898 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
1899 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
1900 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MQ);
1901 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_STATUS);
1902 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MTU);
1903 
1904 	return mlx_vdpa_features;
1905 }
1906 
1907 static u64 mlx5_vdpa_get_device_features(struct vdpa_device *vdev)
1908 {
1909 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1910 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1911 
1912 	print_features(mvdev, ndev->mvdev.mlx_features, false);
1913 	return ndev->mvdev.mlx_features;
1914 }
1915 
1916 static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features)
1917 {
1918 	/* Minimum features to expect */
1919 	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
1920 		return -EOPNOTSUPP;
1921 
1922 	/* Double check features combination sent down by the driver.
1923 	 * Fail invalid features due to absence of the depended feature.
1924 	 *
1925 	 * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
1926 	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
1927 	 * By failing the invalid features sent down by untrusted drivers,
1928 	 * we're assured the assumption made upon is_index_valid() and
1929 	 * is_ctrl_vq_idx() will not be compromised.
1930 	 */
1931 	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
1932             BIT_ULL(VIRTIO_NET_F_MQ))
1933 		return -EINVAL;
1934 
1935 	return 0;
1936 }
1937 
1938 static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev)
1939 {
1940 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1941 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1942 	int err;
1943 	int i;
1944 
1945 	for (i = 0; i < 2 * mlx5_vdpa_max_qps(mvdev->max_vqs); i++) {
1946 		err = setup_vq(ndev, &ndev->vqs[i]);
1947 		if (err)
1948 			goto err_vq;
1949 	}
1950 
1951 	if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) {
1952 		err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
1953 					MLX5_CVQ_MAX_ENT, false,
1954 					(struct vring_desc *)(uintptr_t)cvq->desc_addr,
1955 					(struct vring_avail *)(uintptr_t)cvq->driver_addr,
1956 					(struct vring_used *)(uintptr_t)cvq->device_addr);
1957 		if (err)
1958 			goto err_vq;
1959 	}
1960 
1961 	return 0;
1962 
1963 err_vq:
1964 	for (--i; i >= 0; i--)
1965 		teardown_vq(ndev, &ndev->vqs[i]);
1966 
1967 	return err;
1968 }
1969 
1970 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
1971 {
1972 	struct mlx5_vdpa_virtqueue *mvq;
1973 	int i;
1974 
1975 	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
1976 		mvq = &ndev->vqs[i];
1977 		if (!mvq->initialized)
1978 			continue;
1979 
1980 		teardown_vq(ndev, mvq);
1981 	}
1982 }
1983 
1984 static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
1985 {
1986 	if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
1987 		if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
1988 			/* MQ supported. CVQ index is right above the last data virtqueue's */
1989 			mvdev->max_idx = mvdev->max_vqs;
1990 		} else {
1991 			/* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
1992 			 * CVQ gets index 2
1993 			 */
1994 			mvdev->max_idx = 2;
1995 		}
1996 	} else {
1997 		/* Two data virtqueues only: one for rx and one for tx */
1998 		mvdev->max_idx = 1;
1999 	}
2000 }
2001 
2002 static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features)
2003 {
2004 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2005 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2006 	int err;
2007 
2008 	print_features(mvdev, features, true);
2009 
2010 	err = verify_driver_features(mvdev, features);
2011 	if (err)
2012 		return err;
2013 
2014 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
2015 	if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ))
2016 		ndev->cur_num_vqs = 2 * mlx5vdpa16_to_cpu(mvdev, ndev->config.max_virtqueue_pairs);
2017 	else
2018 		ndev->cur_num_vqs = 2;
2019 
2020 	update_cvq_info(mvdev);
2021 	return err;
2022 }
2023 
2024 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
2025 {
2026 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2027 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2028 
2029 	ndev->config_cb = *cb;
2030 }
2031 
2032 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
2033 static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
2034 {
2035 	return MLX5_VDPA_MAX_VQ_ENTRIES;
2036 }
2037 
2038 static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
2039 {
2040 	return VIRTIO_ID_NET;
2041 }
2042 
2043 static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
2044 {
2045 	return PCI_VENDOR_ID_MELLANOX;
2046 }
2047 
2048 static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
2049 {
2050 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2051 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2052 
2053 	print_status(mvdev, ndev->mvdev.status, false);
2054 	return ndev->mvdev.status;
2055 }
2056 
2057 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
2058 {
2059 	struct mlx5_vq_restore_info *ri = &mvq->ri;
2060 	struct mlx5_virtq_attr attr = {};
2061 	int err;
2062 
2063 	if (mvq->initialized) {
2064 		err = query_virtqueue(ndev, mvq, &attr);
2065 		if (err)
2066 			return err;
2067 	}
2068 
2069 	ri->avail_index = attr.available_index;
2070 	ri->used_index = attr.used_index;
2071 	ri->ready = mvq->ready;
2072 	ri->num_ent = mvq->num_ent;
2073 	ri->desc_addr = mvq->desc_addr;
2074 	ri->device_addr = mvq->device_addr;
2075 	ri->driver_addr = mvq->driver_addr;
2076 	ri->restore = true;
2077 	return 0;
2078 }
2079 
2080 static int save_channels_info(struct mlx5_vdpa_net *ndev)
2081 {
2082 	int i;
2083 
2084 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2085 		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
2086 		save_channel_info(ndev, &ndev->vqs[i]);
2087 	}
2088 	return 0;
2089 }
2090 
2091 static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
2092 {
2093 	int i;
2094 
2095 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2096 		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2097 }
2098 
2099 static void restore_channels_info(struct mlx5_vdpa_net *ndev)
2100 {
2101 	struct mlx5_vdpa_virtqueue *mvq;
2102 	struct mlx5_vq_restore_info *ri;
2103 	int i;
2104 
2105 	mlx5_clear_vqs(ndev);
2106 	init_mvqs(ndev);
2107 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2108 		mvq = &ndev->vqs[i];
2109 		ri = &mvq->ri;
2110 		if (!ri->restore)
2111 			continue;
2112 
2113 		mvq->avail_idx = ri->avail_index;
2114 		mvq->used_idx = ri->used_index;
2115 		mvq->ready = ri->ready;
2116 		mvq->num_ent = ri->num_ent;
2117 		mvq->desc_addr = ri->desc_addr;
2118 		mvq->device_addr = ri->device_addr;
2119 		mvq->driver_addr = ri->driver_addr;
2120 	}
2121 }
2122 
2123 static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
2124 {
2125 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2126 	int err;
2127 
2128 	suspend_vqs(ndev);
2129 	err = save_channels_info(ndev);
2130 	if (err)
2131 		goto err_mr;
2132 
2133 	teardown_driver(ndev);
2134 	mlx5_vdpa_destroy_mr(mvdev);
2135 	err = mlx5_vdpa_create_mr(mvdev, iotlb);
2136 	if (err)
2137 		goto err_mr;
2138 
2139 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
2140 		return 0;
2141 
2142 	restore_channels_info(ndev);
2143 	err = setup_driver(mvdev);
2144 	if (err)
2145 		goto err_setup;
2146 
2147 	return 0;
2148 
2149 err_setup:
2150 	mlx5_vdpa_destroy_mr(mvdev);
2151 err_mr:
2152 	return err;
2153 }
2154 
2155 static int setup_driver(struct mlx5_vdpa_dev *mvdev)
2156 {
2157 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2158 	int err;
2159 
2160 	mutex_lock(&ndev->reslock);
2161 	if (ndev->setup) {
2162 		mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
2163 		err = 0;
2164 		goto out;
2165 	}
2166 	err = setup_virtqueues(mvdev);
2167 	if (err) {
2168 		mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
2169 		goto out;
2170 	}
2171 
2172 	err = create_rqt(ndev);
2173 	if (err) {
2174 		mlx5_vdpa_warn(mvdev, "create_rqt\n");
2175 		goto err_rqt;
2176 	}
2177 
2178 	err = create_tir(ndev);
2179 	if (err) {
2180 		mlx5_vdpa_warn(mvdev, "create_tir\n");
2181 		goto err_tir;
2182 	}
2183 
2184 	err = add_fwd_to_tir(ndev);
2185 	if (err) {
2186 		mlx5_vdpa_warn(mvdev, "add_fwd_to_tir\n");
2187 		goto err_fwd;
2188 	}
2189 	ndev->setup = true;
2190 	mutex_unlock(&ndev->reslock);
2191 
2192 	return 0;
2193 
2194 err_fwd:
2195 	destroy_tir(ndev);
2196 err_tir:
2197 	destroy_rqt(ndev);
2198 err_rqt:
2199 	teardown_virtqueues(ndev);
2200 out:
2201 	mutex_unlock(&ndev->reslock);
2202 	return err;
2203 }
2204 
2205 static void teardown_driver(struct mlx5_vdpa_net *ndev)
2206 {
2207 	mutex_lock(&ndev->reslock);
2208 	if (!ndev->setup)
2209 		goto out;
2210 
2211 	remove_fwd_to_tir(ndev);
2212 	destroy_tir(ndev);
2213 	destroy_rqt(ndev);
2214 	teardown_virtqueues(ndev);
2215 	ndev->setup = false;
2216 out:
2217 	mutex_unlock(&ndev->reslock);
2218 }
2219 
2220 static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
2221 {
2222 	int i;
2223 
2224 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2225 		ndev->vqs[i].ready = false;
2226 
2227 	ndev->mvdev.cvq.ready = false;
2228 }
2229 
2230 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
2231 {
2232 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2233 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2234 	int err;
2235 
2236 	print_status(mvdev, status, true);
2237 
2238 	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
2239 		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
2240 			err = setup_driver(mvdev);
2241 			if (err) {
2242 				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
2243 				goto err_setup;
2244 			}
2245 		} else {
2246 			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
2247 			return;
2248 		}
2249 	}
2250 
2251 	ndev->mvdev.status = status;
2252 	return;
2253 
2254 err_setup:
2255 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2256 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
2257 }
2258 
2259 static int mlx5_vdpa_reset(struct vdpa_device *vdev)
2260 {
2261 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2262 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2263 
2264 	print_status(mvdev, 0, true);
2265 	mlx5_vdpa_info(mvdev, "performing device reset\n");
2266 	teardown_driver(ndev);
2267 	clear_vqs_ready(ndev);
2268 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2269 	ndev->mvdev.status = 0;
2270 	ndev->cur_num_vqs = 0;
2271 	memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1));
2272 	ndev->mvdev.actual_features = 0;
2273 	++mvdev->generation;
2274 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2275 		if (mlx5_vdpa_create_mr(mvdev, NULL))
2276 			mlx5_vdpa_warn(mvdev, "create MR failed\n");
2277 	}
2278 
2279 	return 0;
2280 }
2281 
2282 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
2283 {
2284 	return sizeof(struct virtio_net_config);
2285 }
2286 
2287 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
2288 				 unsigned int len)
2289 {
2290 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2291 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2292 
2293 	if (offset + len <= sizeof(struct virtio_net_config))
2294 		memcpy(buf, (u8 *)&ndev->config + offset, len);
2295 }
2296 
2297 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
2298 				 unsigned int len)
2299 {
2300 	/* not supported */
2301 }
2302 
2303 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
2304 {
2305 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2306 
2307 	return mvdev->generation;
2308 }
2309 
2310 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb)
2311 {
2312 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2313 	bool change_map;
2314 	int err;
2315 
2316 	err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map);
2317 	if (err) {
2318 		mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err);
2319 		return err;
2320 	}
2321 
2322 	if (change_map)
2323 		return mlx5_vdpa_change_map(mvdev, iotlb);
2324 
2325 	return 0;
2326 }
2327 
2328 static void mlx5_vdpa_free(struct vdpa_device *vdev)
2329 {
2330 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2331 	struct mlx5_core_dev *pfmdev;
2332 	struct mlx5_vdpa_net *ndev;
2333 
2334 	ndev = to_mlx5_vdpa_ndev(mvdev);
2335 
2336 	free_resources(ndev);
2337 	mlx5_vdpa_destroy_mr(mvdev);
2338 	if (!is_zero_ether_addr(ndev->config.mac)) {
2339 		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
2340 		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
2341 	}
2342 	mlx5_vdpa_free_resources(&ndev->mvdev);
2343 	mutex_destroy(&ndev->reslock);
2344 	kfree(ndev->event_cbs);
2345 	kfree(ndev->vqs);
2346 }
2347 
2348 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
2349 {
2350 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2351 	struct vdpa_notification_area ret = {};
2352 	struct mlx5_vdpa_net *ndev;
2353 	phys_addr_t addr;
2354 
2355 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
2356 		return ret;
2357 
2358 	/* If SF BAR size is smaller than PAGE_SIZE, do not use direct
2359 	 * notification to avoid the risk of mapping pages that contain BAR of more
2360 	 * than one SF
2361 	 */
2362 	if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
2363 		return ret;
2364 
2365 	ndev = to_mlx5_vdpa_ndev(mvdev);
2366 	addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
2367 	ret.addr = addr;
2368 	ret.size = PAGE_SIZE;
2369 	return ret;
2370 }
2371 
2372 static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 idx)
2373 {
2374 	return -EOPNOTSUPP;
2375 }
2376 
2377 static u64 mlx5_vdpa_get_driver_features(struct vdpa_device *vdev)
2378 {
2379 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2380 
2381 	return mvdev->actual_features;
2382 }
2383 
2384 static const struct vdpa_config_ops mlx5_vdpa_ops = {
2385 	.set_vq_address = mlx5_vdpa_set_vq_address,
2386 	.set_vq_num = mlx5_vdpa_set_vq_num,
2387 	.kick_vq = mlx5_vdpa_kick_vq,
2388 	.set_vq_cb = mlx5_vdpa_set_vq_cb,
2389 	.set_vq_ready = mlx5_vdpa_set_vq_ready,
2390 	.get_vq_ready = mlx5_vdpa_get_vq_ready,
2391 	.set_vq_state = mlx5_vdpa_set_vq_state,
2392 	.get_vq_state = mlx5_vdpa_get_vq_state,
2393 	.get_vq_notification = mlx5_get_vq_notification,
2394 	.get_vq_irq = mlx5_get_vq_irq,
2395 	.get_vq_align = mlx5_vdpa_get_vq_align,
2396 	.get_device_features = mlx5_vdpa_get_device_features,
2397 	.set_driver_features = mlx5_vdpa_set_driver_features,
2398 	.get_driver_features = mlx5_vdpa_get_driver_features,
2399 	.set_config_cb = mlx5_vdpa_set_config_cb,
2400 	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
2401 	.get_device_id = mlx5_vdpa_get_device_id,
2402 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
2403 	.get_status = mlx5_vdpa_get_status,
2404 	.set_status = mlx5_vdpa_set_status,
2405 	.reset = mlx5_vdpa_reset,
2406 	.get_config_size = mlx5_vdpa_get_config_size,
2407 	.get_config = mlx5_vdpa_get_config,
2408 	.set_config = mlx5_vdpa_set_config,
2409 	.get_generation = mlx5_vdpa_get_generation,
2410 	.set_map = mlx5_vdpa_set_map,
2411 	.free = mlx5_vdpa_free,
2412 };
2413 
2414 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
2415 {
2416 	u16 hw_mtu;
2417 	int err;
2418 
2419 	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
2420 	if (err)
2421 		return err;
2422 
2423 	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
2424 	return 0;
2425 }
2426 
2427 static int alloc_resources(struct mlx5_vdpa_net *ndev)
2428 {
2429 	struct mlx5_vdpa_net_resources *res = &ndev->res;
2430 	int err;
2431 
2432 	if (res->valid) {
2433 		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
2434 		return -EEXIST;
2435 	}
2436 
2437 	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
2438 	if (err)
2439 		return err;
2440 
2441 	err = create_tis(ndev);
2442 	if (err)
2443 		goto err_tis;
2444 
2445 	res->valid = true;
2446 
2447 	return 0;
2448 
2449 err_tis:
2450 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
2451 	return err;
2452 }
2453 
2454 static void free_resources(struct mlx5_vdpa_net *ndev)
2455 {
2456 	struct mlx5_vdpa_net_resources *res = &ndev->res;
2457 
2458 	if (!res->valid)
2459 		return;
2460 
2461 	destroy_tis(ndev);
2462 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
2463 	res->valid = false;
2464 }
2465 
2466 static void init_mvqs(struct mlx5_vdpa_net *ndev)
2467 {
2468 	struct mlx5_vdpa_virtqueue *mvq;
2469 	int i;
2470 
2471 	for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); ++i) {
2472 		mvq = &ndev->vqs[i];
2473 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2474 		mvq->index = i;
2475 		mvq->ndev = ndev;
2476 		mvq->fwqp.fw = true;
2477 	}
2478 	for (; i < ndev->mvdev.max_vqs; i++) {
2479 		mvq = &ndev->vqs[i];
2480 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2481 		mvq->index = i;
2482 		mvq->ndev = ndev;
2483 	}
2484 }
2485 
2486 struct mlx5_vdpa_mgmtdev {
2487 	struct vdpa_mgmt_dev mgtdev;
2488 	struct mlx5_adev *madev;
2489 	struct mlx5_vdpa_net *ndev;
2490 };
2491 
2492 static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
2493 {
2494 	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
2495 	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
2496 	int err;
2497 
2498 	MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE);
2499 	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
2500 	MLX5_SET(query_vport_state_in, in, vport_number, vport);
2501 	if (vport)
2502 		MLX5_SET(query_vport_state_in, in, other_vport, 1);
2503 
2504 	err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
2505 	if (err)
2506 		return 0;
2507 
2508 	return MLX5_GET(query_vport_state_out, out, state);
2509 }
2510 
2511 static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
2512 {
2513 	if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) ==
2514 	    VPORT_STATE_UP)
2515 		return true;
2516 
2517 	return false;
2518 }
2519 
2520 static void update_carrier(struct work_struct *work)
2521 {
2522 	struct mlx5_vdpa_wq_ent *wqent;
2523 	struct mlx5_vdpa_dev *mvdev;
2524 	struct mlx5_vdpa_net *ndev;
2525 
2526 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
2527 	mvdev = wqent->mvdev;
2528 	ndev = to_mlx5_vdpa_ndev(mvdev);
2529 	if (get_link_state(mvdev))
2530 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
2531 	else
2532 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
2533 
2534 	if (ndev->config_cb.callback)
2535 		ndev->config_cb.callback(ndev->config_cb.private);
2536 
2537 	kfree(wqent);
2538 }
2539 
2540 static int event_handler(struct notifier_block *nb, unsigned long event, void *param)
2541 {
2542 	struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb);
2543 	struct mlx5_eqe *eqe = param;
2544 	int ret = NOTIFY_DONE;
2545 	struct mlx5_vdpa_wq_ent *wqent;
2546 
2547 	if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
2548 		switch (eqe->sub_type) {
2549 		case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
2550 		case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
2551 			wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
2552 			if (!wqent)
2553 				return NOTIFY_DONE;
2554 
2555 			wqent->mvdev = &ndev->mvdev;
2556 			INIT_WORK(&wqent->work, update_carrier);
2557 			queue_work(ndev->mvdev.wq, &wqent->work);
2558 			ret = NOTIFY_OK;
2559 			break;
2560 		default:
2561 			return NOTIFY_DONE;
2562 		}
2563 		return ret;
2564 	}
2565 	return ret;
2566 }
2567 
2568 static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
2569 			     const struct vdpa_dev_set_config *add_config)
2570 {
2571 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
2572 	struct virtio_net_config *config;
2573 	struct mlx5_core_dev *pfmdev;
2574 	struct mlx5_vdpa_dev *mvdev;
2575 	struct mlx5_vdpa_net *ndev;
2576 	struct mlx5_core_dev *mdev;
2577 	u32 max_vqs;
2578 	u16 mtu;
2579 	int err;
2580 
2581 	if (mgtdev->ndev)
2582 		return -ENOSPC;
2583 
2584 	mdev = mgtdev->madev->mdev;
2585 	if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) &
2586 	    MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) {
2587 		dev_warn(mdev->device, "missing support for split virtqueues\n");
2588 		return -EOPNOTSUPP;
2589 	}
2590 
2591 	max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues);
2592 	if (max_vqs < 2) {
2593 		dev_warn(mdev->device,
2594 			 "%d virtqueues are supported. At least 2 are required\n",
2595 			 max_vqs);
2596 		return -EAGAIN;
2597 	}
2598 
2599 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
2600 		if (add_config->net.max_vq_pairs > max_vqs / 2)
2601 			return -EINVAL;
2602 		max_vqs = min_t(u32, max_vqs, 2 * add_config->net.max_vq_pairs);
2603 	} else {
2604 		max_vqs = 2;
2605 	}
2606 
2607 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
2608 				 name, false);
2609 	if (IS_ERR(ndev))
2610 		return PTR_ERR(ndev);
2611 
2612 	ndev->mvdev.mlx_features = mgtdev->mgtdev.supported_features;
2613 	ndev->mvdev.max_vqs = max_vqs;
2614 	mvdev = &ndev->mvdev;
2615 	mvdev->mdev = mdev;
2616 
2617 	ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL);
2618 	ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL);
2619 	if (!ndev->vqs || !ndev->event_cbs) {
2620 		err = -ENOMEM;
2621 		goto err_alloc;
2622 	}
2623 
2624 	init_mvqs(ndev);
2625 	mutex_init(&ndev->reslock);
2626 	config = &ndev->config;
2627 	err = query_mtu(mdev, &mtu);
2628 	if (err)
2629 		goto err_mtu;
2630 
2631 	ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu);
2632 
2633 	if (get_link_state(mvdev))
2634 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
2635 	else
2636 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
2637 
2638 	if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
2639 		memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN);
2640 	} else {
2641 		err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
2642 		if (err)
2643 			goto err_mtu;
2644 	}
2645 
2646 	if (!is_zero_ether_addr(config->mac)) {
2647 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
2648 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
2649 		if (err)
2650 			goto err_mtu;
2651 
2652 		ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC);
2653 	}
2654 
2655 	config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, mlx5_vdpa_max_qps(max_vqs));
2656 	mvdev->vdev.dma_dev = &mdev->pdev->dev;
2657 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
2658 	if (err)
2659 		goto err_mpfs;
2660 
2661 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2662 		err = mlx5_vdpa_create_mr(mvdev, NULL);
2663 		if (err)
2664 			goto err_res;
2665 	}
2666 
2667 	err = alloc_resources(ndev);
2668 	if (err)
2669 		goto err_mr;
2670 
2671 	mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq");
2672 	if (!mvdev->wq) {
2673 		err = -ENOMEM;
2674 		goto err_res2;
2675 	}
2676 
2677 	ndev->nb.notifier_call = event_handler;
2678 	mlx5_notifier_register(mdev, &ndev->nb);
2679 	mvdev->vdev.mdev = &mgtdev->mgtdev;
2680 	err = _vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs) + 1);
2681 	if (err)
2682 		goto err_reg;
2683 
2684 	mgtdev->ndev = ndev;
2685 	return 0;
2686 
2687 err_reg:
2688 	destroy_workqueue(mvdev->wq);
2689 err_res2:
2690 	free_resources(ndev);
2691 err_mr:
2692 	mlx5_vdpa_destroy_mr(mvdev);
2693 err_res:
2694 	mlx5_vdpa_free_resources(&ndev->mvdev);
2695 err_mpfs:
2696 	if (!is_zero_ether_addr(config->mac))
2697 		mlx5_mpfs_del_mac(pfmdev, config->mac);
2698 err_mtu:
2699 	mutex_destroy(&ndev->reslock);
2700 err_alloc:
2701 	put_device(&mvdev->vdev.dev);
2702 	return err;
2703 }
2704 
2705 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
2706 {
2707 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
2708 	struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
2709 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2710 
2711 	mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
2712 	destroy_workqueue(mvdev->wq);
2713 	_vdpa_unregister_device(dev);
2714 	mgtdev->ndev = NULL;
2715 }
2716 
2717 static const struct vdpa_mgmtdev_ops mdev_ops = {
2718 	.dev_add = mlx5_vdpa_dev_add,
2719 	.dev_del = mlx5_vdpa_dev_del,
2720 };
2721 
2722 static struct virtio_device_id id_table[] = {
2723 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
2724 	{ 0 },
2725 };
2726 
2727 static int mlx5v_probe(struct auxiliary_device *adev,
2728 		       const struct auxiliary_device_id *id)
2729 
2730 {
2731 	struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
2732 	struct mlx5_core_dev *mdev = madev->mdev;
2733 	struct mlx5_vdpa_mgmtdev *mgtdev;
2734 	int err;
2735 
2736 	mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
2737 	if (!mgtdev)
2738 		return -ENOMEM;
2739 
2740 	mgtdev->mgtdev.ops = &mdev_ops;
2741 	mgtdev->mgtdev.device = mdev->device;
2742 	mgtdev->mgtdev.id_table = id_table;
2743 	mgtdev->mgtdev.config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) |
2744 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP);
2745 	mgtdev->mgtdev.max_supported_vqs =
2746 		MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues) + 1;
2747 	mgtdev->mgtdev.supported_features = get_supported_features(mdev);
2748 	mgtdev->madev = madev;
2749 
2750 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
2751 	if (err)
2752 		goto reg_err;
2753 
2754 	auxiliary_set_drvdata(adev, mgtdev);
2755 
2756 	return 0;
2757 
2758 reg_err:
2759 	kfree(mgtdev);
2760 	return err;
2761 }
2762 
2763 static void mlx5v_remove(struct auxiliary_device *adev)
2764 {
2765 	struct mlx5_vdpa_mgmtdev *mgtdev;
2766 
2767 	mgtdev = auxiliary_get_drvdata(adev);
2768 	vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
2769 	kfree(mgtdev);
2770 }
2771 
2772 static const struct auxiliary_device_id mlx5v_id_table[] = {
2773 	{ .name = MLX5_ADEV_NAME ".vnet", },
2774 	{},
2775 };
2776 
2777 MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
2778 
2779 static struct auxiliary_driver mlx5v_driver = {
2780 	.name = "vnet",
2781 	.probe = mlx5v_probe,
2782 	.remove = mlx5v_remove,
2783 	.id_table = mlx5v_id_table,
2784 };
2785 
2786 module_auxiliary_driver(mlx5v_driver);
2787