xref: /openbmc/linux/drivers/vdpa/mlx5/net/mlx5_vnet.c (revision 2d091155)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
3 
4 #include <linux/module.h>
5 #include <linux/vdpa.h>
6 #include <linux/vringh.h>
7 #include <uapi/linux/virtio_net.h>
8 #include <uapi/linux/virtio_ids.h>
9 #include <uapi/linux/vdpa.h>
10 #include <linux/virtio_config.h>
11 #include <linux/auxiliary_bus.h>
12 #include <linux/mlx5/cq.h>
13 #include <linux/mlx5/qp.h>
14 #include <linux/mlx5/device.h>
15 #include <linux/mlx5/driver.h>
16 #include <linux/mlx5/vport.h>
17 #include <linux/mlx5/fs.h>
18 #include <linux/mlx5/mlx5_ifc_vdpa.h>
19 #include <linux/mlx5/mpfs.h>
20 #include "mlx5_vdpa.h"
21 
22 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
23 MODULE_DESCRIPTION("Mellanox VDPA driver");
24 MODULE_LICENSE("Dual BSD/GPL");
25 
26 #define to_mlx5_vdpa_ndev(__mvdev)                                             \
27 	container_of(__mvdev, struct mlx5_vdpa_net, mvdev)
28 #define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev)
29 
30 #define VALID_FEATURES_MASK                                                                        \
31 	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
32 	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
33 	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
34 	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
35 	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
36 	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
37 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
38 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
39 	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
40 	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
41 	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
42 	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
43 	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
44 
45 #define VALID_STATUS_MASK                                                                          \
46 	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
47 	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
48 
49 #define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
50 
51 struct mlx5_vdpa_net_resources {
52 	u32 tisn;
53 	u32 tdn;
54 	u32 tirn;
55 	u32 rqtn;
56 	bool valid;
57 };
58 
59 struct mlx5_vdpa_cq_buf {
60 	struct mlx5_frag_buf_ctrl fbc;
61 	struct mlx5_frag_buf frag_buf;
62 	int cqe_size;
63 	int nent;
64 };
65 
66 struct mlx5_vdpa_cq {
67 	struct mlx5_core_cq mcq;
68 	struct mlx5_vdpa_cq_buf buf;
69 	struct mlx5_db db;
70 	int cqe;
71 };
72 
73 struct mlx5_vdpa_umem {
74 	struct mlx5_frag_buf_ctrl fbc;
75 	struct mlx5_frag_buf frag_buf;
76 	int size;
77 	u32 id;
78 };
79 
80 struct mlx5_vdpa_qp {
81 	struct mlx5_core_qp mqp;
82 	struct mlx5_frag_buf frag_buf;
83 	struct mlx5_db db;
84 	u16 head;
85 	bool fw;
86 };
87 
88 struct mlx5_vq_restore_info {
89 	u32 num_ent;
90 	u64 desc_addr;
91 	u64 device_addr;
92 	u64 driver_addr;
93 	u16 avail_index;
94 	u16 used_index;
95 	bool ready;
96 	bool restore;
97 };
98 
99 struct mlx5_vdpa_virtqueue {
100 	bool ready;
101 	u64 desc_addr;
102 	u64 device_addr;
103 	u64 driver_addr;
104 	u32 num_ent;
105 
106 	/* Resources for implementing the notification channel from the device
107 	 * to the driver. fwqp is the firmware end of an RC connection; the
108 	 * other end is vqqp used by the driver. cq is is where completions are
109 	 * reported.
110 	 */
111 	struct mlx5_vdpa_cq cq;
112 	struct mlx5_vdpa_qp fwqp;
113 	struct mlx5_vdpa_qp vqqp;
114 
115 	/* umem resources are required for the virtqueue operation. They're use
116 	 * is internal and they must be provided by the driver.
117 	 */
118 	struct mlx5_vdpa_umem umem1;
119 	struct mlx5_vdpa_umem umem2;
120 	struct mlx5_vdpa_umem umem3;
121 
122 	bool initialized;
123 	int index;
124 	u32 virtq_id;
125 	struct mlx5_vdpa_net *ndev;
126 	u16 avail_idx;
127 	u16 used_idx;
128 	int fw_state;
129 
130 	/* keep last in the struct */
131 	struct mlx5_vq_restore_info ri;
132 };
133 
134 static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
135 {
136 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) {
137 		if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
138 			return idx < 2;
139 		else
140 			return idx < 3;
141 	}
142 
143 	return idx <= mvdev->max_idx;
144 }
145 
146 struct mlx5_vdpa_net {
147 	struct mlx5_vdpa_dev mvdev;
148 	struct mlx5_vdpa_net_resources res;
149 	struct virtio_net_config config;
150 	struct mlx5_vdpa_virtqueue *vqs;
151 	struct vdpa_callback *event_cbs;
152 
153 	/* Serialize vq resources creation and destruction. This is required
154 	 * since memory map might change and we need to destroy and create
155 	 * resources while driver in operational.
156 	 */
157 	struct mutex reslock;
158 	struct mlx5_flow_table *rxft;
159 	struct mlx5_fc *rx_counter;
160 	struct mlx5_flow_handle *rx_rule_ucast;
161 	struct mlx5_flow_handle *rx_rule_mcast;
162 	bool setup;
163 	u32 cur_num_vqs;
164 	struct notifier_block nb;
165 	struct vdpa_callback config_cb;
166 };
167 
168 static void free_resources(struct mlx5_vdpa_net *ndev);
169 static void init_mvqs(struct mlx5_vdpa_net *ndev);
170 static int setup_driver(struct mlx5_vdpa_dev *mvdev);
171 static void teardown_driver(struct mlx5_vdpa_net *ndev);
172 
173 static bool mlx5_vdpa_debug;
174 
175 #define MLX5_CVQ_MAX_ENT 16
176 
177 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
178 	do {                                                                                       \
179 		if (features & BIT_ULL(_feature))                                                  \
180 			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
181 	} while (0)
182 
183 #define MLX5_LOG_VIO_STAT(_status)                                                                 \
184 	do {                                                                                       \
185 		if (status & (_status))                                                            \
186 			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
187 	} while (0)
188 
189 /* TODO: cross-endian support */
190 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
191 {
192 	return virtio_legacy_is_little_endian() ||
193 		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
194 }
195 
196 static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
197 {
198 	return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
199 }
200 
201 static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
202 {
203 	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
204 }
205 
206 static inline u32 mlx5_vdpa_max_qps(int max_vqs)
207 {
208 	return max_vqs / 2;
209 }
210 
211 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
212 {
213 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
214 		return 2;
215 
216 	return 2 * mlx5_vdpa_max_qps(mvdev->max_vqs);
217 }
218 
219 static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
220 {
221 	return idx == ctrl_vq_idx(mvdev);
222 }
223 
224 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
225 {
226 	if (status & ~VALID_STATUS_MASK)
227 		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
228 			       status & ~VALID_STATUS_MASK);
229 
230 	if (!mlx5_vdpa_debug)
231 		return;
232 
233 	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
234 	if (set && !status) {
235 		mlx5_vdpa_info(mvdev, "driver resets the device\n");
236 		return;
237 	}
238 
239 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
240 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
241 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
242 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
243 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
244 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
245 }
246 
247 static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
248 {
249 	if (features & ~VALID_FEATURES_MASK)
250 		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
251 			       features & ~VALID_FEATURES_MASK);
252 
253 	if (!mlx5_vdpa_debug)
254 		return;
255 
256 	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
257 	if (!features)
258 		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
259 
260 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
261 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
262 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
263 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
264 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
265 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
266 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
267 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
268 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
269 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
270 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
271 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
272 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
273 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
274 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
275 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
276 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
277 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
278 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
279 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
280 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
281 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
282 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
283 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
284 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
285 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
286 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
287 	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
288 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
289 	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
290 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
291 	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
292 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
293 	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
294 }
295 
296 static int create_tis(struct mlx5_vdpa_net *ndev)
297 {
298 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
299 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
300 	void *tisc;
301 	int err;
302 
303 	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
304 	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
305 	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
306 	if (err)
307 		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
308 
309 	return err;
310 }
311 
312 static void destroy_tis(struct mlx5_vdpa_net *ndev)
313 {
314 	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
315 }
316 
317 #define MLX5_VDPA_CQE_SIZE 64
318 #define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
319 
320 static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
321 {
322 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
323 	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
324 	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
325 	int err;
326 
327 	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
328 				       ndev->mvdev.mdev->priv.numa_node);
329 	if (err)
330 		return err;
331 
332 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
333 
334 	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
335 	buf->nent = nent;
336 
337 	return 0;
338 }
339 
340 static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
341 {
342 	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
343 
344 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
345 					ndev->mvdev.mdev->priv.numa_node);
346 }
347 
348 static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
349 {
350 	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
351 }
352 
353 static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
354 {
355 	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
356 }
357 
358 static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
359 {
360 	struct mlx5_cqe64 *cqe64;
361 	void *cqe;
362 	int i;
363 
364 	for (i = 0; i < buf->nent; i++) {
365 		cqe = get_cqe(vcq, i);
366 		cqe64 = cqe;
367 		cqe64->op_own = MLX5_CQE_INVALID << 4;
368 	}
369 }
370 
371 static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
372 {
373 	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
374 
375 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
376 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
377 		return cqe64;
378 
379 	return NULL;
380 }
381 
382 static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
383 {
384 	vqp->head += n;
385 	vqp->db.db[0] = cpu_to_be32(vqp->head);
386 }
387 
388 static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
389 		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
390 {
391 	struct mlx5_vdpa_qp *vqp;
392 	__be64 *pas;
393 	void *qpc;
394 
395 	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
396 	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
397 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
398 	if (vqp->fw) {
399 		/* Firmware QP is allocated by the driver for the firmware's
400 		 * use so we can skip part of the params as they will be chosen by firmware
401 		 */
402 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
403 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
404 		MLX5_SET(qpc, qpc, no_sq, 1);
405 		return;
406 	}
407 
408 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
409 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
410 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
411 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
412 	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
413 	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
414 	MLX5_SET(qpc, qpc, no_sq, 1);
415 	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
416 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
417 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
418 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
419 	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
420 }
421 
422 static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
423 {
424 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
425 					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
426 					ndev->mvdev.mdev->priv.numa_node);
427 }
428 
429 static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
430 {
431 	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
432 }
433 
434 static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
435 		     struct mlx5_vdpa_qp *vqp)
436 {
437 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
438 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
439 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
440 	void *qpc;
441 	void *in;
442 	int err;
443 
444 	if (!vqp->fw) {
445 		vqp = &mvq->vqqp;
446 		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
447 		if (err)
448 			return err;
449 
450 		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
451 		if (err)
452 			goto err_db;
453 		inlen += vqp->frag_buf.npages * sizeof(__be64);
454 	}
455 
456 	in = kzalloc(inlen, GFP_KERNEL);
457 	if (!in) {
458 		err = -ENOMEM;
459 		goto err_kzalloc;
460 	}
461 
462 	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
463 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
464 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
465 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
466 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
467 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
468 	if (!vqp->fw)
469 		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
470 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
471 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
472 	kfree(in);
473 	if (err)
474 		goto err_kzalloc;
475 
476 	vqp->mqp.uid = ndev->mvdev.res.uid;
477 	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
478 
479 	if (!vqp->fw)
480 		rx_post(vqp, mvq->num_ent);
481 
482 	return 0;
483 
484 err_kzalloc:
485 	if (!vqp->fw)
486 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
487 err_db:
488 	if (!vqp->fw)
489 		rq_buf_free(ndev, vqp);
490 
491 	return err;
492 }
493 
494 static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
495 {
496 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
497 
498 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
499 	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
500 	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
501 	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
502 		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
503 	if (!vqp->fw) {
504 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
505 		rq_buf_free(ndev, vqp);
506 	}
507 }
508 
509 static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
510 {
511 	return get_sw_cqe(cq, cq->mcq.cons_index);
512 }
513 
514 static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
515 {
516 	struct mlx5_cqe64 *cqe64;
517 
518 	cqe64 = next_cqe_sw(vcq);
519 	if (!cqe64)
520 		return -EAGAIN;
521 
522 	vcq->mcq.cons_index++;
523 	return 0;
524 }
525 
526 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
527 {
528 	struct mlx5_vdpa_net *ndev = mvq->ndev;
529 	struct vdpa_callback *event_cb;
530 
531 	event_cb = &ndev->event_cbs[mvq->index];
532 	mlx5_cq_set_ci(&mvq->cq.mcq);
533 
534 	/* make sure CQ cosumer update is visible to the hardware before updating
535 	 * RX doorbell record.
536 	 */
537 	dma_wmb();
538 	rx_post(&mvq->vqqp, num);
539 	if (event_cb->callback)
540 		event_cb->callback(event_cb->private);
541 }
542 
543 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
544 {
545 	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
546 	struct mlx5_vdpa_net *ndev = mvq->ndev;
547 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
548 	int num = 0;
549 
550 	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
551 		num++;
552 		if (num > mvq->num_ent / 2) {
553 			/* If completions keep coming while we poll, we want to
554 			 * let the hardware know that we consumed them by
555 			 * updating the doorbell record.  We also let vdpa core
556 			 * know about this so it passes it on the virtio driver
557 			 * on the guest.
558 			 */
559 			mlx5_vdpa_handle_completions(mvq, num);
560 			num = 0;
561 		}
562 	}
563 
564 	if (num)
565 		mlx5_vdpa_handle_completions(mvq, num);
566 
567 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
568 }
569 
570 static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
571 {
572 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
573 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
574 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
575 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
576 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
577 	__be64 *pas;
578 	int inlen;
579 	void *cqc;
580 	void *in;
581 	int err;
582 	int eqn;
583 
584 	err = mlx5_db_alloc(mdev, &vcq->db);
585 	if (err)
586 		return err;
587 
588 	vcq->mcq.set_ci_db = vcq->db.db;
589 	vcq->mcq.arm_db = vcq->db.db + 1;
590 	vcq->mcq.cqe_sz = 64;
591 
592 	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
593 	if (err)
594 		goto err_db;
595 
596 	cq_frag_buf_init(vcq, &vcq->buf);
597 
598 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
599 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
600 	in = kzalloc(inlen, GFP_KERNEL);
601 	if (!in) {
602 		err = -ENOMEM;
603 		goto err_vzalloc;
604 	}
605 
606 	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
607 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
608 	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
609 
610 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
611 	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
612 
613 	/* Use vector 0 by default. Consider adding code to choose least used
614 	 * vector.
615 	 */
616 	err = mlx5_vector2eqn(mdev, 0, &eqn);
617 	if (err)
618 		goto err_vec;
619 
620 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
621 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
622 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
623 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
624 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
625 
626 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
627 	if (err)
628 		goto err_vec;
629 
630 	vcq->mcq.comp = mlx5_vdpa_cq_comp;
631 	vcq->cqe = num_ent;
632 	vcq->mcq.set_ci_db = vcq->db.db;
633 	vcq->mcq.arm_db = vcq->db.db + 1;
634 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
635 	kfree(in);
636 	return 0;
637 
638 err_vec:
639 	kfree(in);
640 err_vzalloc:
641 	cq_frag_buf_free(ndev, &vcq->buf);
642 err_db:
643 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
644 	return err;
645 }
646 
647 static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
648 {
649 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
650 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
651 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
652 
653 	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
654 		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
655 		return;
656 	}
657 	cq_frag_buf_free(ndev, &vcq->buf);
658 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
659 }
660 
661 static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
662 			  struct mlx5_vdpa_umem **umemp)
663 {
664 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
665 	int p_a;
666 	int p_b;
667 
668 	switch (num) {
669 	case 1:
670 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_a);
671 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_b);
672 		*umemp = &mvq->umem1;
673 		break;
674 	case 2:
675 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_a);
676 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_b);
677 		*umemp = &mvq->umem2;
678 		break;
679 	case 3:
680 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_a);
681 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_b);
682 		*umemp = &mvq->umem3;
683 		break;
684 	}
685 	(*umemp)->size = p_a * mvq->num_ent + p_b;
686 }
687 
688 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
689 {
690 	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
691 }
692 
693 static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
694 {
695 	int inlen;
696 	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
697 	void *um;
698 	void *in;
699 	int err;
700 	__be64 *pas;
701 	struct mlx5_vdpa_umem *umem;
702 
703 	set_umem_size(ndev, mvq, num, &umem);
704 	err = umem_frag_buf_alloc(ndev, umem, umem->size);
705 	if (err)
706 		return err;
707 
708 	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
709 
710 	in = kzalloc(inlen, GFP_KERNEL);
711 	if (!in) {
712 		err = -ENOMEM;
713 		goto err_in;
714 	}
715 
716 	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
717 	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
718 	um = MLX5_ADDR_OF(create_umem_in, in, umem);
719 	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
720 	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
721 
722 	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
723 	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
724 
725 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
726 	if (err) {
727 		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
728 		goto err_cmd;
729 	}
730 
731 	kfree(in);
732 	umem->id = MLX5_GET(create_umem_out, out, umem_id);
733 
734 	return 0;
735 
736 err_cmd:
737 	kfree(in);
738 err_in:
739 	umem_frag_buf_free(ndev, umem);
740 	return err;
741 }
742 
743 static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
744 {
745 	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
746 	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
747 	struct mlx5_vdpa_umem *umem;
748 
749 	switch (num) {
750 	case 1:
751 		umem = &mvq->umem1;
752 		break;
753 	case 2:
754 		umem = &mvq->umem2;
755 		break;
756 	case 3:
757 		umem = &mvq->umem3;
758 		break;
759 	}
760 
761 	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
762 	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
763 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
764 		return;
765 
766 	umem_frag_buf_free(ndev, umem);
767 }
768 
769 static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
770 {
771 	int num;
772 	int err;
773 
774 	for (num = 1; num <= 3; num++) {
775 		err = create_umem(ndev, mvq, num);
776 		if (err)
777 			goto err_umem;
778 	}
779 	return 0;
780 
781 err_umem:
782 	for (num--; num > 0; num--)
783 		umem_destroy(ndev, mvq, num);
784 
785 	return err;
786 }
787 
788 static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
789 {
790 	int num;
791 
792 	for (num = 3; num > 0; num--)
793 		umem_destroy(ndev, mvq, num);
794 }
795 
796 static int get_queue_type(struct mlx5_vdpa_net *ndev)
797 {
798 	u32 type_mask;
799 
800 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
801 
802 	/* prefer split queue */
803 	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)
804 		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
805 
806 	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED));
807 
808 	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
809 }
810 
811 static bool vq_is_tx(u16 idx)
812 {
813 	return idx % 2;
814 }
815 
816 static u16 get_features_12_3(u64 features)
817 {
818 	return (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << 9) |
819 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << 8) |
820 	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << 7) |
821 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_CSUM)) << 6);
822 }
823 
824 static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
825 {
826 	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
827 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
828 	void *obj_context;
829 	void *cmd_hdr;
830 	void *vq_ctx;
831 	void *in;
832 	int err;
833 
834 	err = umems_create(ndev, mvq);
835 	if (err)
836 		return err;
837 
838 	in = kzalloc(inlen, GFP_KERNEL);
839 	if (!in) {
840 		err = -ENOMEM;
841 		goto err_alloc;
842 	}
843 
844 	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
845 
846 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
847 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
848 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
849 
850 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
851 	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
852 	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
853 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
854 		 get_features_12_3(ndev->mvdev.actual_features));
855 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
856 	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
857 
858 	if (vq_is_tx(mvq->index))
859 		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
860 
861 	MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
862 	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
863 	MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
864 	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
865 	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
866 		 !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
867 	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
868 	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
869 	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
870 	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey);
871 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
872 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
873 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
874 	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
875 	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
876 	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
877 	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
878 
879 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
880 	if (err)
881 		goto err_cmd;
882 
883 	kfree(in);
884 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
885 
886 	return 0;
887 
888 err_cmd:
889 	kfree(in);
890 err_alloc:
891 	umems_destroy(ndev, mvq);
892 	return err;
893 }
894 
895 static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
896 {
897 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
898 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
899 
900 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
901 		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
902 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
903 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
904 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
905 		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
906 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
907 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
908 		return;
909 	}
910 	umems_destroy(ndev, mvq);
911 }
912 
913 static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
914 {
915 	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
916 }
917 
918 static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
919 {
920 	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
921 }
922 
923 static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
924 			int *outlen, u32 qpn, u32 rqpn)
925 {
926 	void *qpc;
927 	void *pp;
928 
929 	switch (cmd) {
930 	case MLX5_CMD_OP_2RST_QP:
931 		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
932 		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
933 		*in = kzalloc(*inlen, GFP_KERNEL);
934 		*out = kzalloc(*outlen, GFP_KERNEL);
935 		if (!*in || !*out)
936 			goto outerr;
937 
938 		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
939 		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
940 		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
941 		break;
942 	case MLX5_CMD_OP_RST2INIT_QP:
943 		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
944 		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
945 		*in = kzalloc(*inlen, GFP_KERNEL);
946 		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
947 		if (!*in || !*out)
948 			goto outerr;
949 
950 		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
951 		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
952 		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
953 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
954 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
955 		MLX5_SET(qpc, qpc, rwe, 1);
956 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
957 		MLX5_SET(ads, pp, vhca_port_num, 1);
958 		break;
959 	case MLX5_CMD_OP_INIT2RTR_QP:
960 		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
961 		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
962 		*in = kzalloc(*inlen, GFP_KERNEL);
963 		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
964 		if (!*in || !*out)
965 			goto outerr;
966 
967 		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
968 		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
969 		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
970 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
971 		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
972 		MLX5_SET(qpc, qpc, log_msg_max, 30);
973 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
974 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
975 		MLX5_SET(ads, pp, fl, 1);
976 		break;
977 	case MLX5_CMD_OP_RTR2RTS_QP:
978 		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
979 		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
980 		*in = kzalloc(*inlen, GFP_KERNEL);
981 		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
982 		if (!*in || !*out)
983 			goto outerr;
984 
985 		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
986 		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
987 		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
988 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
989 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
990 		MLX5_SET(ads, pp, ack_timeout, 14);
991 		MLX5_SET(qpc, qpc, retry_count, 7);
992 		MLX5_SET(qpc, qpc, rnr_retry, 7);
993 		break;
994 	default:
995 		goto outerr_nullify;
996 	}
997 
998 	return;
999 
1000 outerr:
1001 	kfree(*in);
1002 	kfree(*out);
1003 outerr_nullify:
1004 	*in = NULL;
1005 	*out = NULL;
1006 }
1007 
1008 static void free_inout(void *in, void *out)
1009 {
1010 	kfree(in);
1011 	kfree(out);
1012 }
1013 
1014 /* Two QPs are used by each virtqueue. One is used by the driver and one by
1015  * firmware. The fw argument indicates whether the subjected QP is the one used
1016  * by firmware.
1017  */
1018 static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
1019 {
1020 	int outlen;
1021 	int inlen;
1022 	void *out;
1023 	void *in;
1024 	int err;
1025 
1026 	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
1027 	if (!in || !out)
1028 		return -ENOMEM;
1029 
1030 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
1031 	free_inout(in, out);
1032 	return err;
1033 }
1034 
1035 static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1036 {
1037 	int err;
1038 
1039 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
1040 	if (err)
1041 		return err;
1042 
1043 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
1044 	if (err)
1045 		return err;
1046 
1047 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
1048 	if (err)
1049 		return err;
1050 
1051 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
1052 	if (err)
1053 		return err;
1054 
1055 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
1056 	if (err)
1057 		return err;
1058 
1059 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
1060 	if (err)
1061 		return err;
1062 
1063 	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1064 }
1065 
1066 struct mlx5_virtq_attr {
1067 	u8 state;
1068 	u16 available_index;
1069 	u16 used_index;
1070 };
1071 
1072 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1073 			   struct mlx5_virtq_attr *attr)
1074 {
1075 	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1076 	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1077 	void *out;
1078 	void *obj_context;
1079 	void *cmd_hdr;
1080 	int err;
1081 
1082 	out = kzalloc(outlen, GFP_KERNEL);
1083 	if (!out)
1084 		return -ENOMEM;
1085 
1086 	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1087 
1088 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1089 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1090 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1091 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1092 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1093 	if (err)
1094 		goto err_cmd;
1095 
1096 	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1097 	memset(attr, 0, sizeof(*attr));
1098 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1099 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1100 	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
1101 	kfree(out);
1102 	return 0;
1103 
1104 err_cmd:
1105 	kfree(out);
1106 	return err;
1107 }
1108 
1109 static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
1110 {
1111 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1112 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1113 	void *obj_context;
1114 	void *cmd_hdr;
1115 	void *in;
1116 	int err;
1117 
1118 	in = kzalloc(inlen, GFP_KERNEL);
1119 	if (!in)
1120 		return -ENOMEM;
1121 
1122 	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1123 
1124 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1125 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1126 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1127 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1128 
1129 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1130 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select,
1131 		   MLX5_VIRTQ_MODIFY_MASK_STATE);
1132 	MLX5_SET(virtio_net_q_object, obj_context, state, state);
1133 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1134 	kfree(in);
1135 	if (!err)
1136 		mvq->fw_state = state;
1137 
1138 	return err;
1139 }
1140 
1141 static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1142 {
1143 	u16 idx = mvq->index;
1144 	int err;
1145 
1146 	if (!mvq->num_ent)
1147 		return 0;
1148 
1149 	if (mvq->initialized)
1150 		return 0;
1151 
1152 	err = cq_create(ndev, idx, mvq->num_ent);
1153 	if (err)
1154 		return err;
1155 
1156 	err = qp_create(ndev, mvq, &mvq->fwqp);
1157 	if (err)
1158 		goto err_fwqp;
1159 
1160 	err = qp_create(ndev, mvq, &mvq->vqqp);
1161 	if (err)
1162 		goto err_vqqp;
1163 
1164 	err = connect_qps(ndev, mvq);
1165 	if (err)
1166 		goto err_connect;
1167 
1168 	err = create_virtqueue(ndev, mvq);
1169 	if (err)
1170 		goto err_connect;
1171 
1172 	if (mvq->ready) {
1173 		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1174 		if (err) {
1175 			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
1176 				       idx, err);
1177 			goto err_connect;
1178 		}
1179 	}
1180 
1181 	mvq->initialized = true;
1182 	return 0;
1183 
1184 err_connect:
1185 	qp_destroy(ndev, &mvq->vqqp);
1186 err_vqqp:
1187 	qp_destroy(ndev, &mvq->fwqp);
1188 err_fwqp:
1189 	cq_destroy(ndev, idx);
1190 	return err;
1191 }
1192 
1193 static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1194 {
1195 	struct mlx5_virtq_attr attr;
1196 
1197 	if (!mvq->initialized)
1198 		return;
1199 
1200 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1201 		return;
1202 
1203 	if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
1204 		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
1205 
1206 	if (query_virtqueue(ndev, mvq, &attr)) {
1207 		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
1208 		return;
1209 	}
1210 	mvq->avail_idx = attr.available_index;
1211 	mvq->used_idx = attr.used_index;
1212 }
1213 
1214 static void suspend_vqs(struct mlx5_vdpa_net *ndev)
1215 {
1216 	int i;
1217 
1218 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
1219 		suspend_vq(ndev, &ndev->vqs[i]);
1220 }
1221 
1222 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1223 {
1224 	if (!mvq->initialized)
1225 		return;
1226 
1227 	suspend_vq(ndev, mvq);
1228 	destroy_virtqueue(ndev, mvq);
1229 	qp_destroy(ndev, &mvq->vqqp);
1230 	qp_destroy(ndev, &mvq->fwqp);
1231 	cq_destroy(ndev, mvq->index);
1232 	mvq->initialized = false;
1233 }
1234 
1235 static int create_rqt(struct mlx5_vdpa_net *ndev)
1236 {
1237 	__be32 *list;
1238 	int max_rqt;
1239 	void *rqtc;
1240 	int inlen;
1241 	void *in;
1242 	int i, j;
1243 	int err;
1244 	int num;
1245 
1246 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
1247 		num = 1;
1248 	else
1249 		num = ndev->cur_num_vqs / 2;
1250 
1251 	max_rqt = min_t(int, roundup_pow_of_two(num),
1252 			1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
1253 	if (max_rqt < 1)
1254 		return -EOPNOTSUPP;
1255 
1256 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num);
1257 	in = kzalloc(inlen, GFP_KERNEL);
1258 	if (!in)
1259 		return -ENOMEM;
1260 
1261 	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1262 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1263 
1264 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1265 	MLX5_SET(rqtc, rqtc, rqt_max_size, max_rqt);
1266 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1267 	for (i = 0, j = 0; i < max_rqt; i++, j += 2)
1268 		list[i] = cpu_to_be32(ndev->vqs[j % (2 * num)].virtq_id);
1269 
1270 	MLX5_SET(rqtc, rqtc, rqt_actual_size, max_rqt);
1271 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1272 	kfree(in);
1273 	if (err)
1274 		return err;
1275 
1276 	return 0;
1277 }
1278 
1279 #define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
1280 
1281 static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
1282 {
1283 	__be32 *list;
1284 	int max_rqt;
1285 	void *rqtc;
1286 	int inlen;
1287 	void *in;
1288 	int i, j;
1289 	int err;
1290 
1291 	max_rqt = min_t(int, roundup_pow_of_two(ndev->cur_num_vqs / 2),
1292 			1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
1293 	if (max_rqt < 1)
1294 		return -EOPNOTSUPP;
1295 
1296 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num);
1297 	in = kzalloc(inlen, GFP_KERNEL);
1298 	if (!in)
1299 		return -ENOMEM;
1300 
1301 	MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
1302 	MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
1303 	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
1304 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1305 
1306 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1307 	for (i = 0, j = 0; i < max_rqt; i++, j += 2)
1308 		list[i] = cpu_to_be32(ndev->vqs[j % num].virtq_id);
1309 
1310 	MLX5_SET(rqtc, rqtc, rqt_actual_size, max_rqt);
1311 	err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
1312 	kfree(in);
1313 	if (err)
1314 		return err;
1315 
1316 	return 0;
1317 }
1318 
1319 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1320 {
1321 	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1322 }
1323 
1324 static int create_tir(struct mlx5_vdpa_net *ndev)
1325 {
1326 #define HASH_IP_L4PORTS                                                                            \
1327 	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1328 	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1329 	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1330 						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1331 						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1332 						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1333 						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1334 	void *rss_key;
1335 	void *outer;
1336 	void *tirc;
1337 	void *in;
1338 	int err;
1339 
1340 	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1341 	if (!in)
1342 		return -ENOMEM;
1343 
1344 	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1345 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1346 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1347 
1348 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1349 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1350 	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1351 	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1352 
1353 	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1354 	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1355 	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1356 	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1357 
1358 	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1359 	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1360 
1361 	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1362 	kfree(in);
1363 	return err;
1364 }
1365 
1366 static void destroy_tir(struct mlx5_vdpa_net *ndev)
1367 {
1368 	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1369 }
1370 
1371 static int add_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1372 {
1373 	struct mlx5_flow_destination dest[2] = {};
1374 	struct mlx5_flow_table_attr ft_attr = {};
1375 	struct mlx5_flow_act flow_act = {};
1376 	struct mlx5_flow_namespace *ns;
1377 	struct mlx5_flow_spec *spec;
1378 	void *headers_c;
1379 	void *headers_v;
1380 	u8 *dmac_c;
1381 	u8 *dmac_v;
1382 	int err;
1383 
1384 	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
1385 	if (!spec)
1386 		return -ENOMEM;
1387 
1388 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
1389 	ft_attr.max_fte = 2;
1390 	ft_attr.autogroup.max_num_groups = 2;
1391 
1392 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
1393 	if (!ns) {
1394 		mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n");
1395 		err = -EOPNOTSUPP;
1396 		goto err_ns;
1397 	}
1398 
1399 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
1400 	if (IS_ERR(ndev->rxft)) {
1401 		err = PTR_ERR(ndev->rxft);
1402 		goto err_ns;
1403 	}
1404 
1405 	ndev->rx_counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1406 	if (IS_ERR(ndev->rx_counter)) {
1407 		err = PTR_ERR(ndev->rx_counter);
1408 		goto err_fc;
1409 	}
1410 
1411 	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers);
1412 	dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16);
1413 	memset(dmac_c, 0xff, ETH_ALEN);
1414 	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
1415 	dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16);
1416 	ether_addr_copy(dmac_v, ndev->config.mac);
1417 
1418 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_COUNT;
1419 	dest[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1420 	dest[0].tir_num = ndev->res.tirn;
1421 	dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1422 	dest[1].counter_id = mlx5_fc_id(ndev->rx_counter);
1423 	ndev->rx_rule_ucast = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dest, 2);
1424 
1425 	if (IS_ERR(ndev->rx_rule_ucast)) {
1426 		err = PTR_ERR(ndev->rx_rule_ucast);
1427 		ndev->rx_rule_ucast = NULL;
1428 		goto err_rule_ucast;
1429 	}
1430 
1431 	memset(dmac_c, 0, ETH_ALEN);
1432 	memset(dmac_v, 0, ETH_ALEN);
1433 	dmac_c[0] = 1;
1434 	dmac_v[0] = 1;
1435 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
1436 	ndev->rx_rule_mcast = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dest, 1);
1437 	if (IS_ERR(ndev->rx_rule_mcast)) {
1438 		err = PTR_ERR(ndev->rx_rule_mcast);
1439 		ndev->rx_rule_mcast = NULL;
1440 		goto err_rule_mcast;
1441 	}
1442 
1443 	kvfree(spec);
1444 	return 0;
1445 
1446 err_rule_mcast:
1447 	mlx5_del_flow_rules(ndev->rx_rule_ucast);
1448 	ndev->rx_rule_ucast = NULL;
1449 err_rule_ucast:
1450 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1451 err_fc:
1452 	mlx5_destroy_flow_table(ndev->rxft);
1453 err_ns:
1454 	kvfree(spec);
1455 	return err;
1456 }
1457 
1458 static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1459 {
1460 	if (!ndev->rx_rule_ucast)
1461 		return;
1462 
1463 	mlx5_del_flow_rules(ndev->rx_rule_mcast);
1464 	ndev->rx_rule_mcast = NULL;
1465 	mlx5_del_flow_rules(ndev->rx_rule_ucast);
1466 	ndev->rx_rule_ucast = NULL;
1467 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1468 	mlx5_destroy_flow_table(ndev->rxft);
1469 }
1470 
1471 static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1472 {
1473 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1474 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1475 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1476 	struct mlx5_core_dev *pfmdev;
1477 	size_t read;
1478 	u8 mac[ETH_ALEN], mac_back[ETH_ALEN];
1479 
1480 	pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
1481 	switch (cmd) {
1482 	case VIRTIO_NET_CTRL_MAC_ADDR_SET:
1483 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
1484 		if (read != ETH_ALEN)
1485 			break;
1486 
1487 		if (!memcmp(ndev->config.mac, mac, 6)) {
1488 			status = VIRTIO_NET_OK;
1489 			break;
1490 		}
1491 
1492 		if (is_zero_ether_addr(mac))
1493 			break;
1494 
1495 		if (!is_zero_ether_addr(ndev->config.mac)) {
1496 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1497 				mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
1498 					       ndev->config.mac);
1499 				break;
1500 			}
1501 		}
1502 
1503 		if (mlx5_mpfs_add_mac(pfmdev, mac)) {
1504 			mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
1505 				       mac);
1506 			break;
1507 		}
1508 
1509 		/* backup the original mac address so that if failed to add the forward rules
1510 		 * we could restore it
1511 		 */
1512 		memcpy(mac_back, ndev->config.mac, ETH_ALEN);
1513 
1514 		memcpy(ndev->config.mac, mac, ETH_ALEN);
1515 
1516 		/* Need recreate the flow table entry, so that the packet could forward back
1517 		 */
1518 		remove_fwd_to_tir(ndev);
1519 
1520 		if (add_fwd_to_tir(ndev)) {
1521 			mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n");
1522 
1523 			/* Although it hardly run here, we still need double check */
1524 			if (is_zero_ether_addr(mac_back)) {
1525 				mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n");
1526 				break;
1527 			}
1528 
1529 			/* Try to restore original mac address to MFPS table, and try to restore
1530 			 * the forward rule entry.
1531 			 */
1532 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1533 				mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n",
1534 					       ndev->config.mac);
1535 			}
1536 
1537 			if (mlx5_mpfs_add_mac(pfmdev, mac_back)) {
1538 				mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n",
1539 					       mac_back);
1540 			}
1541 
1542 			memcpy(ndev->config.mac, mac_back, ETH_ALEN);
1543 
1544 			if (add_fwd_to_tir(ndev))
1545 				mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n");
1546 
1547 			break;
1548 		}
1549 
1550 		status = VIRTIO_NET_OK;
1551 		break;
1552 
1553 	default:
1554 		break;
1555 	}
1556 
1557 	return status;
1558 }
1559 
1560 static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
1561 {
1562 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1563 	int cur_qps = ndev->cur_num_vqs / 2;
1564 	int err;
1565 	int i;
1566 
1567 	if (cur_qps > newqps) {
1568 		err = modify_rqt(ndev, 2 * newqps);
1569 		if (err)
1570 			return err;
1571 
1572 		for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
1573 			teardown_vq(ndev, &ndev->vqs[i]);
1574 
1575 		ndev->cur_num_vqs = 2 * newqps;
1576 	} else {
1577 		ndev->cur_num_vqs = 2 * newqps;
1578 		for (i = cur_qps * 2; i < 2 * newqps; i++) {
1579 			err = setup_vq(ndev, &ndev->vqs[i]);
1580 			if (err)
1581 				goto clean_added;
1582 		}
1583 		err = modify_rqt(ndev, 2 * newqps);
1584 		if (err)
1585 			goto clean_added;
1586 	}
1587 	return 0;
1588 
1589 clean_added:
1590 	for (--i; i >= 2 * cur_qps; --i)
1591 		teardown_vq(ndev, &ndev->vqs[i]);
1592 
1593 	ndev->cur_num_vqs = 2 * cur_qps;
1594 
1595 	return err;
1596 }
1597 
1598 static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1599 {
1600 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1601 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1602 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1603 	struct virtio_net_ctrl_mq mq;
1604 	size_t read;
1605 	u16 newqps;
1606 
1607 	switch (cmd) {
1608 	case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
1609 		/* This mq feature check aligns with pre-existing userspace
1610 		 * implementation.
1611 		 *
1612 		 * Without it, an untrusted driver could fake a multiqueue config
1613 		 * request down to a non-mq device that may cause kernel to
1614 		 * panic due to uninitialized resources for extra vqs. Even with
1615 		 * a well behaving guest driver, it is not expected to allow
1616 		 * changing the number of vqs on a non-mq device.
1617 		 */
1618 		if (!MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ))
1619 			break;
1620 
1621 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
1622 		if (read != sizeof(mq))
1623 			break;
1624 
1625 		newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
1626 		if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
1627 		    newqps > mlx5_vdpa_max_qps(mvdev->max_vqs))
1628 			break;
1629 
1630 		if (ndev->cur_num_vqs == 2 * newqps) {
1631 			status = VIRTIO_NET_OK;
1632 			break;
1633 		}
1634 
1635 		if (!change_num_qps(mvdev, newqps))
1636 			status = VIRTIO_NET_OK;
1637 
1638 		break;
1639 	default:
1640 		break;
1641 	}
1642 
1643 	return status;
1644 }
1645 
1646 static void mlx5_cvq_kick_handler(struct work_struct *work)
1647 {
1648 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1649 	struct virtio_net_ctrl_hdr ctrl;
1650 	struct mlx5_vdpa_wq_ent *wqent;
1651 	struct mlx5_vdpa_dev *mvdev;
1652 	struct mlx5_control_vq *cvq;
1653 	struct mlx5_vdpa_net *ndev;
1654 	size_t read, write;
1655 	int err;
1656 
1657 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
1658 	mvdev = wqent->mvdev;
1659 	ndev = to_mlx5_vdpa_ndev(mvdev);
1660 	cvq = &mvdev->cvq;
1661 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
1662 		goto out;
1663 
1664 	if (!cvq->ready)
1665 		goto out;
1666 
1667 	while (true) {
1668 		err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
1669 					   GFP_ATOMIC);
1670 		if (err <= 0)
1671 			break;
1672 
1673 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
1674 		if (read != sizeof(ctrl))
1675 			break;
1676 
1677 		switch (ctrl.class) {
1678 		case VIRTIO_NET_CTRL_MAC:
1679 			status = handle_ctrl_mac(mvdev, ctrl.cmd);
1680 			break;
1681 		case VIRTIO_NET_CTRL_MQ:
1682 			status = handle_ctrl_mq(mvdev, ctrl.cmd);
1683 			break;
1684 
1685 		default:
1686 			break;
1687 		}
1688 
1689 		/* Make sure data is written before advancing index */
1690 		smp_wmb();
1691 
1692 		write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
1693 		vringh_complete_iotlb(&cvq->vring, cvq->head, write);
1694 		vringh_kiov_cleanup(&cvq->riov);
1695 		vringh_kiov_cleanup(&cvq->wiov);
1696 
1697 		if (vringh_need_notify_iotlb(&cvq->vring))
1698 			vringh_notify(&cvq->vring);
1699 	}
1700 out:
1701 	kfree(wqent);
1702 }
1703 
1704 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
1705 {
1706 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1707 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1708 	struct mlx5_vdpa_virtqueue *mvq;
1709 	struct mlx5_vdpa_wq_ent *wqent;
1710 
1711 	if (!is_index_valid(mvdev, idx))
1712 		return;
1713 
1714 	if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
1715 		if (!mvdev->wq || !mvdev->cvq.ready)
1716 			return;
1717 
1718 		wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
1719 		if (!wqent)
1720 			return;
1721 
1722 		wqent->mvdev = mvdev;
1723 		INIT_WORK(&wqent->work, mlx5_cvq_kick_handler);
1724 		queue_work(mvdev->wq, &wqent->work);
1725 		return;
1726 	}
1727 
1728 	mvq = &ndev->vqs[idx];
1729 	if (unlikely(!mvq->ready))
1730 		return;
1731 
1732 	iowrite16(idx, ndev->mvdev.res.kick_addr);
1733 }
1734 
1735 static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
1736 				    u64 driver_area, u64 device_area)
1737 {
1738 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1739 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1740 	struct mlx5_vdpa_virtqueue *mvq;
1741 
1742 	if (!is_index_valid(mvdev, idx))
1743 		return -EINVAL;
1744 
1745 	if (is_ctrl_vq_idx(mvdev, idx)) {
1746 		mvdev->cvq.desc_addr = desc_area;
1747 		mvdev->cvq.device_addr = device_area;
1748 		mvdev->cvq.driver_addr = driver_area;
1749 		return 0;
1750 	}
1751 
1752 	mvq = &ndev->vqs[idx];
1753 	mvq->desc_addr = desc_area;
1754 	mvq->device_addr = device_area;
1755 	mvq->driver_addr = driver_area;
1756 	return 0;
1757 }
1758 
1759 static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
1760 {
1761 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1762 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1763 	struct mlx5_vdpa_virtqueue *mvq;
1764 
1765 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
1766 		return;
1767 
1768 	mvq = &ndev->vqs[idx];
1769 	mvq->num_ent = num;
1770 }
1771 
1772 static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
1773 {
1774 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1775 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1776 
1777 	ndev->event_cbs[idx] = *cb;
1778 }
1779 
1780 static void mlx5_cvq_notify(struct vringh *vring)
1781 {
1782 	struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
1783 
1784 	if (!cvq->event_cb.callback)
1785 		return;
1786 
1787 	cvq->event_cb.callback(cvq->event_cb.private);
1788 }
1789 
1790 static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
1791 {
1792 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1793 
1794 	cvq->ready = ready;
1795 	if (!ready)
1796 		return;
1797 
1798 	cvq->vring.notify = mlx5_cvq_notify;
1799 }
1800 
1801 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
1802 {
1803 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1804 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1805 	struct mlx5_vdpa_virtqueue *mvq;
1806 
1807 	if (!mvdev->actual_features)
1808 		return;
1809 
1810 	if (!is_index_valid(mvdev, idx))
1811 		return;
1812 
1813 	if (is_ctrl_vq_idx(mvdev, idx)) {
1814 		set_cvq_ready(mvdev, ready);
1815 		return;
1816 	}
1817 
1818 	mvq = &ndev->vqs[idx];
1819 	if (!ready)
1820 		suspend_vq(ndev, mvq);
1821 
1822 	mvq->ready = ready;
1823 }
1824 
1825 static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
1826 {
1827 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1828 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1829 
1830 	if (!is_index_valid(mvdev, idx))
1831 		return false;
1832 
1833 	if (is_ctrl_vq_idx(mvdev, idx))
1834 		return mvdev->cvq.ready;
1835 
1836 	return ndev->vqs[idx].ready;
1837 }
1838 
1839 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
1840 				  const struct vdpa_vq_state *state)
1841 {
1842 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1843 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1844 	struct mlx5_vdpa_virtqueue *mvq;
1845 
1846 	if (!is_index_valid(mvdev, idx))
1847 		return -EINVAL;
1848 
1849 	if (is_ctrl_vq_idx(mvdev, idx)) {
1850 		mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
1851 		return 0;
1852 	}
1853 
1854 	mvq = &ndev->vqs[idx];
1855 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
1856 		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
1857 		return -EINVAL;
1858 	}
1859 
1860 	mvq->used_idx = state->split.avail_index;
1861 	mvq->avail_idx = state->split.avail_index;
1862 	return 0;
1863 }
1864 
1865 static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
1866 {
1867 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1868 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1869 	struct mlx5_vdpa_virtqueue *mvq;
1870 	struct mlx5_virtq_attr attr;
1871 	int err;
1872 
1873 	if (!is_index_valid(mvdev, idx))
1874 		return -EINVAL;
1875 
1876 	if (is_ctrl_vq_idx(mvdev, idx)) {
1877 		state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
1878 		return 0;
1879 	}
1880 
1881 	mvq = &ndev->vqs[idx];
1882 	/* If the virtq object was destroyed, use the value saved at
1883 	 * the last minute of suspend_vq. This caters for userspace
1884 	 * that cares about emulating the index after vq is stopped.
1885 	 */
1886 	if (!mvq->initialized) {
1887 		/* Firmware returns a wrong value for the available index.
1888 		 * Since both values should be identical, we take the value of
1889 		 * used_idx which is reported correctly.
1890 		 */
1891 		state->split.avail_index = mvq->used_idx;
1892 		return 0;
1893 	}
1894 
1895 	err = query_virtqueue(ndev, mvq, &attr);
1896 	if (err) {
1897 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
1898 		return err;
1899 	}
1900 	state->split.avail_index = attr.used_index;
1901 	return 0;
1902 }
1903 
1904 static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
1905 {
1906 	return PAGE_SIZE;
1907 }
1908 
1909 enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9,
1910 	MLX5_VIRTIO_NET_F_CSUM = 1 << 10,
1911 	MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11,
1912 	MLX5_VIRTIO_NET_F_HOST_TSO4 = 1 << 12,
1913 };
1914 
1915 static u64 mlx_to_vritio_features(u16 dev_features)
1916 {
1917 	u64 result = 0;
1918 
1919 	if (dev_features & MLX5_VIRTIO_NET_F_GUEST_CSUM)
1920 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
1921 	if (dev_features & MLX5_VIRTIO_NET_F_CSUM)
1922 		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
1923 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO6)
1924 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
1925 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO4)
1926 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
1927 
1928 	return result;
1929 }
1930 
1931 static u64 get_supported_features(struct mlx5_core_dev *mdev)
1932 {
1933 	u64 mlx_vdpa_features = 0;
1934 	u16 dev_features;
1935 
1936 	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mdev, device_features_bits_mask);
1937 	mlx_vdpa_features |= mlx_to_vritio_features(dev_features);
1938 	if (MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_version_1_0))
1939 		mlx_vdpa_features |= BIT_ULL(VIRTIO_F_VERSION_1);
1940 	mlx_vdpa_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
1941 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
1942 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
1943 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MQ);
1944 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_STATUS);
1945 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MTU);
1946 
1947 	return mlx_vdpa_features;
1948 }
1949 
1950 static u64 mlx5_vdpa_get_device_features(struct vdpa_device *vdev)
1951 {
1952 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1953 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1954 
1955 	print_features(mvdev, ndev->mvdev.mlx_features, false);
1956 	return ndev->mvdev.mlx_features;
1957 }
1958 
1959 static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features)
1960 {
1961 	/* Minimum features to expect */
1962 	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
1963 		return -EOPNOTSUPP;
1964 
1965 	/* Double check features combination sent down by the driver.
1966 	 * Fail invalid features due to absence of the depended feature.
1967 	 *
1968 	 * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
1969 	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
1970 	 * By failing the invalid features sent down by untrusted drivers,
1971 	 * we're assured the assumption made upon is_index_valid() and
1972 	 * is_ctrl_vq_idx() will not be compromised.
1973 	 */
1974 	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
1975             BIT_ULL(VIRTIO_NET_F_MQ))
1976 		return -EINVAL;
1977 
1978 	return 0;
1979 }
1980 
1981 static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev)
1982 {
1983 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1984 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1985 	int err;
1986 	int i;
1987 
1988 	for (i = 0; i < 2 * mlx5_vdpa_max_qps(mvdev->max_vqs); i++) {
1989 		err = setup_vq(ndev, &ndev->vqs[i]);
1990 		if (err)
1991 			goto err_vq;
1992 	}
1993 
1994 	if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) {
1995 		err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
1996 					MLX5_CVQ_MAX_ENT, false,
1997 					(struct vring_desc *)(uintptr_t)cvq->desc_addr,
1998 					(struct vring_avail *)(uintptr_t)cvq->driver_addr,
1999 					(struct vring_used *)(uintptr_t)cvq->device_addr);
2000 		if (err)
2001 			goto err_vq;
2002 	}
2003 
2004 	return 0;
2005 
2006 err_vq:
2007 	for (--i; i >= 0; i--)
2008 		teardown_vq(ndev, &ndev->vqs[i]);
2009 
2010 	return err;
2011 }
2012 
2013 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
2014 {
2015 	struct mlx5_vdpa_virtqueue *mvq;
2016 	int i;
2017 
2018 	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
2019 		mvq = &ndev->vqs[i];
2020 		if (!mvq->initialized)
2021 			continue;
2022 
2023 		teardown_vq(ndev, mvq);
2024 	}
2025 }
2026 
2027 static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
2028 {
2029 	if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
2030 		if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
2031 			/* MQ supported. CVQ index is right above the last data virtqueue's */
2032 			mvdev->max_idx = mvdev->max_vqs;
2033 		} else {
2034 			/* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
2035 			 * CVQ gets index 2
2036 			 */
2037 			mvdev->max_idx = 2;
2038 		}
2039 	} else {
2040 		/* Two data virtqueues only: one for rx and one for tx */
2041 		mvdev->max_idx = 1;
2042 	}
2043 }
2044 
2045 static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features)
2046 {
2047 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2048 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2049 	int err;
2050 
2051 	print_features(mvdev, features, true);
2052 
2053 	err = verify_driver_features(mvdev, features);
2054 	if (err)
2055 		return err;
2056 
2057 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
2058 	if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ))
2059 		ndev->cur_num_vqs = 2 * mlx5vdpa16_to_cpu(mvdev, ndev->config.max_virtqueue_pairs);
2060 	else
2061 		ndev->cur_num_vqs = 2;
2062 
2063 	update_cvq_info(mvdev);
2064 	return err;
2065 }
2066 
2067 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
2068 {
2069 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2070 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2071 
2072 	ndev->config_cb = *cb;
2073 }
2074 
2075 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
2076 static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
2077 {
2078 	return MLX5_VDPA_MAX_VQ_ENTRIES;
2079 }
2080 
2081 static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
2082 {
2083 	return VIRTIO_ID_NET;
2084 }
2085 
2086 static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
2087 {
2088 	return PCI_VENDOR_ID_MELLANOX;
2089 }
2090 
2091 static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
2092 {
2093 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2094 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2095 
2096 	print_status(mvdev, ndev->mvdev.status, false);
2097 	return ndev->mvdev.status;
2098 }
2099 
2100 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
2101 {
2102 	struct mlx5_vq_restore_info *ri = &mvq->ri;
2103 	struct mlx5_virtq_attr attr = {};
2104 	int err;
2105 
2106 	if (mvq->initialized) {
2107 		err = query_virtqueue(ndev, mvq, &attr);
2108 		if (err)
2109 			return err;
2110 	}
2111 
2112 	ri->avail_index = attr.available_index;
2113 	ri->used_index = attr.used_index;
2114 	ri->ready = mvq->ready;
2115 	ri->num_ent = mvq->num_ent;
2116 	ri->desc_addr = mvq->desc_addr;
2117 	ri->device_addr = mvq->device_addr;
2118 	ri->driver_addr = mvq->driver_addr;
2119 	ri->restore = true;
2120 	return 0;
2121 }
2122 
2123 static int save_channels_info(struct mlx5_vdpa_net *ndev)
2124 {
2125 	int i;
2126 
2127 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2128 		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
2129 		save_channel_info(ndev, &ndev->vqs[i]);
2130 	}
2131 	return 0;
2132 }
2133 
2134 static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
2135 {
2136 	int i;
2137 
2138 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2139 		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2140 }
2141 
2142 static void restore_channels_info(struct mlx5_vdpa_net *ndev)
2143 {
2144 	struct mlx5_vdpa_virtqueue *mvq;
2145 	struct mlx5_vq_restore_info *ri;
2146 	int i;
2147 
2148 	mlx5_clear_vqs(ndev);
2149 	init_mvqs(ndev);
2150 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2151 		mvq = &ndev->vqs[i];
2152 		ri = &mvq->ri;
2153 		if (!ri->restore)
2154 			continue;
2155 
2156 		mvq->avail_idx = ri->avail_index;
2157 		mvq->used_idx = ri->used_index;
2158 		mvq->ready = ri->ready;
2159 		mvq->num_ent = ri->num_ent;
2160 		mvq->desc_addr = ri->desc_addr;
2161 		mvq->device_addr = ri->device_addr;
2162 		mvq->driver_addr = ri->driver_addr;
2163 	}
2164 }
2165 
2166 static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
2167 {
2168 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2169 	int err;
2170 
2171 	suspend_vqs(ndev);
2172 	err = save_channels_info(ndev);
2173 	if (err)
2174 		goto err_mr;
2175 
2176 	teardown_driver(ndev);
2177 	mlx5_vdpa_destroy_mr(mvdev);
2178 	err = mlx5_vdpa_create_mr(mvdev, iotlb);
2179 	if (err)
2180 		goto err_mr;
2181 
2182 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
2183 		return 0;
2184 
2185 	restore_channels_info(ndev);
2186 	err = setup_driver(mvdev);
2187 	if (err)
2188 		goto err_setup;
2189 
2190 	return 0;
2191 
2192 err_setup:
2193 	mlx5_vdpa_destroy_mr(mvdev);
2194 err_mr:
2195 	return err;
2196 }
2197 
2198 static int setup_driver(struct mlx5_vdpa_dev *mvdev)
2199 {
2200 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2201 	int err;
2202 
2203 	mutex_lock(&ndev->reslock);
2204 	if (ndev->setup) {
2205 		mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
2206 		err = 0;
2207 		goto out;
2208 	}
2209 	err = setup_virtqueues(mvdev);
2210 	if (err) {
2211 		mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
2212 		goto out;
2213 	}
2214 
2215 	err = create_rqt(ndev);
2216 	if (err) {
2217 		mlx5_vdpa_warn(mvdev, "create_rqt\n");
2218 		goto err_rqt;
2219 	}
2220 
2221 	err = create_tir(ndev);
2222 	if (err) {
2223 		mlx5_vdpa_warn(mvdev, "create_tir\n");
2224 		goto err_tir;
2225 	}
2226 
2227 	err = add_fwd_to_tir(ndev);
2228 	if (err) {
2229 		mlx5_vdpa_warn(mvdev, "add_fwd_to_tir\n");
2230 		goto err_fwd;
2231 	}
2232 	ndev->setup = true;
2233 	mutex_unlock(&ndev->reslock);
2234 
2235 	return 0;
2236 
2237 err_fwd:
2238 	destroy_tir(ndev);
2239 err_tir:
2240 	destroy_rqt(ndev);
2241 err_rqt:
2242 	teardown_virtqueues(ndev);
2243 out:
2244 	mutex_unlock(&ndev->reslock);
2245 	return err;
2246 }
2247 
2248 static void teardown_driver(struct mlx5_vdpa_net *ndev)
2249 {
2250 	mutex_lock(&ndev->reslock);
2251 	if (!ndev->setup)
2252 		goto out;
2253 
2254 	remove_fwd_to_tir(ndev);
2255 	destroy_tir(ndev);
2256 	destroy_rqt(ndev);
2257 	teardown_virtqueues(ndev);
2258 	ndev->setup = false;
2259 out:
2260 	mutex_unlock(&ndev->reslock);
2261 }
2262 
2263 static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
2264 {
2265 	int i;
2266 
2267 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2268 		ndev->vqs[i].ready = false;
2269 
2270 	ndev->mvdev.cvq.ready = false;
2271 }
2272 
2273 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
2274 {
2275 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2276 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2277 	int err;
2278 
2279 	print_status(mvdev, status, true);
2280 
2281 	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
2282 		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
2283 			err = setup_driver(mvdev);
2284 			if (err) {
2285 				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
2286 				goto err_setup;
2287 			}
2288 		} else {
2289 			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
2290 			return;
2291 		}
2292 	}
2293 
2294 	ndev->mvdev.status = status;
2295 	return;
2296 
2297 err_setup:
2298 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2299 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
2300 }
2301 
2302 static int mlx5_vdpa_reset(struct vdpa_device *vdev)
2303 {
2304 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2305 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2306 
2307 	print_status(mvdev, 0, true);
2308 	mlx5_vdpa_info(mvdev, "performing device reset\n");
2309 	teardown_driver(ndev);
2310 	clear_vqs_ready(ndev);
2311 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2312 	ndev->mvdev.status = 0;
2313 	ndev->cur_num_vqs = 0;
2314 	memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1));
2315 	ndev->mvdev.actual_features = 0;
2316 	++mvdev->generation;
2317 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2318 		if (mlx5_vdpa_create_mr(mvdev, NULL))
2319 			mlx5_vdpa_warn(mvdev, "create MR failed\n");
2320 	}
2321 
2322 	return 0;
2323 }
2324 
2325 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
2326 {
2327 	return sizeof(struct virtio_net_config);
2328 }
2329 
2330 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
2331 				 unsigned int len)
2332 {
2333 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2334 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2335 
2336 	if (offset + len <= sizeof(struct virtio_net_config))
2337 		memcpy(buf, (u8 *)&ndev->config + offset, len);
2338 }
2339 
2340 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
2341 				 unsigned int len)
2342 {
2343 	/* not supported */
2344 }
2345 
2346 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
2347 {
2348 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2349 
2350 	return mvdev->generation;
2351 }
2352 
2353 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb)
2354 {
2355 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2356 	bool change_map;
2357 	int err;
2358 
2359 	err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map);
2360 	if (err) {
2361 		mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err);
2362 		return err;
2363 	}
2364 
2365 	if (change_map)
2366 		return mlx5_vdpa_change_map(mvdev, iotlb);
2367 
2368 	return 0;
2369 }
2370 
2371 static void mlx5_vdpa_free(struct vdpa_device *vdev)
2372 {
2373 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2374 	struct mlx5_core_dev *pfmdev;
2375 	struct mlx5_vdpa_net *ndev;
2376 
2377 	ndev = to_mlx5_vdpa_ndev(mvdev);
2378 
2379 	free_resources(ndev);
2380 	mlx5_vdpa_destroy_mr(mvdev);
2381 	if (!is_zero_ether_addr(ndev->config.mac)) {
2382 		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
2383 		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
2384 	}
2385 	mlx5_vdpa_free_resources(&ndev->mvdev);
2386 	mutex_destroy(&ndev->reslock);
2387 	kfree(ndev->event_cbs);
2388 	kfree(ndev->vqs);
2389 }
2390 
2391 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
2392 {
2393 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2394 	struct vdpa_notification_area ret = {};
2395 	struct mlx5_vdpa_net *ndev;
2396 	phys_addr_t addr;
2397 
2398 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
2399 		return ret;
2400 
2401 	/* If SF BAR size is smaller than PAGE_SIZE, do not use direct
2402 	 * notification to avoid the risk of mapping pages that contain BAR of more
2403 	 * than one SF
2404 	 */
2405 	if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
2406 		return ret;
2407 
2408 	ndev = to_mlx5_vdpa_ndev(mvdev);
2409 	addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
2410 	ret.addr = addr;
2411 	ret.size = PAGE_SIZE;
2412 	return ret;
2413 }
2414 
2415 static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 idx)
2416 {
2417 	return -EOPNOTSUPP;
2418 }
2419 
2420 static u64 mlx5_vdpa_get_driver_features(struct vdpa_device *vdev)
2421 {
2422 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2423 
2424 	return mvdev->actual_features;
2425 }
2426 
2427 static const struct vdpa_config_ops mlx5_vdpa_ops = {
2428 	.set_vq_address = mlx5_vdpa_set_vq_address,
2429 	.set_vq_num = mlx5_vdpa_set_vq_num,
2430 	.kick_vq = mlx5_vdpa_kick_vq,
2431 	.set_vq_cb = mlx5_vdpa_set_vq_cb,
2432 	.set_vq_ready = mlx5_vdpa_set_vq_ready,
2433 	.get_vq_ready = mlx5_vdpa_get_vq_ready,
2434 	.set_vq_state = mlx5_vdpa_set_vq_state,
2435 	.get_vq_state = mlx5_vdpa_get_vq_state,
2436 	.get_vq_notification = mlx5_get_vq_notification,
2437 	.get_vq_irq = mlx5_get_vq_irq,
2438 	.get_vq_align = mlx5_vdpa_get_vq_align,
2439 	.get_device_features = mlx5_vdpa_get_device_features,
2440 	.set_driver_features = mlx5_vdpa_set_driver_features,
2441 	.get_driver_features = mlx5_vdpa_get_driver_features,
2442 	.set_config_cb = mlx5_vdpa_set_config_cb,
2443 	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
2444 	.get_device_id = mlx5_vdpa_get_device_id,
2445 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
2446 	.get_status = mlx5_vdpa_get_status,
2447 	.set_status = mlx5_vdpa_set_status,
2448 	.reset = mlx5_vdpa_reset,
2449 	.get_config_size = mlx5_vdpa_get_config_size,
2450 	.get_config = mlx5_vdpa_get_config,
2451 	.set_config = mlx5_vdpa_set_config,
2452 	.get_generation = mlx5_vdpa_get_generation,
2453 	.set_map = mlx5_vdpa_set_map,
2454 	.free = mlx5_vdpa_free,
2455 };
2456 
2457 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
2458 {
2459 	u16 hw_mtu;
2460 	int err;
2461 
2462 	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
2463 	if (err)
2464 		return err;
2465 
2466 	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
2467 	return 0;
2468 }
2469 
2470 static int alloc_resources(struct mlx5_vdpa_net *ndev)
2471 {
2472 	struct mlx5_vdpa_net_resources *res = &ndev->res;
2473 	int err;
2474 
2475 	if (res->valid) {
2476 		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
2477 		return -EEXIST;
2478 	}
2479 
2480 	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
2481 	if (err)
2482 		return err;
2483 
2484 	err = create_tis(ndev);
2485 	if (err)
2486 		goto err_tis;
2487 
2488 	res->valid = true;
2489 
2490 	return 0;
2491 
2492 err_tis:
2493 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
2494 	return err;
2495 }
2496 
2497 static void free_resources(struct mlx5_vdpa_net *ndev)
2498 {
2499 	struct mlx5_vdpa_net_resources *res = &ndev->res;
2500 
2501 	if (!res->valid)
2502 		return;
2503 
2504 	destroy_tis(ndev);
2505 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
2506 	res->valid = false;
2507 }
2508 
2509 static void init_mvqs(struct mlx5_vdpa_net *ndev)
2510 {
2511 	struct mlx5_vdpa_virtqueue *mvq;
2512 	int i;
2513 
2514 	for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); ++i) {
2515 		mvq = &ndev->vqs[i];
2516 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2517 		mvq->index = i;
2518 		mvq->ndev = ndev;
2519 		mvq->fwqp.fw = true;
2520 	}
2521 	for (; i < ndev->mvdev.max_vqs; i++) {
2522 		mvq = &ndev->vqs[i];
2523 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2524 		mvq->index = i;
2525 		mvq->ndev = ndev;
2526 	}
2527 }
2528 
2529 struct mlx5_vdpa_mgmtdev {
2530 	struct vdpa_mgmt_dev mgtdev;
2531 	struct mlx5_adev *madev;
2532 	struct mlx5_vdpa_net *ndev;
2533 };
2534 
2535 static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
2536 {
2537 	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
2538 	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
2539 	int err;
2540 
2541 	MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE);
2542 	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
2543 	MLX5_SET(query_vport_state_in, in, vport_number, vport);
2544 	if (vport)
2545 		MLX5_SET(query_vport_state_in, in, other_vport, 1);
2546 
2547 	err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
2548 	if (err)
2549 		return 0;
2550 
2551 	return MLX5_GET(query_vport_state_out, out, state);
2552 }
2553 
2554 static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
2555 {
2556 	if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) ==
2557 	    VPORT_STATE_UP)
2558 		return true;
2559 
2560 	return false;
2561 }
2562 
2563 static void update_carrier(struct work_struct *work)
2564 {
2565 	struct mlx5_vdpa_wq_ent *wqent;
2566 	struct mlx5_vdpa_dev *mvdev;
2567 	struct mlx5_vdpa_net *ndev;
2568 
2569 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
2570 	mvdev = wqent->mvdev;
2571 	ndev = to_mlx5_vdpa_ndev(mvdev);
2572 	if (get_link_state(mvdev))
2573 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
2574 	else
2575 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
2576 
2577 	if (ndev->config_cb.callback)
2578 		ndev->config_cb.callback(ndev->config_cb.private);
2579 
2580 	kfree(wqent);
2581 }
2582 
2583 static int event_handler(struct notifier_block *nb, unsigned long event, void *param)
2584 {
2585 	struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb);
2586 	struct mlx5_eqe *eqe = param;
2587 	int ret = NOTIFY_DONE;
2588 	struct mlx5_vdpa_wq_ent *wqent;
2589 
2590 	if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
2591 		switch (eqe->sub_type) {
2592 		case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
2593 		case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
2594 			wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
2595 			if (!wqent)
2596 				return NOTIFY_DONE;
2597 
2598 			wqent->mvdev = &ndev->mvdev;
2599 			INIT_WORK(&wqent->work, update_carrier);
2600 			queue_work(ndev->mvdev.wq, &wqent->work);
2601 			ret = NOTIFY_OK;
2602 			break;
2603 		default:
2604 			return NOTIFY_DONE;
2605 		}
2606 		return ret;
2607 	}
2608 	return ret;
2609 }
2610 
2611 static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu)
2612 {
2613 	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
2614 	void *in;
2615 	int err;
2616 
2617 	in = kvzalloc(inlen, GFP_KERNEL);
2618 	if (!in)
2619 		return -ENOMEM;
2620 
2621 	MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1);
2622 	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu,
2623 		 mtu + MLX5V_ETH_HARD_MTU);
2624 	MLX5_SET(modify_nic_vport_context_in, in, opcode,
2625 		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
2626 
2627 	err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in);
2628 
2629 	kvfree(in);
2630 	return err;
2631 }
2632 
2633 static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
2634 			     const struct vdpa_dev_set_config *add_config)
2635 {
2636 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
2637 	struct virtio_net_config *config;
2638 	struct mlx5_core_dev *pfmdev;
2639 	struct mlx5_vdpa_dev *mvdev;
2640 	struct mlx5_vdpa_net *ndev;
2641 	struct mlx5_core_dev *mdev;
2642 	u32 max_vqs;
2643 	u16 mtu;
2644 	int err;
2645 
2646 	if (mgtdev->ndev)
2647 		return -ENOSPC;
2648 
2649 	mdev = mgtdev->madev->mdev;
2650 	if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) &
2651 	    MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) {
2652 		dev_warn(mdev->device, "missing support for split virtqueues\n");
2653 		return -EOPNOTSUPP;
2654 	}
2655 
2656 	max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues);
2657 	if (max_vqs < 2) {
2658 		dev_warn(mdev->device,
2659 			 "%d virtqueues are supported. At least 2 are required\n",
2660 			 max_vqs);
2661 		return -EAGAIN;
2662 	}
2663 
2664 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
2665 		if (add_config->net.max_vq_pairs > max_vqs / 2)
2666 			return -EINVAL;
2667 		max_vqs = min_t(u32, max_vqs, 2 * add_config->net.max_vq_pairs);
2668 	} else {
2669 		max_vqs = 2;
2670 	}
2671 
2672 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
2673 				 name, false);
2674 	if (IS_ERR(ndev))
2675 		return PTR_ERR(ndev);
2676 
2677 	ndev->mvdev.mlx_features = mgtdev->mgtdev.supported_features;
2678 	ndev->mvdev.max_vqs = max_vqs;
2679 	mvdev = &ndev->mvdev;
2680 	mvdev->mdev = mdev;
2681 
2682 	ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL);
2683 	ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL);
2684 	if (!ndev->vqs || !ndev->event_cbs) {
2685 		err = -ENOMEM;
2686 		goto err_alloc;
2687 	}
2688 
2689 	init_mvqs(ndev);
2690 	mutex_init(&ndev->reslock);
2691 	config = &ndev->config;
2692 
2693 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
2694 		err = config_func_mtu(mdev, add_config->net.mtu);
2695 		if (err)
2696 			goto err_mtu;
2697 	}
2698 
2699 	err = query_mtu(mdev, &mtu);
2700 	if (err)
2701 		goto err_mtu;
2702 
2703 	ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu);
2704 
2705 	if (get_link_state(mvdev))
2706 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
2707 	else
2708 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
2709 
2710 	if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
2711 		memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN);
2712 	} else {
2713 		err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
2714 		if (err)
2715 			goto err_mtu;
2716 	}
2717 
2718 	if (!is_zero_ether_addr(config->mac)) {
2719 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
2720 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
2721 		if (err)
2722 			goto err_mtu;
2723 
2724 		ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC);
2725 	}
2726 
2727 	config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, mlx5_vdpa_max_qps(max_vqs));
2728 	mvdev->vdev.dma_dev = &mdev->pdev->dev;
2729 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
2730 	if (err)
2731 		goto err_mpfs;
2732 
2733 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2734 		err = mlx5_vdpa_create_mr(mvdev, NULL);
2735 		if (err)
2736 			goto err_res;
2737 	}
2738 
2739 	err = alloc_resources(ndev);
2740 	if (err)
2741 		goto err_mr;
2742 
2743 	mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq");
2744 	if (!mvdev->wq) {
2745 		err = -ENOMEM;
2746 		goto err_res2;
2747 	}
2748 
2749 	ndev->nb.notifier_call = event_handler;
2750 	mlx5_notifier_register(mdev, &ndev->nb);
2751 	mvdev->vdev.mdev = &mgtdev->mgtdev;
2752 	err = _vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs) + 1);
2753 	if (err)
2754 		goto err_reg;
2755 
2756 	mgtdev->ndev = ndev;
2757 	return 0;
2758 
2759 err_reg:
2760 	destroy_workqueue(mvdev->wq);
2761 err_res2:
2762 	free_resources(ndev);
2763 err_mr:
2764 	mlx5_vdpa_destroy_mr(mvdev);
2765 err_res:
2766 	mlx5_vdpa_free_resources(&ndev->mvdev);
2767 err_mpfs:
2768 	if (!is_zero_ether_addr(config->mac))
2769 		mlx5_mpfs_del_mac(pfmdev, config->mac);
2770 err_mtu:
2771 	mutex_destroy(&ndev->reslock);
2772 err_alloc:
2773 	put_device(&mvdev->vdev.dev);
2774 	return err;
2775 }
2776 
2777 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
2778 {
2779 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
2780 	struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
2781 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2782 	struct workqueue_struct *wq;
2783 
2784 	mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
2785 	wq = mvdev->wq;
2786 	mvdev->wq = NULL;
2787 	destroy_workqueue(wq);
2788 	_vdpa_unregister_device(dev);
2789 	mgtdev->ndev = NULL;
2790 }
2791 
2792 static const struct vdpa_mgmtdev_ops mdev_ops = {
2793 	.dev_add = mlx5_vdpa_dev_add,
2794 	.dev_del = mlx5_vdpa_dev_del,
2795 };
2796 
2797 static struct virtio_device_id id_table[] = {
2798 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
2799 	{ 0 },
2800 };
2801 
2802 static int mlx5v_probe(struct auxiliary_device *adev,
2803 		       const struct auxiliary_device_id *id)
2804 
2805 {
2806 	struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
2807 	struct mlx5_core_dev *mdev = madev->mdev;
2808 	struct mlx5_vdpa_mgmtdev *mgtdev;
2809 	int err;
2810 
2811 	mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
2812 	if (!mgtdev)
2813 		return -ENOMEM;
2814 
2815 	mgtdev->mgtdev.ops = &mdev_ops;
2816 	mgtdev->mgtdev.device = mdev->device;
2817 	mgtdev->mgtdev.id_table = id_table;
2818 	mgtdev->mgtdev.config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) |
2819 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP) |
2820 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU);
2821 	mgtdev->mgtdev.max_supported_vqs =
2822 		MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues) + 1;
2823 	mgtdev->mgtdev.supported_features = get_supported_features(mdev);
2824 	mgtdev->madev = madev;
2825 
2826 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
2827 	if (err)
2828 		goto reg_err;
2829 
2830 	auxiliary_set_drvdata(adev, mgtdev);
2831 
2832 	return 0;
2833 
2834 reg_err:
2835 	kfree(mgtdev);
2836 	return err;
2837 }
2838 
2839 static void mlx5v_remove(struct auxiliary_device *adev)
2840 {
2841 	struct mlx5_vdpa_mgmtdev *mgtdev;
2842 
2843 	mgtdev = auxiliary_get_drvdata(adev);
2844 	vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
2845 	kfree(mgtdev);
2846 }
2847 
2848 static const struct auxiliary_device_id mlx5v_id_table[] = {
2849 	{ .name = MLX5_ADEV_NAME ".vnet", },
2850 	{},
2851 };
2852 
2853 MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
2854 
2855 static struct auxiliary_driver mlx5v_driver = {
2856 	.name = "vnet",
2857 	.probe = mlx5v_probe,
2858 	.remove = mlx5v_remove,
2859 	.id_table = mlx5v_id_table,
2860 };
2861 
2862 module_auxiliary_driver(mlx5v_driver);
2863