xref: /openbmc/linux/drivers/vdpa/mlx5/net/mlx5_vnet.c (revision 6219b20e)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
3 
4 #include <linux/module.h>
5 #include <linux/vdpa.h>
6 #include <linux/vringh.h>
7 #include <uapi/linux/virtio_net.h>
8 #include <uapi/linux/virtio_ids.h>
9 #include <linux/virtio_config.h>
10 #include <linux/auxiliary_bus.h>
11 #include <linux/mlx5/cq.h>
12 #include <linux/mlx5/qp.h>
13 #include <linux/mlx5/device.h>
14 #include <linux/mlx5/driver.h>
15 #include <linux/mlx5/vport.h>
16 #include <linux/mlx5/fs.h>
17 #include <linux/mlx5/mlx5_ifc_vdpa.h>
18 #include <linux/mlx5/mpfs.h>
19 #include "mlx5_vdpa.h"
20 
21 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
22 MODULE_DESCRIPTION("Mellanox VDPA driver");
23 MODULE_LICENSE("Dual BSD/GPL");
24 
25 #define to_mlx5_vdpa_ndev(__mvdev)                                             \
26 	container_of(__mvdev, struct mlx5_vdpa_net, mvdev)
27 #define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev)
28 
29 #define VALID_FEATURES_MASK                                                                        \
30 	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
31 	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
32 	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
33 	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
34 	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
35 	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
36 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
37 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
38 	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
39 	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
40 	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
41 	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
42 	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
43 
44 #define VALID_STATUS_MASK                                                                          \
45 	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
46 	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
47 
48 #define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
49 
50 struct mlx5_vdpa_net_resources {
51 	u32 tisn;
52 	u32 tdn;
53 	u32 tirn;
54 	u32 rqtn;
55 	bool valid;
56 };
57 
58 struct mlx5_vdpa_cq_buf {
59 	struct mlx5_frag_buf_ctrl fbc;
60 	struct mlx5_frag_buf frag_buf;
61 	int cqe_size;
62 	int nent;
63 };
64 
65 struct mlx5_vdpa_cq {
66 	struct mlx5_core_cq mcq;
67 	struct mlx5_vdpa_cq_buf buf;
68 	struct mlx5_db db;
69 	int cqe;
70 };
71 
72 struct mlx5_vdpa_umem {
73 	struct mlx5_frag_buf_ctrl fbc;
74 	struct mlx5_frag_buf frag_buf;
75 	int size;
76 	u32 id;
77 };
78 
79 struct mlx5_vdpa_qp {
80 	struct mlx5_core_qp mqp;
81 	struct mlx5_frag_buf frag_buf;
82 	struct mlx5_db db;
83 	u16 head;
84 	bool fw;
85 };
86 
87 struct mlx5_vq_restore_info {
88 	u32 num_ent;
89 	u64 desc_addr;
90 	u64 device_addr;
91 	u64 driver_addr;
92 	u16 avail_index;
93 	u16 used_index;
94 	bool ready;
95 	bool restore;
96 };
97 
98 struct mlx5_vdpa_virtqueue {
99 	bool ready;
100 	u64 desc_addr;
101 	u64 device_addr;
102 	u64 driver_addr;
103 	u32 num_ent;
104 
105 	/* Resources for implementing the notification channel from the device
106 	 * to the driver. fwqp is the firmware end of an RC connection; the
107 	 * other end is vqqp used by the driver. cq is is where completions are
108 	 * reported.
109 	 */
110 	struct mlx5_vdpa_cq cq;
111 	struct mlx5_vdpa_qp fwqp;
112 	struct mlx5_vdpa_qp vqqp;
113 
114 	/* umem resources are required for the virtqueue operation. They're use
115 	 * is internal and they must be provided by the driver.
116 	 */
117 	struct mlx5_vdpa_umem umem1;
118 	struct mlx5_vdpa_umem umem2;
119 	struct mlx5_vdpa_umem umem3;
120 
121 	bool initialized;
122 	int index;
123 	u32 virtq_id;
124 	struct mlx5_vdpa_net *ndev;
125 	u16 avail_idx;
126 	u16 used_idx;
127 	int fw_state;
128 
129 	/* keep last in the struct */
130 	struct mlx5_vq_restore_info ri;
131 };
132 
133 /* We will remove this limitation once mlx5_vdpa_alloc_resources()
134  * provides for driver space allocation
135  */
136 #define MLX5_MAX_SUPPORTED_VQS 16
137 
138 static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
139 {
140 	if (unlikely(idx > mvdev->max_idx))
141 		return false;
142 
143 	return true;
144 }
145 
146 struct mlx5_vdpa_net {
147 	struct mlx5_vdpa_dev mvdev;
148 	struct mlx5_vdpa_net_resources res;
149 	struct virtio_net_config config;
150 	struct mlx5_vdpa_virtqueue vqs[MLX5_MAX_SUPPORTED_VQS];
151 	struct vdpa_callback event_cbs[MLX5_MAX_SUPPORTED_VQS + 1];
152 
153 	/* Serialize vq resources creation and destruction. This is required
154 	 * since memory map might change and we need to destroy and create
155 	 * resources while driver in operational.
156 	 */
157 	struct mutex reslock;
158 	struct mlx5_flow_table *rxft;
159 	struct mlx5_fc *rx_counter;
160 	struct mlx5_flow_handle *rx_rule;
161 	bool setup;
162 	u16 mtu;
163 	u32 cur_num_vqs;
164 };
165 
166 static void free_resources(struct mlx5_vdpa_net *ndev);
167 static void init_mvqs(struct mlx5_vdpa_net *ndev);
168 static int setup_driver(struct mlx5_vdpa_dev *mvdev);
169 static void teardown_driver(struct mlx5_vdpa_net *ndev);
170 
171 static bool mlx5_vdpa_debug;
172 
173 #define MLX5_CVQ_MAX_ENT 16
174 
175 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
176 	do {                                                                                       \
177 		if (features & BIT_ULL(_feature))                                                  \
178 			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
179 	} while (0)
180 
181 #define MLX5_LOG_VIO_STAT(_status)                                                                 \
182 	do {                                                                                       \
183 		if (status & (_status))                                                            \
184 			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
185 	} while (0)
186 
187 /* TODO: cross-endian support */
188 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
189 {
190 	return virtio_legacy_is_little_endian() ||
191 		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
192 }
193 
194 static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
195 {
196 	return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
197 }
198 
199 static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
200 {
201 	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
202 }
203 
204 static inline u32 mlx5_vdpa_max_qps(int max_vqs)
205 {
206 	return max_vqs / 2;
207 }
208 
209 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
210 {
211 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
212 		return 2;
213 
214 	return 2 * mlx5_vdpa_max_qps(mvdev->max_vqs);
215 }
216 
217 static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
218 {
219 	return idx == ctrl_vq_idx(mvdev);
220 }
221 
222 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
223 {
224 	if (status & ~VALID_STATUS_MASK)
225 		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
226 			       status & ~VALID_STATUS_MASK);
227 
228 	if (!mlx5_vdpa_debug)
229 		return;
230 
231 	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
232 	if (set && !status) {
233 		mlx5_vdpa_info(mvdev, "driver resets the device\n");
234 		return;
235 	}
236 
237 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
238 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
239 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
240 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
241 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
242 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
243 }
244 
245 static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
246 {
247 	if (features & ~VALID_FEATURES_MASK)
248 		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
249 			       features & ~VALID_FEATURES_MASK);
250 
251 	if (!mlx5_vdpa_debug)
252 		return;
253 
254 	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
255 	if (!features)
256 		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
257 
258 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
259 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
260 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
261 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
262 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
263 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
264 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
265 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
266 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
267 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
268 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
269 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
270 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
271 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
272 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
273 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
274 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
275 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
276 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
277 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
278 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
279 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
280 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
281 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
282 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
283 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
284 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
285 	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
286 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
287 	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
288 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
289 	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
290 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
291 	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
292 }
293 
294 static int create_tis(struct mlx5_vdpa_net *ndev)
295 {
296 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
297 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
298 	void *tisc;
299 	int err;
300 
301 	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
302 	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
303 	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
304 	if (err)
305 		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
306 
307 	return err;
308 }
309 
310 static void destroy_tis(struct mlx5_vdpa_net *ndev)
311 {
312 	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
313 }
314 
315 #define MLX5_VDPA_CQE_SIZE 64
316 #define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
317 
318 static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
319 {
320 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
321 	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
322 	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
323 	int err;
324 
325 	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
326 				       ndev->mvdev.mdev->priv.numa_node);
327 	if (err)
328 		return err;
329 
330 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
331 
332 	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
333 	buf->nent = nent;
334 
335 	return 0;
336 }
337 
338 static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
339 {
340 	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
341 
342 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
343 					ndev->mvdev.mdev->priv.numa_node);
344 }
345 
346 static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
347 {
348 	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
349 }
350 
351 static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
352 {
353 	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
354 }
355 
356 static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
357 {
358 	struct mlx5_cqe64 *cqe64;
359 	void *cqe;
360 	int i;
361 
362 	for (i = 0; i < buf->nent; i++) {
363 		cqe = get_cqe(vcq, i);
364 		cqe64 = cqe;
365 		cqe64->op_own = MLX5_CQE_INVALID << 4;
366 	}
367 }
368 
369 static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
370 {
371 	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
372 
373 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
374 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
375 		return cqe64;
376 
377 	return NULL;
378 }
379 
380 static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
381 {
382 	vqp->head += n;
383 	vqp->db.db[0] = cpu_to_be32(vqp->head);
384 }
385 
386 static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
387 		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
388 {
389 	struct mlx5_vdpa_qp *vqp;
390 	__be64 *pas;
391 	void *qpc;
392 
393 	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
394 	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
395 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
396 	if (vqp->fw) {
397 		/* Firmware QP is allocated by the driver for the firmware's
398 		 * use so we can skip part of the params as they will be chosen by firmware
399 		 */
400 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
401 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
402 		MLX5_SET(qpc, qpc, no_sq, 1);
403 		return;
404 	}
405 
406 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
407 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
408 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
409 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
410 	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
411 	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
412 	MLX5_SET(qpc, qpc, no_sq, 1);
413 	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
414 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
415 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
416 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
417 	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
418 }
419 
420 static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
421 {
422 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
423 					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
424 					ndev->mvdev.mdev->priv.numa_node);
425 }
426 
427 static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
428 {
429 	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
430 }
431 
432 static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
433 		     struct mlx5_vdpa_qp *vqp)
434 {
435 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
436 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
437 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
438 	void *qpc;
439 	void *in;
440 	int err;
441 
442 	if (!vqp->fw) {
443 		vqp = &mvq->vqqp;
444 		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
445 		if (err)
446 			return err;
447 
448 		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
449 		if (err)
450 			goto err_db;
451 		inlen += vqp->frag_buf.npages * sizeof(__be64);
452 	}
453 
454 	in = kzalloc(inlen, GFP_KERNEL);
455 	if (!in) {
456 		err = -ENOMEM;
457 		goto err_kzalloc;
458 	}
459 
460 	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
461 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
462 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
463 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
464 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
465 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
466 	if (!vqp->fw)
467 		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
468 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
469 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
470 	kfree(in);
471 	if (err)
472 		goto err_kzalloc;
473 
474 	vqp->mqp.uid = ndev->mvdev.res.uid;
475 	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
476 
477 	if (!vqp->fw)
478 		rx_post(vqp, mvq->num_ent);
479 
480 	return 0;
481 
482 err_kzalloc:
483 	if (!vqp->fw)
484 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
485 err_db:
486 	if (!vqp->fw)
487 		rq_buf_free(ndev, vqp);
488 
489 	return err;
490 }
491 
492 static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
493 {
494 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
495 
496 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
497 	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
498 	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
499 	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
500 		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
501 	if (!vqp->fw) {
502 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
503 		rq_buf_free(ndev, vqp);
504 	}
505 }
506 
507 static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
508 {
509 	return get_sw_cqe(cq, cq->mcq.cons_index);
510 }
511 
512 static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
513 {
514 	struct mlx5_cqe64 *cqe64;
515 
516 	cqe64 = next_cqe_sw(vcq);
517 	if (!cqe64)
518 		return -EAGAIN;
519 
520 	vcq->mcq.cons_index++;
521 	return 0;
522 }
523 
524 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
525 {
526 	struct mlx5_vdpa_net *ndev = mvq->ndev;
527 	struct vdpa_callback *event_cb;
528 
529 	event_cb = &ndev->event_cbs[mvq->index];
530 	mlx5_cq_set_ci(&mvq->cq.mcq);
531 
532 	/* make sure CQ cosumer update is visible to the hardware before updating
533 	 * RX doorbell record.
534 	 */
535 	dma_wmb();
536 	rx_post(&mvq->vqqp, num);
537 	if (event_cb->callback)
538 		event_cb->callback(event_cb->private);
539 }
540 
541 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
542 {
543 	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
544 	struct mlx5_vdpa_net *ndev = mvq->ndev;
545 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
546 	int num = 0;
547 
548 	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
549 		num++;
550 		if (num > mvq->num_ent / 2) {
551 			/* If completions keep coming while we poll, we want to
552 			 * let the hardware know that we consumed them by
553 			 * updating the doorbell record.  We also let vdpa core
554 			 * know about this so it passes it on the virtio driver
555 			 * on the guest.
556 			 */
557 			mlx5_vdpa_handle_completions(mvq, num);
558 			num = 0;
559 		}
560 	}
561 
562 	if (num)
563 		mlx5_vdpa_handle_completions(mvq, num);
564 
565 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
566 }
567 
568 static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
569 {
570 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
571 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
572 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
573 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
574 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
575 	__be64 *pas;
576 	int inlen;
577 	void *cqc;
578 	void *in;
579 	int err;
580 	int eqn;
581 
582 	err = mlx5_db_alloc(mdev, &vcq->db);
583 	if (err)
584 		return err;
585 
586 	vcq->mcq.set_ci_db = vcq->db.db;
587 	vcq->mcq.arm_db = vcq->db.db + 1;
588 	vcq->mcq.cqe_sz = 64;
589 
590 	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
591 	if (err)
592 		goto err_db;
593 
594 	cq_frag_buf_init(vcq, &vcq->buf);
595 
596 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
597 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
598 	in = kzalloc(inlen, GFP_KERNEL);
599 	if (!in) {
600 		err = -ENOMEM;
601 		goto err_vzalloc;
602 	}
603 
604 	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
605 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
606 	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
607 
608 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
609 	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
610 
611 	/* Use vector 0 by default. Consider adding code to choose least used
612 	 * vector.
613 	 */
614 	err = mlx5_vector2eqn(mdev, 0, &eqn);
615 	if (err)
616 		goto err_vec;
617 
618 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
619 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
620 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
621 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
622 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
623 
624 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
625 	if (err)
626 		goto err_vec;
627 
628 	vcq->mcq.comp = mlx5_vdpa_cq_comp;
629 	vcq->cqe = num_ent;
630 	vcq->mcq.set_ci_db = vcq->db.db;
631 	vcq->mcq.arm_db = vcq->db.db + 1;
632 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
633 	kfree(in);
634 	return 0;
635 
636 err_vec:
637 	kfree(in);
638 err_vzalloc:
639 	cq_frag_buf_free(ndev, &vcq->buf);
640 err_db:
641 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
642 	return err;
643 }
644 
645 static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
646 {
647 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
648 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
649 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
650 
651 	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
652 		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
653 		return;
654 	}
655 	cq_frag_buf_free(ndev, &vcq->buf);
656 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
657 }
658 
659 static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
660 			  struct mlx5_vdpa_umem **umemp)
661 {
662 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
663 	int p_a;
664 	int p_b;
665 
666 	switch (num) {
667 	case 1:
668 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_a);
669 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_b);
670 		*umemp = &mvq->umem1;
671 		break;
672 	case 2:
673 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_a);
674 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_b);
675 		*umemp = &mvq->umem2;
676 		break;
677 	case 3:
678 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_a);
679 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_b);
680 		*umemp = &mvq->umem3;
681 		break;
682 	}
683 	(*umemp)->size = p_a * mvq->num_ent + p_b;
684 }
685 
686 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
687 {
688 	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
689 }
690 
691 static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
692 {
693 	int inlen;
694 	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
695 	void *um;
696 	void *in;
697 	int err;
698 	__be64 *pas;
699 	struct mlx5_vdpa_umem *umem;
700 
701 	set_umem_size(ndev, mvq, num, &umem);
702 	err = umem_frag_buf_alloc(ndev, umem, umem->size);
703 	if (err)
704 		return err;
705 
706 	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
707 
708 	in = kzalloc(inlen, GFP_KERNEL);
709 	if (!in) {
710 		err = -ENOMEM;
711 		goto err_in;
712 	}
713 
714 	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
715 	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
716 	um = MLX5_ADDR_OF(create_umem_in, in, umem);
717 	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
718 	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
719 
720 	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
721 	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
722 
723 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
724 	if (err) {
725 		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
726 		goto err_cmd;
727 	}
728 
729 	kfree(in);
730 	umem->id = MLX5_GET(create_umem_out, out, umem_id);
731 
732 	return 0;
733 
734 err_cmd:
735 	kfree(in);
736 err_in:
737 	umem_frag_buf_free(ndev, umem);
738 	return err;
739 }
740 
741 static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
742 {
743 	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
744 	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
745 	struct mlx5_vdpa_umem *umem;
746 
747 	switch (num) {
748 	case 1:
749 		umem = &mvq->umem1;
750 		break;
751 	case 2:
752 		umem = &mvq->umem2;
753 		break;
754 	case 3:
755 		umem = &mvq->umem3;
756 		break;
757 	}
758 
759 	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
760 	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
761 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
762 		return;
763 
764 	umem_frag_buf_free(ndev, umem);
765 }
766 
767 static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
768 {
769 	int num;
770 	int err;
771 
772 	for (num = 1; num <= 3; num++) {
773 		err = create_umem(ndev, mvq, num);
774 		if (err)
775 			goto err_umem;
776 	}
777 	return 0;
778 
779 err_umem:
780 	for (num--; num > 0; num--)
781 		umem_destroy(ndev, mvq, num);
782 
783 	return err;
784 }
785 
786 static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
787 {
788 	int num;
789 
790 	for (num = 3; num > 0; num--)
791 		umem_destroy(ndev, mvq, num);
792 }
793 
794 static int get_queue_type(struct mlx5_vdpa_net *ndev)
795 {
796 	u32 type_mask;
797 
798 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
799 
800 	/* prefer split queue */
801 	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)
802 		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
803 
804 	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED));
805 
806 	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
807 }
808 
809 static bool vq_is_tx(u16 idx)
810 {
811 	return idx % 2;
812 }
813 
814 static u16 get_features_12_3(u64 features)
815 {
816 	return (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << 9) |
817 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << 8) |
818 	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << 7) |
819 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_CSUM)) << 6);
820 }
821 
822 static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
823 {
824 	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
825 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
826 	void *obj_context;
827 	void *cmd_hdr;
828 	void *vq_ctx;
829 	void *in;
830 	int err;
831 
832 	err = umems_create(ndev, mvq);
833 	if (err)
834 		return err;
835 
836 	in = kzalloc(inlen, GFP_KERNEL);
837 	if (!in) {
838 		err = -ENOMEM;
839 		goto err_alloc;
840 	}
841 
842 	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
843 
844 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
845 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
846 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
847 
848 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
849 	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
850 	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
851 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
852 		 get_features_12_3(ndev->mvdev.actual_features));
853 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
854 	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
855 
856 	if (vq_is_tx(mvq->index))
857 		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
858 
859 	MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
860 	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
861 	MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
862 	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
863 	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
864 		 !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
865 	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
866 	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
867 	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
868 	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey.key);
869 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
870 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
871 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
872 	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
873 	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
874 	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
875 	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
876 	if (MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, eth_frame_offload_type))
877 		MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, 1);
878 
879 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
880 	if (err)
881 		goto err_cmd;
882 
883 	kfree(in);
884 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
885 
886 	return 0;
887 
888 err_cmd:
889 	kfree(in);
890 err_alloc:
891 	umems_destroy(ndev, mvq);
892 	return err;
893 }
894 
895 static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
896 {
897 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
898 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
899 
900 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
901 		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
902 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
903 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
904 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
905 		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
906 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
907 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
908 		return;
909 	}
910 	umems_destroy(ndev, mvq);
911 }
912 
913 static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
914 {
915 	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
916 }
917 
918 static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
919 {
920 	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
921 }
922 
923 static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
924 			int *outlen, u32 qpn, u32 rqpn)
925 {
926 	void *qpc;
927 	void *pp;
928 
929 	switch (cmd) {
930 	case MLX5_CMD_OP_2RST_QP:
931 		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
932 		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
933 		*in = kzalloc(*inlen, GFP_KERNEL);
934 		*out = kzalloc(*outlen, GFP_KERNEL);
935 		if (!*in || !*out)
936 			goto outerr;
937 
938 		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
939 		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
940 		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
941 		break;
942 	case MLX5_CMD_OP_RST2INIT_QP:
943 		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
944 		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
945 		*in = kzalloc(*inlen, GFP_KERNEL);
946 		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
947 		if (!*in || !*out)
948 			goto outerr;
949 
950 		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
951 		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
952 		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
953 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
954 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
955 		MLX5_SET(qpc, qpc, rwe, 1);
956 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
957 		MLX5_SET(ads, pp, vhca_port_num, 1);
958 		break;
959 	case MLX5_CMD_OP_INIT2RTR_QP:
960 		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
961 		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
962 		*in = kzalloc(*inlen, GFP_KERNEL);
963 		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
964 		if (!*in || !*out)
965 			goto outerr;
966 
967 		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
968 		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
969 		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
970 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
971 		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
972 		MLX5_SET(qpc, qpc, log_msg_max, 30);
973 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
974 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
975 		MLX5_SET(ads, pp, fl, 1);
976 		break;
977 	case MLX5_CMD_OP_RTR2RTS_QP:
978 		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
979 		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
980 		*in = kzalloc(*inlen, GFP_KERNEL);
981 		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
982 		if (!*in || !*out)
983 			goto outerr;
984 
985 		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
986 		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
987 		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
988 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
989 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
990 		MLX5_SET(ads, pp, ack_timeout, 14);
991 		MLX5_SET(qpc, qpc, retry_count, 7);
992 		MLX5_SET(qpc, qpc, rnr_retry, 7);
993 		break;
994 	default:
995 		goto outerr_nullify;
996 	}
997 
998 	return;
999 
1000 outerr:
1001 	kfree(*in);
1002 	kfree(*out);
1003 outerr_nullify:
1004 	*in = NULL;
1005 	*out = NULL;
1006 }
1007 
1008 static void free_inout(void *in, void *out)
1009 {
1010 	kfree(in);
1011 	kfree(out);
1012 }
1013 
1014 /* Two QPs are used by each virtqueue. One is used by the driver and one by
1015  * firmware. The fw argument indicates whether the subjected QP is the one used
1016  * by firmware.
1017  */
1018 static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
1019 {
1020 	int outlen;
1021 	int inlen;
1022 	void *out;
1023 	void *in;
1024 	int err;
1025 
1026 	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
1027 	if (!in || !out)
1028 		return -ENOMEM;
1029 
1030 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
1031 	free_inout(in, out);
1032 	return err;
1033 }
1034 
1035 static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1036 {
1037 	int err;
1038 
1039 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
1040 	if (err)
1041 		return err;
1042 
1043 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
1044 	if (err)
1045 		return err;
1046 
1047 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
1048 	if (err)
1049 		return err;
1050 
1051 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
1052 	if (err)
1053 		return err;
1054 
1055 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
1056 	if (err)
1057 		return err;
1058 
1059 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
1060 	if (err)
1061 		return err;
1062 
1063 	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1064 }
1065 
1066 struct mlx5_virtq_attr {
1067 	u8 state;
1068 	u16 available_index;
1069 	u16 used_index;
1070 };
1071 
1072 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1073 			   struct mlx5_virtq_attr *attr)
1074 {
1075 	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1076 	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1077 	void *out;
1078 	void *obj_context;
1079 	void *cmd_hdr;
1080 	int err;
1081 
1082 	out = kzalloc(outlen, GFP_KERNEL);
1083 	if (!out)
1084 		return -ENOMEM;
1085 
1086 	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1087 
1088 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1089 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1090 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1091 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1092 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1093 	if (err)
1094 		goto err_cmd;
1095 
1096 	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1097 	memset(attr, 0, sizeof(*attr));
1098 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1099 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1100 	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
1101 	kfree(out);
1102 	return 0;
1103 
1104 err_cmd:
1105 	kfree(out);
1106 	return err;
1107 }
1108 
1109 static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
1110 {
1111 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1112 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1113 	void *obj_context;
1114 	void *cmd_hdr;
1115 	void *in;
1116 	int err;
1117 
1118 	in = kzalloc(inlen, GFP_KERNEL);
1119 	if (!in)
1120 		return -ENOMEM;
1121 
1122 	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1123 
1124 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1125 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1126 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1127 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1128 
1129 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1130 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select,
1131 		   MLX5_VIRTQ_MODIFY_MASK_STATE);
1132 	MLX5_SET(virtio_net_q_object, obj_context, state, state);
1133 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1134 	kfree(in);
1135 	if (!err)
1136 		mvq->fw_state = state;
1137 
1138 	return err;
1139 }
1140 
1141 static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1142 {
1143 	u16 idx = mvq->index;
1144 	int err;
1145 
1146 	if (!mvq->num_ent)
1147 		return 0;
1148 
1149 	if (mvq->initialized)
1150 		return 0;
1151 
1152 	err = cq_create(ndev, idx, mvq->num_ent);
1153 	if (err)
1154 		return err;
1155 
1156 	err = qp_create(ndev, mvq, &mvq->fwqp);
1157 	if (err)
1158 		goto err_fwqp;
1159 
1160 	err = qp_create(ndev, mvq, &mvq->vqqp);
1161 	if (err)
1162 		goto err_vqqp;
1163 
1164 	err = connect_qps(ndev, mvq);
1165 	if (err)
1166 		goto err_connect;
1167 
1168 	err = create_virtqueue(ndev, mvq);
1169 	if (err)
1170 		goto err_connect;
1171 
1172 	if (mvq->ready) {
1173 		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1174 		if (err) {
1175 			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
1176 				       idx, err);
1177 			goto err_connect;
1178 		}
1179 	}
1180 
1181 	mvq->initialized = true;
1182 	return 0;
1183 
1184 err_connect:
1185 	qp_destroy(ndev, &mvq->vqqp);
1186 err_vqqp:
1187 	qp_destroy(ndev, &mvq->fwqp);
1188 err_fwqp:
1189 	cq_destroy(ndev, idx);
1190 	return err;
1191 }
1192 
1193 static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1194 {
1195 	struct mlx5_virtq_attr attr;
1196 
1197 	if (!mvq->initialized)
1198 		return;
1199 
1200 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1201 		return;
1202 
1203 	if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
1204 		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
1205 
1206 	if (query_virtqueue(ndev, mvq, &attr)) {
1207 		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
1208 		return;
1209 	}
1210 	mvq->avail_idx = attr.available_index;
1211 	mvq->used_idx = attr.used_index;
1212 }
1213 
1214 static void suspend_vqs(struct mlx5_vdpa_net *ndev)
1215 {
1216 	int i;
1217 
1218 	for (i = 0; i < MLX5_MAX_SUPPORTED_VQS; i++)
1219 		suspend_vq(ndev, &ndev->vqs[i]);
1220 }
1221 
1222 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1223 {
1224 	if (!mvq->initialized)
1225 		return;
1226 
1227 	suspend_vq(ndev, mvq);
1228 	destroy_virtqueue(ndev, mvq);
1229 	qp_destroy(ndev, &mvq->vqqp);
1230 	qp_destroy(ndev, &mvq->fwqp);
1231 	cq_destroy(ndev, mvq->index);
1232 	mvq->initialized = false;
1233 }
1234 
1235 static int create_rqt(struct mlx5_vdpa_net *ndev)
1236 {
1237 	__be32 *list;
1238 	int max_rqt;
1239 	void *rqtc;
1240 	int inlen;
1241 	void *in;
1242 	int i, j;
1243 	int err;
1244 
1245 	max_rqt = min_t(int, MLX5_MAX_SUPPORTED_VQS / 2,
1246 			1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
1247 	if (max_rqt < 1)
1248 		return -EOPNOTSUPP;
1249 
1250 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num);
1251 	in = kzalloc(inlen, GFP_KERNEL);
1252 	if (!in)
1253 		return -ENOMEM;
1254 
1255 	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1256 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1257 
1258 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1259 	MLX5_SET(rqtc, rqtc, rqt_max_size, max_rqt);
1260 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1261 	for (i = 0, j = 0; j < max_rqt; j++) {
1262 		if (!ndev->vqs[j].initialized)
1263 			continue;
1264 
1265 		if (!vq_is_tx(ndev->vqs[j].index)) {
1266 			list[i] = cpu_to_be32(ndev->vqs[j].virtq_id);
1267 			i++;
1268 		}
1269 	}
1270 	MLX5_SET(rqtc, rqtc, rqt_actual_size, i);
1271 
1272 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1273 	kfree(in);
1274 	if (err)
1275 		return err;
1276 
1277 	return 0;
1278 }
1279 
1280 #define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
1281 
1282 static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
1283 {
1284 	__be32 *list;
1285 	int max_rqt;
1286 	void *rqtc;
1287 	int inlen;
1288 	void *in;
1289 	int i, j;
1290 	int err;
1291 
1292 	max_rqt = min_t(int, ndev->cur_num_vqs / 2,
1293 			1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
1294 	if (max_rqt < 1)
1295 		return -EOPNOTSUPP;
1296 
1297 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num);
1298 	in = kzalloc(inlen, GFP_KERNEL);
1299 	if (!in)
1300 		return -ENOMEM;
1301 
1302 	MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
1303 	MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
1304 	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
1305 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1306 
1307 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1308 	for (i = 0, j = 0; j < num; j++) {
1309 		if (!ndev->vqs[j].initialized)
1310 			continue;
1311 
1312 		if (!vq_is_tx(ndev->vqs[j].index)) {
1313 			list[i] = cpu_to_be32(ndev->vqs[j].virtq_id);
1314 			i++;
1315 		}
1316 	}
1317 	MLX5_SET(rqtc, rqtc, rqt_actual_size, i);
1318 	err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
1319 	kfree(in);
1320 	if (err)
1321 		return err;
1322 
1323 	return 0;
1324 }
1325 
1326 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1327 {
1328 	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1329 }
1330 
1331 static int create_tir(struct mlx5_vdpa_net *ndev)
1332 {
1333 #define HASH_IP_L4PORTS                                                                            \
1334 	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1335 	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1336 	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1337 						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1338 						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1339 						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1340 						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1341 	void *rss_key;
1342 	void *outer;
1343 	void *tirc;
1344 	void *in;
1345 	int err;
1346 
1347 	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1348 	if (!in)
1349 		return -ENOMEM;
1350 
1351 	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1352 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1353 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1354 
1355 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1356 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1357 	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1358 	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1359 
1360 	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1361 	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1362 	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1363 	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1364 
1365 	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1366 	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1367 
1368 	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1369 	kfree(in);
1370 	return err;
1371 }
1372 
1373 static void destroy_tir(struct mlx5_vdpa_net *ndev)
1374 {
1375 	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1376 }
1377 
1378 static int add_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1379 {
1380 	struct mlx5_flow_destination dest[2] = {};
1381 	struct mlx5_flow_table_attr ft_attr = {};
1382 	struct mlx5_flow_act flow_act = {};
1383 	struct mlx5_flow_namespace *ns;
1384 	int err;
1385 
1386 	/* for now, one entry, match all, forward to tir */
1387 	ft_attr.max_fte = 1;
1388 	ft_attr.autogroup.max_num_groups = 1;
1389 
1390 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
1391 	if (!ns) {
1392 		mlx5_vdpa_warn(&ndev->mvdev, "get flow namespace\n");
1393 		return -EOPNOTSUPP;
1394 	}
1395 
1396 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
1397 	if (IS_ERR(ndev->rxft))
1398 		return PTR_ERR(ndev->rxft);
1399 
1400 	ndev->rx_counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1401 	if (IS_ERR(ndev->rx_counter)) {
1402 		err = PTR_ERR(ndev->rx_counter);
1403 		goto err_fc;
1404 	}
1405 
1406 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_COUNT;
1407 	dest[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1408 	dest[0].tir_num = ndev->res.tirn;
1409 	dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1410 	dest[1].counter_id = mlx5_fc_id(ndev->rx_counter);
1411 	ndev->rx_rule = mlx5_add_flow_rules(ndev->rxft, NULL, &flow_act, dest, 2);
1412 	if (IS_ERR(ndev->rx_rule)) {
1413 		err = PTR_ERR(ndev->rx_rule);
1414 		ndev->rx_rule = NULL;
1415 		goto err_rule;
1416 	}
1417 
1418 	return 0;
1419 
1420 err_rule:
1421 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1422 err_fc:
1423 	mlx5_destroy_flow_table(ndev->rxft);
1424 	return err;
1425 }
1426 
1427 static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1428 {
1429 	if (!ndev->rx_rule)
1430 		return;
1431 
1432 	mlx5_del_flow_rules(ndev->rx_rule);
1433 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1434 	mlx5_destroy_flow_table(ndev->rxft);
1435 
1436 	ndev->rx_rule = NULL;
1437 }
1438 
1439 static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1440 {
1441 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1442 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1443 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1444 	struct mlx5_core_dev *pfmdev;
1445 	size_t read;
1446 	u8 mac[ETH_ALEN];
1447 
1448 	pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
1449 	switch (cmd) {
1450 	case VIRTIO_NET_CTRL_MAC_ADDR_SET:
1451 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
1452 		if (read != ETH_ALEN)
1453 			break;
1454 
1455 		if (!memcmp(ndev->config.mac, mac, 6)) {
1456 			status = VIRTIO_NET_OK;
1457 			break;
1458 		}
1459 
1460 		if (!is_zero_ether_addr(ndev->config.mac)) {
1461 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1462 				mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
1463 					       ndev->config.mac);
1464 				break;
1465 			}
1466 		}
1467 
1468 		if (mlx5_mpfs_add_mac(pfmdev, mac)) {
1469 			mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
1470 				       mac);
1471 			break;
1472 		}
1473 
1474 		memcpy(ndev->config.mac, mac, ETH_ALEN);
1475 		status = VIRTIO_NET_OK;
1476 		break;
1477 
1478 	default:
1479 		break;
1480 	}
1481 
1482 	return status;
1483 }
1484 
1485 static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
1486 {
1487 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1488 	int cur_qps = ndev->cur_num_vqs / 2;
1489 	int err;
1490 	int i;
1491 
1492 	if (cur_qps > newqps) {
1493 		err = modify_rqt(ndev, 2 * newqps);
1494 		if (err)
1495 			return err;
1496 
1497 		for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
1498 			teardown_vq(ndev, &ndev->vqs[i]);
1499 
1500 		ndev->cur_num_vqs = 2 * newqps;
1501 	} else {
1502 		ndev->cur_num_vqs = 2 * newqps;
1503 		for (i = cur_qps * 2; i < 2 * newqps; i++) {
1504 			err = setup_vq(ndev, &ndev->vqs[i]);
1505 			if (err)
1506 				goto clean_added;
1507 		}
1508 		err = modify_rqt(ndev, 2 * newqps);
1509 		if (err)
1510 			goto clean_added;
1511 	}
1512 	return 0;
1513 
1514 clean_added:
1515 	for (--i; i >= cur_qps; --i)
1516 		teardown_vq(ndev, &ndev->vqs[i]);
1517 
1518 	return err;
1519 }
1520 
1521 static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1522 {
1523 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1524 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1525 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1526 	struct virtio_net_ctrl_mq mq;
1527 	size_t read;
1528 	u16 newqps;
1529 
1530 	switch (cmd) {
1531 	case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
1532 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
1533 		if (read != sizeof(mq))
1534 			break;
1535 
1536 		newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
1537 		if (ndev->cur_num_vqs == 2 * newqps) {
1538 			status = VIRTIO_NET_OK;
1539 			break;
1540 		}
1541 
1542 		if (newqps & (newqps - 1))
1543 			break;
1544 
1545 		if (!change_num_qps(mvdev, newqps))
1546 			status = VIRTIO_NET_OK;
1547 
1548 		break;
1549 	default:
1550 		break;
1551 	}
1552 
1553 	return status;
1554 }
1555 
1556 static void mlx5_cvq_kick_handler(struct work_struct *work)
1557 {
1558 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1559 	struct virtio_net_ctrl_hdr ctrl;
1560 	struct mlx5_ctrl_wq_ent *wqent;
1561 	struct mlx5_vdpa_dev *mvdev;
1562 	struct mlx5_control_vq *cvq;
1563 	struct mlx5_vdpa_net *ndev;
1564 	size_t read, write;
1565 	int err;
1566 
1567 	wqent = container_of(work, struct mlx5_ctrl_wq_ent, work);
1568 	mvdev = wqent->mvdev;
1569 	ndev = to_mlx5_vdpa_ndev(mvdev);
1570 	cvq = &mvdev->cvq;
1571 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
1572 		goto out;
1573 
1574 	if (!cvq->ready)
1575 		goto out;
1576 
1577 	while (true) {
1578 		err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
1579 					   GFP_ATOMIC);
1580 		if (err <= 0)
1581 			break;
1582 
1583 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
1584 		if (read != sizeof(ctrl))
1585 			break;
1586 
1587 		switch (ctrl.class) {
1588 		case VIRTIO_NET_CTRL_MAC:
1589 			status = handle_ctrl_mac(mvdev, ctrl.cmd);
1590 			break;
1591 		case VIRTIO_NET_CTRL_MQ:
1592 			status = handle_ctrl_mq(mvdev, ctrl.cmd);
1593 			break;
1594 
1595 		default:
1596 			break;
1597 		}
1598 
1599 		/* Make sure data is written before advancing index */
1600 		smp_wmb();
1601 
1602 		write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
1603 		vringh_complete_iotlb(&cvq->vring, cvq->head, write);
1604 		vringh_kiov_cleanup(&cvq->riov);
1605 		vringh_kiov_cleanup(&cvq->wiov);
1606 
1607 		if (vringh_need_notify_iotlb(&cvq->vring))
1608 			vringh_notify(&cvq->vring);
1609 	}
1610 out:
1611 	kfree(wqent);
1612 }
1613 
1614 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
1615 {
1616 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1617 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1618 	struct mlx5_vdpa_virtqueue *mvq;
1619 	struct mlx5_ctrl_wq_ent *wqent;
1620 
1621 	if (!is_index_valid(mvdev, idx))
1622 		return;
1623 
1624 	if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
1625 		if (!mvdev->cvq.ready)
1626 			return;
1627 
1628 		wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
1629 		if (!wqent)
1630 			return;
1631 
1632 		wqent->mvdev = mvdev;
1633 		INIT_WORK(&wqent->work, mlx5_cvq_kick_handler);
1634 		queue_work(mvdev->wq, &wqent->work);
1635 		return;
1636 	}
1637 
1638 	mvq = &ndev->vqs[idx];
1639 	if (unlikely(!mvq->ready))
1640 		return;
1641 
1642 	iowrite16(idx, ndev->mvdev.res.kick_addr);
1643 }
1644 
1645 static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
1646 				    u64 driver_area, u64 device_area)
1647 {
1648 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1649 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1650 	struct mlx5_vdpa_virtqueue *mvq;
1651 
1652 	if (!is_index_valid(mvdev, idx))
1653 		return -EINVAL;
1654 
1655 	if (is_ctrl_vq_idx(mvdev, idx)) {
1656 		mvdev->cvq.desc_addr = desc_area;
1657 		mvdev->cvq.device_addr = device_area;
1658 		mvdev->cvq.driver_addr = driver_area;
1659 		return 0;
1660 	}
1661 
1662 	mvq = &ndev->vqs[idx];
1663 	mvq->desc_addr = desc_area;
1664 	mvq->device_addr = device_area;
1665 	mvq->driver_addr = driver_area;
1666 	return 0;
1667 }
1668 
1669 static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
1670 {
1671 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1672 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1673 	struct mlx5_vdpa_virtqueue *mvq;
1674 
1675 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
1676 		return;
1677 
1678 	mvq = &ndev->vqs[idx];
1679 	mvq->num_ent = num;
1680 }
1681 
1682 static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
1683 {
1684 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1685 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1686 
1687 	ndev->event_cbs[idx] = *cb;
1688 }
1689 
1690 static void mlx5_cvq_notify(struct vringh *vring)
1691 {
1692 	struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
1693 
1694 	if (!cvq->event_cb.callback)
1695 		return;
1696 
1697 	cvq->event_cb.callback(cvq->event_cb.private);
1698 }
1699 
1700 static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
1701 {
1702 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1703 
1704 	cvq->ready = ready;
1705 	if (!ready)
1706 		return;
1707 
1708 	cvq->vring.notify = mlx5_cvq_notify;
1709 }
1710 
1711 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
1712 {
1713 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1714 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1715 	struct mlx5_vdpa_virtqueue *mvq;
1716 
1717 	if (!is_index_valid(mvdev, idx))
1718 		return;
1719 
1720 	if (is_ctrl_vq_idx(mvdev, idx)) {
1721 		set_cvq_ready(mvdev, ready);
1722 		return;
1723 	}
1724 
1725 	mvq = &ndev->vqs[idx];
1726 	if (!ready)
1727 		suspend_vq(ndev, mvq);
1728 
1729 	mvq->ready = ready;
1730 }
1731 
1732 static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
1733 {
1734 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1735 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1736 
1737 	if (!is_index_valid(mvdev, idx))
1738 		return false;
1739 
1740 	if (is_ctrl_vq_idx(mvdev, idx))
1741 		return mvdev->cvq.ready;
1742 
1743 	return ndev->vqs[idx].ready;
1744 }
1745 
1746 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
1747 				  const struct vdpa_vq_state *state)
1748 {
1749 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1750 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1751 	struct mlx5_vdpa_virtqueue *mvq;
1752 
1753 	if (!is_index_valid(mvdev, idx))
1754 		return -EINVAL;
1755 
1756 	if (is_ctrl_vq_idx(mvdev, idx)) {
1757 		mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
1758 		return 0;
1759 	}
1760 
1761 	mvq = &ndev->vqs[idx];
1762 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
1763 		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
1764 		return -EINVAL;
1765 	}
1766 
1767 	mvq->used_idx = state->split.avail_index;
1768 	mvq->avail_idx = state->split.avail_index;
1769 	return 0;
1770 }
1771 
1772 static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
1773 {
1774 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1775 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1776 	struct mlx5_vdpa_virtqueue *mvq;
1777 	struct mlx5_virtq_attr attr;
1778 	int err;
1779 
1780 	if (!is_index_valid(mvdev, idx))
1781 		return -EINVAL;
1782 
1783 	if (is_ctrl_vq_idx(mvdev, idx)) {
1784 		state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
1785 		return 0;
1786 	}
1787 
1788 	mvq = &ndev->vqs[idx];
1789 	/* If the virtq object was destroyed, use the value saved at
1790 	 * the last minute of suspend_vq. This caters for userspace
1791 	 * that cares about emulating the index after vq is stopped.
1792 	 */
1793 	if (!mvq->initialized) {
1794 		/* Firmware returns a wrong value for the available index.
1795 		 * Since both values should be identical, we take the value of
1796 		 * used_idx which is reported correctly.
1797 		 */
1798 		state->split.avail_index = mvq->used_idx;
1799 		return 0;
1800 	}
1801 
1802 	err = query_virtqueue(ndev, mvq, &attr);
1803 	if (err) {
1804 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
1805 		return err;
1806 	}
1807 	state->split.avail_index = attr.used_index;
1808 	return 0;
1809 }
1810 
1811 static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
1812 {
1813 	return PAGE_SIZE;
1814 }
1815 
1816 enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9,
1817 	MLX5_VIRTIO_NET_F_CSUM = 1 << 10,
1818 	MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11,
1819 	MLX5_VIRTIO_NET_F_HOST_TSO4 = 1 << 12,
1820 };
1821 
1822 static u64 mlx_to_vritio_features(u16 dev_features)
1823 {
1824 	u64 result = 0;
1825 
1826 	if (dev_features & MLX5_VIRTIO_NET_F_GUEST_CSUM)
1827 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
1828 	if (dev_features & MLX5_VIRTIO_NET_F_CSUM)
1829 		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
1830 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO6)
1831 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
1832 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO4)
1833 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
1834 
1835 	return result;
1836 }
1837 
1838 static u64 mlx5_vdpa_get_features(struct vdpa_device *vdev)
1839 {
1840 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1841 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1842 	u16 dev_features;
1843 
1844 	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, device_features_bits_mask);
1845 	ndev->mvdev.mlx_features |= mlx_to_vritio_features(dev_features);
1846 	if (MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, virtio_version_1_0))
1847 		ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_VERSION_1);
1848 	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
1849 	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
1850 	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
1851 	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MQ);
1852 
1853 	print_features(mvdev, ndev->mvdev.mlx_features, false);
1854 	return ndev->mvdev.mlx_features;
1855 }
1856 
1857 static int verify_min_features(struct mlx5_vdpa_dev *mvdev, u64 features)
1858 {
1859 	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
1860 		return -EOPNOTSUPP;
1861 
1862 	return 0;
1863 }
1864 
1865 static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev)
1866 {
1867 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1868 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1869 	int err;
1870 	int i;
1871 
1872 	for (i = 0; i < 2 * mlx5_vdpa_max_qps(mvdev->max_vqs); i++) {
1873 		err = setup_vq(ndev, &ndev->vqs[i]);
1874 		if (err)
1875 			goto err_vq;
1876 	}
1877 
1878 	if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) {
1879 		err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
1880 					MLX5_CVQ_MAX_ENT, false,
1881 					(struct vring_desc *)(uintptr_t)cvq->desc_addr,
1882 					(struct vring_avail *)(uintptr_t)cvq->driver_addr,
1883 					(struct vring_used *)(uintptr_t)cvq->device_addr);
1884 		if (err)
1885 			goto err_vq;
1886 	}
1887 
1888 	return 0;
1889 
1890 err_vq:
1891 	for (--i; i >= 0; i--)
1892 		teardown_vq(ndev, &ndev->vqs[i]);
1893 
1894 	return err;
1895 }
1896 
1897 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
1898 {
1899 	struct mlx5_vdpa_virtqueue *mvq;
1900 	int i;
1901 
1902 	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
1903 		mvq = &ndev->vqs[i];
1904 		if (!mvq->initialized)
1905 			continue;
1906 
1907 		teardown_vq(ndev, mvq);
1908 	}
1909 }
1910 
1911 static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
1912 {
1913 	if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
1914 		if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
1915 			/* MQ supported. CVQ index is right above the last data virtqueue's */
1916 			mvdev->max_idx = mvdev->max_vqs;
1917 		} else {
1918 			/* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
1919 			 * CVQ gets index 2
1920 			 */
1921 			mvdev->max_idx = 2;
1922 		}
1923 	} else {
1924 		/* Two data virtqueues only: one for rx and one for tx */
1925 		mvdev->max_idx = 1;
1926 	}
1927 }
1928 
1929 static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features)
1930 {
1931 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1932 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1933 	int err;
1934 
1935 	print_features(mvdev, features, true);
1936 
1937 	err = verify_min_features(mvdev, features);
1938 	if (err)
1939 		return err;
1940 
1941 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
1942 	ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu);
1943 	ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
1944 	update_cvq_info(mvdev);
1945 	return err;
1946 }
1947 
1948 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
1949 {
1950 	/* not implemented */
1951 	mlx5_vdpa_warn(to_mvdev(vdev), "set config callback not supported\n");
1952 }
1953 
1954 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
1955 static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
1956 {
1957 	return MLX5_VDPA_MAX_VQ_ENTRIES;
1958 }
1959 
1960 static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
1961 {
1962 	return VIRTIO_ID_NET;
1963 }
1964 
1965 static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
1966 {
1967 	return PCI_VENDOR_ID_MELLANOX;
1968 }
1969 
1970 static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
1971 {
1972 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1973 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1974 
1975 	print_status(mvdev, ndev->mvdev.status, false);
1976 	return ndev->mvdev.status;
1977 }
1978 
1979 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1980 {
1981 	struct mlx5_vq_restore_info *ri = &mvq->ri;
1982 	struct mlx5_virtq_attr attr = {};
1983 	int err;
1984 
1985 	if (mvq->initialized) {
1986 		err = query_virtqueue(ndev, mvq, &attr);
1987 		if (err)
1988 			return err;
1989 	}
1990 
1991 	ri->avail_index = attr.available_index;
1992 	ri->used_index = attr.used_index;
1993 	ri->ready = mvq->ready;
1994 	ri->num_ent = mvq->num_ent;
1995 	ri->desc_addr = mvq->desc_addr;
1996 	ri->device_addr = mvq->device_addr;
1997 	ri->driver_addr = mvq->driver_addr;
1998 	ri->restore = true;
1999 	return 0;
2000 }
2001 
2002 static int save_channels_info(struct mlx5_vdpa_net *ndev)
2003 {
2004 	int i;
2005 
2006 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2007 		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
2008 		save_channel_info(ndev, &ndev->vqs[i]);
2009 	}
2010 	return 0;
2011 }
2012 
2013 static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
2014 {
2015 	int i;
2016 
2017 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2018 		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2019 }
2020 
2021 static void restore_channels_info(struct mlx5_vdpa_net *ndev)
2022 {
2023 	struct mlx5_vdpa_virtqueue *mvq;
2024 	struct mlx5_vq_restore_info *ri;
2025 	int i;
2026 
2027 	mlx5_clear_vqs(ndev);
2028 	init_mvqs(ndev);
2029 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2030 		mvq = &ndev->vqs[i];
2031 		ri = &mvq->ri;
2032 		if (!ri->restore)
2033 			continue;
2034 
2035 		mvq->avail_idx = ri->avail_index;
2036 		mvq->used_idx = ri->used_index;
2037 		mvq->ready = ri->ready;
2038 		mvq->num_ent = ri->num_ent;
2039 		mvq->desc_addr = ri->desc_addr;
2040 		mvq->device_addr = ri->device_addr;
2041 		mvq->driver_addr = ri->driver_addr;
2042 	}
2043 }
2044 
2045 static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
2046 {
2047 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2048 	int err;
2049 
2050 	suspend_vqs(ndev);
2051 	err = save_channels_info(ndev);
2052 	if (err)
2053 		goto err_mr;
2054 
2055 	teardown_driver(ndev);
2056 	mlx5_vdpa_destroy_mr(mvdev);
2057 	err = mlx5_vdpa_create_mr(mvdev, iotlb);
2058 	if (err)
2059 		goto err_mr;
2060 
2061 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
2062 		return 0;
2063 
2064 	restore_channels_info(ndev);
2065 	err = setup_driver(mvdev);
2066 	if (err)
2067 		goto err_setup;
2068 
2069 	return 0;
2070 
2071 err_setup:
2072 	mlx5_vdpa_destroy_mr(mvdev);
2073 err_mr:
2074 	return err;
2075 }
2076 
2077 static int setup_driver(struct mlx5_vdpa_dev *mvdev)
2078 {
2079 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2080 	int err;
2081 
2082 	mutex_lock(&ndev->reslock);
2083 	if (ndev->setup) {
2084 		mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
2085 		err = 0;
2086 		goto out;
2087 	}
2088 	err = setup_virtqueues(mvdev);
2089 	if (err) {
2090 		mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
2091 		goto out;
2092 	}
2093 
2094 	err = create_rqt(ndev);
2095 	if (err) {
2096 		mlx5_vdpa_warn(mvdev, "create_rqt\n");
2097 		goto err_rqt;
2098 	}
2099 
2100 	err = create_tir(ndev);
2101 	if (err) {
2102 		mlx5_vdpa_warn(mvdev, "create_tir\n");
2103 		goto err_tir;
2104 	}
2105 
2106 	err = add_fwd_to_tir(ndev);
2107 	if (err) {
2108 		mlx5_vdpa_warn(mvdev, "add_fwd_to_tir\n");
2109 		goto err_fwd;
2110 	}
2111 	ndev->setup = true;
2112 	mutex_unlock(&ndev->reslock);
2113 
2114 	return 0;
2115 
2116 err_fwd:
2117 	destroy_tir(ndev);
2118 err_tir:
2119 	destroy_rqt(ndev);
2120 err_rqt:
2121 	teardown_virtqueues(ndev);
2122 out:
2123 	mutex_unlock(&ndev->reslock);
2124 	return err;
2125 }
2126 
2127 static void teardown_driver(struct mlx5_vdpa_net *ndev)
2128 {
2129 	mutex_lock(&ndev->reslock);
2130 	if (!ndev->setup)
2131 		goto out;
2132 
2133 	remove_fwd_to_tir(ndev);
2134 	destroy_tir(ndev);
2135 	destroy_rqt(ndev);
2136 	teardown_virtqueues(ndev);
2137 	ndev->setup = false;
2138 out:
2139 	mutex_unlock(&ndev->reslock);
2140 }
2141 
2142 static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
2143 {
2144 	int i;
2145 
2146 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2147 		ndev->vqs[i].ready = false;
2148 }
2149 
2150 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
2151 {
2152 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2153 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2154 	int err;
2155 
2156 	print_status(mvdev, status, true);
2157 
2158 	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
2159 		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
2160 			err = setup_driver(mvdev);
2161 			if (err) {
2162 				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
2163 				goto err_setup;
2164 			}
2165 		} else {
2166 			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
2167 			return;
2168 		}
2169 	}
2170 
2171 	ndev->mvdev.status = status;
2172 	return;
2173 
2174 err_setup:
2175 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2176 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
2177 }
2178 
2179 static int mlx5_vdpa_reset(struct vdpa_device *vdev)
2180 {
2181 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2182 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2183 
2184 	print_status(mvdev, 0, true);
2185 	mlx5_vdpa_info(mvdev, "performing device reset\n");
2186 	teardown_driver(ndev);
2187 	clear_vqs_ready(ndev);
2188 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2189 	ndev->mvdev.status = 0;
2190 	ndev->mvdev.mlx_features = 0;
2191 	memset(ndev->event_cbs, 0, sizeof(ndev->event_cbs));
2192 	ndev->mvdev.actual_features = 0;
2193 	++mvdev->generation;
2194 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2195 		if (mlx5_vdpa_create_mr(mvdev, NULL))
2196 			mlx5_vdpa_warn(mvdev, "create MR failed\n");
2197 	}
2198 
2199 	return 0;
2200 }
2201 
2202 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
2203 {
2204 	return sizeof(struct virtio_net_config);
2205 }
2206 
2207 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
2208 				 unsigned int len)
2209 {
2210 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2211 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2212 
2213 	if (offset + len <= sizeof(struct virtio_net_config))
2214 		memcpy(buf, (u8 *)&ndev->config + offset, len);
2215 }
2216 
2217 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
2218 				 unsigned int len)
2219 {
2220 	/* not supported */
2221 }
2222 
2223 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
2224 {
2225 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2226 
2227 	return mvdev->generation;
2228 }
2229 
2230 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb)
2231 {
2232 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2233 	bool change_map;
2234 	int err;
2235 
2236 	err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map);
2237 	if (err) {
2238 		mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err);
2239 		return err;
2240 	}
2241 
2242 	if (change_map)
2243 		return mlx5_vdpa_change_map(mvdev, iotlb);
2244 
2245 	return 0;
2246 }
2247 
2248 static void mlx5_vdpa_free(struct vdpa_device *vdev)
2249 {
2250 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2251 	struct mlx5_core_dev *pfmdev;
2252 	struct mlx5_vdpa_net *ndev;
2253 
2254 	ndev = to_mlx5_vdpa_ndev(mvdev);
2255 
2256 	free_resources(ndev);
2257 	mlx5_vdpa_destroy_mr(mvdev);
2258 	if (!is_zero_ether_addr(ndev->config.mac)) {
2259 		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
2260 		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
2261 	}
2262 	mlx5_vdpa_free_resources(&ndev->mvdev);
2263 	mutex_destroy(&ndev->reslock);
2264 }
2265 
2266 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
2267 {
2268 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2269 	struct vdpa_notification_area ret = {};
2270 	struct mlx5_vdpa_net *ndev;
2271 	phys_addr_t addr;
2272 
2273 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
2274 		return ret;
2275 
2276 	/* If SF BAR size is smaller than PAGE_SIZE, do not use direct
2277 	 * notification to avoid the risk of mapping pages that contain BAR of more
2278 	 * than one SF
2279 	 */
2280 	if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
2281 		return ret;
2282 
2283 	ndev = to_mlx5_vdpa_ndev(mvdev);
2284 	addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
2285 	ret.addr = addr;
2286 	ret.size = PAGE_SIZE;
2287 	return ret;
2288 }
2289 
2290 static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 idx)
2291 {
2292 	return -EOPNOTSUPP;
2293 }
2294 
2295 static const struct vdpa_config_ops mlx5_vdpa_ops = {
2296 	.set_vq_address = mlx5_vdpa_set_vq_address,
2297 	.set_vq_num = mlx5_vdpa_set_vq_num,
2298 	.kick_vq = mlx5_vdpa_kick_vq,
2299 	.set_vq_cb = mlx5_vdpa_set_vq_cb,
2300 	.set_vq_ready = mlx5_vdpa_set_vq_ready,
2301 	.get_vq_ready = mlx5_vdpa_get_vq_ready,
2302 	.set_vq_state = mlx5_vdpa_set_vq_state,
2303 	.get_vq_state = mlx5_vdpa_get_vq_state,
2304 	.get_vq_notification = mlx5_get_vq_notification,
2305 	.get_vq_irq = mlx5_get_vq_irq,
2306 	.get_vq_align = mlx5_vdpa_get_vq_align,
2307 	.get_features = mlx5_vdpa_get_features,
2308 	.set_features = mlx5_vdpa_set_features,
2309 	.set_config_cb = mlx5_vdpa_set_config_cb,
2310 	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
2311 	.get_device_id = mlx5_vdpa_get_device_id,
2312 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
2313 	.get_status = mlx5_vdpa_get_status,
2314 	.set_status = mlx5_vdpa_set_status,
2315 	.reset = mlx5_vdpa_reset,
2316 	.get_config_size = mlx5_vdpa_get_config_size,
2317 	.get_config = mlx5_vdpa_get_config,
2318 	.set_config = mlx5_vdpa_set_config,
2319 	.get_generation = mlx5_vdpa_get_generation,
2320 	.set_map = mlx5_vdpa_set_map,
2321 	.free = mlx5_vdpa_free,
2322 };
2323 
2324 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
2325 {
2326 	u16 hw_mtu;
2327 	int err;
2328 
2329 	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
2330 	if (err)
2331 		return err;
2332 
2333 	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
2334 	return 0;
2335 }
2336 
2337 static int alloc_resources(struct mlx5_vdpa_net *ndev)
2338 {
2339 	struct mlx5_vdpa_net_resources *res = &ndev->res;
2340 	int err;
2341 
2342 	if (res->valid) {
2343 		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
2344 		return -EEXIST;
2345 	}
2346 
2347 	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
2348 	if (err)
2349 		return err;
2350 
2351 	err = create_tis(ndev);
2352 	if (err)
2353 		goto err_tis;
2354 
2355 	res->valid = true;
2356 
2357 	return 0;
2358 
2359 err_tis:
2360 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
2361 	return err;
2362 }
2363 
2364 static void free_resources(struct mlx5_vdpa_net *ndev)
2365 {
2366 	struct mlx5_vdpa_net_resources *res = &ndev->res;
2367 
2368 	if (!res->valid)
2369 		return;
2370 
2371 	destroy_tis(ndev);
2372 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
2373 	res->valid = false;
2374 }
2375 
2376 static void init_mvqs(struct mlx5_vdpa_net *ndev)
2377 {
2378 	struct mlx5_vdpa_virtqueue *mvq;
2379 	int i;
2380 
2381 	for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); ++i) {
2382 		mvq = &ndev->vqs[i];
2383 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2384 		mvq->index = i;
2385 		mvq->ndev = ndev;
2386 		mvq->fwqp.fw = true;
2387 	}
2388 	for (; i < ndev->mvdev.max_vqs; i++) {
2389 		mvq = &ndev->vqs[i];
2390 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2391 		mvq->index = i;
2392 		mvq->ndev = ndev;
2393 	}
2394 }
2395 
2396 struct mlx5_vdpa_mgmtdev {
2397 	struct vdpa_mgmt_dev mgtdev;
2398 	struct mlx5_adev *madev;
2399 	struct mlx5_vdpa_net *ndev;
2400 };
2401 
2402 static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
2403 {
2404 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
2405 	struct virtio_net_config *config;
2406 	struct mlx5_core_dev *pfmdev;
2407 	struct mlx5_vdpa_dev *mvdev;
2408 	struct mlx5_vdpa_net *ndev;
2409 	struct mlx5_core_dev *mdev;
2410 	u32 max_vqs;
2411 	int err;
2412 
2413 	if (mgtdev->ndev)
2414 		return -ENOSPC;
2415 
2416 	mdev = mgtdev->madev->mdev;
2417 	if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) &
2418 	    MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) {
2419 		dev_warn(mdev->device, "missing support for split virtqueues\n");
2420 		return -EOPNOTSUPP;
2421 	}
2422 
2423 	/* we save one virtqueue for control virtqueue should we require it */
2424 	max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues);
2425 	max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
2426 
2427 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
2428 				 name, false);
2429 	if (IS_ERR(ndev))
2430 		return PTR_ERR(ndev);
2431 
2432 	ndev->mvdev.max_vqs = max_vqs;
2433 	mvdev = &ndev->mvdev;
2434 	mvdev->mdev = mdev;
2435 	init_mvqs(ndev);
2436 	mutex_init(&ndev->reslock);
2437 	config = &ndev->config;
2438 	err = query_mtu(mdev, &ndev->mtu);
2439 	if (err)
2440 		goto err_mtu;
2441 
2442 	err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
2443 	if (err)
2444 		goto err_mtu;
2445 
2446 	if (!is_zero_ether_addr(config->mac)) {
2447 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
2448 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
2449 		if (err)
2450 			goto err_mtu;
2451 
2452 		ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC);
2453 	}
2454 
2455 	config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, mlx5_vdpa_max_qps(max_vqs));
2456 	mvdev->vdev.dma_dev = &mdev->pdev->dev;
2457 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
2458 	if (err)
2459 		goto err_mpfs;
2460 
2461 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2462 		err = mlx5_vdpa_create_mr(mvdev, NULL);
2463 		if (err)
2464 			goto err_res;
2465 	}
2466 
2467 	err = alloc_resources(ndev);
2468 	if (err)
2469 		goto err_mr;
2470 
2471 	mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_ctrl_wq");
2472 	if (!mvdev->wq) {
2473 		err = -ENOMEM;
2474 		goto err_res2;
2475 	}
2476 
2477 	ndev->cur_num_vqs = 2 * mlx5_vdpa_max_qps(max_vqs);
2478 	mvdev->vdev.mdev = &mgtdev->mgtdev;
2479 	err = _vdpa_register_device(&mvdev->vdev, ndev->cur_num_vqs + 1);
2480 	if (err)
2481 		goto err_reg;
2482 
2483 	mgtdev->ndev = ndev;
2484 	return 0;
2485 
2486 err_reg:
2487 	destroy_workqueue(mvdev->wq);
2488 err_res2:
2489 	free_resources(ndev);
2490 err_mr:
2491 	mlx5_vdpa_destroy_mr(mvdev);
2492 err_res:
2493 	mlx5_vdpa_free_resources(&ndev->mvdev);
2494 err_mpfs:
2495 	if (!is_zero_ether_addr(config->mac))
2496 		mlx5_mpfs_del_mac(pfmdev, config->mac);
2497 err_mtu:
2498 	mutex_destroy(&ndev->reslock);
2499 	put_device(&mvdev->vdev.dev);
2500 	return err;
2501 }
2502 
2503 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
2504 {
2505 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
2506 	struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
2507 
2508 	destroy_workqueue(mvdev->wq);
2509 	_vdpa_unregister_device(dev);
2510 	mgtdev->ndev = NULL;
2511 }
2512 
2513 static const struct vdpa_mgmtdev_ops mdev_ops = {
2514 	.dev_add = mlx5_vdpa_dev_add,
2515 	.dev_del = mlx5_vdpa_dev_del,
2516 };
2517 
2518 static struct virtio_device_id id_table[] = {
2519 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
2520 	{ 0 },
2521 };
2522 
2523 static int mlx5v_probe(struct auxiliary_device *adev,
2524 		       const struct auxiliary_device_id *id)
2525 
2526 {
2527 	struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
2528 	struct mlx5_core_dev *mdev = madev->mdev;
2529 	struct mlx5_vdpa_mgmtdev *mgtdev;
2530 	int err;
2531 
2532 	mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
2533 	if (!mgtdev)
2534 		return -ENOMEM;
2535 
2536 	mgtdev->mgtdev.ops = &mdev_ops;
2537 	mgtdev->mgtdev.device = mdev->device;
2538 	mgtdev->mgtdev.id_table = id_table;
2539 	mgtdev->madev = madev;
2540 
2541 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
2542 	if (err)
2543 		goto reg_err;
2544 
2545 	dev_set_drvdata(&adev->dev, mgtdev);
2546 
2547 	return 0;
2548 
2549 reg_err:
2550 	kfree(mgtdev);
2551 	return err;
2552 }
2553 
2554 static void mlx5v_remove(struct auxiliary_device *adev)
2555 {
2556 	struct mlx5_vdpa_mgmtdev *mgtdev;
2557 
2558 	mgtdev = dev_get_drvdata(&adev->dev);
2559 	vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
2560 	kfree(mgtdev);
2561 }
2562 
2563 static const struct auxiliary_device_id mlx5v_id_table[] = {
2564 	{ .name = MLX5_ADEV_NAME ".vnet", },
2565 	{},
2566 };
2567 
2568 MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
2569 
2570 static struct auxiliary_driver mlx5v_driver = {
2571 	.name = "vnet",
2572 	.probe = mlx5v_probe,
2573 	.remove = mlx5v_remove,
2574 	.id_table = mlx5v_id_table,
2575 };
2576 
2577 module_auxiliary_driver(mlx5v_driver);
2578