xref: /openbmc/linux/drivers/vdpa/mlx5/net/mlx5_vnet.c (revision eed183ab)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
3 
4 #include <linux/module.h>
5 #include <linux/vdpa.h>
6 #include <linux/vringh.h>
7 #include <uapi/linux/virtio_net.h>
8 #include <uapi/linux/virtio_ids.h>
9 #include <linux/virtio_config.h>
10 #include <linux/auxiliary_bus.h>
11 #include <linux/mlx5/cq.h>
12 #include <linux/mlx5/qp.h>
13 #include <linux/mlx5/device.h>
14 #include <linux/mlx5/driver.h>
15 #include <linux/mlx5/vport.h>
16 #include <linux/mlx5/fs.h>
17 #include <linux/mlx5/mlx5_ifc_vdpa.h>
18 #include <linux/mlx5/mpfs.h>
19 #include "mlx5_vdpa.h"
20 
21 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
22 MODULE_DESCRIPTION("Mellanox VDPA driver");
23 MODULE_LICENSE("Dual BSD/GPL");
24 
25 #define to_mlx5_vdpa_ndev(__mvdev)                                             \
26 	container_of(__mvdev, struct mlx5_vdpa_net, mvdev)
27 #define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev)
28 
29 #define VALID_FEATURES_MASK                                                                        \
30 	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
31 	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
32 	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
33 	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
34 	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
35 	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
36 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
37 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
38 	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
39 	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
40 	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
41 	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
42 	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
43 
44 #define VALID_STATUS_MASK                                                                          \
45 	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
46 	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
47 
48 #define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
49 
50 struct mlx5_vdpa_net_resources {
51 	u32 tisn;
52 	u32 tdn;
53 	u32 tirn;
54 	u32 rqtn;
55 	bool valid;
56 };
57 
58 struct mlx5_vdpa_cq_buf {
59 	struct mlx5_frag_buf_ctrl fbc;
60 	struct mlx5_frag_buf frag_buf;
61 	int cqe_size;
62 	int nent;
63 };
64 
65 struct mlx5_vdpa_cq {
66 	struct mlx5_core_cq mcq;
67 	struct mlx5_vdpa_cq_buf buf;
68 	struct mlx5_db db;
69 	int cqe;
70 };
71 
72 struct mlx5_vdpa_umem {
73 	struct mlx5_frag_buf_ctrl fbc;
74 	struct mlx5_frag_buf frag_buf;
75 	int size;
76 	u32 id;
77 };
78 
79 struct mlx5_vdpa_qp {
80 	struct mlx5_core_qp mqp;
81 	struct mlx5_frag_buf frag_buf;
82 	struct mlx5_db db;
83 	u16 head;
84 	bool fw;
85 };
86 
87 struct mlx5_vq_restore_info {
88 	u32 num_ent;
89 	u64 desc_addr;
90 	u64 device_addr;
91 	u64 driver_addr;
92 	u16 avail_index;
93 	u16 used_index;
94 	bool ready;
95 	bool restore;
96 };
97 
98 struct mlx5_vdpa_virtqueue {
99 	bool ready;
100 	u64 desc_addr;
101 	u64 device_addr;
102 	u64 driver_addr;
103 	u32 num_ent;
104 
105 	/* Resources for implementing the notification channel from the device
106 	 * to the driver. fwqp is the firmware end of an RC connection; the
107 	 * other end is vqqp used by the driver. cq is is where completions are
108 	 * reported.
109 	 */
110 	struct mlx5_vdpa_cq cq;
111 	struct mlx5_vdpa_qp fwqp;
112 	struct mlx5_vdpa_qp vqqp;
113 
114 	/* umem resources are required for the virtqueue operation. They're use
115 	 * is internal and they must be provided by the driver.
116 	 */
117 	struct mlx5_vdpa_umem umem1;
118 	struct mlx5_vdpa_umem umem2;
119 	struct mlx5_vdpa_umem umem3;
120 
121 	bool initialized;
122 	int index;
123 	u32 virtq_id;
124 	struct mlx5_vdpa_net *ndev;
125 	u16 avail_idx;
126 	u16 used_idx;
127 	int fw_state;
128 
129 	/* keep last in the struct */
130 	struct mlx5_vq_restore_info ri;
131 };
132 
133 /* We will remove this limitation once mlx5_vdpa_alloc_resources()
134  * provides for driver space allocation
135  */
136 #define MLX5_MAX_SUPPORTED_VQS 16
137 
138 static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
139 {
140 	if (unlikely(idx > mvdev->max_idx))
141 		return false;
142 
143 	return true;
144 }
145 
146 struct mlx5_vdpa_net {
147 	struct mlx5_vdpa_dev mvdev;
148 	struct mlx5_vdpa_net_resources res;
149 	struct virtio_net_config config;
150 	struct mlx5_vdpa_virtqueue vqs[MLX5_MAX_SUPPORTED_VQS];
151 	struct vdpa_callback event_cbs[MLX5_MAX_SUPPORTED_VQS + 1];
152 
153 	/* Serialize vq resources creation and destruction. This is required
154 	 * since memory map might change and we need to destroy and create
155 	 * resources while driver in operational.
156 	 */
157 	struct mutex reslock;
158 	struct mlx5_flow_table *rxft;
159 	struct mlx5_fc *rx_counter;
160 	struct mlx5_flow_handle *rx_rule;
161 	bool setup;
162 	u16 mtu;
163 	u32 cur_num_vqs;
164 };
165 
166 static void free_resources(struct mlx5_vdpa_net *ndev);
167 static void init_mvqs(struct mlx5_vdpa_net *ndev);
168 static int setup_driver(struct mlx5_vdpa_dev *mvdev);
169 static void teardown_driver(struct mlx5_vdpa_net *ndev);
170 
171 static bool mlx5_vdpa_debug;
172 
173 #define MLX5_CVQ_MAX_ENT 16
174 
175 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
176 	do {                                                                                       \
177 		if (features & BIT_ULL(_feature))                                                  \
178 			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
179 	} while (0)
180 
181 #define MLX5_LOG_VIO_STAT(_status)                                                                 \
182 	do {                                                                                       \
183 		if (status & (_status))                                                            \
184 			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
185 	} while (0)
186 
187 /* TODO: cross-endian support */
188 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
189 {
190 	return virtio_legacy_is_little_endian() ||
191 		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
192 }
193 
194 static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
195 {
196 	return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
197 }
198 
199 static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
200 {
201 	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
202 }
203 
204 static inline u32 mlx5_vdpa_max_qps(int max_vqs)
205 {
206 	return max_vqs / 2;
207 }
208 
209 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
210 {
211 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
212 		return 2;
213 
214 	return 2 * mlx5_vdpa_max_qps(mvdev->max_vqs);
215 }
216 
217 static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
218 {
219 	return idx == ctrl_vq_idx(mvdev);
220 }
221 
222 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
223 {
224 	if (status & ~VALID_STATUS_MASK)
225 		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
226 			       status & ~VALID_STATUS_MASK);
227 
228 	if (!mlx5_vdpa_debug)
229 		return;
230 
231 	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
232 	if (set && !status) {
233 		mlx5_vdpa_info(mvdev, "driver resets the device\n");
234 		return;
235 	}
236 
237 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
238 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
239 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
240 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
241 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
242 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
243 }
244 
245 static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
246 {
247 	if (features & ~VALID_FEATURES_MASK)
248 		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
249 			       features & ~VALID_FEATURES_MASK);
250 
251 	if (!mlx5_vdpa_debug)
252 		return;
253 
254 	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
255 	if (!features)
256 		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
257 
258 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
259 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
260 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
261 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
262 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
263 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
264 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
265 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
266 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
267 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
268 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
269 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
270 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
271 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
272 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
273 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
274 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
275 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
276 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
277 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
278 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
279 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
280 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
281 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
282 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
283 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
284 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
285 	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
286 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
287 	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
288 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
289 	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
290 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
291 	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
292 }
293 
294 static int create_tis(struct mlx5_vdpa_net *ndev)
295 {
296 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
297 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
298 	void *tisc;
299 	int err;
300 
301 	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
302 	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
303 	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
304 	if (err)
305 		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
306 
307 	return err;
308 }
309 
310 static void destroy_tis(struct mlx5_vdpa_net *ndev)
311 {
312 	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
313 }
314 
315 #define MLX5_VDPA_CQE_SIZE 64
316 #define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
317 
318 static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
319 {
320 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
321 	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
322 	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
323 	int err;
324 
325 	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
326 				       ndev->mvdev.mdev->priv.numa_node);
327 	if (err)
328 		return err;
329 
330 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
331 
332 	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
333 	buf->nent = nent;
334 
335 	return 0;
336 }
337 
338 static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
339 {
340 	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
341 
342 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
343 					ndev->mvdev.mdev->priv.numa_node);
344 }
345 
346 static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
347 {
348 	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
349 }
350 
351 static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
352 {
353 	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
354 }
355 
356 static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
357 {
358 	struct mlx5_cqe64 *cqe64;
359 	void *cqe;
360 	int i;
361 
362 	for (i = 0; i < buf->nent; i++) {
363 		cqe = get_cqe(vcq, i);
364 		cqe64 = cqe;
365 		cqe64->op_own = MLX5_CQE_INVALID << 4;
366 	}
367 }
368 
369 static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
370 {
371 	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
372 
373 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
374 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
375 		return cqe64;
376 
377 	return NULL;
378 }
379 
380 static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
381 {
382 	vqp->head += n;
383 	vqp->db.db[0] = cpu_to_be32(vqp->head);
384 }
385 
386 static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
387 		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
388 {
389 	struct mlx5_vdpa_qp *vqp;
390 	__be64 *pas;
391 	void *qpc;
392 
393 	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
394 	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
395 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
396 	if (vqp->fw) {
397 		/* Firmware QP is allocated by the driver for the firmware's
398 		 * use so we can skip part of the params as they will be chosen by firmware
399 		 */
400 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
401 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
402 		MLX5_SET(qpc, qpc, no_sq, 1);
403 		return;
404 	}
405 
406 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
407 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
408 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
409 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
410 	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
411 	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
412 	MLX5_SET(qpc, qpc, no_sq, 1);
413 	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
414 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
415 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
416 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
417 	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
418 }
419 
420 static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
421 {
422 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
423 					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
424 					ndev->mvdev.mdev->priv.numa_node);
425 }
426 
427 static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
428 {
429 	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
430 }
431 
432 static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
433 		     struct mlx5_vdpa_qp *vqp)
434 {
435 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
436 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
437 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
438 	void *qpc;
439 	void *in;
440 	int err;
441 
442 	if (!vqp->fw) {
443 		vqp = &mvq->vqqp;
444 		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
445 		if (err)
446 			return err;
447 
448 		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
449 		if (err)
450 			goto err_db;
451 		inlen += vqp->frag_buf.npages * sizeof(__be64);
452 	}
453 
454 	in = kzalloc(inlen, GFP_KERNEL);
455 	if (!in) {
456 		err = -ENOMEM;
457 		goto err_kzalloc;
458 	}
459 
460 	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
461 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
462 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
463 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
464 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
465 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
466 	if (!vqp->fw)
467 		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
468 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
469 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
470 	kfree(in);
471 	if (err)
472 		goto err_kzalloc;
473 
474 	vqp->mqp.uid = ndev->mvdev.res.uid;
475 	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
476 
477 	if (!vqp->fw)
478 		rx_post(vqp, mvq->num_ent);
479 
480 	return 0;
481 
482 err_kzalloc:
483 	if (!vqp->fw)
484 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
485 err_db:
486 	if (!vqp->fw)
487 		rq_buf_free(ndev, vqp);
488 
489 	return err;
490 }
491 
492 static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
493 {
494 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
495 
496 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
497 	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
498 	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
499 	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
500 		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
501 	if (!vqp->fw) {
502 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
503 		rq_buf_free(ndev, vqp);
504 	}
505 }
506 
507 static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
508 {
509 	return get_sw_cqe(cq, cq->mcq.cons_index);
510 }
511 
512 static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
513 {
514 	struct mlx5_cqe64 *cqe64;
515 
516 	cqe64 = next_cqe_sw(vcq);
517 	if (!cqe64)
518 		return -EAGAIN;
519 
520 	vcq->mcq.cons_index++;
521 	return 0;
522 }
523 
524 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
525 {
526 	struct mlx5_vdpa_net *ndev = mvq->ndev;
527 	struct vdpa_callback *event_cb;
528 
529 	event_cb = &ndev->event_cbs[mvq->index];
530 	mlx5_cq_set_ci(&mvq->cq.mcq);
531 
532 	/* make sure CQ cosumer update is visible to the hardware before updating
533 	 * RX doorbell record.
534 	 */
535 	dma_wmb();
536 	rx_post(&mvq->vqqp, num);
537 	if (event_cb->callback)
538 		event_cb->callback(event_cb->private);
539 }
540 
541 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
542 {
543 	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
544 	struct mlx5_vdpa_net *ndev = mvq->ndev;
545 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
546 	int num = 0;
547 
548 	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
549 		num++;
550 		if (num > mvq->num_ent / 2) {
551 			/* If completions keep coming while we poll, we want to
552 			 * let the hardware know that we consumed them by
553 			 * updating the doorbell record.  We also let vdpa core
554 			 * know about this so it passes it on the virtio driver
555 			 * on the guest.
556 			 */
557 			mlx5_vdpa_handle_completions(mvq, num);
558 			num = 0;
559 		}
560 	}
561 
562 	if (num)
563 		mlx5_vdpa_handle_completions(mvq, num);
564 
565 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
566 }
567 
568 static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
569 {
570 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
571 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
572 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
573 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
574 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
575 	__be64 *pas;
576 	int inlen;
577 	void *cqc;
578 	void *in;
579 	int err;
580 	int eqn;
581 
582 	err = mlx5_db_alloc(mdev, &vcq->db);
583 	if (err)
584 		return err;
585 
586 	vcq->mcq.set_ci_db = vcq->db.db;
587 	vcq->mcq.arm_db = vcq->db.db + 1;
588 	vcq->mcq.cqe_sz = 64;
589 
590 	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
591 	if (err)
592 		goto err_db;
593 
594 	cq_frag_buf_init(vcq, &vcq->buf);
595 
596 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
597 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
598 	in = kzalloc(inlen, GFP_KERNEL);
599 	if (!in) {
600 		err = -ENOMEM;
601 		goto err_vzalloc;
602 	}
603 
604 	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
605 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
606 	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
607 
608 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
609 	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
610 
611 	/* Use vector 0 by default. Consider adding code to choose least used
612 	 * vector.
613 	 */
614 	err = mlx5_vector2eqn(mdev, 0, &eqn);
615 	if (err)
616 		goto err_vec;
617 
618 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
619 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
620 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
621 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
622 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
623 
624 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
625 	if (err)
626 		goto err_vec;
627 
628 	vcq->mcq.comp = mlx5_vdpa_cq_comp;
629 	vcq->cqe = num_ent;
630 	vcq->mcq.set_ci_db = vcq->db.db;
631 	vcq->mcq.arm_db = vcq->db.db + 1;
632 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
633 	kfree(in);
634 	return 0;
635 
636 err_vec:
637 	kfree(in);
638 err_vzalloc:
639 	cq_frag_buf_free(ndev, &vcq->buf);
640 err_db:
641 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
642 	return err;
643 }
644 
645 static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
646 {
647 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
648 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
649 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
650 
651 	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
652 		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
653 		return;
654 	}
655 	cq_frag_buf_free(ndev, &vcq->buf);
656 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
657 }
658 
659 static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
660 			  struct mlx5_vdpa_umem **umemp)
661 {
662 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
663 	int p_a;
664 	int p_b;
665 
666 	switch (num) {
667 	case 1:
668 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_a);
669 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_b);
670 		*umemp = &mvq->umem1;
671 		break;
672 	case 2:
673 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_a);
674 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_b);
675 		*umemp = &mvq->umem2;
676 		break;
677 	case 3:
678 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_a);
679 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_b);
680 		*umemp = &mvq->umem3;
681 		break;
682 	}
683 	(*umemp)->size = p_a * mvq->num_ent + p_b;
684 }
685 
686 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
687 {
688 	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
689 }
690 
691 static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
692 {
693 	int inlen;
694 	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
695 	void *um;
696 	void *in;
697 	int err;
698 	__be64 *pas;
699 	struct mlx5_vdpa_umem *umem;
700 
701 	set_umem_size(ndev, mvq, num, &umem);
702 	err = umem_frag_buf_alloc(ndev, umem, umem->size);
703 	if (err)
704 		return err;
705 
706 	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
707 
708 	in = kzalloc(inlen, GFP_KERNEL);
709 	if (!in) {
710 		err = -ENOMEM;
711 		goto err_in;
712 	}
713 
714 	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
715 	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
716 	um = MLX5_ADDR_OF(create_umem_in, in, umem);
717 	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
718 	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
719 
720 	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
721 	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
722 
723 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
724 	if (err) {
725 		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
726 		goto err_cmd;
727 	}
728 
729 	kfree(in);
730 	umem->id = MLX5_GET(create_umem_out, out, umem_id);
731 
732 	return 0;
733 
734 err_cmd:
735 	kfree(in);
736 err_in:
737 	umem_frag_buf_free(ndev, umem);
738 	return err;
739 }
740 
741 static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
742 {
743 	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
744 	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
745 	struct mlx5_vdpa_umem *umem;
746 
747 	switch (num) {
748 	case 1:
749 		umem = &mvq->umem1;
750 		break;
751 	case 2:
752 		umem = &mvq->umem2;
753 		break;
754 	case 3:
755 		umem = &mvq->umem3;
756 		break;
757 	}
758 
759 	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
760 	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
761 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
762 		return;
763 
764 	umem_frag_buf_free(ndev, umem);
765 }
766 
767 static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
768 {
769 	int num;
770 	int err;
771 
772 	for (num = 1; num <= 3; num++) {
773 		err = create_umem(ndev, mvq, num);
774 		if (err)
775 			goto err_umem;
776 	}
777 	return 0;
778 
779 err_umem:
780 	for (num--; num > 0; num--)
781 		umem_destroy(ndev, mvq, num);
782 
783 	return err;
784 }
785 
786 static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
787 {
788 	int num;
789 
790 	for (num = 3; num > 0; num--)
791 		umem_destroy(ndev, mvq, num);
792 }
793 
794 static int get_queue_type(struct mlx5_vdpa_net *ndev)
795 {
796 	u32 type_mask;
797 
798 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
799 
800 	/* prefer split queue */
801 	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)
802 		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
803 
804 	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED));
805 
806 	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
807 }
808 
809 static bool vq_is_tx(u16 idx)
810 {
811 	return idx % 2;
812 }
813 
814 static u16 get_features_12_3(u64 features)
815 {
816 	return (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << 9) |
817 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << 8) |
818 	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << 7) |
819 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_CSUM)) << 6);
820 }
821 
822 static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
823 {
824 	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
825 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
826 	void *obj_context;
827 	void *cmd_hdr;
828 	void *vq_ctx;
829 	void *in;
830 	int err;
831 
832 	err = umems_create(ndev, mvq);
833 	if (err)
834 		return err;
835 
836 	in = kzalloc(inlen, GFP_KERNEL);
837 	if (!in) {
838 		err = -ENOMEM;
839 		goto err_alloc;
840 	}
841 
842 	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
843 
844 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
845 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
846 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
847 
848 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
849 	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
850 	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
851 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
852 		 get_features_12_3(ndev->mvdev.actual_features));
853 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
854 	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
855 
856 	if (vq_is_tx(mvq->index))
857 		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
858 
859 	MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
860 	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
861 	MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
862 	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
863 	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
864 		 !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
865 	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
866 	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
867 	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
868 	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey.key);
869 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
870 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
871 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
872 	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
873 	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
874 	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
875 	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
876 	if (MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, eth_frame_offload_type))
877 		MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, 1);
878 
879 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
880 	if (err)
881 		goto err_cmd;
882 
883 	kfree(in);
884 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
885 
886 	return 0;
887 
888 err_cmd:
889 	kfree(in);
890 err_alloc:
891 	umems_destroy(ndev, mvq);
892 	return err;
893 }
894 
895 static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
896 {
897 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
898 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
899 
900 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
901 		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
902 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
903 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
904 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
905 		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
906 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
907 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
908 		return;
909 	}
910 	umems_destroy(ndev, mvq);
911 }
912 
913 static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
914 {
915 	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
916 }
917 
918 static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
919 {
920 	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
921 }
922 
923 static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
924 			int *outlen, u32 qpn, u32 rqpn)
925 {
926 	void *qpc;
927 	void *pp;
928 
929 	switch (cmd) {
930 	case MLX5_CMD_OP_2RST_QP:
931 		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
932 		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
933 		*in = kzalloc(*inlen, GFP_KERNEL);
934 		*out = kzalloc(*outlen, GFP_KERNEL);
935 		if (!*in || !*out)
936 			goto outerr;
937 
938 		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
939 		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
940 		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
941 		break;
942 	case MLX5_CMD_OP_RST2INIT_QP:
943 		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
944 		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
945 		*in = kzalloc(*inlen, GFP_KERNEL);
946 		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
947 		if (!*in || !*out)
948 			goto outerr;
949 
950 		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
951 		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
952 		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
953 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
954 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
955 		MLX5_SET(qpc, qpc, rwe, 1);
956 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
957 		MLX5_SET(ads, pp, vhca_port_num, 1);
958 		break;
959 	case MLX5_CMD_OP_INIT2RTR_QP:
960 		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
961 		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
962 		*in = kzalloc(*inlen, GFP_KERNEL);
963 		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
964 		if (!*in || !*out)
965 			goto outerr;
966 
967 		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
968 		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
969 		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
970 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
971 		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
972 		MLX5_SET(qpc, qpc, log_msg_max, 30);
973 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
974 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
975 		MLX5_SET(ads, pp, fl, 1);
976 		break;
977 	case MLX5_CMD_OP_RTR2RTS_QP:
978 		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
979 		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
980 		*in = kzalloc(*inlen, GFP_KERNEL);
981 		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
982 		if (!*in || !*out)
983 			goto outerr;
984 
985 		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
986 		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
987 		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
988 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
989 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
990 		MLX5_SET(ads, pp, ack_timeout, 14);
991 		MLX5_SET(qpc, qpc, retry_count, 7);
992 		MLX5_SET(qpc, qpc, rnr_retry, 7);
993 		break;
994 	default:
995 		goto outerr_nullify;
996 	}
997 
998 	return;
999 
1000 outerr:
1001 	kfree(*in);
1002 	kfree(*out);
1003 outerr_nullify:
1004 	*in = NULL;
1005 	*out = NULL;
1006 }
1007 
1008 static void free_inout(void *in, void *out)
1009 {
1010 	kfree(in);
1011 	kfree(out);
1012 }
1013 
1014 /* Two QPs are used by each virtqueue. One is used by the driver and one by
1015  * firmware. The fw argument indicates whether the subjected QP is the one used
1016  * by firmware.
1017  */
1018 static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
1019 {
1020 	int outlen;
1021 	int inlen;
1022 	void *out;
1023 	void *in;
1024 	int err;
1025 
1026 	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
1027 	if (!in || !out)
1028 		return -ENOMEM;
1029 
1030 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
1031 	free_inout(in, out);
1032 	return err;
1033 }
1034 
1035 static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1036 {
1037 	int err;
1038 
1039 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
1040 	if (err)
1041 		return err;
1042 
1043 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
1044 	if (err)
1045 		return err;
1046 
1047 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
1048 	if (err)
1049 		return err;
1050 
1051 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
1052 	if (err)
1053 		return err;
1054 
1055 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
1056 	if (err)
1057 		return err;
1058 
1059 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
1060 	if (err)
1061 		return err;
1062 
1063 	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1064 }
1065 
1066 struct mlx5_virtq_attr {
1067 	u8 state;
1068 	u16 available_index;
1069 	u16 used_index;
1070 };
1071 
1072 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1073 			   struct mlx5_virtq_attr *attr)
1074 {
1075 	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1076 	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1077 	void *out;
1078 	void *obj_context;
1079 	void *cmd_hdr;
1080 	int err;
1081 
1082 	out = kzalloc(outlen, GFP_KERNEL);
1083 	if (!out)
1084 		return -ENOMEM;
1085 
1086 	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1087 
1088 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1089 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1090 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1091 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1092 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1093 	if (err)
1094 		goto err_cmd;
1095 
1096 	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1097 	memset(attr, 0, sizeof(*attr));
1098 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1099 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1100 	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
1101 	kfree(out);
1102 	return 0;
1103 
1104 err_cmd:
1105 	kfree(out);
1106 	return err;
1107 }
1108 
1109 static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
1110 {
1111 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1112 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1113 	void *obj_context;
1114 	void *cmd_hdr;
1115 	void *in;
1116 	int err;
1117 
1118 	in = kzalloc(inlen, GFP_KERNEL);
1119 	if (!in)
1120 		return -ENOMEM;
1121 
1122 	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1123 
1124 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1125 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1126 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1127 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1128 
1129 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1130 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select,
1131 		   MLX5_VIRTQ_MODIFY_MASK_STATE);
1132 	MLX5_SET(virtio_net_q_object, obj_context, state, state);
1133 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1134 	kfree(in);
1135 	if (!err)
1136 		mvq->fw_state = state;
1137 
1138 	return err;
1139 }
1140 
1141 static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1142 {
1143 	u16 idx = mvq->index;
1144 	int err;
1145 
1146 	if (!mvq->num_ent)
1147 		return 0;
1148 
1149 	if (mvq->initialized)
1150 		return 0;
1151 
1152 	err = cq_create(ndev, idx, mvq->num_ent);
1153 	if (err)
1154 		return err;
1155 
1156 	err = qp_create(ndev, mvq, &mvq->fwqp);
1157 	if (err)
1158 		goto err_fwqp;
1159 
1160 	err = qp_create(ndev, mvq, &mvq->vqqp);
1161 	if (err)
1162 		goto err_vqqp;
1163 
1164 	err = connect_qps(ndev, mvq);
1165 	if (err)
1166 		goto err_connect;
1167 
1168 	err = create_virtqueue(ndev, mvq);
1169 	if (err)
1170 		goto err_connect;
1171 
1172 	if (mvq->ready) {
1173 		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1174 		if (err) {
1175 			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
1176 				       idx, err);
1177 			goto err_connect;
1178 		}
1179 	}
1180 
1181 	mvq->initialized = true;
1182 	return 0;
1183 
1184 err_connect:
1185 	qp_destroy(ndev, &mvq->vqqp);
1186 err_vqqp:
1187 	qp_destroy(ndev, &mvq->fwqp);
1188 err_fwqp:
1189 	cq_destroy(ndev, idx);
1190 	return err;
1191 }
1192 
1193 static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1194 {
1195 	struct mlx5_virtq_attr attr;
1196 
1197 	if (!mvq->initialized)
1198 		return;
1199 
1200 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1201 		return;
1202 
1203 	if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
1204 		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
1205 
1206 	if (query_virtqueue(ndev, mvq, &attr)) {
1207 		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
1208 		return;
1209 	}
1210 	mvq->avail_idx = attr.available_index;
1211 	mvq->used_idx = attr.used_index;
1212 }
1213 
1214 static void suspend_vqs(struct mlx5_vdpa_net *ndev)
1215 {
1216 	int i;
1217 
1218 	for (i = 0; i < MLX5_MAX_SUPPORTED_VQS; i++)
1219 		suspend_vq(ndev, &ndev->vqs[i]);
1220 }
1221 
1222 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1223 {
1224 	if (!mvq->initialized)
1225 		return;
1226 
1227 	suspend_vq(ndev, mvq);
1228 	destroy_virtqueue(ndev, mvq);
1229 	qp_destroy(ndev, &mvq->vqqp);
1230 	qp_destroy(ndev, &mvq->fwqp);
1231 	cq_destroy(ndev, mvq->index);
1232 	mvq->initialized = false;
1233 }
1234 
1235 static int create_rqt(struct mlx5_vdpa_net *ndev)
1236 {
1237 	__be32 *list;
1238 	int max_rqt;
1239 	void *rqtc;
1240 	int inlen;
1241 	void *in;
1242 	int i, j;
1243 	int err;
1244 
1245 	max_rqt = min_t(int, MLX5_MAX_SUPPORTED_VQS / 2,
1246 			1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
1247 	if (max_rqt < 1)
1248 		return -EOPNOTSUPP;
1249 
1250 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num);
1251 	in = kzalloc(inlen, GFP_KERNEL);
1252 	if (!in)
1253 		return -ENOMEM;
1254 
1255 	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1256 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1257 
1258 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1259 	MLX5_SET(rqtc, rqtc, rqt_max_size, max_rqt);
1260 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1261 	for (i = 0, j = 0; j < max_rqt; j++) {
1262 		if (!ndev->vqs[j].initialized)
1263 			continue;
1264 
1265 		if (!vq_is_tx(ndev->vqs[j].index)) {
1266 			list[i] = cpu_to_be32(ndev->vqs[j].virtq_id);
1267 			i++;
1268 		}
1269 	}
1270 	MLX5_SET(rqtc, rqtc, rqt_actual_size, i);
1271 
1272 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1273 	kfree(in);
1274 	if (err)
1275 		return err;
1276 
1277 	return 0;
1278 }
1279 
1280 #define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
1281 
1282 static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
1283 {
1284 	__be32 *list;
1285 	int max_rqt;
1286 	void *rqtc;
1287 	int inlen;
1288 	void *in;
1289 	int i, j;
1290 	int err;
1291 
1292 	max_rqt = min_t(int, ndev->cur_num_vqs / 2,
1293 			1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
1294 	if (max_rqt < 1)
1295 		return -EOPNOTSUPP;
1296 
1297 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num);
1298 	in = kzalloc(inlen, GFP_KERNEL);
1299 	if (!in)
1300 		return -ENOMEM;
1301 
1302 	MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
1303 	MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
1304 	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
1305 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1306 
1307 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1308 	for (i = 0, j = 0; j < num; j++) {
1309 		if (!ndev->vqs[j].initialized)
1310 			continue;
1311 
1312 		if (!vq_is_tx(ndev->vqs[j].index)) {
1313 			list[i] = cpu_to_be32(ndev->vqs[j].virtq_id);
1314 			i++;
1315 		}
1316 	}
1317 	MLX5_SET(rqtc, rqtc, rqt_actual_size, i);
1318 	err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
1319 	kfree(in);
1320 	if (err)
1321 		return err;
1322 
1323 	return 0;
1324 }
1325 
1326 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1327 {
1328 	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1329 }
1330 
1331 static int create_tir(struct mlx5_vdpa_net *ndev)
1332 {
1333 #define HASH_IP_L4PORTS                                                                            \
1334 	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1335 	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1336 	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1337 						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1338 						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1339 						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1340 						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1341 	void *rss_key;
1342 	void *outer;
1343 	void *tirc;
1344 	void *in;
1345 	int err;
1346 
1347 	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1348 	if (!in)
1349 		return -ENOMEM;
1350 
1351 	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1352 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1353 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1354 
1355 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1356 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1357 	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1358 	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1359 
1360 	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1361 	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1362 	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1363 	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1364 
1365 	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1366 	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1367 
1368 	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1369 	kfree(in);
1370 	return err;
1371 }
1372 
1373 static void destroy_tir(struct mlx5_vdpa_net *ndev)
1374 {
1375 	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1376 }
1377 
1378 static int add_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1379 {
1380 	struct mlx5_flow_destination dest[2] = {};
1381 	struct mlx5_flow_table_attr ft_attr = {};
1382 	struct mlx5_flow_act flow_act = {};
1383 	struct mlx5_flow_namespace *ns;
1384 	int err;
1385 
1386 	/* for now, one entry, match all, forward to tir */
1387 	ft_attr.max_fte = 1;
1388 	ft_attr.autogroup.max_num_groups = 1;
1389 
1390 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
1391 	if (!ns) {
1392 		mlx5_vdpa_warn(&ndev->mvdev, "get flow namespace\n");
1393 		return -EOPNOTSUPP;
1394 	}
1395 
1396 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
1397 	if (IS_ERR(ndev->rxft))
1398 		return PTR_ERR(ndev->rxft);
1399 
1400 	ndev->rx_counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1401 	if (IS_ERR(ndev->rx_counter)) {
1402 		err = PTR_ERR(ndev->rx_counter);
1403 		goto err_fc;
1404 	}
1405 
1406 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_COUNT;
1407 	dest[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1408 	dest[0].tir_num = ndev->res.tirn;
1409 	dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1410 	dest[1].counter_id = mlx5_fc_id(ndev->rx_counter);
1411 	ndev->rx_rule = mlx5_add_flow_rules(ndev->rxft, NULL, &flow_act, dest, 2);
1412 	if (IS_ERR(ndev->rx_rule)) {
1413 		err = PTR_ERR(ndev->rx_rule);
1414 		ndev->rx_rule = NULL;
1415 		goto err_rule;
1416 	}
1417 
1418 	return 0;
1419 
1420 err_rule:
1421 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1422 err_fc:
1423 	mlx5_destroy_flow_table(ndev->rxft);
1424 	return err;
1425 }
1426 
1427 static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1428 {
1429 	if (!ndev->rx_rule)
1430 		return;
1431 
1432 	mlx5_del_flow_rules(ndev->rx_rule);
1433 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1434 	mlx5_destroy_flow_table(ndev->rxft);
1435 
1436 	ndev->rx_rule = NULL;
1437 }
1438 
1439 static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1440 {
1441 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1442 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1443 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1444 	struct mlx5_core_dev *pfmdev;
1445 	size_t read;
1446 	u8 mac[ETH_ALEN];
1447 
1448 	pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
1449 	switch (cmd) {
1450 	case VIRTIO_NET_CTRL_MAC_ADDR_SET:
1451 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
1452 		if (read != ETH_ALEN)
1453 			break;
1454 
1455 		if (!memcmp(ndev->config.mac, mac, 6)) {
1456 			status = VIRTIO_NET_OK;
1457 			break;
1458 		}
1459 
1460 		if (!is_zero_ether_addr(ndev->config.mac)) {
1461 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1462 				mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
1463 					       ndev->config.mac);
1464 				break;
1465 			}
1466 		}
1467 
1468 		if (mlx5_mpfs_add_mac(pfmdev, mac)) {
1469 			mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
1470 				       mac);
1471 			break;
1472 		}
1473 
1474 		memcpy(ndev->config.mac, mac, ETH_ALEN);
1475 		status = VIRTIO_NET_OK;
1476 		break;
1477 
1478 	default:
1479 		break;
1480 	}
1481 
1482 	return status;
1483 }
1484 
1485 static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
1486 {
1487 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1488 	int cur_qps = ndev->cur_num_vqs / 2;
1489 	int err;
1490 	int i;
1491 
1492 	if (cur_qps > newqps) {
1493 		err = modify_rqt(ndev, 2 * newqps);
1494 		if (err)
1495 			return err;
1496 
1497 		for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
1498 			teardown_vq(ndev, &ndev->vqs[i]);
1499 
1500 		ndev->cur_num_vqs = 2 * newqps;
1501 	} else {
1502 		ndev->cur_num_vqs = 2 * newqps;
1503 		for (i = cur_qps * 2; i < 2 * newqps; i++) {
1504 			err = setup_vq(ndev, &ndev->vqs[i]);
1505 			if (err)
1506 				goto clean_added;
1507 		}
1508 		err = modify_rqt(ndev, 2 * newqps);
1509 		if (err)
1510 			goto clean_added;
1511 	}
1512 	return 0;
1513 
1514 clean_added:
1515 	for (--i; i >= cur_qps; --i)
1516 		teardown_vq(ndev, &ndev->vqs[i]);
1517 
1518 	return err;
1519 }
1520 
1521 static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1522 {
1523 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1524 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1525 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1526 	struct virtio_net_ctrl_mq mq;
1527 	size_t read;
1528 	u16 newqps;
1529 
1530 	switch (cmd) {
1531 	case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
1532 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
1533 		if (read != sizeof(mq))
1534 			break;
1535 
1536 		newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
1537 		if (ndev->cur_num_vqs == 2 * newqps) {
1538 			status = VIRTIO_NET_OK;
1539 			break;
1540 		}
1541 
1542 		if (newqps & (newqps - 1))
1543 			break;
1544 
1545 		if (!change_num_qps(mvdev, newqps))
1546 			status = VIRTIO_NET_OK;
1547 
1548 		break;
1549 	default:
1550 		break;
1551 	}
1552 
1553 	return status;
1554 }
1555 
1556 static void mlx5_cvq_kick_handler(struct work_struct *work)
1557 {
1558 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1559 	struct virtio_net_ctrl_hdr ctrl;
1560 	struct mlx5_ctrl_wq_ent *wqent;
1561 	struct mlx5_vdpa_dev *mvdev;
1562 	struct mlx5_control_vq *cvq;
1563 	struct mlx5_vdpa_net *ndev;
1564 	size_t read, write;
1565 	int err;
1566 
1567 	wqent = container_of(work, struct mlx5_ctrl_wq_ent, work);
1568 	mvdev = wqent->mvdev;
1569 	ndev = to_mlx5_vdpa_ndev(mvdev);
1570 	cvq = &mvdev->cvq;
1571 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
1572 		goto out;
1573 
1574 	if (!cvq->ready)
1575 		goto out;
1576 
1577 	while (true) {
1578 		err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
1579 					   GFP_ATOMIC);
1580 		if (err <= 0)
1581 			break;
1582 
1583 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
1584 		if (read != sizeof(ctrl))
1585 			break;
1586 
1587 		switch (ctrl.class) {
1588 		case VIRTIO_NET_CTRL_MAC:
1589 			status = handle_ctrl_mac(mvdev, ctrl.cmd);
1590 			break;
1591 		case VIRTIO_NET_CTRL_MQ:
1592 			status = handle_ctrl_mq(mvdev, ctrl.cmd);
1593 			break;
1594 
1595 		default:
1596 			break;
1597 		}
1598 
1599 		/* Make sure data is written before advancing index */
1600 		smp_wmb();
1601 
1602 		write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
1603 		vringh_complete_iotlb(&cvq->vring, cvq->head, write);
1604 		vringh_kiov_cleanup(&cvq->riov);
1605 		vringh_kiov_cleanup(&cvq->wiov);
1606 
1607 		if (vringh_need_notify_iotlb(&cvq->vring))
1608 			vringh_notify(&cvq->vring);
1609 	}
1610 out:
1611 	kfree(wqent);
1612 }
1613 
1614 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
1615 {
1616 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1617 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1618 	struct mlx5_vdpa_virtqueue *mvq;
1619 	struct mlx5_ctrl_wq_ent *wqent;
1620 
1621 	if (!is_index_valid(mvdev, idx))
1622 		return;
1623 
1624 	if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
1625 		if (!mvdev->cvq.ready)
1626 			return;
1627 
1628 		wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
1629 		if (!wqent)
1630 			return;
1631 
1632 		wqent->mvdev = mvdev;
1633 		INIT_WORK(&wqent->work, mlx5_cvq_kick_handler);
1634 		queue_work(mvdev->wq, &wqent->work);
1635 		return;
1636 	}
1637 
1638 	mvq = &ndev->vqs[idx];
1639 	if (unlikely(!mvq->ready))
1640 		return;
1641 
1642 	iowrite16(idx, ndev->mvdev.res.kick_addr);
1643 }
1644 
1645 static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
1646 				    u64 driver_area, u64 device_area)
1647 {
1648 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1649 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1650 	struct mlx5_vdpa_virtqueue *mvq;
1651 
1652 	if (!is_index_valid(mvdev, idx))
1653 		return -EINVAL;
1654 
1655 	if (is_ctrl_vq_idx(mvdev, idx)) {
1656 		mvdev->cvq.desc_addr = desc_area;
1657 		mvdev->cvq.device_addr = device_area;
1658 		mvdev->cvq.driver_addr = driver_area;
1659 		return 0;
1660 	}
1661 
1662 	mvq = &ndev->vqs[idx];
1663 	mvq->desc_addr = desc_area;
1664 	mvq->device_addr = device_area;
1665 	mvq->driver_addr = driver_area;
1666 	return 0;
1667 }
1668 
1669 static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
1670 {
1671 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1672 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1673 	struct mlx5_vdpa_virtqueue *mvq;
1674 
1675 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
1676 		return;
1677 
1678 	mvq = &ndev->vqs[idx];
1679 	mvq->num_ent = num;
1680 }
1681 
1682 static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
1683 {
1684 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1685 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1686 
1687 	ndev->event_cbs[idx] = *cb;
1688 }
1689 
1690 static void mlx5_cvq_notify(struct vringh *vring)
1691 {
1692 	struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
1693 
1694 	if (!cvq->event_cb.callback)
1695 		return;
1696 
1697 	cvq->event_cb.callback(cvq->event_cb.private);
1698 }
1699 
1700 static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
1701 {
1702 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1703 
1704 	cvq->ready = ready;
1705 	if (!ready)
1706 		return;
1707 
1708 	cvq->vring.notify = mlx5_cvq_notify;
1709 }
1710 
1711 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
1712 {
1713 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1714 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1715 	struct mlx5_vdpa_virtqueue *mvq;
1716 
1717 	if (!mvdev->actual_features)
1718 		return;
1719 
1720 	if (!is_index_valid(mvdev, idx))
1721 		return;
1722 
1723 	if (is_ctrl_vq_idx(mvdev, idx)) {
1724 		set_cvq_ready(mvdev, ready);
1725 		return;
1726 	}
1727 
1728 	mvq = &ndev->vqs[idx];
1729 	if (!ready)
1730 		suspend_vq(ndev, mvq);
1731 
1732 	mvq->ready = ready;
1733 }
1734 
1735 static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
1736 {
1737 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1738 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1739 
1740 	if (!is_index_valid(mvdev, idx))
1741 		return false;
1742 
1743 	if (is_ctrl_vq_idx(mvdev, idx))
1744 		return mvdev->cvq.ready;
1745 
1746 	return ndev->vqs[idx].ready;
1747 }
1748 
1749 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
1750 				  const struct vdpa_vq_state *state)
1751 {
1752 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1753 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1754 	struct mlx5_vdpa_virtqueue *mvq;
1755 
1756 	if (!is_index_valid(mvdev, idx))
1757 		return -EINVAL;
1758 
1759 	if (is_ctrl_vq_idx(mvdev, idx)) {
1760 		mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
1761 		return 0;
1762 	}
1763 
1764 	mvq = &ndev->vqs[idx];
1765 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
1766 		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
1767 		return -EINVAL;
1768 	}
1769 
1770 	mvq->used_idx = state->split.avail_index;
1771 	mvq->avail_idx = state->split.avail_index;
1772 	return 0;
1773 }
1774 
1775 static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
1776 {
1777 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1778 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1779 	struct mlx5_vdpa_virtqueue *mvq;
1780 	struct mlx5_virtq_attr attr;
1781 	int err;
1782 
1783 	if (!is_index_valid(mvdev, idx))
1784 		return -EINVAL;
1785 
1786 	if (is_ctrl_vq_idx(mvdev, idx)) {
1787 		state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
1788 		return 0;
1789 	}
1790 
1791 	mvq = &ndev->vqs[idx];
1792 	/* If the virtq object was destroyed, use the value saved at
1793 	 * the last minute of suspend_vq. This caters for userspace
1794 	 * that cares about emulating the index after vq is stopped.
1795 	 */
1796 	if (!mvq->initialized) {
1797 		/* Firmware returns a wrong value for the available index.
1798 		 * Since both values should be identical, we take the value of
1799 		 * used_idx which is reported correctly.
1800 		 */
1801 		state->split.avail_index = mvq->used_idx;
1802 		return 0;
1803 	}
1804 
1805 	err = query_virtqueue(ndev, mvq, &attr);
1806 	if (err) {
1807 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
1808 		return err;
1809 	}
1810 	state->split.avail_index = attr.used_index;
1811 	return 0;
1812 }
1813 
1814 static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
1815 {
1816 	return PAGE_SIZE;
1817 }
1818 
1819 enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9,
1820 	MLX5_VIRTIO_NET_F_CSUM = 1 << 10,
1821 	MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11,
1822 	MLX5_VIRTIO_NET_F_HOST_TSO4 = 1 << 12,
1823 };
1824 
1825 static u64 mlx_to_vritio_features(u16 dev_features)
1826 {
1827 	u64 result = 0;
1828 
1829 	if (dev_features & MLX5_VIRTIO_NET_F_GUEST_CSUM)
1830 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
1831 	if (dev_features & MLX5_VIRTIO_NET_F_CSUM)
1832 		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
1833 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO6)
1834 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
1835 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO4)
1836 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
1837 
1838 	return result;
1839 }
1840 
1841 static u64 mlx5_vdpa_get_features(struct vdpa_device *vdev)
1842 {
1843 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1844 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1845 	u16 dev_features;
1846 
1847 	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, device_features_bits_mask);
1848 	ndev->mvdev.mlx_features |= mlx_to_vritio_features(dev_features);
1849 	if (MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, virtio_version_1_0))
1850 		ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_VERSION_1);
1851 	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
1852 	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
1853 	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
1854 	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MQ);
1855 
1856 	print_features(mvdev, ndev->mvdev.mlx_features, false);
1857 	return ndev->mvdev.mlx_features;
1858 }
1859 
1860 static int verify_min_features(struct mlx5_vdpa_dev *mvdev, u64 features)
1861 {
1862 	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
1863 		return -EOPNOTSUPP;
1864 
1865 	return 0;
1866 }
1867 
1868 static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev)
1869 {
1870 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1871 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1872 	int err;
1873 	int i;
1874 
1875 	for (i = 0; i < 2 * mlx5_vdpa_max_qps(mvdev->max_vqs); i++) {
1876 		err = setup_vq(ndev, &ndev->vqs[i]);
1877 		if (err)
1878 			goto err_vq;
1879 	}
1880 
1881 	if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) {
1882 		err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
1883 					MLX5_CVQ_MAX_ENT, false,
1884 					(struct vring_desc *)(uintptr_t)cvq->desc_addr,
1885 					(struct vring_avail *)(uintptr_t)cvq->driver_addr,
1886 					(struct vring_used *)(uintptr_t)cvq->device_addr);
1887 		if (err)
1888 			goto err_vq;
1889 	}
1890 
1891 	return 0;
1892 
1893 err_vq:
1894 	for (--i; i >= 0; i--)
1895 		teardown_vq(ndev, &ndev->vqs[i]);
1896 
1897 	return err;
1898 }
1899 
1900 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
1901 {
1902 	struct mlx5_vdpa_virtqueue *mvq;
1903 	int i;
1904 
1905 	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
1906 		mvq = &ndev->vqs[i];
1907 		if (!mvq->initialized)
1908 			continue;
1909 
1910 		teardown_vq(ndev, mvq);
1911 	}
1912 }
1913 
1914 static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
1915 {
1916 	if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
1917 		if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
1918 			/* MQ supported. CVQ index is right above the last data virtqueue's */
1919 			mvdev->max_idx = mvdev->max_vqs;
1920 		} else {
1921 			/* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
1922 			 * CVQ gets index 2
1923 			 */
1924 			mvdev->max_idx = 2;
1925 		}
1926 	} else {
1927 		/* Two data virtqueues only: one for rx and one for tx */
1928 		mvdev->max_idx = 1;
1929 	}
1930 }
1931 
1932 static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features)
1933 {
1934 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1935 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1936 	int err;
1937 
1938 	print_features(mvdev, features, true);
1939 
1940 	err = verify_min_features(mvdev, features);
1941 	if (err)
1942 		return err;
1943 
1944 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
1945 	ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu);
1946 	ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
1947 	update_cvq_info(mvdev);
1948 	return err;
1949 }
1950 
1951 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
1952 {
1953 	/* not implemented */
1954 	mlx5_vdpa_warn(to_mvdev(vdev), "set config callback not supported\n");
1955 }
1956 
1957 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
1958 static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
1959 {
1960 	return MLX5_VDPA_MAX_VQ_ENTRIES;
1961 }
1962 
1963 static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
1964 {
1965 	return VIRTIO_ID_NET;
1966 }
1967 
1968 static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
1969 {
1970 	return PCI_VENDOR_ID_MELLANOX;
1971 }
1972 
1973 static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
1974 {
1975 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1976 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1977 
1978 	print_status(mvdev, ndev->mvdev.status, false);
1979 	return ndev->mvdev.status;
1980 }
1981 
1982 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1983 {
1984 	struct mlx5_vq_restore_info *ri = &mvq->ri;
1985 	struct mlx5_virtq_attr attr = {};
1986 	int err;
1987 
1988 	if (mvq->initialized) {
1989 		err = query_virtqueue(ndev, mvq, &attr);
1990 		if (err)
1991 			return err;
1992 	}
1993 
1994 	ri->avail_index = attr.available_index;
1995 	ri->used_index = attr.used_index;
1996 	ri->ready = mvq->ready;
1997 	ri->num_ent = mvq->num_ent;
1998 	ri->desc_addr = mvq->desc_addr;
1999 	ri->device_addr = mvq->device_addr;
2000 	ri->driver_addr = mvq->driver_addr;
2001 	ri->restore = true;
2002 	return 0;
2003 }
2004 
2005 static int save_channels_info(struct mlx5_vdpa_net *ndev)
2006 {
2007 	int i;
2008 
2009 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2010 		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
2011 		save_channel_info(ndev, &ndev->vqs[i]);
2012 	}
2013 	return 0;
2014 }
2015 
2016 static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
2017 {
2018 	int i;
2019 
2020 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2021 		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2022 }
2023 
2024 static void restore_channels_info(struct mlx5_vdpa_net *ndev)
2025 {
2026 	struct mlx5_vdpa_virtqueue *mvq;
2027 	struct mlx5_vq_restore_info *ri;
2028 	int i;
2029 
2030 	mlx5_clear_vqs(ndev);
2031 	init_mvqs(ndev);
2032 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2033 		mvq = &ndev->vqs[i];
2034 		ri = &mvq->ri;
2035 		if (!ri->restore)
2036 			continue;
2037 
2038 		mvq->avail_idx = ri->avail_index;
2039 		mvq->used_idx = ri->used_index;
2040 		mvq->ready = ri->ready;
2041 		mvq->num_ent = ri->num_ent;
2042 		mvq->desc_addr = ri->desc_addr;
2043 		mvq->device_addr = ri->device_addr;
2044 		mvq->driver_addr = ri->driver_addr;
2045 	}
2046 }
2047 
2048 static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
2049 {
2050 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2051 	int err;
2052 
2053 	suspend_vqs(ndev);
2054 	err = save_channels_info(ndev);
2055 	if (err)
2056 		goto err_mr;
2057 
2058 	teardown_driver(ndev);
2059 	mlx5_vdpa_destroy_mr(mvdev);
2060 	err = mlx5_vdpa_create_mr(mvdev, iotlb);
2061 	if (err)
2062 		goto err_mr;
2063 
2064 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
2065 		return 0;
2066 
2067 	restore_channels_info(ndev);
2068 	err = setup_driver(mvdev);
2069 	if (err)
2070 		goto err_setup;
2071 
2072 	return 0;
2073 
2074 err_setup:
2075 	mlx5_vdpa_destroy_mr(mvdev);
2076 err_mr:
2077 	return err;
2078 }
2079 
2080 static int setup_driver(struct mlx5_vdpa_dev *mvdev)
2081 {
2082 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2083 	int err;
2084 
2085 	mutex_lock(&ndev->reslock);
2086 	if (ndev->setup) {
2087 		mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
2088 		err = 0;
2089 		goto out;
2090 	}
2091 	err = setup_virtqueues(mvdev);
2092 	if (err) {
2093 		mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
2094 		goto out;
2095 	}
2096 
2097 	err = create_rqt(ndev);
2098 	if (err) {
2099 		mlx5_vdpa_warn(mvdev, "create_rqt\n");
2100 		goto err_rqt;
2101 	}
2102 
2103 	err = create_tir(ndev);
2104 	if (err) {
2105 		mlx5_vdpa_warn(mvdev, "create_tir\n");
2106 		goto err_tir;
2107 	}
2108 
2109 	err = add_fwd_to_tir(ndev);
2110 	if (err) {
2111 		mlx5_vdpa_warn(mvdev, "add_fwd_to_tir\n");
2112 		goto err_fwd;
2113 	}
2114 	ndev->setup = true;
2115 	mutex_unlock(&ndev->reslock);
2116 
2117 	return 0;
2118 
2119 err_fwd:
2120 	destroy_tir(ndev);
2121 err_tir:
2122 	destroy_rqt(ndev);
2123 err_rqt:
2124 	teardown_virtqueues(ndev);
2125 out:
2126 	mutex_unlock(&ndev->reslock);
2127 	return err;
2128 }
2129 
2130 static void teardown_driver(struct mlx5_vdpa_net *ndev)
2131 {
2132 	mutex_lock(&ndev->reslock);
2133 	if (!ndev->setup)
2134 		goto out;
2135 
2136 	remove_fwd_to_tir(ndev);
2137 	destroy_tir(ndev);
2138 	destroy_rqt(ndev);
2139 	teardown_virtqueues(ndev);
2140 	ndev->setup = false;
2141 out:
2142 	mutex_unlock(&ndev->reslock);
2143 }
2144 
2145 static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
2146 {
2147 	int i;
2148 
2149 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2150 		ndev->vqs[i].ready = false;
2151 
2152 	ndev->mvdev.cvq.ready = false;
2153 }
2154 
2155 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
2156 {
2157 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2158 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2159 	int err;
2160 
2161 	print_status(mvdev, status, true);
2162 
2163 	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
2164 		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
2165 			err = setup_driver(mvdev);
2166 			if (err) {
2167 				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
2168 				goto err_setup;
2169 			}
2170 		} else {
2171 			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
2172 			return;
2173 		}
2174 	}
2175 
2176 	ndev->mvdev.status = status;
2177 	return;
2178 
2179 err_setup:
2180 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2181 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
2182 }
2183 
2184 static int mlx5_vdpa_reset(struct vdpa_device *vdev)
2185 {
2186 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2187 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2188 
2189 	print_status(mvdev, 0, true);
2190 	mlx5_vdpa_info(mvdev, "performing device reset\n");
2191 	teardown_driver(ndev);
2192 	clear_vqs_ready(ndev);
2193 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2194 	ndev->mvdev.status = 0;
2195 	ndev->mvdev.mlx_features = 0;
2196 	memset(ndev->event_cbs, 0, sizeof(ndev->event_cbs));
2197 	ndev->mvdev.actual_features = 0;
2198 	++mvdev->generation;
2199 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2200 		if (mlx5_vdpa_create_mr(mvdev, NULL))
2201 			mlx5_vdpa_warn(mvdev, "create MR failed\n");
2202 	}
2203 
2204 	return 0;
2205 }
2206 
2207 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
2208 {
2209 	return sizeof(struct virtio_net_config);
2210 }
2211 
2212 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
2213 				 unsigned int len)
2214 {
2215 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2216 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2217 
2218 	if (offset + len <= sizeof(struct virtio_net_config))
2219 		memcpy(buf, (u8 *)&ndev->config + offset, len);
2220 }
2221 
2222 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
2223 				 unsigned int len)
2224 {
2225 	/* not supported */
2226 }
2227 
2228 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
2229 {
2230 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2231 
2232 	return mvdev->generation;
2233 }
2234 
2235 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb)
2236 {
2237 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2238 	bool change_map;
2239 	int err;
2240 
2241 	err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map);
2242 	if (err) {
2243 		mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err);
2244 		return err;
2245 	}
2246 
2247 	if (change_map)
2248 		return mlx5_vdpa_change_map(mvdev, iotlb);
2249 
2250 	return 0;
2251 }
2252 
2253 static void mlx5_vdpa_free(struct vdpa_device *vdev)
2254 {
2255 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2256 	struct mlx5_core_dev *pfmdev;
2257 	struct mlx5_vdpa_net *ndev;
2258 
2259 	ndev = to_mlx5_vdpa_ndev(mvdev);
2260 
2261 	free_resources(ndev);
2262 	mlx5_vdpa_destroy_mr(mvdev);
2263 	if (!is_zero_ether_addr(ndev->config.mac)) {
2264 		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
2265 		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
2266 	}
2267 	mlx5_vdpa_free_resources(&ndev->mvdev);
2268 	mutex_destroy(&ndev->reslock);
2269 }
2270 
2271 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
2272 {
2273 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2274 	struct vdpa_notification_area ret = {};
2275 	struct mlx5_vdpa_net *ndev;
2276 	phys_addr_t addr;
2277 
2278 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
2279 		return ret;
2280 
2281 	/* If SF BAR size is smaller than PAGE_SIZE, do not use direct
2282 	 * notification to avoid the risk of mapping pages that contain BAR of more
2283 	 * than one SF
2284 	 */
2285 	if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
2286 		return ret;
2287 
2288 	ndev = to_mlx5_vdpa_ndev(mvdev);
2289 	addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
2290 	ret.addr = addr;
2291 	ret.size = PAGE_SIZE;
2292 	return ret;
2293 }
2294 
2295 static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 idx)
2296 {
2297 	return -EOPNOTSUPP;
2298 }
2299 
2300 static const struct vdpa_config_ops mlx5_vdpa_ops = {
2301 	.set_vq_address = mlx5_vdpa_set_vq_address,
2302 	.set_vq_num = mlx5_vdpa_set_vq_num,
2303 	.kick_vq = mlx5_vdpa_kick_vq,
2304 	.set_vq_cb = mlx5_vdpa_set_vq_cb,
2305 	.set_vq_ready = mlx5_vdpa_set_vq_ready,
2306 	.get_vq_ready = mlx5_vdpa_get_vq_ready,
2307 	.set_vq_state = mlx5_vdpa_set_vq_state,
2308 	.get_vq_state = mlx5_vdpa_get_vq_state,
2309 	.get_vq_notification = mlx5_get_vq_notification,
2310 	.get_vq_irq = mlx5_get_vq_irq,
2311 	.get_vq_align = mlx5_vdpa_get_vq_align,
2312 	.get_features = mlx5_vdpa_get_features,
2313 	.set_features = mlx5_vdpa_set_features,
2314 	.set_config_cb = mlx5_vdpa_set_config_cb,
2315 	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
2316 	.get_device_id = mlx5_vdpa_get_device_id,
2317 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
2318 	.get_status = mlx5_vdpa_get_status,
2319 	.set_status = mlx5_vdpa_set_status,
2320 	.reset = mlx5_vdpa_reset,
2321 	.get_config_size = mlx5_vdpa_get_config_size,
2322 	.get_config = mlx5_vdpa_get_config,
2323 	.set_config = mlx5_vdpa_set_config,
2324 	.get_generation = mlx5_vdpa_get_generation,
2325 	.set_map = mlx5_vdpa_set_map,
2326 	.free = mlx5_vdpa_free,
2327 };
2328 
2329 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
2330 {
2331 	u16 hw_mtu;
2332 	int err;
2333 
2334 	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
2335 	if (err)
2336 		return err;
2337 
2338 	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
2339 	return 0;
2340 }
2341 
2342 static int alloc_resources(struct mlx5_vdpa_net *ndev)
2343 {
2344 	struct mlx5_vdpa_net_resources *res = &ndev->res;
2345 	int err;
2346 
2347 	if (res->valid) {
2348 		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
2349 		return -EEXIST;
2350 	}
2351 
2352 	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
2353 	if (err)
2354 		return err;
2355 
2356 	err = create_tis(ndev);
2357 	if (err)
2358 		goto err_tis;
2359 
2360 	res->valid = true;
2361 
2362 	return 0;
2363 
2364 err_tis:
2365 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
2366 	return err;
2367 }
2368 
2369 static void free_resources(struct mlx5_vdpa_net *ndev)
2370 {
2371 	struct mlx5_vdpa_net_resources *res = &ndev->res;
2372 
2373 	if (!res->valid)
2374 		return;
2375 
2376 	destroy_tis(ndev);
2377 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
2378 	res->valid = false;
2379 }
2380 
2381 static void init_mvqs(struct mlx5_vdpa_net *ndev)
2382 {
2383 	struct mlx5_vdpa_virtqueue *mvq;
2384 	int i;
2385 
2386 	for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); ++i) {
2387 		mvq = &ndev->vqs[i];
2388 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2389 		mvq->index = i;
2390 		mvq->ndev = ndev;
2391 		mvq->fwqp.fw = true;
2392 	}
2393 	for (; i < ndev->mvdev.max_vqs; i++) {
2394 		mvq = &ndev->vqs[i];
2395 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2396 		mvq->index = i;
2397 		mvq->ndev = ndev;
2398 	}
2399 }
2400 
2401 struct mlx5_vdpa_mgmtdev {
2402 	struct vdpa_mgmt_dev mgtdev;
2403 	struct mlx5_adev *madev;
2404 	struct mlx5_vdpa_net *ndev;
2405 };
2406 
2407 static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
2408 {
2409 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
2410 	struct virtio_net_config *config;
2411 	struct mlx5_core_dev *pfmdev;
2412 	struct mlx5_vdpa_dev *mvdev;
2413 	struct mlx5_vdpa_net *ndev;
2414 	struct mlx5_core_dev *mdev;
2415 	u32 max_vqs;
2416 	int err;
2417 
2418 	if (mgtdev->ndev)
2419 		return -ENOSPC;
2420 
2421 	mdev = mgtdev->madev->mdev;
2422 	if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) &
2423 	    MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) {
2424 		dev_warn(mdev->device, "missing support for split virtqueues\n");
2425 		return -EOPNOTSUPP;
2426 	}
2427 
2428 	/* we save one virtqueue for control virtqueue should we require it */
2429 	max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues);
2430 	max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
2431 
2432 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
2433 				 name, false);
2434 	if (IS_ERR(ndev))
2435 		return PTR_ERR(ndev);
2436 
2437 	ndev->mvdev.max_vqs = max_vqs;
2438 	mvdev = &ndev->mvdev;
2439 	mvdev->mdev = mdev;
2440 	init_mvqs(ndev);
2441 	mutex_init(&ndev->reslock);
2442 	config = &ndev->config;
2443 	err = query_mtu(mdev, &ndev->mtu);
2444 	if (err)
2445 		goto err_mtu;
2446 
2447 	err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
2448 	if (err)
2449 		goto err_mtu;
2450 
2451 	if (!is_zero_ether_addr(config->mac)) {
2452 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
2453 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
2454 		if (err)
2455 			goto err_mtu;
2456 
2457 		ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC);
2458 	}
2459 
2460 	config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, mlx5_vdpa_max_qps(max_vqs));
2461 	mvdev->vdev.dma_dev = &mdev->pdev->dev;
2462 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
2463 	if (err)
2464 		goto err_mpfs;
2465 
2466 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2467 		err = mlx5_vdpa_create_mr(mvdev, NULL);
2468 		if (err)
2469 			goto err_res;
2470 	}
2471 
2472 	err = alloc_resources(ndev);
2473 	if (err)
2474 		goto err_mr;
2475 
2476 	mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_ctrl_wq");
2477 	if (!mvdev->wq) {
2478 		err = -ENOMEM;
2479 		goto err_res2;
2480 	}
2481 
2482 	ndev->cur_num_vqs = 2 * mlx5_vdpa_max_qps(max_vqs);
2483 	mvdev->vdev.mdev = &mgtdev->mgtdev;
2484 	err = _vdpa_register_device(&mvdev->vdev, ndev->cur_num_vqs + 1);
2485 	if (err)
2486 		goto err_reg;
2487 
2488 	mgtdev->ndev = ndev;
2489 	return 0;
2490 
2491 err_reg:
2492 	destroy_workqueue(mvdev->wq);
2493 err_res2:
2494 	free_resources(ndev);
2495 err_mr:
2496 	mlx5_vdpa_destroy_mr(mvdev);
2497 err_res:
2498 	mlx5_vdpa_free_resources(&ndev->mvdev);
2499 err_mpfs:
2500 	if (!is_zero_ether_addr(config->mac))
2501 		mlx5_mpfs_del_mac(pfmdev, config->mac);
2502 err_mtu:
2503 	mutex_destroy(&ndev->reslock);
2504 	put_device(&mvdev->vdev.dev);
2505 	return err;
2506 }
2507 
2508 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
2509 {
2510 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
2511 	struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
2512 
2513 	destroy_workqueue(mvdev->wq);
2514 	_vdpa_unregister_device(dev);
2515 	mgtdev->ndev = NULL;
2516 }
2517 
2518 static const struct vdpa_mgmtdev_ops mdev_ops = {
2519 	.dev_add = mlx5_vdpa_dev_add,
2520 	.dev_del = mlx5_vdpa_dev_del,
2521 };
2522 
2523 static struct virtio_device_id id_table[] = {
2524 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
2525 	{ 0 },
2526 };
2527 
2528 static int mlx5v_probe(struct auxiliary_device *adev,
2529 		       const struct auxiliary_device_id *id)
2530 
2531 {
2532 	struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
2533 	struct mlx5_core_dev *mdev = madev->mdev;
2534 	struct mlx5_vdpa_mgmtdev *mgtdev;
2535 	int err;
2536 
2537 	mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
2538 	if (!mgtdev)
2539 		return -ENOMEM;
2540 
2541 	mgtdev->mgtdev.ops = &mdev_ops;
2542 	mgtdev->mgtdev.device = mdev->device;
2543 	mgtdev->mgtdev.id_table = id_table;
2544 	mgtdev->madev = madev;
2545 
2546 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
2547 	if (err)
2548 		goto reg_err;
2549 
2550 	dev_set_drvdata(&adev->dev, mgtdev);
2551 
2552 	return 0;
2553 
2554 reg_err:
2555 	kfree(mgtdev);
2556 	return err;
2557 }
2558 
2559 static void mlx5v_remove(struct auxiliary_device *adev)
2560 {
2561 	struct mlx5_vdpa_mgmtdev *mgtdev;
2562 
2563 	mgtdev = dev_get_drvdata(&adev->dev);
2564 	vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
2565 	kfree(mgtdev);
2566 }
2567 
2568 static const struct auxiliary_device_id mlx5v_id_table[] = {
2569 	{ .name = MLX5_ADEV_NAME ".vnet", },
2570 	{},
2571 };
2572 
2573 MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
2574 
2575 static struct auxiliary_driver mlx5v_driver = {
2576 	.name = "vnet",
2577 	.probe = mlx5v_probe,
2578 	.remove = mlx5v_remove,
2579 	.id_table = mlx5v_id_table,
2580 };
2581 
2582 module_auxiliary_driver(mlx5v_driver);
2583