xref: /openbmc/linux/drivers/vdpa/mlx5/net/mlx5_vnet.c (revision e802ca75)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
3 
4 #include <linux/module.h>
5 #include <linux/vdpa.h>
6 #include <linux/vringh.h>
7 #include <uapi/linux/virtio_net.h>
8 #include <uapi/linux/virtio_ids.h>
9 #include <uapi/linux/vdpa.h>
10 #include <linux/virtio_config.h>
11 #include <linux/auxiliary_bus.h>
12 #include <linux/mlx5/cq.h>
13 #include <linux/mlx5/qp.h>
14 #include <linux/mlx5/device.h>
15 #include <linux/mlx5/driver.h>
16 #include <linux/mlx5/vport.h>
17 #include <linux/mlx5/fs.h>
18 #include <linux/mlx5/mlx5_ifc_vdpa.h>
19 #include <linux/mlx5/mpfs.h>
20 #include "mlx5_vdpa.h"
21 
22 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
23 MODULE_DESCRIPTION("Mellanox VDPA driver");
24 MODULE_LICENSE("Dual BSD/GPL");
25 
26 #define to_mlx5_vdpa_ndev(__mvdev)                                             \
27 	container_of(__mvdev, struct mlx5_vdpa_net, mvdev)
28 #define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev)
29 
30 #define VALID_FEATURES_MASK                                                                        \
31 	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
32 	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
33 	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
34 	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
35 	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
36 	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
37 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
38 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
39 	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
40 	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
41 	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
42 	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
43 	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
44 
45 #define VALID_STATUS_MASK                                                                          \
46 	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
47 	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
48 
49 #define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
50 
51 struct mlx5_vdpa_net_resources {
52 	u32 tisn;
53 	u32 tdn;
54 	u32 tirn;
55 	u32 rqtn;
56 	bool valid;
57 };
58 
59 struct mlx5_vdpa_cq_buf {
60 	struct mlx5_frag_buf_ctrl fbc;
61 	struct mlx5_frag_buf frag_buf;
62 	int cqe_size;
63 	int nent;
64 };
65 
66 struct mlx5_vdpa_cq {
67 	struct mlx5_core_cq mcq;
68 	struct mlx5_vdpa_cq_buf buf;
69 	struct mlx5_db db;
70 	int cqe;
71 };
72 
73 struct mlx5_vdpa_umem {
74 	struct mlx5_frag_buf_ctrl fbc;
75 	struct mlx5_frag_buf frag_buf;
76 	int size;
77 	u32 id;
78 };
79 
80 struct mlx5_vdpa_qp {
81 	struct mlx5_core_qp mqp;
82 	struct mlx5_frag_buf frag_buf;
83 	struct mlx5_db db;
84 	u16 head;
85 	bool fw;
86 };
87 
88 struct mlx5_vq_restore_info {
89 	u32 num_ent;
90 	u64 desc_addr;
91 	u64 device_addr;
92 	u64 driver_addr;
93 	u16 avail_index;
94 	u16 used_index;
95 	bool ready;
96 	bool restore;
97 };
98 
99 struct mlx5_vdpa_virtqueue {
100 	bool ready;
101 	u64 desc_addr;
102 	u64 device_addr;
103 	u64 driver_addr;
104 	u32 num_ent;
105 
106 	/* Resources for implementing the notification channel from the device
107 	 * to the driver. fwqp is the firmware end of an RC connection; the
108 	 * other end is vqqp used by the driver. cq is is where completions are
109 	 * reported.
110 	 */
111 	struct mlx5_vdpa_cq cq;
112 	struct mlx5_vdpa_qp fwqp;
113 	struct mlx5_vdpa_qp vqqp;
114 
115 	/* umem resources are required for the virtqueue operation. They're use
116 	 * is internal and they must be provided by the driver.
117 	 */
118 	struct mlx5_vdpa_umem umem1;
119 	struct mlx5_vdpa_umem umem2;
120 	struct mlx5_vdpa_umem umem3;
121 
122 	bool initialized;
123 	int index;
124 	u32 virtq_id;
125 	struct mlx5_vdpa_net *ndev;
126 	u16 avail_idx;
127 	u16 used_idx;
128 	int fw_state;
129 
130 	/* keep last in the struct */
131 	struct mlx5_vq_restore_info ri;
132 };
133 
134 static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
135 {
136 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) {
137 		if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
138 			return idx < 2;
139 		else
140 			return idx < 3;
141 	}
142 
143 	return idx <= mvdev->max_idx;
144 }
145 
146 struct mlx5_vdpa_net {
147 	struct mlx5_vdpa_dev mvdev;
148 	struct mlx5_vdpa_net_resources res;
149 	struct virtio_net_config config;
150 	struct mlx5_vdpa_virtqueue *vqs;
151 	struct vdpa_callback *event_cbs;
152 
153 	/* Serialize vq resources creation and destruction. This is required
154 	 * since memory map might change and we need to destroy and create
155 	 * resources while driver in operational.
156 	 */
157 	struct mutex reslock;
158 	struct mlx5_flow_table *rxft;
159 	struct mlx5_fc *rx_counter;
160 	struct mlx5_flow_handle *rx_rule_ucast;
161 	struct mlx5_flow_handle *rx_rule_mcast;
162 	bool setup;
163 	u32 cur_num_vqs;
164 	struct notifier_block nb;
165 	struct vdpa_callback config_cb;
166 	struct mlx5_vdpa_wq_ent cvq_ent;
167 };
168 
169 static void free_resources(struct mlx5_vdpa_net *ndev);
170 static void init_mvqs(struct mlx5_vdpa_net *ndev);
171 static int setup_driver(struct mlx5_vdpa_dev *mvdev);
172 static void teardown_driver(struct mlx5_vdpa_net *ndev);
173 
174 static bool mlx5_vdpa_debug;
175 
176 #define MLX5_CVQ_MAX_ENT 16
177 
178 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
179 	do {                                                                                       \
180 		if (features & BIT_ULL(_feature))                                                  \
181 			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
182 	} while (0)
183 
184 #define MLX5_LOG_VIO_STAT(_status)                                                                 \
185 	do {                                                                                       \
186 		if (status & (_status))                                                            \
187 			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
188 	} while (0)
189 
190 /* TODO: cross-endian support */
191 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
192 {
193 	return virtio_legacy_is_little_endian() ||
194 		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
195 }
196 
197 static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
198 {
199 	return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
200 }
201 
202 static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
203 {
204 	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
205 }
206 
207 static inline u32 mlx5_vdpa_max_qps(int max_vqs)
208 {
209 	return max_vqs / 2;
210 }
211 
212 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
213 {
214 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
215 		return 2;
216 
217 	return 2 * mlx5_vdpa_max_qps(mvdev->max_vqs);
218 }
219 
220 static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
221 {
222 	return idx == ctrl_vq_idx(mvdev);
223 }
224 
225 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
226 {
227 	if (status & ~VALID_STATUS_MASK)
228 		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
229 			       status & ~VALID_STATUS_MASK);
230 
231 	if (!mlx5_vdpa_debug)
232 		return;
233 
234 	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
235 	if (set && !status) {
236 		mlx5_vdpa_info(mvdev, "driver resets the device\n");
237 		return;
238 	}
239 
240 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
241 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
242 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
243 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
244 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
245 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
246 }
247 
248 static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
249 {
250 	if (features & ~VALID_FEATURES_MASK)
251 		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
252 			       features & ~VALID_FEATURES_MASK);
253 
254 	if (!mlx5_vdpa_debug)
255 		return;
256 
257 	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
258 	if (!features)
259 		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
260 
261 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
262 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
263 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
264 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
265 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
266 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
267 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
268 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
269 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
270 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
271 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
272 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
273 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
274 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
275 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
276 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
277 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
278 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
279 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
280 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
281 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
282 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
283 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
284 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
285 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
286 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
287 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
288 	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
289 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
290 	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
291 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
292 	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
293 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
294 	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
295 }
296 
297 static int create_tis(struct mlx5_vdpa_net *ndev)
298 {
299 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
300 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
301 	void *tisc;
302 	int err;
303 
304 	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
305 	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
306 	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
307 	if (err)
308 		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
309 
310 	return err;
311 }
312 
313 static void destroy_tis(struct mlx5_vdpa_net *ndev)
314 {
315 	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
316 }
317 
318 #define MLX5_VDPA_CQE_SIZE 64
319 #define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
320 
321 static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
322 {
323 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
324 	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
325 	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
326 	int err;
327 
328 	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
329 				       ndev->mvdev.mdev->priv.numa_node);
330 	if (err)
331 		return err;
332 
333 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
334 
335 	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
336 	buf->nent = nent;
337 
338 	return 0;
339 }
340 
341 static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
342 {
343 	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
344 
345 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
346 					ndev->mvdev.mdev->priv.numa_node);
347 }
348 
349 static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
350 {
351 	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
352 }
353 
354 static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
355 {
356 	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
357 }
358 
359 static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
360 {
361 	struct mlx5_cqe64 *cqe64;
362 	void *cqe;
363 	int i;
364 
365 	for (i = 0; i < buf->nent; i++) {
366 		cqe = get_cqe(vcq, i);
367 		cqe64 = cqe;
368 		cqe64->op_own = MLX5_CQE_INVALID << 4;
369 	}
370 }
371 
372 static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
373 {
374 	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
375 
376 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
377 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
378 		return cqe64;
379 
380 	return NULL;
381 }
382 
383 static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
384 {
385 	vqp->head += n;
386 	vqp->db.db[0] = cpu_to_be32(vqp->head);
387 }
388 
389 static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
390 		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
391 {
392 	struct mlx5_vdpa_qp *vqp;
393 	__be64 *pas;
394 	void *qpc;
395 
396 	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
397 	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
398 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
399 	if (vqp->fw) {
400 		/* Firmware QP is allocated by the driver for the firmware's
401 		 * use so we can skip part of the params as they will be chosen by firmware
402 		 */
403 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
404 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
405 		MLX5_SET(qpc, qpc, no_sq, 1);
406 		return;
407 	}
408 
409 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
410 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
411 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
412 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
413 	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
414 	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
415 	MLX5_SET(qpc, qpc, no_sq, 1);
416 	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
417 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
418 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
419 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
420 	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
421 }
422 
423 static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
424 {
425 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
426 					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
427 					ndev->mvdev.mdev->priv.numa_node);
428 }
429 
430 static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
431 {
432 	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
433 }
434 
435 static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
436 		     struct mlx5_vdpa_qp *vqp)
437 {
438 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
439 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
440 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
441 	void *qpc;
442 	void *in;
443 	int err;
444 
445 	if (!vqp->fw) {
446 		vqp = &mvq->vqqp;
447 		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
448 		if (err)
449 			return err;
450 
451 		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
452 		if (err)
453 			goto err_db;
454 		inlen += vqp->frag_buf.npages * sizeof(__be64);
455 	}
456 
457 	in = kzalloc(inlen, GFP_KERNEL);
458 	if (!in) {
459 		err = -ENOMEM;
460 		goto err_kzalloc;
461 	}
462 
463 	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
464 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
465 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
466 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
467 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
468 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
469 	if (!vqp->fw)
470 		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
471 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
472 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
473 	kfree(in);
474 	if (err)
475 		goto err_kzalloc;
476 
477 	vqp->mqp.uid = ndev->mvdev.res.uid;
478 	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
479 
480 	if (!vqp->fw)
481 		rx_post(vqp, mvq->num_ent);
482 
483 	return 0;
484 
485 err_kzalloc:
486 	if (!vqp->fw)
487 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
488 err_db:
489 	if (!vqp->fw)
490 		rq_buf_free(ndev, vqp);
491 
492 	return err;
493 }
494 
495 static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
496 {
497 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
498 
499 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
500 	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
501 	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
502 	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
503 		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
504 	if (!vqp->fw) {
505 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
506 		rq_buf_free(ndev, vqp);
507 	}
508 }
509 
510 static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
511 {
512 	return get_sw_cqe(cq, cq->mcq.cons_index);
513 }
514 
515 static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
516 {
517 	struct mlx5_cqe64 *cqe64;
518 
519 	cqe64 = next_cqe_sw(vcq);
520 	if (!cqe64)
521 		return -EAGAIN;
522 
523 	vcq->mcq.cons_index++;
524 	return 0;
525 }
526 
527 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
528 {
529 	struct mlx5_vdpa_net *ndev = mvq->ndev;
530 	struct vdpa_callback *event_cb;
531 
532 	event_cb = &ndev->event_cbs[mvq->index];
533 	mlx5_cq_set_ci(&mvq->cq.mcq);
534 
535 	/* make sure CQ cosumer update is visible to the hardware before updating
536 	 * RX doorbell record.
537 	 */
538 	dma_wmb();
539 	rx_post(&mvq->vqqp, num);
540 	if (event_cb->callback)
541 		event_cb->callback(event_cb->private);
542 }
543 
544 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
545 {
546 	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
547 	struct mlx5_vdpa_net *ndev = mvq->ndev;
548 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
549 	int num = 0;
550 
551 	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
552 		num++;
553 		if (num > mvq->num_ent / 2) {
554 			/* If completions keep coming while we poll, we want to
555 			 * let the hardware know that we consumed them by
556 			 * updating the doorbell record.  We also let vdpa core
557 			 * know about this so it passes it on the virtio driver
558 			 * on the guest.
559 			 */
560 			mlx5_vdpa_handle_completions(mvq, num);
561 			num = 0;
562 		}
563 	}
564 
565 	if (num)
566 		mlx5_vdpa_handle_completions(mvq, num);
567 
568 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
569 }
570 
571 static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
572 {
573 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
574 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
575 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
576 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
577 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
578 	__be64 *pas;
579 	int inlen;
580 	void *cqc;
581 	void *in;
582 	int err;
583 	int eqn;
584 
585 	err = mlx5_db_alloc(mdev, &vcq->db);
586 	if (err)
587 		return err;
588 
589 	vcq->mcq.set_ci_db = vcq->db.db;
590 	vcq->mcq.arm_db = vcq->db.db + 1;
591 	vcq->mcq.cqe_sz = 64;
592 
593 	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
594 	if (err)
595 		goto err_db;
596 
597 	cq_frag_buf_init(vcq, &vcq->buf);
598 
599 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
600 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
601 	in = kzalloc(inlen, GFP_KERNEL);
602 	if (!in) {
603 		err = -ENOMEM;
604 		goto err_vzalloc;
605 	}
606 
607 	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
608 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
609 	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
610 
611 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
612 	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
613 
614 	/* Use vector 0 by default. Consider adding code to choose least used
615 	 * vector.
616 	 */
617 	err = mlx5_vector2eqn(mdev, 0, &eqn);
618 	if (err)
619 		goto err_vec;
620 
621 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
622 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
623 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
624 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
625 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
626 
627 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
628 	if (err)
629 		goto err_vec;
630 
631 	vcq->mcq.comp = mlx5_vdpa_cq_comp;
632 	vcq->cqe = num_ent;
633 	vcq->mcq.set_ci_db = vcq->db.db;
634 	vcq->mcq.arm_db = vcq->db.db + 1;
635 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
636 	kfree(in);
637 	return 0;
638 
639 err_vec:
640 	kfree(in);
641 err_vzalloc:
642 	cq_frag_buf_free(ndev, &vcq->buf);
643 err_db:
644 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
645 	return err;
646 }
647 
648 static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
649 {
650 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
651 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
652 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
653 
654 	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
655 		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
656 		return;
657 	}
658 	cq_frag_buf_free(ndev, &vcq->buf);
659 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
660 }
661 
662 static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
663 			  struct mlx5_vdpa_umem **umemp)
664 {
665 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
666 	int p_a;
667 	int p_b;
668 
669 	switch (num) {
670 	case 1:
671 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_a);
672 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_b);
673 		*umemp = &mvq->umem1;
674 		break;
675 	case 2:
676 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_a);
677 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_b);
678 		*umemp = &mvq->umem2;
679 		break;
680 	case 3:
681 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_a);
682 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_b);
683 		*umemp = &mvq->umem3;
684 		break;
685 	}
686 	(*umemp)->size = p_a * mvq->num_ent + p_b;
687 }
688 
689 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
690 {
691 	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
692 }
693 
694 static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
695 {
696 	int inlen;
697 	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
698 	void *um;
699 	void *in;
700 	int err;
701 	__be64 *pas;
702 	struct mlx5_vdpa_umem *umem;
703 
704 	set_umem_size(ndev, mvq, num, &umem);
705 	err = umem_frag_buf_alloc(ndev, umem, umem->size);
706 	if (err)
707 		return err;
708 
709 	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
710 
711 	in = kzalloc(inlen, GFP_KERNEL);
712 	if (!in) {
713 		err = -ENOMEM;
714 		goto err_in;
715 	}
716 
717 	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
718 	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
719 	um = MLX5_ADDR_OF(create_umem_in, in, umem);
720 	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
721 	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
722 
723 	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
724 	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
725 
726 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
727 	if (err) {
728 		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
729 		goto err_cmd;
730 	}
731 
732 	kfree(in);
733 	umem->id = MLX5_GET(create_umem_out, out, umem_id);
734 
735 	return 0;
736 
737 err_cmd:
738 	kfree(in);
739 err_in:
740 	umem_frag_buf_free(ndev, umem);
741 	return err;
742 }
743 
744 static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
745 {
746 	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
747 	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
748 	struct mlx5_vdpa_umem *umem;
749 
750 	switch (num) {
751 	case 1:
752 		umem = &mvq->umem1;
753 		break;
754 	case 2:
755 		umem = &mvq->umem2;
756 		break;
757 	case 3:
758 		umem = &mvq->umem3;
759 		break;
760 	}
761 
762 	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
763 	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
764 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
765 		return;
766 
767 	umem_frag_buf_free(ndev, umem);
768 }
769 
770 static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
771 {
772 	int num;
773 	int err;
774 
775 	for (num = 1; num <= 3; num++) {
776 		err = create_umem(ndev, mvq, num);
777 		if (err)
778 			goto err_umem;
779 	}
780 	return 0;
781 
782 err_umem:
783 	for (num--; num > 0; num--)
784 		umem_destroy(ndev, mvq, num);
785 
786 	return err;
787 }
788 
789 static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
790 {
791 	int num;
792 
793 	for (num = 3; num > 0; num--)
794 		umem_destroy(ndev, mvq, num);
795 }
796 
797 static int get_queue_type(struct mlx5_vdpa_net *ndev)
798 {
799 	u32 type_mask;
800 
801 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
802 
803 	/* prefer split queue */
804 	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)
805 		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
806 
807 	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED));
808 
809 	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
810 }
811 
812 static bool vq_is_tx(u16 idx)
813 {
814 	return idx % 2;
815 }
816 
817 static u16 get_features_12_3(u64 features)
818 {
819 	return (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << 9) |
820 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << 8) |
821 	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << 7) |
822 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_CSUM)) << 6);
823 }
824 
825 static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
826 {
827 	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
828 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
829 	void *obj_context;
830 	void *cmd_hdr;
831 	void *vq_ctx;
832 	void *in;
833 	int err;
834 
835 	err = umems_create(ndev, mvq);
836 	if (err)
837 		return err;
838 
839 	in = kzalloc(inlen, GFP_KERNEL);
840 	if (!in) {
841 		err = -ENOMEM;
842 		goto err_alloc;
843 	}
844 
845 	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
846 
847 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
848 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
849 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
850 
851 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
852 	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
853 	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
854 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
855 		 get_features_12_3(ndev->mvdev.actual_features));
856 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
857 	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
858 
859 	if (vq_is_tx(mvq->index))
860 		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
861 
862 	MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
863 	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
864 	MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
865 	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
866 	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
867 		 !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
868 	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
869 	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
870 	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
871 	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey);
872 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
873 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
874 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
875 	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
876 	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
877 	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
878 	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
879 
880 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
881 	if (err)
882 		goto err_cmd;
883 
884 	kfree(in);
885 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
886 
887 	return 0;
888 
889 err_cmd:
890 	kfree(in);
891 err_alloc:
892 	umems_destroy(ndev, mvq);
893 	return err;
894 }
895 
896 static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
897 {
898 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
899 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
900 
901 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
902 		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
903 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
904 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
905 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
906 		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
907 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
908 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
909 		return;
910 	}
911 	umems_destroy(ndev, mvq);
912 }
913 
914 static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
915 {
916 	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
917 }
918 
919 static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
920 {
921 	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
922 }
923 
924 static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
925 			int *outlen, u32 qpn, u32 rqpn)
926 {
927 	void *qpc;
928 	void *pp;
929 
930 	switch (cmd) {
931 	case MLX5_CMD_OP_2RST_QP:
932 		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
933 		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
934 		*in = kzalloc(*inlen, GFP_KERNEL);
935 		*out = kzalloc(*outlen, GFP_KERNEL);
936 		if (!*in || !*out)
937 			goto outerr;
938 
939 		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
940 		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
941 		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
942 		break;
943 	case MLX5_CMD_OP_RST2INIT_QP:
944 		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
945 		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
946 		*in = kzalloc(*inlen, GFP_KERNEL);
947 		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
948 		if (!*in || !*out)
949 			goto outerr;
950 
951 		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
952 		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
953 		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
954 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
955 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
956 		MLX5_SET(qpc, qpc, rwe, 1);
957 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
958 		MLX5_SET(ads, pp, vhca_port_num, 1);
959 		break;
960 	case MLX5_CMD_OP_INIT2RTR_QP:
961 		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
962 		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
963 		*in = kzalloc(*inlen, GFP_KERNEL);
964 		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
965 		if (!*in || !*out)
966 			goto outerr;
967 
968 		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
969 		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
970 		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
971 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
972 		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
973 		MLX5_SET(qpc, qpc, log_msg_max, 30);
974 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
975 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
976 		MLX5_SET(ads, pp, fl, 1);
977 		break;
978 	case MLX5_CMD_OP_RTR2RTS_QP:
979 		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
980 		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
981 		*in = kzalloc(*inlen, GFP_KERNEL);
982 		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
983 		if (!*in || !*out)
984 			goto outerr;
985 
986 		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
987 		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
988 		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
989 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
990 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
991 		MLX5_SET(ads, pp, ack_timeout, 14);
992 		MLX5_SET(qpc, qpc, retry_count, 7);
993 		MLX5_SET(qpc, qpc, rnr_retry, 7);
994 		break;
995 	default:
996 		goto outerr_nullify;
997 	}
998 
999 	return;
1000 
1001 outerr:
1002 	kfree(*in);
1003 	kfree(*out);
1004 outerr_nullify:
1005 	*in = NULL;
1006 	*out = NULL;
1007 }
1008 
1009 static void free_inout(void *in, void *out)
1010 {
1011 	kfree(in);
1012 	kfree(out);
1013 }
1014 
1015 /* Two QPs are used by each virtqueue. One is used by the driver and one by
1016  * firmware. The fw argument indicates whether the subjected QP is the one used
1017  * by firmware.
1018  */
1019 static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
1020 {
1021 	int outlen;
1022 	int inlen;
1023 	void *out;
1024 	void *in;
1025 	int err;
1026 
1027 	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
1028 	if (!in || !out)
1029 		return -ENOMEM;
1030 
1031 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
1032 	free_inout(in, out);
1033 	return err;
1034 }
1035 
1036 static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1037 {
1038 	int err;
1039 
1040 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
1041 	if (err)
1042 		return err;
1043 
1044 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
1045 	if (err)
1046 		return err;
1047 
1048 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
1049 	if (err)
1050 		return err;
1051 
1052 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
1053 	if (err)
1054 		return err;
1055 
1056 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
1057 	if (err)
1058 		return err;
1059 
1060 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
1061 	if (err)
1062 		return err;
1063 
1064 	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1065 }
1066 
1067 struct mlx5_virtq_attr {
1068 	u8 state;
1069 	u16 available_index;
1070 	u16 used_index;
1071 };
1072 
1073 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1074 			   struct mlx5_virtq_attr *attr)
1075 {
1076 	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1077 	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1078 	void *out;
1079 	void *obj_context;
1080 	void *cmd_hdr;
1081 	int err;
1082 
1083 	out = kzalloc(outlen, GFP_KERNEL);
1084 	if (!out)
1085 		return -ENOMEM;
1086 
1087 	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1088 
1089 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1090 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1091 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1092 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1093 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1094 	if (err)
1095 		goto err_cmd;
1096 
1097 	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1098 	memset(attr, 0, sizeof(*attr));
1099 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1100 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1101 	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
1102 	kfree(out);
1103 	return 0;
1104 
1105 err_cmd:
1106 	kfree(out);
1107 	return err;
1108 }
1109 
1110 static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
1111 {
1112 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1113 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1114 	void *obj_context;
1115 	void *cmd_hdr;
1116 	void *in;
1117 	int err;
1118 
1119 	in = kzalloc(inlen, GFP_KERNEL);
1120 	if (!in)
1121 		return -ENOMEM;
1122 
1123 	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1124 
1125 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1126 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1127 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1128 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1129 
1130 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1131 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select,
1132 		   MLX5_VIRTQ_MODIFY_MASK_STATE);
1133 	MLX5_SET(virtio_net_q_object, obj_context, state, state);
1134 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1135 	kfree(in);
1136 	if (!err)
1137 		mvq->fw_state = state;
1138 
1139 	return err;
1140 }
1141 
1142 static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1143 {
1144 	u16 idx = mvq->index;
1145 	int err;
1146 
1147 	if (!mvq->num_ent)
1148 		return 0;
1149 
1150 	if (mvq->initialized)
1151 		return 0;
1152 
1153 	err = cq_create(ndev, idx, mvq->num_ent);
1154 	if (err)
1155 		return err;
1156 
1157 	err = qp_create(ndev, mvq, &mvq->fwqp);
1158 	if (err)
1159 		goto err_fwqp;
1160 
1161 	err = qp_create(ndev, mvq, &mvq->vqqp);
1162 	if (err)
1163 		goto err_vqqp;
1164 
1165 	err = connect_qps(ndev, mvq);
1166 	if (err)
1167 		goto err_connect;
1168 
1169 	err = create_virtqueue(ndev, mvq);
1170 	if (err)
1171 		goto err_connect;
1172 
1173 	if (mvq->ready) {
1174 		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1175 		if (err) {
1176 			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
1177 				       idx, err);
1178 			goto err_connect;
1179 		}
1180 	}
1181 
1182 	mvq->initialized = true;
1183 	return 0;
1184 
1185 err_connect:
1186 	qp_destroy(ndev, &mvq->vqqp);
1187 err_vqqp:
1188 	qp_destroy(ndev, &mvq->fwqp);
1189 err_fwqp:
1190 	cq_destroy(ndev, idx);
1191 	return err;
1192 }
1193 
1194 static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1195 {
1196 	struct mlx5_virtq_attr attr;
1197 
1198 	if (!mvq->initialized)
1199 		return;
1200 
1201 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1202 		return;
1203 
1204 	if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
1205 		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
1206 
1207 	if (query_virtqueue(ndev, mvq, &attr)) {
1208 		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
1209 		return;
1210 	}
1211 	mvq->avail_idx = attr.available_index;
1212 	mvq->used_idx = attr.used_index;
1213 }
1214 
1215 static void suspend_vqs(struct mlx5_vdpa_net *ndev)
1216 {
1217 	int i;
1218 
1219 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
1220 		suspend_vq(ndev, &ndev->vqs[i]);
1221 }
1222 
1223 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1224 {
1225 	if (!mvq->initialized)
1226 		return;
1227 
1228 	suspend_vq(ndev, mvq);
1229 	destroy_virtqueue(ndev, mvq);
1230 	qp_destroy(ndev, &mvq->vqqp);
1231 	qp_destroy(ndev, &mvq->fwqp);
1232 	cq_destroy(ndev, mvq->index);
1233 	mvq->initialized = false;
1234 }
1235 
1236 static int create_rqt(struct mlx5_vdpa_net *ndev)
1237 {
1238 	__be32 *list;
1239 	int max_rqt;
1240 	void *rqtc;
1241 	int inlen;
1242 	void *in;
1243 	int i, j;
1244 	int err;
1245 	int num;
1246 
1247 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
1248 		num = 1;
1249 	else
1250 		num = ndev->cur_num_vqs / 2;
1251 
1252 	max_rqt = min_t(int, roundup_pow_of_two(num),
1253 			1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
1254 	if (max_rqt < 1)
1255 		return -EOPNOTSUPP;
1256 
1257 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num);
1258 	in = kzalloc(inlen, GFP_KERNEL);
1259 	if (!in)
1260 		return -ENOMEM;
1261 
1262 	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1263 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1264 
1265 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1266 	MLX5_SET(rqtc, rqtc, rqt_max_size, max_rqt);
1267 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1268 	for (i = 0, j = 0; i < max_rqt; i++, j += 2)
1269 		list[i] = cpu_to_be32(ndev->vqs[j % (2 * num)].virtq_id);
1270 
1271 	MLX5_SET(rqtc, rqtc, rqt_actual_size, max_rqt);
1272 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1273 	kfree(in);
1274 	if (err)
1275 		return err;
1276 
1277 	return 0;
1278 }
1279 
1280 #define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
1281 
1282 static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
1283 {
1284 	__be32 *list;
1285 	int max_rqt;
1286 	void *rqtc;
1287 	int inlen;
1288 	void *in;
1289 	int i, j;
1290 	int err;
1291 
1292 	max_rqt = min_t(int, roundup_pow_of_two(ndev->cur_num_vqs / 2),
1293 			1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
1294 	if (max_rqt < 1)
1295 		return -EOPNOTSUPP;
1296 
1297 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num);
1298 	in = kzalloc(inlen, GFP_KERNEL);
1299 	if (!in)
1300 		return -ENOMEM;
1301 
1302 	MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
1303 	MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
1304 	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
1305 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1306 
1307 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1308 	for (i = 0, j = 0; i < max_rqt; i++, j += 2)
1309 		list[i] = cpu_to_be32(ndev->vqs[j % num].virtq_id);
1310 
1311 	MLX5_SET(rqtc, rqtc, rqt_actual_size, max_rqt);
1312 	err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
1313 	kfree(in);
1314 	if (err)
1315 		return err;
1316 
1317 	return 0;
1318 }
1319 
1320 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1321 {
1322 	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1323 }
1324 
1325 static int create_tir(struct mlx5_vdpa_net *ndev)
1326 {
1327 #define HASH_IP_L4PORTS                                                                            \
1328 	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1329 	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1330 	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1331 						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1332 						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1333 						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1334 						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1335 	void *rss_key;
1336 	void *outer;
1337 	void *tirc;
1338 	void *in;
1339 	int err;
1340 
1341 	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1342 	if (!in)
1343 		return -ENOMEM;
1344 
1345 	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1346 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1347 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1348 
1349 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1350 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1351 	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1352 	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1353 
1354 	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1355 	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1356 	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1357 	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1358 
1359 	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1360 	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1361 
1362 	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1363 	kfree(in);
1364 	return err;
1365 }
1366 
1367 static void destroy_tir(struct mlx5_vdpa_net *ndev)
1368 {
1369 	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1370 }
1371 
1372 static int add_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1373 {
1374 	struct mlx5_flow_destination dest[2] = {};
1375 	struct mlx5_flow_table_attr ft_attr = {};
1376 	struct mlx5_flow_act flow_act = {};
1377 	struct mlx5_flow_namespace *ns;
1378 	struct mlx5_flow_spec *spec;
1379 	void *headers_c;
1380 	void *headers_v;
1381 	u8 *dmac_c;
1382 	u8 *dmac_v;
1383 	int err;
1384 
1385 	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
1386 	if (!spec)
1387 		return -ENOMEM;
1388 
1389 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
1390 	ft_attr.max_fte = 2;
1391 	ft_attr.autogroup.max_num_groups = 2;
1392 
1393 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
1394 	if (!ns) {
1395 		mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n");
1396 		err = -EOPNOTSUPP;
1397 		goto err_ns;
1398 	}
1399 
1400 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
1401 	if (IS_ERR(ndev->rxft)) {
1402 		err = PTR_ERR(ndev->rxft);
1403 		goto err_ns;
1404 	}
1405 
1406 	ndev->rx_counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1407 	if (IS_ERR(ndev->rx_counter)) {
1408 		err = PTR_ERR(ndev->rx_counter);
1409 		goto err_fc;
1410 	}
1411 
1412 	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers);
1413 	dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16);
1414 	memset(dmac_c, 0xff, ETH_ALEN);
1415 	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
1416 	dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16);
1417 	ether_addr_copy(dmac_v, ndev->config.mac);
1418 
1419 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_COUNT;
1420 	dest[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1421 	dest[0].tir_num = ndev->res.tirn;
1422 	dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1423 	dest[1].counter_id = mlx5_fc_id(ndev->rx_counter);
1424 	ndev->rx_rule_ucast = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dest, 2);
1425 
1426 	if (IS_ERR(ndev->rx_rule_ucast)) {
1427 		err = PTR_ERR(ndev->rx_rule_ucast);
1428 		ndev->rx_rule_ucast = NULL;
1429 		goto err_rule_ucast;
1430 	}
1431 
1432 	memset(dmac_c, 0, ETH_ALEN);
1433 	memset(dmac_v, 0, ETH_ALEN);
1434 	dmac_c[0] = 1;
1435 	dmac_v[0] = 1;
1436 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
1437 	ndev->rx_rule_mcast = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dest, 1);
1438 	if (IS_ERR(ndev->rx_rule_mcast)) {
1439 		err = PTR_ERR(ndev->rx_rule_mcast);
1440 		ndev->rx_rule_mcast = NULL;
1441 		goto err_rule_mcast;
1442 	}
1443 
1444 	kvfree(spec);
1445 	return 0;
1446 
1447 err_rule_mcast:
1448 	mlx5_del_flow_rules(ndev->rx_rule_ucast);
1449 	ndev->rx_rule_ucast = NULL;
1450 err_rule_ucast:
1451 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1452 err_fc:
1453 	mlx5_destroy_flow_table(ndev->rxft);
1454 err_ns:
1455 	kvfree(spec);
1456 	return err;
1457 }
1458 
1459 static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1460 {
1461 	if (!ndev->rx_rule_ucast)
1462 		return;
1463 
1464 	mlx5_del_flow_rules(ndev->rx_rule_mcast);
1465 	ndev->rx_rule_mcast = NULL;
1466 	mlx5_del_flow_rules(ndev->rx_rule_ucast);
1467 	ndev->rx_rule_ucast = NULL;
1468 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1469 	mlx5_destroy_flow_table(ndev->rxft);
1470 }
1471 
1472 static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1473 {
1474 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1475 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1476 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1477 	struct mlx5_core_dev *pfmdev;
1478 	size_t read;
1479 	u8 mac[ETH_ALEN], mac_back[ETH_ALEN];
1480 
1481 	pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
1482 	switch (cmd) {
1483 	case VIRTIO_NET_CTRL_MAC_ADDR_SET:
1484 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
1485 		if (read != ETH_ALEN)
1486 			break;
1487 
1488 		if (!memcmp(ndev->config.mac, mac, 6)) {
1489 			status = VIRTIO_NET_OK;
1490 			break;
1491 		}
1492 
1493 		if (is_zero_ether_addr(mac))
1494 			break;
1495 
1496 		if (!is_zero_ether_addr(ndev->config.mac)) {
1497 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1498 				mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
1499 					       ndev->config.mac);
1500 				break;
1501 			}
1502 		}
1503 
1504 		if (mlx5_mpfs_add_mac(pfmdev, mac)) {
1505 			mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
1506 				       mac);
1507 			break;
1508 		}
1509 
1510 		/* backup the original mac address so that if failed to add the forward rules
1511 		 * we could restore it
1512 		 */
1513 		memcpy(mac_back, ndev->config.mac, ETH_ALEN);
1514 
1515 		memcpy(ndev->config.mac, mac, ETH_ALEN);
1516 
1517 		/* Need recreate the flow table entry, so that the packet could forward back
1518 		 */
1519 		remove_fwd_to_tir(ndev);
1520 
1521 		if (add_fwd_to_tir(ndev)) {
1522 			mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n");
1523 
1524 			/* Although it hardly run here, we still need double check */
1525 			if (is_zero_ether_addr(mac_back)) {
1526 				mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n");
1527 				break;
1528 			}
1529 
1530 			/* Try to restore original mac address to MFPS table, and try to restore
1531 			 * the forward rule entry.
1532 			 */
1533 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1534 				mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n",
1535 					       ndev->config.mac);
1536 			}
1537 
1538 			if (mlx5_mpfs_add_mac(pfmdev, mac_back)) {
1539 				mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n",
1540 					       mac_back);
1541 			}
1542 
1543 			memcpy(ndev->config.mac, mac_back, ETH_ALEN);
1544 
1545 			if (add_fwd_to_tir(ndev))
1546 				mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n");
1547 
1548 			break;
1549 		}
1550 
1551 		status = VIRTIO_NET_OK;
1552 		break;
1553 
1554 	default:
1555 		break;
1556 	}
1557 
1558 	return status;
1559 }
1560 
1561 static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
1562 {
1563 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1564 	int cur_qps = ndev->cur_num_vqs / 2;
1565 	int err;
1566 	int i;
1567 
1568 	if (cur_qps > newqps) {
1569 		err = modify_rqt(ndev, 2 * newqps);
1570 		if (err)
1571 			return err;
1572 
1573 		for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
1574 			teardown_vq(ndev, &ndev->vqs[i]);
1575 
1576 		ndev->cur_num_vqs = 2 * newqps;
1577 	} else {
1578 		ndev->cur_num_vqs = 2 * newqps;
1579 		for (i = cur_qps * 2; i < 2 * newqps; i++) {
1580 			err = setup_vq(ndev, &ndev->vqs[i]);
1581 			if (err)
1582 				goto clean_added;
1583 		}
1584 		err = modify_rqt(ndev, 2 * newqps);
1585 		if (err)
1586 			goto clean_added;
1587 	}
1588 	return 0;
1589 
1590 clean_added:
1591 	for (--i; i >= 2 * cur_qps; --i)
1592 		teardown_vq(ndev, &ndev->vqs[i]);
1593 
1594 	ndev->cur_num_vqs = 2 * cur_qps;
1595 
1596 	return err;
1597 }
1598 
1599 static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1600 {
1601 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1602 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1603 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1604 	struct virtio_net_ctrl_mq mq;
1605 	size_t read;
1606 	u16 newqps;
1607 
1608 	switch (cmd) {
1609 	case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
1610 		/* This mq feature check aligns with pre-existing userspace
1611 		 * implementation.
1612 		 *
1613 		 * Without it, an untrusted driver could fake a multiqueue config
1614 		 * request down to a non-mq device that may cause kernel to
1615 		 * panic due to uninitialized resources for extra vqs. Even with
1616 		 * a well behaving guest driver, it is not expected to allow
1617 		 * changing the number of vqs on a non-mq device.
1618 		 */
1619 		if (!MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ))
1620 			break;
1621 
1622 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
1623 		if (read != sizeof(mq))
1624 			break;
1625 
1626 		newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
1627 		if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
1628 		    newqps > mlx5_vdpa_max_qps(mvdev->max_vqs))
1629 			break;
1630 
1631 		if (ndev->cur_num_vqs == 2 * newqps) {
1632 			status = VIRTIO_NET_OK;
1633 			break;
1634 		}
1635 
1636 		if (!change_num_qps(mvdev, newqps))
1637 			status = VIRTIO_NET_OK;
1638 
1639 		break;
1640 	default:
1641 		break;
1642 	}
1643 
1644 	return status;
1645 }
1646 
1647 static void mlx5_cvq_kick_handler(struct work_struct *work)
1648 {
1649 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1650 	struct virtio_net_ctrl_hdr ctrl;
1651 	struct mlx5_vdpa_wq_ent *wqent;
1652 	struct mlx5_vdpa_dev *mvdev;
1653 	struct mlx5_control_vq *cvq;
1654 	struct mlx5_vdpa_net *ndev;
1655 	size_t read, write;
1656 	int err;
1657 
1658 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
1659 	mvdev = wqent->mvdev;
1660 	ndev = to_mlx5_vdpa_ndev(mvdev);
1661 	cvq = &mvdev->cvq;
1662 
1663 	mutex_lock(&ndev->reslock);
1664 
1665 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
1666 		goto out;
1667 
1668 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
1669 		goto out;
1670 
1671 	if (!cvq->ready)
1672 		goto out;
1673 
1674 	while (true) {
1675 		err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
1676 					   GFP_ATOMIC);
1677 		if (err <= 0)
1678 			break;
1679 
1680 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
1681 		if (read != sizeof(ctrl))
1682 			break;
1683 
1684 		switch (ctrl.class) {
1685 		case VIRTIO_NET_CTRL_MAC:
1686 			status = handle_ctrl_mac(mvdev, ctrl.cmd);
1687 			break;
1688 		case VIRTIO_NET_CTRL_MQ:
1689 			status = handle_ctrl_mq(mvdev, ctrl.cmd);
1690 			break;
1691 
1692 		default:
1693 			break;
1694 		}
1695 
1696 		/* Make sure data is written before advancing index */
1697 		smp_wmb();
1698 
1699 		write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
1700 		vringh_complete_iotlb(&cvq->vring, cvq->head, write);
1701 		vringh_kiov_cleanup(&cvq->riov);
1702 		vringh_kiov_cleanup(&cvq->wiov);
1703 
1704 		if (vringh_need_notify_iotlb(&cvq->vring))
1705 			vringh_notify(&cvq->vring);
1706 
1707 		queue_work(mvdev->wq, &wqent->work);
1708 		break;
1709 	}
1710 
1711 out:
1712 	mutex_unlock(&ndev->reslock);
1713 }
1714 
1715 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
1716 {
1717 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1718 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1719 	struct mlx5_vdpa_virtqueue *mvq;
1720 
1721 	if (!is_index_valid(mvdev, idx))
1722 		return;
1723 
1724 	if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
1725 		if (!mvdev->wq || !mvdev->cvq.ready)
1726 			return;
1727 
1728 		queue_work(mvdev->wq, &ndev->cvq_ent.work);
1729 		return;
1730 	}
1731 
1732 	mvq = &ndev->vqs[idx];
1733 	if (unlikely(!mvq->ready))
1734 		return;
1735 
1736 	iowrite16(idx, ndev->mvdev.res.kick_addr);
1737 }
1738 
1739 static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
1740 				    u64 driver_area, u64 device_area)
1741 {
1742 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1743 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1744 	struct mlx5_vdpa_virtqueue *mvq;
1745 
1746 	if (!is_index_valid(mvdev, idx))
1747 		return -EINVAL;
1748 
1749 	if (is_ctrl_vq_idx(mvdev, idx)) {
1750 		mvdev->cvq.desc_addr = desc_area;
1751 		mvdev->cvq.device_addr = device_area;
1752 		mvdev->cvq.driver_addr = driver_area;
1753 		return 0;
1754 	}
1755 
1756 	mvq = &ndev->vqs[idx];
1757 	mvq->desc_addr = desc_area;
1758 	mvq->device_addr = device_area;
1759 	mvq->driver_addr = driver_area;
1760 	return 0;
1761 }
1762 
1763 static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
1764 {
1765 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1766 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1767 	struct mlx5_vdpa_virtqueue *mvq;
1768 
1769 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
1770 		return;
1771 
1772 	mvq = &ndev->vqs[idx];
1773 	mvq->num_ent = num;
1774 }
1775 
1776 static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
1777 {
1778 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1779 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1780 
1781 	ndev->event_cbs[idx] = *cb;
1782 }
1783 
1784 static void mlx5_cvq_notify(struct vringh *vring)
1785 {
1786 	struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
1787 
1788 	if (!cvq->event_cb.callback)
1789 		return;
1790 
1791 	cvq->event_cb.callback(cvq->event_cb.private);
1792 }
1793 
1794 static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
1795 {
1796 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1797 
1798 	cvq->ready = ready;
1799 	if (!ready)
1800 		return;
1801 
1802 	cvq->vring.notify = mlx5_cvq_notify;
1803 }
1804 
1805 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
1806 {
1807 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1808 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1809 	struct mlx5_vdpa_virtqueue *mvq;
1810 
1811 	if (!mvdev->actual_features)
1812 		return;
1813 
1814 	if (!is_index_valid(mvdev, idx))
1815 		return;
1816 
1817 	if (is_ctrl_vq_idx(mvdev, idx)) {
1818 		set_cvq_ready(mvdev, ready);
1819 		return;
1820 	}
1821 
1822 	mvq = &ndev->vqs[idx];
1823 	if (!ready)
1824 		suspend_vq(ndev, mvq);
1825 
1826 	mvq->ready = ready;
1827 }
1828 
1829 static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
1830 {
1831 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1832 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1833 
1834 	if (!is_index_valid(mvdev, idx))
1835 		return false;
1836 
1837 	if (is_ctrl_vq_idx(mvdev, idx))
1838 		return mvdev->cvq.ready;
1839 
1840 	return ndev->vqs[idx].ready;
1841 }
1842 
1843 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
1844 				  const struct vdpa_vq_state *state)
1845 {
1846 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1847 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1848 	struct mlx5_vdpa_virtqueue *mvq;
1849 
1850 	if (!is_index_valid(mvdev, idx))
1851 		return -EINVAL;
1852 
1853 	if (is_ctrl_vq_idx(mvdev, idx)) {
1854 		mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
1855 		return 0;
1856 	}
1857 
1858 	mvq = &ndev->vqs[idx];
1859 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
1860 		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
1861 		return -EINVAL;
1862 	}
1863 
1864 	mvq->used_idx = state->split.avail_index;
1865 	mvq->avail_idx = state->split.avail_index;
1866 	return 0;
1867 }
1868 
1869 static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
1870 {
1871 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1872 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1873 	struct mlx5_vdpa_virtqueue *mvq;
1874 	struct mlx5_virtq_attr attr;
1875 	int err;
1876 
1877 	if (!is_index_valid(mvdev, idx))
1878 		return -EINVAL;
1879 
1880 	if (is_ctrl_vq_idx(mvdev, idx)) {
1881 		state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
1882 		return 0;
1883 	}
1884 
1885 	mvq = &ndev->vqs[idx];
1886 	/* If the virtq object was destroyed, use the value saved at
1887 	 * the last minute of suspend_vq. This caters for userspace
1888 	 * that cares about emulating the index after vq is stopped.
1889 	 */
1890 	if (!mvq->initialized) {
1891 		/* Firmware returns a wrong value for the available index.
1892 		 * Since both values should be identical, we take the value of
1893 		 * used_idx which is reported correctly.
1894 		 */
1895 		state->split.avail_index = mvq->used_idx;
1896 		return 0;
1897 	}
1898 
1899 	err = query_virtqueue(ndev, mvq, &attr);
1900 	if (err) {
1901 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
1902 		return err;
1903 	}
1904 	state->split.avail_index = attr.used_index;
1905 	return 0;
1906 }
1907 
1908 static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
1909 {
1910 	return PAGE_SIZE;
1911 }
1912 
1913 enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9,
1914 	MLX5_VIRTIO_NET_F_CSUM = 1 << 10,
1915 	MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11,
1916 	MLX5_VIRTIO_NET_F_HOST_TSO4 = 1 << 12,
1917 };
1918 
1919 static u64 mlx_to_vritio_features(u16 dev_features)
1920 {
1921 	u64 result = 0;
1922 
1923 	if (dev_features & MLX5_VIRTIO_NET_F_GUEST_CSUM)
1924 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
1925 	if (dev_features & MLX5_VIRTIO_NET_F_CSUM)
1926 		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
1927 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO6)
1928 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
1929 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO4)
1930 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
1931 
1932 	return result;
1933 }
1934 
1935 static u64 get_supported_features(struct mlx5_core_dev *mdev)
1936 {
1937 	u64 mlx_vdpa_features = 0;
1938 	u16 dev_features;
1939 
1940 	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mdev, device_features_bits_mask);
1941 	mlx_vdpa_features |= mlx_to_vritio_features(dev_features);
1942 	if (MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_version_1_0))
1943 		mlx_vdpa_features |= BIT_ULL(VIRTIO_F_VERSION_1);
1944 	mlx_vdpa_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
1945 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
1946 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
1947 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MQ);
1948 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_STATUS);
1949 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MTU);
1950 
1951 	return mlx_vdpa_features;
1952 }
1953 
1954 static u64 mlx5_vdpa_get_device_features(struct vdpa_device *vdev)
1955 {
1956 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1957 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1958 
1959 	print_features(mvdev, ndev->mvdev.mlx_features, false);
1960 	return ndev->mvdev.mlx_features;
1961 }
1962 
1963 static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features)
1964 {
1965 	/* Minimum features to expect */
1966 	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
1967 		return -EOPNOTSUPP;
1968 
1969 	/* Double check features combination sent down by the driver.
1970 	 * Fail invalid features due to absence of the depended feature.
1971 	 *
1972 	 * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
1973 	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
1974 	 * By failing the invalid features sent down by untrusted drivers,
1975 	 * we're assured the assumption made upon is_index_valid() and
1976 	 * is_ctrl_vq_idx() will not be compromised.
1977 	 */
1978 	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
1979             BIT_ULL(VIRTIO_NET_F_MQ))
1980 		return -EINVAL;
1981 
1982 	return 0;
1983 }
1984 
1985 static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev)
1986 {
1987 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1988 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1989 	int err;
1990 	int i;
1991 
1992 	for (i = 0; i < 2 * mlx5_vdpa_max_qps(mvdev->max_vqs); i++) {
1993 		err = setup_vq(ndev, &ndev->vqs[i]);
1994 		if (err)
1995 			goto err_vq;
1996 	}
1997 
1998 	if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) {
1999 		err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
2000 					MLX5_CVQ_MAX_ENT, false,
2001 					(struct vring_desc *)(uintptr_t)cvq->desc_addr,
2002 					(struct vring_avail *)(uintptr_t)cvq->driver_addr,
2003 					(struct vring_used *)(uintptr_t)cvq->device_addr);
2004 		if (err)
2005 			goto err_vq;
2006 	}
2007 
2008 	return 0;
2009 
2010 err_vq:
2011 	for (--i; i >= 0; i--)
2012 		teardown_vq(ndev, &ndev->vqs[i]);
2013 
2014 	return err;
2015 }
2016 
2017 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
2018 {
2019 	struct mlx5_vdpa_virtqueue *mvq;
2020 	int i;
2021 
2022 	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
2023 		mvq = &ndev->vqs[i];
2024 		if (!mvq->initialized)
2025 			continue;
2026 
2027 		teardown_vq(ndev, mvq);
2028 	}
2029 }
2030 
2031 static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
2032 {
2033 	if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
2034 		if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
2035 			/* MQ supported. CVQ index is right above the last data virtqueue's */
2036 			mvdev->max_idx = mvdev->max_vqs;
2037 		} else {
2038 			/* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
2039 			 * CVQ gets index 2
2040 			 */
2041 			mvdev->max_idx = 2;
2042 		}
2043 	} else {
2044 		/* Two data virtqueues only: one for rx and one for tx */
2045 		mvdev->max_idx = 1;
2046 	}
2047 }
2048 
2049 static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features)
2050 {
2051 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2052 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2053 	int err;
2054 
2055 	print_features(mvdev, features, true);
2056 
2057 	err = verify_driver_features(mvdev, features);
2058 	if (err)
2059 		return err;
2060 
2061 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
2062 	if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ))
2063 		ndev->cur_num_vqs = 2 * mlx5vdpa16_to_cpu(mvdev, ndev->config.max_virtqueue_pairs);
2064 	else
2065 		ndev->cur_num_vqs = 2;
2066 
2067 	update_cvq_info(mvdev);
2068 	return err;
2069 }
2070 
2071 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
2072 {
2073 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2074 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2075 
2076 	ndev->config_cb = *cb;
2077 }
2078 
2079 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
2080 static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
2081 {
2082 	return MLX5_VDPA_MAX_VQ_ENTRIES;
2083 }
2084 
2085 static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
2086 {
2087 	return VIRTIO_ID_NET;
2088 }
2089 
2090 static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
2091 {
2092 	return PCI_VENDOR_ID_MELLANOX;
2093 }
2094 
2095 static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
2096 {
2097 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2098 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2099 
2100 	print_status(mvdev, ndev->mvdev.status, false);
2101 	return ndev->mvdev.status;
2102 }
2103 
2104 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
2105 {
2106 	struct mlx5_vq_restore_info *ri = &mvq->ri;
2107 	struct mlx5_virtq_attr attr = {};
2108 	int err;
2109 
2110 	if (mvq->initialized) {
2111 		err = query_virtqueue(ndev, mvq, &attr);
2112 		if (err)
2113 			return err;
2114 	}
2115 
2116 	ri->avail_index = attr.available_index;
2117 	ri->used_index = attr.used_index;
2118 	ri->ready = mvq->ready;
2119 	ri->num_ent = mvq->num_ent;
2120 	ri->desc_addr = mvq->desc_addr;
2121 	ri->device_addr = mvq->device_addr;
2122 	ri->driver_addr = mvq->driver_addr;
2123 	ri->restore = true;
2124 	return 0;
2125 }
2126 
2127 static int save_channels_info(struct mlx5_vdpa_net *ndev)
2128 {
2129 	int i;
2130 
2131 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2132 		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
2133 		save_channel_info(ndev, &ndev->vqs[i]);
2134 	}
2135 	return 0;
2136 }
2137 
2138 static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
2139 {
2140 	int i;
2141 
2142 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2143 		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2144 }
2145 
2146 static void restore_channels_info(struct mlx5_vdpa_net *ndev)
2147 {
2148 	struct mlx5_vdpa_virtqueue *mvq;
2149 	struct mlx5_vq_restore_info *ri;
2150 	int i;
2151 
2152 	mlx5_clear_vqs(ndev);
2153 	init_mvqs(ndev);
2154 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2155 		mvq = &ndev->vqs[i];
2156 		ri = &mvq->ri;
2157 		if (!ri->restore)
2158 			continue;
2159 
2160 		mvq->avail_idx = ri->avail_index;
2161 		mvq->used_idx = ri->used_index;
2162 		mvq->ready = ri->ready;
2163 		mvq->num_ent = ri->num_ent;
2164 		mvq->desc_addr = ri->desc_addr;
2165 		mvq->device_addr = ri->device_addr;
2166 		mvq->driver_addr = ri->driver_addr;
2167 	}
2168 }
2169 
2170 static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
2171 {
2172 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2173 	int err;
2174 
2175 	suspend_vqs(ndev);
2176 	err = save_channels_info(ndev);
2177 	if (err)
2178 		goto err_mr;
2179 
2180 	teardown_driver(ndev);
2181 	mlx5_vdpa_destroy_mr(mvdev);
2182 	err = mlx5_vdpa_create_mr(mvdev, iotlb);
2183 	if (err)
2184 		goto err_mr;
2185 
2186 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
2187 		goto err_mr;
2188 
2189 	restore_channels_info(ndev);
2190 	err = setup_driver(mvdev);
2191 	if (err)
2192 		goto err_setup;
2193 
2194 	return 0;
2195 
2196 err_setup:
2197 	mlx5_vdpa_destroy_mr(mvdev);
2198 err_mr:
2199 	return err;
2200 }
2201 
2202 /* reslock must be held for this function */
2203 static int setup_driver(struct mlx5_vdpa_dev *mvdev)
2204 {
2205 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2206 	int err;
2207 
2208 	WARN_ON(!mutex_is_locked(&ndev->reslock));
2209 
2210 	if (ndev->setup) {
2211 		mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
2212 		err = 0;
2213 		goto out;
2214 	}
2215 	err = setup_virtqueues(mvdev);
2216 	if (err) {
2217 		mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
2218 		goto out;
2219 	}
2220 
2221 	err = create_rqt(ndev);
2222 	if (err) {
2223 		mlx5_vdpa_warn(mvdev, "create_rqt\n");
2224 		goto err_rqt;
2225 	}
2226 
2227 	err = create_tir(ndev);
2228 	if (err) {
2229 		mlx5_vdpa_warn(mvdev, "create_tir\n");
2230 		goto err_tir;
2231 	}
2232 
2233 	err = add_fwd_to_tir(ndev);
2234 	if (err) {
2235 		mlx5_vdpa_warn(mvdev, "add_fwd_to_tir\n");
2236 		goto err_fwd;
2237 	}
2238 	ndev->setup = true;
2239 
2240 	return 0;
2241 
2242 err_fwd:
2243 	destroy_tir(ndev);
2244 err_tir:
2245 	destroy_rqt(ndev);
2246 err_rqt:
2247 	teardown_virtqueues(ndev);
2248 out:
2249 	return err;
2250 }
2251 
2252 /* reslock must be held for this function */
2253 static void teardown_driver(struct mlx5_vdpa_net *ndev)
2254 {
2255 
2256 	WARN_ON(!mutex_is_locked(&ndev->reslock));
2257 
2258 	if (!ndev->setup)
2259 		return;
2260 
2261 	remove_fwd_to_tir(ndev);
2262 	destroy_tir(ndev);
2263 	destroy_rqt(ndev);
2264 	teardown_virtqueues(ndev);
2265 	ndev->setup = false;
2266 }
2267 
2268 static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
2269 {
2270 	int i;
2271 
2272 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2273 		ndev->vqs[i].ready = false;
2274 
2275 	ndev->mvdev.cvq.ready = false;
2276 }
2277 
2278 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
2279 {
2280 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2281 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2282 	int err;
2283 
2284 	print_status(mvdev, status, true);
2285 
2286 	mutex_lock(&ndev->reslock);
2287 
2288 	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
2289 		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
2290 			err = setup_driver(mvdev);
2291 			if (err) {
2292 				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
2293 				goto err_setup;
2294 			}
2295 		} else {
2296 			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
2297 			goto err_clear;
2298 		}
2299 	}
2300 
2301 	ndev->mvdev.status = status;
2302 	mutex_unlock(&ndev->reslock);
2303 	return;
2304 
2305 err_setup:
2306 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2307 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
2308 err_clear:
2309 	mutex_unlock(&ndev->reslock);
2310 }
2311 
2312 static int mlx5_vdpa_reset(struct vdpa_device *vdev)
2313 {
2314 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2315 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2316 
2317 	print_status(mvdev, 0, true);
2318 	mlx5_vdpa_info(mvdev, "performing device reset\n");
2319 
2320 	mutex_lock(&ndev->reslock);
2321 	teardown_driver(ndev);
2322 	clear_vqs_ready(ndev);
2323 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2324 	ndev->mvdev.status = 0;
2325 	ndev->cur_num_vqs = 0;
2326 	memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1));
2327 	ndev->mvdev.actual_features = 0;
2328 	++mvdev->generation;
2329 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2330 		if (mlx5_vdpa_create_mr(mvdev, NULL))
2331 			mlx5_vdpa_warn(mvdev, "create MR failed\n");
2332 	}
2333 	mutex_unlock(&ndev->reslock);
2334 
2335 	return 0;
2336 }
2337 
2338 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
2339 {
2340 	return sizeof(struct virtio_net_config);
2341 }
2342 
2343 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
2344 				 unsigned int len)
2345 {
2346 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2347 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2348 
2349 	if (offset + len <= sizeof(struct virtio_net_config))
2350 		memcpy(buf, (u8 *)&ndev->config + offset, len);
2351 }
2352 
2353 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
2354 				 unsigned int len)
2355 {
2356 	/* not supported */
2357 }
2358 
2359 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
2360 {
2361 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2362 
2363 	return mvdev->generation;
2364 }
2365 
2366 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb)
2367 {
2368 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2369 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2370 	bool change_map;
2371 	int err;
2372 
2373 	mutex_lock(&ndev->reslock);
2374 
2375 	err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map);
2376 	if (err) {
2377 		mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err);
2378 		goto err;
2379 	}
2380 
2381 	if (change_map)
2382 		err = mlx5_vdpa_change_map(mvdev, iotlb);
2383 
2384 err:
2385 	mutex_unlock(&ndev->reslock);
2386 	return err;
2387 }
2388 
2389 static void mlx5_vdpa_free(struct vdpa_device *vdev)
2390 {
2391 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2392 	struct mlx5_core_dev *pfmdev;
2393 	struct mlx5_vdpa_net *ndev;
2394 
2395 	ndev = to_mlx5_vdpa_ndev(mvdev);
2396 
2397 	free_resources(ndev);
2398 	mlx5_vdpa_destroy_mr(mvdev);
2399 	if (!is_zero_ether_addr(ndev->config.mac)) {
2400 		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
2401 		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
2402 	}
2403 	mlx5_vdpa_free_resources(&ndev->mvdev);
2404 	mutex_destroy(&ndev->reslock);
2405 	kfree(ndev->event_cbs);
2406 	kfree(ndev->vqs);
2407 }
2408 
2409 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
2410 {
2411 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2412 	struct vdpa_notification_area ret = {};
2413 	struct mlx5_vdpa_net *ndev;
2414 	phys_addr_t addr;
2415 
2416 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
2417 		return ret;
2418 
2419 	/* If SF BAR size is smaller than PAGE_SIZE, do not use direct
2420 	 * notification to avoid the risk of mapping pages that contain BAR of more
2421 	 * than one SF
2422 	 */
2423 	if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
2424 		return ret;
2425 
2426 	ndev = to_mlx5_vdpa_ndev(mvdev);
2427 	addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
2428 	ret.addr = addr;
2429 	ret.size = PAGE_SIZE;
2430 	return ret;
2431 }
2432 
2433 static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 idx)
2434 {
2435 	return -EOPNOTSUPP;
2436 }
2437 
2438 static u64 mlx5_vdpa_get_driver_features(struct vdpa_device *vdev)
2439 {
2440 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2441 
2442 	return mvdev->actual_features;
2443 }
2444 
2445 static const struct vdpa_config_ops mlx5_vdpa_ops = {
2446 	.set_vq_address = mlx5_vdpa_set_vq_address,
2447 	.set_vq_num = mlx5_vdpa_set_vq_num,
2448 	.kick_vq = mlx5_vdpa_kick_vq,
2449 	.set_vq_cb = mlx5_vdpa_set_vq_cb,
2450 	.set_vq_ready = mlx5_vdpa_set_vq_ready,
2451 	.get_vq_ready = mlx5_vdpa_get_vq_ready,
2452 	.set_vq_state = mlx5_vdpa_set_vq_state,
2453 	.get_vq_state = mlx5_vdpa_get_vq_state,
2454 	.get_vq_notification = mlx5_get_vq_notification,
2455 	.get_vq_irq = mlx5_get_vq_irq,
2456 	.get_vq_align = mlx5_vdpa_get_vq_align,
2457 	.get_device_features = mlx5_vdpa_get_device_features,
2458 	.set_driver_features = mlx5_vdpa_set_driver_features,
2459 	.get_driver_features = mlx5_vdpa_get_driver_features,
2460 	.set_config_cb = mlx5_vdpa_set_config_cb,
2461 	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
2462 	.get_device_id = mlx5_vdpa_get_device_id,
2463 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
2464 	.get_status = mlx5_vdpa_get_status,
2465 	.set_status = mlx5_vdpa_set_status,
2466 	.reset = mlx5_vdpa_reset,
2467 	.get_config_size = mlx5_vdpa_get_config_size,
2468 	.get_config = mlx5_vdpa_get_config,
2469 	.set_config = mlx5_vdpa_set_config,
2470 	.get_generation = mlx5_vdpa_get_generation,
2471 	.set_map = mlx5_vdpa_set_map,
2472 	.free = mlx5_vdpa_free,
2473 };
2474 
2475 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
2476 {
2477 	u16 hw_mtu;
2478 	int err;
2479 
2480 	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
2481 	if (err)
2482 		return err;
2483 
2484 	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
2485 	return 0;
2486 }
2487 
2488 static int alloc_resources(struct mlx5_vdpa_net *ndev)
2489 {
2490 	struct mlx5_vdpa_net_resources *res = &ndev->res;
2491 	int err;
2492 
2493 	if (res->valid) {
2494 		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
2495 		return -EEXIST;
2496 	}
2497 
2498 	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
2499 	if (err)
2500 		return err;
2501 
2502 	err = create_tis(ndev);
2503 	if (err)
2504 		goto err_tis;
2505 
2506 	res->valid = true;
2507 
2508 	return 0;
2509 
2510 err_tis:
2511 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
2512 	return err;
2513 }
2514 
2515 static void free_resources(struct mlx5_vdpa_net *ndev)
2516 {
2517 	struct mlx5_vdpa_net_resources *res = &ndev->res;
2518 
2519 	if (!res->valid)
2520 		return;
2521 
2522 	destroy_tis(ndev);
2523 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
2524 	res->valid = false;
2525 }
2526 
2527 static void init_mvqs(struct mlx5_vdpa_net *ndev)
2528 {
2529 	struct mlx5_vdpa_virtqueue *mvq;
2530 	int i;
2531 
2532 	for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); ++i) {
2533 		mvq = &ndev->vqs[i];
2534 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2535 		mvq->index = i;
2536 		mvq->ndev = ndev;
2537 		mvq->fwqp.fw = true;
2538 	}
2539 	for (; i < ndev->mvdev.max_vqs; i++) {
2540 		mvq = &ndev->vqs[i];
2541 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2542 		mvq->index = i;
2543 		mvq->ndev = ndev;
2544 	}
2545 }
2546 
2547 struct mlx5_vdpa_mgmtdev {
2548 	struct vdpa_mgmt_dev mgtdev;
2549 	struct mlx5_adev *madev;
2550 	struct mlx5_vdpa_net *ndev;
2551 };
2552 
2553 static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
2554 {
2555 	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
2556 	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
2557 	int err;
2558 
2559 	MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE);
2560 	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
2561 	MLX5_SET(query_vport_state_in, in, vport_number, vport);
2562 	if (vport)
2563 		MLX5_SET(query_vport_state_in, in, other_vport, 1);
2564 
2565 	err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
2566 	if (err)
2567 		return 0;
2568 
2569 	return MLX5_GET(query_vport_state_out, out, state);
2570 }
2571 
2572 static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
2573 {
2574 	if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) ==
2575 	    VPORT_STATE_UP)
2576 		return true;
2577 
2578 	return false;
2579 }
2580 
2581 static void update_carrier(struct work_struct *work)
2582 {
2583 	struct mlx5_vdpa_wq_ent *wqent;
2584 	struct mlx5_vdpa_dev *mvdev;
2585 	struct mlx5_vdpa_net *ndev;
2586 
2587 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
2588 	mvdev = wqent->mvdev;
2589 	ndev = to_mlx5_vdpa_ndev(mvdev);
2590 	if (get_link_state(mvdev))
2591 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
2592 	else
2593 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
2594 
2595 	if (ndev->config_cb.callback)
2596 		ndev->config_cb.callback(ndev->config_cb.private);
2597 
2598 	kfree(wqent);
2599 }
2600 
2601 static int event_handler(struct notifier_block *nb, unsigned long event, void *param)
2602 {
2603 	struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb);
2604 	struct mlx5_eqe *eqe = param;
2605 	int ret = NOTIFY_DONE;
2606 	struct mlx5_vdpa_wq_ent *wqent;
2607 
2608 	if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
2609 		switch (eqe->sub_type) {
2610 		case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
2611 		case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
2612 			wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
2613 			if (!wqent)
2614 				return NOTIFY_DONE;
2615 
2616 			wqent->mvdev = &ndev->mvdev;
2617 			INIT_WORK(&wqent->work, update_carrier);
2618 			queue_work(ndev->mvdev.wq, &wqent->work);
2619 			ret = NOTIFY_OK;
2620 			break;
2621 		default:
2622 			return NOTIFY_DONE;
2623 		}
2624 		return ret;
2625 	}
2626 	return ret;
2627 }
2628 
2629 static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu)
2630 {
2631 	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
2632 	void *in;
2633 	int err;
2634 
2635 	in = kvzalloc(inlen, GFP_KERNEL);
2636 	if (!in)
2637 		return -ENOMEM;
2638 
2639 	MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1);
2640 	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu,
2641 		 mtu + MLX5V_ETH_HARD_MTU);
2642 	MLX5_SET(modify_nic_vport_context_in, in, opcode,
2643 		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
2644 
2645 	err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in);
2646 
2647 	kvfree(in);
2648 	return err;
2649 }
2650 
2651 static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
2652 			     const struct vdpa_dev_set_config *add_config)
2653 {
2654 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
2655 	struct virtio_net_config *config;
2656 	struct mlx5_core_dev *pfmdev;
2657 	struct mlx5_vdpa_dev *mvdev;
2658 	struct mlx5_vdpa_net *ndev;
2659 	struct mlx5_core_dev *mdev;
2660 	u32 max_vqs;
2661 	u16 mtu;
2662 	int err;
2663 
2664 	if (mgtdev->ndev)
2665 		return -ENOSPC;
2666 
2667 	mdev = mgtdev->madev->mdev;
2668 	if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) &
2669 	    MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) {
2670 		dev_warn(mdev->device, "missing support for split virtqueues\n");
2671 		return -EOPNOTSUPP;
2672 	}
2673 
2674 	max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues);
2675 	if (max_vqs < 2) {
2676 		dev_warn(mdev->device,
2677 			 "%d virtqueues are supported. At least 2 are required\n",
2678 			 max_vqs);
2679 		return -EAGAIN;
2680 	}
2681 
2682 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
2683 		if (add_config->net.max_vq_pairs > max_vqs / 2)
2684 			return -EINVAL;
2685 		max_vqs = min_t(u32, max_vqs, 2 * add_config->net.max_vq_pairs);
2686 	} else {
2687 		max_vqs = 2;
2688 	}
2689 
2690 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
2691 				 name, false);
2692 	if (IS_ERR(ndev))
2693 		return PTR_ERR(ndev);
2694 
2695 	ndev->mvdev.mlx_features = mgtdev->mgtdev.supported_features;
2696 	ndev->mvdev.max_vqs = max_vqs;
2697 	mvdev = &ndev->mvdev;
2698 	mvdev->mdev = mdev;
2699 
2700 	ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL);
2701 	ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL);
2702 	if (!ndev->vqs || !ndev->event_cbs) {
2703 		err = -ENOMEM;
2704 		goto err_alloc;
2705 	}
2706 
2707 	init_mvqs(ndev);
2708 	mutex_init(&ndev->reslock);
2709 	config = &ndev->config;
2710 
2711 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
2712 		err = config_func_mtu(mdev, add_config->net.mtu);
2713 		if (err)
2714 			goto err_mtu;
2715 	}
2716 
2717 	err = query_mtu(mdev, &mtu);
2718 	if (err)
2719 		goto err_mtu;
2720 
2721 	ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu);
2722 
2723 	if (get_link_state(mvdev))
2724 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
2725 	else
2726 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
2727 
2728 	if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
2729 		memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN);
2730 	} else {
2731 		err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
2732 		if (err)
2733 			goto err_mtu;
2734 	}
2735 
2736 	if (!is_zero_ether_addr(config->mac)) {
2737 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
2738 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
2739 		if (err)
2740 			goto err_mtu;
2741 
2742 		ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC);
2743 	}
2744 
2745 	config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, mlx5_vdpa_max_qps(max_vqs));
2746 	mvdev->vdev.dma_dev = &mdev->pdev->dev;
2747 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
2748 	if (err)
2749 		goto err_mpfs;
2750 
2751 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2752 		err = mlx5_vdpa_create_mr(mvdev, NULL);
2753 		if (err)
2754 			goto err_res;
2755 	}
2756 
2757 	err = alloc_resources(ndev);
2758 	if (err)
2759 		goto err_mr;
2760 
2761 	ndev->cvq_ent.mvdev = mvdev;
2762 	INIT_WORK(&ndev->cvq_ent.work, mlx5_cvq_kick_handler);
2763 	mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq");
2764 	if (!mvdev->wq) {
2765 		err = -ENOMEM;
2766 		goto err_res2;
2767 	}
2768 
2769 	ndev->nb.notifier_call = event_handler;
2770 	mlx5_notifier_register(mdev, &ndev->nb);
2771 	mvdev->vdev.mdev = &mgtdev->mgtdev;
2772 	err = _vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs) + 1);
2773 	if (err)
2774 		goto err_reg;
2775 
2776 	mgtdev->ndev = ndev;
2777 	return 0;
2778 
2779 err_reg:
2780 	destroy_workqueue(mvdev->wq);
2781 err_res2:
2782 	free_resources(ndev);
2783 err_mr:
2784 	mlx5_vdpa_destroy_mr(mvdev);
2785 err_res:
2786 	mlx5_vdpa_free_resources(&ndev->mvdev);
2787 err_mpfs:
2788 	if (!is_zero_ether_addr(config->mac))
2789 		mlx5_mpfs_del_mac(pfmdev, config->mac);
2790 err_mtu:
2791 	mutex_destroy(&ndev->reslock);
2792 err_alloc:
2793 	put_device(&mvdev->vdev.dev);
2794 	return err;
2795 }
2796 
2797 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
2798 {
2799 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
2800 	struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
2801 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2802 	struct workqueue_struct *wq;
2803 
2804 	mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
2805 	wq = mvdev->wq;
2806 	mvdev->wq = NULL;
2807 	destroy_workqueue(wq);
2808 	_vdpa_unregister_device(dev);
2809 	mgtdev->ndev = NULL;
2810 }
2811 
2812 static const struct vdpa_mgmtdev_ops mdev_ops = {
2813 	.dev_add = mlx5_vdpa_dev_add,
2814 	.dev_del = mlx5_vdpa_dev_del,
2815 };
2816 
2817 static struct virtio_device_id id_table[] = {
2818 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
2819 	{ 0 },
2820 };
2821 
2822 static int mlx5v_probe(struct auxiliary_device *adev,
2823 		       const struct auxiliary_device_id *id)
2824 
2825 {
2826 	struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
2827 	struct mlx5_core_dev *mdev = madev->mdev;
2828 	struct mlx5_vdpa_mgmtdev *mgtdev;
2829 	int err;
2830 
2831 	mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
2832 	if (!mgtdev)
2833 		return -ENOMEM;
2834 
2835 	mgtdev->mgtdev.ops = &mdev_ops;
2836 	mgtdev->mgtdev.device = mdev->device;
2837 	mgtdev->mgtdev.id_table = id_table;
2838 	mgtdev->mgtdev.config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) |
2839 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP) |
2840 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU);
2841 	mgtdev->mgtdev.max_supported_vqs =
2842 		MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues) + 1;
2843 	mgtdev->mgtdev.supported_features = get_supported_features(mdev);
2844 	mgtdev->madev = madev;
2845 
2846 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
2847 	if (err)
2848 		goto reg_err;
2849 
2850 	auxiliary_set_drvdata(adev, mgtdev);
2851 
2852 	return 0;
2853 
2854 reg_err:
2855 	kfree(mgtdev);
2856 	return err;
2857 }
2858 
2859 static void mlx5v_remove(struct auxiliary_device *adev)
2860 {
2861 	struct mlx5_vdpa_mgmtdev *mgtdev;
2862 
2863 	mgtdev = auxiliary_get_drvdata(adev);
2864 	vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
2865 	kfree(mgtdev);
2866 }
2867 
2868 static const struct auxiliary_device_id mlx5v_id_table[] = {
2869 	{ .name = MLX5_ADEV_NAME ".vnet", },
2870 	{},
2871 };
2872 
2873 MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
2874 
2875 static struct auxiliary_driver mlx5v_driver = {
2876 	.name = "vnet",
2877 	.probe = mlx5v_probe,
2878 	.remove = mlx5v_remove,
2879 	.id_table = mlx5v_id_table,
2880 };
2881 
2882 module_auxiliary_driver(mlx5v_driver);
2883