xref: /openbmc/linux/drivers/vdpa/mlx5/net/mlx5_vnet.c (revision 8dda2eac)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
3 
4 #include <linux/module.h>
5 #include <linux/vdpa.h>
6 #include <linux/vringh.h>
7 #include <uapi/linux/virtio_net.h>
8 #include <uapi/linux/virtio_ids.h>
9 #include <linux/virtio_config.h>
10 #include <linux/auxiliary_bus.h>
11 #include <linux/mlx5/cq.h>
12 #include <linux/mlx5/qp.h>
13 #include <linux/mlx5/device.h>
14 #include <linux/mlx5/driver.h>
15 #include <linux/mlx5/vport.h>
16 #include <linux/mlx5/fs.h>
17 #include <linux/mlx5/mlx5_ifc_vdpa.h>
18 #include <linux/mlx5/mpfs.h>
19 #include "mlx5_vdpa.h"
20 
21 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
22 MODULE_DESCRIPTION("Mellanox VDPA driver");
23 MODULE_LICENSE("Dual BSD/GPL");
24 
25 #define to_mlx5_vdpa_ndev(__mvdev)                                             \
26 	container_of(__mvdev, struct mlx5_vdpa_net, mvdev)
27 #define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev)
28 
29 #define VALID_FEATURES_MASK                                                                        \
30 	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
31 	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
32 	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
33 	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
34 	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
35 	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
36 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
37 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
38 	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
39 	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
40 	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
41 	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
42 	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
43 
44 #define VALID_STATUS_MASK                                                                          \
45 	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
46 	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
47 
48 struct mlx5_vdpa_net_resources {
49 	u32 tisn;
50 	u32 tdn;
51 	u32 tirn;
52 	u32 rqtn;
53 	bool valid;
54 };
55 
56 struct mlx5_vdpa_cq_buf {
57 	struct mlx5_frag_buf_ctrl fbc;
58 	struct mlx5_frag_buf frag_buf;
59 	int cqe_size;
60 	int nent;
61 };
62 
63 struct mlx5_vdpa_cq {
64 	struct mlx5_core_cq mcq;
65 	struct mlx5_vdpa_cq_buf buf;
66 	struct mlx5_db db;
67 	int cqe;
68 };
69 
70 struct mlx5_vdpa_umem {
71 	struct mlx5_frag_buf_ctrl fbc;
72 	struct mlx5_frag_buf frag_buf;
73 	int size;
74 	u32 id;
75 };
76 
77 struct mlx5_vdpa_qp {
78 	struct mlx5_core_qp mqp;
79 	struct mlx5_frag_buf frag_buf;
80 	struct mlx5_db db;
81 	u16 head;
82 	bool fw;
83 };
84 
85 struct mlx5_vq_restore_info {
86 	u32 num_ent;
87 	u64 desc_addr;
88 	u64 device_addr;
89 	u64 driver_addr;
90 	u16 avail_index;
91 	u16 used_index;
92 	bool ready;
93 	struct vdpa_callback cb;
94 	bool restore;
95 };
96 
97 struct mlx5_vdpa_virtqueue {
98 	bool ready;
99 	u64 desc_addr;
100 	u64 device_addr;
101 	u64 driver_addr;
102 	u32 num_ent;
103 	struct vdpa_callback event_cb;
104 
105 	/* Resources for implementing the notification channel from the device
106 	 * to the driver. fwqp is the firmware end of an RC connection; the
107 	 * other end is vqqp used by the driver. cq is is where completions are
108 	 * reported.
109 	 */
110 	struct mlx5_vdpa_cq cq;
111 	struct mlx5_vdpa_qp fwqp;
112 	struct mlx5_vdpa_qp vqqp;
113 
114 	/* umem resources are required for the virtqueue operation. They're use
115 	 * is internal and they must be provided by the driver.
116 	 */
117 	struct mlx5_vdpa_umem umem1;
118 	struct mlx5_vdpa_umem umem2;
119 	struct mlx5_vdpa_umem umem3;
120 
121 	bool initialized;
122 	int index;
123 	u32 virtq_id;
124 	struct mlx5_vdpa_net *ndev;
125 	u16 avail_idx;
126 	u16 used_idx;
127 	int fw_state;
128 
129 	/* keep last in the struct */
130 	struct mlx5_vq_restore_info ri;
131 };
132 
133 /* We will remove this limitation once mlx5_vdpa_alloc_resources()
134  * provides for driver space allocation
135  */
136 #define MLX5_MAX_SUPPORTED_VQS 16
137 
138 struct mlx5_vdpa_net {
139 	struct mlx5_vdpa_dev mvdev;
140 	struct mlx5_vdpa_net_resources res;
141 	struct virtio_net_config config;
142 	struct mlx5_vdpa_virtqueue vqs[MLX5_MAX_SUPPORTED_VQS];
143 
144 	/* Serialize vq resources creation and destruction. This is required
145 	 * since memory map might change and we need to destroy and create
146 	 * resources while driver in operational.
147 	 */
148 	struct mutex reslock;
149 	struct mlx5_flow_table *rxft;
150 	struct mlx5_fc *rx_counter;
151 	struct mlx5_flow_handle *rx_rule;
152 	bool setup;
153 	u16 mtu;
154 };
155 
156 static void free_resources(struct mlx5_vdpa_net *ndev);
157 static void init_mvqs(struct mlx5_vdpa_net *ndev);
158 static int setup_driver(struct mlx5_vdpa_net *ndev);
159 static void teardown_driver(struct mlx5_vdpa_net *ndev);
160 
161 static bool mlx5_vdpa_debug;
162 
163 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
164 	do {                                                                                       \
165 		if (features & BIT_ULL(_feature))                                                  \
166 			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
167 	} while (0)
168 
169 #define MLX5_LOG_VIO_STAT(_status)                                                                 \
170 	do {                                                                                       \
171 		if (status & (_status))                                                            \
172 			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
173 	} while (0)
174 
175 static inline u32 mlx5_vdpa_max_qps(int max_vqs)
176 {
177 	return max_vqs / 2;
178 }
179 
180 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
181 {
182 	if (status & ~VALID_STATUS_MASK)
183 		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
184 			       status & ~VALID_STATUS_MASK);
185 
186 	if (!mlx5_vdpa_debug)
187 		return;
188 
189 	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
190 	if (set && !status) {
191 		mlx5_vdpa_info(mvdev, "driver resets the device\n");
192 		return;
193 	}
194 
195 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
196 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
197 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
198 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
199 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
200 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
201 }
202 
203 static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
204 {
205 	if (features & ~VALID_FEATURES_MASK)
206 		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
207 			       features & ~VALID_FEATURES_MASK);
208 
209 	if (!mlx5_vdpa_debug)
210 		return;
211 
212 	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
213 	if (!features)
214 		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
215 
216 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
217 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
218 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
219 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
220 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
221 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
222 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
223 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
224 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
225 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
226 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
227 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
228 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
229 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
230 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
231 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
232 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
233 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
234 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
235 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
236 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
237 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
238 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
239 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
240 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
241 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
242 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
243 	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
244 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
245 	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
246 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
247 	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
248 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
249 	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
250 }
251 
252 static int create_tis(struct mlx5_vdpa_net *ndev)
253 {
254 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
255 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
256 	void *tisc;
257 	int err;
258 
259 	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
260 	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
261 	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
262 	if (err)
263 		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
264 
265 	return err;
266 }
267 
268 static void destroy_tis(struct mlx5_vdpa_net *ndev)
269 {
270 	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
271 }
272 
273 #define MLX5_VDPA_CQE_SIZE 64
274 #define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
275 
276 static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
277 {
278 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
279 	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
280 	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
281 	int err;
282 
283 	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
284 				       ndev->mvdev.mdev->priv.numa_node);
285 	if (err)
286 		return err;
287 
288 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
289 
290 	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
291 	buf->nent = nent;
292 
293 	return 0;
294 }
295 
296 static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
297 {
298 	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
299 
300 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
301 					ndev->mvdev.mdev->priv.numa_node);
302 }
303 
304 static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
305 {
306 	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
307 }
308 
309 static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
310 {
311 	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
312 }
313 
314 static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
315 {
316 	struct mlx5_cqe64 *cqe64;
317 	void *cqe;
318 	int i;
319 
320 	for (i = 0; i < buf->nent; i++) {
321 		cqe = get_cqe(vcq, i);
322 		cqe64 = cqe;
323 		cqe64->op_own = MLX5_CQE_INVALID << 4;
324 	}
325 }
326 
327 static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
328 {
329 	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
330 
331 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
332 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
333 		return cqe64;
334 
335 	return NULL;
336 }
337 
338 static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
339 {
340 	vqp->head += n;
341 	vqp->db.db[0] = cpu_to_be32(vqp->head);
342 }
343 
344 static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
345 		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
346 {
347 	struct mlx5_vdpa_qp *vqp;
348 	__be64 *pas;
349 	void *qpc;
350 
351 	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
352 	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
353 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
354 	if (vqp->fw) {
355 		/* Firmware QP is allocated by the driver for the firmware's
356 		 * use so we can skip part of the params as they will be chosen by firmware
357 		 */
358 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
359 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
360 		MLX5_SET(qpc, qpc, no_sq, 1);
361 		return;
362 	}
363 
364 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
365 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
366 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
367 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
368 	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
369 	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
370 	MLX5_SET(qpc, qpc, no_sq, 1);
371 	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
372 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
373 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
374 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
375 	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
376 }
377 
378 static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
379 {
380 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
381 					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
382 					ndev->mvdev.mdev->priv.numa_node);
383 }
384 
385 static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
386 {
387 	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
388 }
389 
390 static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
391 		     struct mlx5_vdpa_qp *vqp)
392 {
393 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
394 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
395 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
396 	void *qpc;
397 	void *in;
398 	int err;
399 
400 	if (!vqp->fw) {
401 		vqp = &mvq->vqqp;
402 		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
403 		if (err)
404 			return err;
405 
406 		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
407 		if (err)
408 			goto err_db;
409 		inlen += vqp->frag_buf.npages * sizeof(__be64);
410 	}
411 
412 	in = kzalloc(inlen, GFP_KERNEL);
413 	if (!in) {
414 		err = -ENOMEM;
415 		goto err_kzalloc;
416 	}
417 
418 	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
419 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
420 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
421 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
422 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
423 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
424 	if (!vqp->fw)
425 		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
426 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
427 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
428 	kfree(in);
429 	if (err)
430 		goto err_kzalloc;
431 
432 	vqp->mqp.uid = ndev->mvdev.res.uid;
433 	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
434 
435 	if (!vqp->fw)
436 		rx_post(vqp, mvq->num_ent);
437 
438 	return 0;
439 
440 err_kzalloc:
441 	if (!vqp->fw)
442 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
443 err_db:
444 	if (!vqp->fw)
445 		rq_buf_free(ndev, vqp);
446 
447 	return err;
448 }
449 
450 static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
451 {
452 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
453 
454 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
455 	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
456 	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
457 	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
458 		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
459 	if (!vqp->fw) {
460 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
461 		rq_buf_free(ndev, vqp);
462 	}
463 }
464 
465 static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
466 {
467 	return get_sw_cqe(cq, cq->mcq.cons_index);
468 }
469 
470 static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
471 {
472 	struct mlx5_cqe64 *cqe64;
473 
474 	cqe64 = next_cqe_sw(vcq);
475 	if (!cqe64)
476 		return -EAGAIN;
477 
478 	vcq->mcq.cons_index++;
479 	return 0;
480 }
481 
482 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
483 {
484 	mlx5_cq_set_ci(&mvq->cq.mcq);
485 
486 	/* make sure CQ cosumer update is visible to the hardware before updating
487 	 * RX doorbell record.
488 	 */
489 	dma_wmb();
490 	rx_post(&mvq->vqqp, num);
491 	if (mvq->event_cb.callback)
492 		mvq->event_cb.callback(mvq->event_cb.private);
493 }
494 
495 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
496 {
497 	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
498 	struct mlx5_vdpa_net *ndev = mvq->ndev;
499 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
500 	int num = 0;
501 
502 	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
503 		num++;
504 		if (num > mvq->num_ent / 2) {
505 			/* If completions keep coming while we poll, we want to
506 			 * let the hardware know that we consumed them by
507 			 * updating the doorbell record.  We also let vdpa core
508 			 * know about this so it passes it on the virtio driver
509 			 * on the guest.
510 			 */
511 			mlx5_vdpa_handle_completions(mvq, num);
512 			num = 0;
513 		}
514 	}
515 
516 	if (num)
517 		mlx5_vdpa_handle_completions(mvq, num);
518 
519 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
520 }
521 
522 static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
523 {
524 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
525 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
526 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
527 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
528 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
529 	unsigned int irqn;
530 	__be64 *pas;
531 	int inlen;
532 	void *cqc;
533 	void *in;
534 	int err;
535 	int eqn;
536 
537 	err = mlx5_db_alloc(mdev, &vcq->db);
538 	if (err)
539 		return err;
540 
541 	vcq->mcq.set_ci_db = vcq->db.db;
542 	vcq->mcq.arm_db = vcq->db.db + 1;
543 	vcq->mcq.cqe_sz = 64;
544 
545 	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
546 	if (err)
547 		goto err_db;
548 
549 	cq_frag_buf_init(vcq, &vcq->buf);
550 
551 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
552 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
553 	in = kzalloc(inlen, GFP_KERNEL);
554 	if (!in) {
555 		err = -ENOMEM;
556 		goto err_vzalloc;
557 	}
558 
559 	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
560 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
561 	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
562 
563 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
564 	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
565 
566 	/* Use vector 0 by default. Consider adding code to choose least used
567 	 * vector.
568 	 */
569 	err = mlx5_vector2eqn(mdev, 0, &eqn, &irqn);
570 	if (err)
571 		goto err_vec;
572 
573 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
574 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
575 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
576 	MLX5_SET(cqc, cqc, c_eqn, eqn);
577 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
578 
579 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
580 	if (err)
581 		goto err_vec;
582 
583 	vcq->mcq.comp = mlx5_vdpa_cq_comp;
584 	vcq->cqe = num_ent;
585 	vcq->mcq.set_ci_db = vcq->db.db;
586 	vcq->mcq.arm_db = vcq->db.db + 1;
587 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
588 	kfree(in);
589 	return 0;
590 
591 err_vec:
592 	kfree(in);
593 err_vzalloc:
594 	cq_frag_buf_free(ndev, &vcq->buf);
595 err_db:
596 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
597 	return err;
598 }
599 
600 static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
601 {
602 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
603 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
604 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
605 
606 	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
607 		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
608 		return;
609 	}
610 	cq_frag_buf_free(ndev, &vcq->buf);
611 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
612 }
613 
614 static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
615 			  struct mlx5_vdpa_umem **umemp)
616 {
617 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
618 	int p_a;
619 	int p_b;
620 
621 	switch (num) {
622 	case 1:
623 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_a);
624 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_b);
625 		*umemp = &mvq->umem1;
626 		break;
627 	case 2:
628 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_a);
629 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_b);
630 		*umemp = &mvq->umem2;
631 		break;
632 	case 3:
633 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_a);
634 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_b);
635 		*umemp = &mvq->umem3;
636 		break;
637 	}
638 	(*umemp)->size = p_a * mvq->num_ent + p_b;
639 }
640 
641 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
642 {
643 	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
644 }
645 
646 static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
647 {
648 	int inlen;
649 	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
650 	void *um;
651 	void *in;
652 	int err;
653 	__be64 *pas;
654 	struct mlx5_vdpa_umem *umem;
655 
656 	set_umem_size(ndev, mvq, num, &umem);
657 	err = umem_frag_buf_alloc(ndev, umem, umem->size);
658 	if (err)
659 		return err;
660 
661 	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
662 
663 	in = kzalloc(inlen, GFP_KERNEL);
664 	if (!in) {
665 		err = -ENOMEM;
666 		goto err_in;
667 	}
668 
669 	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
670 	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
671 	um = MLX5_ADDR_OF(create_umem_in, in, umem);
672 	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
673 	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
674 
675 	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
676 	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
677 
678 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
679 	if (err) {
680 		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
681 		goto err_cmd;
682 	}
683 
684 	kfree(in);
685 	umem->id = MLX5_GET(create_umem_out, out, umem_id);
686 
687 	return 0;
688 
689 err_cmd:
690 	kfree(in);
691 err_in:
692 	umem_frag_buf_free(ndev, umem);
693 	return err;
694 }
695 
696 static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
697 {
698 	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
699 	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
700 	struct mlx5_vdpa_umem *umem;
701 
702 	switch (num) {
703 	case 1:
704 		umem = &mvq->umem1;
705 		break;
706 	case 2:
707 		umem = &mvq->umem2;
708 		break;
709 	case 3:
710 		umem = &mvq->umem3;
711 		break;
712 	}
713 
714 	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
715 	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
716 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
717 		return;
718 
719 	umem_frag_buf_free(ndev, umem);
720 }
721 
722 static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
723 {
724 	int num;
725 	int err;
726 
727 	for (num = 1; num <= 3; num++) {
728 		err = create_umem(ndev, mvq, num);
729 		if (err)
730 			goto err_umem;
731 	}
732 	return 0;
733 
734 err_umem:
735 	for (num--; num > 0; num--)
736 		umem_destroy(ndev, mvq, num);
737 
738 	return err;
739 }
740 
741 static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
742 {
743 	int num;
744 
745 	for (num = 3; num > 0; num--)
746 		umem_destroy(ndev, mvq, num);
747 }
748 
749 static int get_queue_type(struct mlx5_vdpa_net *ndev)
750 {
751 	u32 type_mask;
752 
753 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
754 
755 	/* prefer split queue */
756 	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED)
757 		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
758 
759 	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT));
760 
761 	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
762 }
763 
764 static bool vq_is_tx(u16 idx)
765 {
766 	return idx % 2;
767 }
768 
769 static u16 get_features_12_3(u64 features)
770 {
771 	return (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << 9) |
772 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << 8) |
773 	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << 7) |
774 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_CSUM)) << 6);
775 }
776 
777 static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
778 {
779 	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
780 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
781 	void *obj_context;
782 	void *cmd_hdr;
783 	void *vq_ctx;
784 	void *in;
785 	int err;
786 
787 	err = umems_create(ndev, mvq);
788 	if (err)
789 		return err;
790 
791 	in = kzalloc(inlen, GFP_KERNEL);
792 	if (!in) {
793 		err = -ENOMEM;
794 		goto err_alloc;
795 	}
796 
797 	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
798 
799 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
800 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
801 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
802 
803 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
804 	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
805 	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
806 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
807 		 get_features_12_3(ndev->mvdev.actual_features));
808 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
809 	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
810 
811 	if (vq_is_tx(mvq->index))
812 		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
813 
814 	MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
815 	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
816 	MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
817 	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
818 	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
819 		 !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
820 	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
821 	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
822 	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
823 	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey.key);
824 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
825 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
826 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
827 	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
828 	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
829 	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
830 	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
831 	if (MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, eth_frame_offload_type))
832 		MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, 1);
833 
834 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
835 	if (err)
836 		goto err_cmd;
837 
838 	kfree(in);
839 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
840 
841 	return 0;
842 
843 err_cmd:
844 	kfree(in);
845 err_alloc:
846 	umems_destroy(ndev, mvq);
847 	return err;
848 }
849 
850 static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
851 {
852 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
853 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
854 
855 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
856 		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
857 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
858 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
859 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
860 		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
861 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
862 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
863 		return;
864 	}
865 	umems_destroy(ndev, mvq);
866 }
867 
868 static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
869 {
870 	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
871 }
872 
873 static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
874 {
875 	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
876 }
877 
878 static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
879 			int *outlen, u32 qpn, u32 rqpn)
880 {
881 	void *qpc;
882 	void *pp;
883 
884 	switch (cmd) {
885 	case MLX5_CMD_OP_2RST_QP:
886 		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
887 		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
888 		*in = kzalloc(*inlen, GFP_KERNEL);
889 		*out = kzalloc(*outlen, GFP_KERNEL);
890 		if (!*in || !*out)
891 			goto outerr;
892 
893 		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
894 		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
895 		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
896 		break;
897 	case MLX5_CMD_OP_RST2INIT_QP:
898 		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
899 		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
900 		*in = kzalloc(*inlen, GFP_KERNEL);
901 		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
902 		if (!*in || !*out)
903 			goto outerr;
904 
905 		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
906 		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
907 		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
908 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
909 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
910 		MLX5_SET(qpc, qpc, rwe, 1);
911 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
912 		MLX5_SET(ads, pp, vhca_port_num, 1);
913 		break;
914 	case MLX5_CMD_OP_INIT2RTR_QP:
915 		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
916 		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
917 		*in = kzalloc(*inlen, GFP_KERNEL);
918 		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
919 		if (!*in || !*out)
920 			goto outerr;
921 
922 		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
923 		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
924 		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
925 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
926 		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
927 		MLX5_SET(qpc, qpc, log_msg_max, 30);
928 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
929 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
930 		MLX5_SET(ads, pp, fl, 1);
931 		break;
932 	case MLX5_CMD_OP_RTR2RTS_QP:
933 		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
934 		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
935 		*in = kzalloc(*inlen, GFP_KERNEL);
936 		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
937 		if (!*in || !*out)
938 			goto outerr;
939 
940 		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
941 		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
942 		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
943 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
944 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
945 		MLX5_SET(ads, pp, ack_timeout, 14);
946 		MLX5_SET(qpc, qpc, retry_count, 7);
947 		MLX5_SET(qpc, qpc, rnr_retry, 7);
948 		break;
949 	default:
950 		goto outerr_nullify;
951 	}
952 
953 	return;
954 
955 outerr:
956 	kfree(*in);
957 	kfree(*out);
958 outerr_nullify:
959 	*in = NULL;
960 	*out = NULL;
961 }
962 
963 static void free_inout(void *in, void *out)
964 {
965 	kfree(in);
966 	kfree(out);
967 }
968 
969 /* Two QPs are used by each virtqueue. One is used by the driver and one by
970  * firmware. The fw argument indicates whether the subjected QP is the one used
971  * by firmware.
972  */
973 static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
974 {
975 	int outlen;
976 	int inlen;
977 	void *out;
978 	void *in;
979 	int err;
980 
981 	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
982 	if (!in || !out)
983 		return -ENOMEM;
984 
985 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
986 	free_inout(in, out);
987 	return err;
988 }
989 
990 static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
991 {
992 	int err;
993 
994 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
995 	if (err)
996 		return err;
997 
998 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
999 	if (err)
1000 		return err;
1001 
1002 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
1003 	if (err)
1004 		return err;
1005 
1006 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
1007 	if (err)
1008 		return err;
1009 
1010 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
1011 	if (err)
1012 		return err;
1013 
1014 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
1015 	if (err)
1016 		return err;
1017 
1018 	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1019 }
1020 
1021 struct mlx5_virtq_attr {
1022 	u8 state;
1023 	u16 available_index;
1024 	u16 used_index;
1025 };
1026 
1027 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1028 			   struct mlx5_virtq_attr *attr)
1029 {
1030 	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1031 	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1032 	void *out;
1033 	void *obj_context;
1034 	void *cmd_hdr;
1035 	int err;
1036 
1037 	out = kzalloc(outlen, GFP_KERNEL);
1038 	if (!out)
1039 		return -ENOMEM;
1040 
1041 	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1042 
1043 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1044 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1045 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1046 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1047 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1048 	if (err)
1049 		goto err_cmd;
1050 
1051 	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1052 	memset(attr, 0, sizeof(*attr));
1053 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1054 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1055 	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
1056 	kfree(out);
1057 	return 0;
1058 
1059 err_cmd:
1060 	kfree(out);
1061 	return err;
1062 }
1063 
1064 static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
1065 {
1066 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1067 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1068 	void *obj_context;
1069 	void *cmd_hdr;
1070 	void *in;
1071 	int err;
1072 
1073 	in = kzalloc(inlen, GFP_KERNEL);
1074 	if (!in)
1075 		return -ENOMEM;
1076 
1077 	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1078 
1079 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1080 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1081 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1082 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1083 
1084 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1085 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select,
1086 		   MLX5_VIRTQ_MODIFY_MASK_STATE);
1087 	MLX5_SET(virtio_net_q_object, obj_context, state, state);
1088 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1089 	kfree(in);
1090 	if (!err)
1091 		mvq->fw_state = state;
1092 
1093 	return err;
1094 }
1095 
1096 static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1097 {
1098 	u16 idx = mvq->index;
1099 	int err;
1100 
1101 	if (!mvq->num_ent)
1102 		return 0;
1103 
1104 	if (mvq->initialized) {
1105 		mlx5_vdpa_warn(&ndev->mvdev, "attempt re init\n");
1106 		return -EINVAL;
1107 	}
1108 
1109 	err = cq_create(ndev, idx, mvq->num_ent);
1110 	if (err)
1111 		return err;
1112 
1113 	err = qp_create(ndev, mvq, &mvq->fwqp);
1114 	if (err)
1115 		goto err_fwqp;
1116 
1117 	err = qp_create(ndev, mvq, &mvq->vqqp);
1118 	if (err)
1119 		goto err_vqqp;
1120 
1121 	err = connect_qps(ndev, mvq);
1122 	if (err)
1123 		goto err_connect;
1124 
1125 	err = create_virtqueue(ndev, mvq);
1126 	if (err)
1127 		goto err_connect;
1128 
1129 	if (mvq->ready) {
1130 		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1131 		if (err) {
1132 			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
1133 				       idx, err);
1134 			goto err_connect;
1135 		}
1136 	}
1137 
1138 	mvq->initialized = true;
1139 	return 0;
1140 
1141 err_connect:
1142 	qp_destroy(ndev, &mvq->vqqp);
1143 err_vqqp:
1144 	qp_destroy(ndev, &mvq->fwqp);
1145 err_fwqp:
1146 	cq_destroy(ndev, idx);
1147 	return err;
1148 }
1149 
1150 static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1151 {
1152 	struct mlx5_virtq_attr attr;
1153 
1154 	if (!mvq->initialized)
1155 		return;
1156 
1157 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1158 		return;
1159 
1160 	if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
1161 		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
1162 
1163 	if (query_virtqueue(ndev, mvq, &attr)) {
1164 		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
1165 		return;
1166 	}
1167 	mvq->avail_idx = attr.available_index;
1168 	mvq->used_idx = attr.used_index;
1169 }
1170 
1171 static void suspend_vqs(struct mlx5_vdpa_net *ndev)
1172 {
1173 	int i;
1174 
1175 	for (i = 0; i < MLX5_MAX_SUPPORTED_VQS; i++)
1176 		suspend_vq(ndev, &ndev->vqs[i]);
1177 }
1178 
1179 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1180 {
1181 	if (!mvq->initialized)
1182 		return;
1183 
1184 	suspend_vq(ndev, mvq);
1185 	destroy_virtqueue(ndev, mvq);
1186 	qp_destroy(ndev, &mvq->vqqp);
1187 	qp_destroy(ndev, &mvq->fwqp);
1188 	cq_destroy(ndev, mvq->index);
1189 	mvq->initialized = false;
1190 }
1191 
1192 static int create_rqt(struct mlx5_vdpa_net *ndev)
1193 {
1194 	int log_max_rqt;
1195 	__be32 *list;
1196 	void *rqtc;
1197 	int inlen;
1198 	void *in;
1199 	int i, j;
1200 	int err;
1201 
1202 	log_max_rqt = min_t(int, 1, MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
1203 	if (log_max_rqt < 1)
1204 		return -EOPNOTSUPP;
1205 
1206 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + (1 << log_max_rqt) * MLX5_ST_SZ_BYTES(rq_num);
1207 	in = kzalloc(inlen, GFP_KERNEL);
1208 	if (!in)
1209 		return -ENOMEM;
1210 
1211 	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1212 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1213 
1214 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1215 	MLX5_SET(rqtc, rqtc, rqt_max_size, 1 << log_max_rqt);
1216 	MLX5_SET(rqtc, rqtc, rqt_actual_size, 1);
1217 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1218 	for (i = 0, j = 0; j < ndev->mvdev.max_vqs; j++) {
1219 		if (!ndev->vqs[j].initialized)
1220 			continue;
1221 
1222 		if (!vq_is_tx(ndev->vqs[j].index)) {
1223 			list[i] = cpu_to_be32(ndev->vqs[j].virtq_id);
1224 			i++;
1225 		}
1226 	}
1227 
1228 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1229 	kfree(in);
1230 	if (err)
1231 		return err;
1232 
1233 	return 0;
1234 }
1235 
1236 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1237 {
1238 	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1239 }
1240 
1241 static int create_tir(struct mlx5_vdpa_net *ndev)
1242 {
1243 #define HASH_IP_L4PORTS                                                                            \
1244 	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1245 	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1246 	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1247 						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1248 						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1249 						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1250 						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1251 	void *rss_key;
1252 	void *outer;
1253 	void *tirc;
1254 	void *in;
1255 	int err;
1256 
1257 	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1258 	if (!in)
1259 		return -ENOMEM;
1260 
1261 	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1262 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1263 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1264 
1265 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1266 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1267 	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1268 	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1269 
1270 	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1271 	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1272 	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1273 	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1274 
1275 	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1276 	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1277 
1278 	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1279 	kfree(in);
1280 	return err;
1281 }
1282 
1283 static void destroy_tir(struct mlx5_vdpa_net *ndev)
1284 {
1285 	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1286 }
1287 
1288 static int add_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1289 {
1290 	struct mlx5_flow_destination dest[2] = {};
1291 	struct mlx5_flow_table_attr ft_attr = {};
1292 	struct mlx5_flow_act flow_act = {};
1293 	struct mlx5_flow_namespace *ns;
1294 	int err;
1295 
1296 	/* for now, one entry, match all, forward to tir */
1297 	ft_attr.max_fte = 1;
1298 	ft_attr.autogroup.max_num_groups = 1;
1299 
1300 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
1301 	if (!ns) {
1302 		mlx5_vdpa_warn(&ndev->mvdev, "get flow namespace\n");
1303 		return -EOPNOTSUPP;
1304 	}
1305 
1306 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
1307 	if (IS_ERR(ndev->rxft))
1308 		return PTR_ERR(ndev->rxft);
1309 
1310 	ndev->rx_counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1311 	if (IS_ERR(ndev->rx_counter)) {
1312 		err = PTR_ERR(ndev->rx_counter);
1313 		goto err_fc;
1314 	}
1315 
1316 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_COUNT;
1317 	dest[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1318 	dest[0].tir_num = ndev->res.tirn;
1319 	dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1320 	dest[1].counter_id = mlx5_fc_id(ndev->rx_counter);
1321 	ndev->rx_rule = mlx5_add_flow_rules(ndev->rxft, NULL, &flow_act, dest, 2);
1322 	if (IS_ERR(ndev->rx_rule)) {
1323 		err = PTR_ERR(ndev->rx_rule);
1324 		ndev->rx_rule = NULL;
1325 		goto err_rule;
1326 	}
1327 
1328 	return 0;
1329 
1330 err_rule:
1331 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1332 err_fc:
1333 	mlx5_destroy_flow_table(ndev->rxft);
1334 	return err;
1335 }
1336 
1337 static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1338 {
1339 	if (!ndev->rx_rule)
1340 		return;
1341 
1342 	mlx5_del_flow_rules(ndev->rx_rule);
1343 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1344 	mlx5_destroy_flow_table(ndev->rxft);
1345 
1346 	ndev->rx_rule = NULL;
1347 }
1348 
1349 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
1350 {
1351 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1352 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1353 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1354 
1355 	if (unlikely(!mvq->ready))
1356 		return;
1357 
1358 	iowrite16(idx, ndev->mvdev.res.kick_addr);
1359 }
1360 
1361 static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
1362 				    u64 driver_area, u64 device_area)
1363 {
1364 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1365 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1366 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1367 
1368 	mvq->desc_addr = desc_area;
1369 	mvq->device_addr = device_area;
1370 	mvq->driver_addr = driver_area;
1371 	return 0;
1372 }
1373 
1374 static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
1375 {
1376 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1377 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1378 	struct mlx5_vdpa_virtqueue *mvq;
1379 
1380 	mvq = &ndev->vqs[idx];
1381 	mvq->num_ent = num;
1382 }
1383 
1384 static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
1385 {
1386 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1387 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1388 	struct mlx5_vdpa_virtqueue *vq = &ndev->vqs[idx];
1389 
1390 	vq->event_cb = *cb;
1391 }
1392 
1393 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
1394 {
1395 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1396 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1397 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1398 
1399 	if (!ready)
1400 		suspend_vq(ndev, mvq);
1401 
1402 	mvq->ready = ready;
1403 }
1404 
1405 static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
1406 {
1407 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1408 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1409 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1410 
1411 	return mvq->ready;
1412 }
1413 
1414 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
1415 				  const struct vdpa_vq_state *state)
1416 {
1417 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1418 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1419 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1420 
1421 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
1422 		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
1423 		return -EINVAL;
1424 	}
1425 
1426 	mvq->used_idx = state->split.avail_index;
1427 	mvq->avail_idx = state->split.avail_index;
1428 	return 0;
1429 }
1430 
1431 static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
1432 {
1433 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1434 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1435 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1436 	struct mlx5_virtq_attr attr;
1437 	int err;
1438 
1439 	/* If the virtq object was destroyed, use the value saved at
1440 	 * the last minute of suspend_vq. This caters for userspace
1441 	 * that cares about emulating the index after vq is stopped.
1442 	 */
1443 	if (!mvq->initialized) {
1444 		/* Firmware returns a wrong value for the available index.
1445 		 * Since both values should be identical, we take the value of
1446 		 * used_idx which is reported correctly.
1447 		 */
1448 		state->split.avail_index = mvq->used_idx;
1449 		return 0;
1450 	}
1451 
1452 	err = query_virtqueue(ndev, mvq, &attr);
1453 	if (err) {
1454 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
1455 		return err;
1456 	}
1457 	state->split.avail_index = attr.used_index;
1458 	return 0;
1459 }
1460 
1461 static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
1462 {
1463 	return PAGE_SIZE;
1464 }
1465 
1466 enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9,
1467 	MLX5_VIRTIO_NET_F_CSUM = 1 << 10,
1468 	MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11,
1469 	MLX5_VIRTIO_NET_F_HOST_TSO4 = 1 << 12,
1470 };
1471 
1472 static u64 mlx_to_vritio_features(u16 dev_features)
1473 {
1474 	u64 result = 0;
1475 
1476 	if (dev_features & MLX5_VIRTIO_NET_F_GUEST_CSUM)
1477 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
1478 	if (dev_features & MLX5_VIRTIO_NET_F_CSUM)
1479 		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
1480 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO6)
1481 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
1482 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO4)
1483 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
1484 
1485 	return result;
1486 }
1487 
1488 static u64 mlx5_vdpa_get_features(struct vdpa_device *vdev)
1489 {
1490 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1491 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1492 	u16 dev_features;
1493 
1494 	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, device_features_bits_mask);
1495 	ndev->mvdev.mlx_features = mlx_to_vritio_features(dev_features);
1496 	if (MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, virtio_version_1_0))
1497 		ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_VERSION_1);
1498 	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
1499 	print_features(mvdev, ndev->mvdev.mlx_features, false);
1500 	return ndev->mvdev.mlx_features;
1501 }
1502 
1503 static int verify_min_features(struct mlx5_vdpa_dev *mvdev, u64 features)
1504 {
1505 	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
1506 		return -EOPNOTSUPP;
1507 
1508 	return 0;
1509 }
1510 
1511 static int setup_virtqueues(struct mlx5_vdpa_net *ndev)
1512 {
1513 	int err;
1514 	int i;
1515 
1516 	for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); i++) {
1517 		err = setup_vq(ndev, &ndev->vqs[i]);
1518 		if (err)
1519 			goto err_vq;
1520 	}
1521 
1522 	return 0;
1523 
1524 err_vq:
1525 	for (--i; i >= 0; i--)
1526 		teardown_vq(ndev, &ndev->vqs[i]);
1527 
1528 	return err;
1529 }
1530 
1531 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
1532 {
1533 	struct mlx5_vdpa_virtqueue *mvq;
1534 	int i;
1535 
1536 	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
1537 		mvq = &ndev->vqs[i];
1538 		if (!mvq->initialized)
1539 			continue;
1540 
1541 		teardown_vq(ndev, mvq);
1542 	}
1543 }
1544 
1545 /* TODO: cross-endian support */
1546 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
1547 {
1548 	return virtio_legacy_is_little_endian() ||
1549 		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
1550 }
1551 
1552 static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
1553 {
1554 	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
1555 }
1556 
1557 static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features)
1558 {
1559 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1560 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1561 	int err;
1562 
1563 	print_features(mvdev, features, true);
1564 
1565 	err = verify_min_features(mvdev, features);
1566 	if (err)
1567 		return err;
1568 
1569 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
1570 	ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu);
1571 	ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
1572 	return err;
1573 }
1574 
1575 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
1576 {
1577 	/* not implemented */
1578 	mlx5_vdpa_warn(to_mvdev(vdev), "set config callback not supported\n");
1579 }
1580 
1581 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
1582 static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
1583 {
1584 	return MLX5_VDPA_MAX_VQ_ENTRIES;
1585 }
1586 
1587 static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
1588 {
1589 	return VIRTIO_ID_NET;
1590 }
1591 
1592 static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
1593 {
1594 	return PCI_VENDOR_ID_MELLANOX;
1595 }
1596 
1597 static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
1598 {
1599 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1600 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1601 
1602 	print_status(mvdev, ndev->mvdev.status, false);
1603 	return ndev->mvdev.status;
1604 }
1605 
1606 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1607 {
1608 	struct mlx5_vq_restore_info *ri = &mvq->ri;
1609 	struct mlx5_virtq_attr attr;
1610 	int err;
1611 
1612 	if (!mvq->initialized)
1613 		return 0;
1614 
1615 	err = query_virtqueue(ndev, mvq, &attr);
1616 	if (err)
1617 		return err;
1618 
1619 	ri->avail_index = attr.available_index;
1620 	ri->used_index = attr.used_index;
1621 	ri->ready = mvq->ready;
1622 	ri->num_ent = mvq->num_ent;
1623 	ri->desc_addr = mvq->desc_addr;
1624 	ri->device_addr = mvq->device_addr;
1625 	ri->driver_addr = mvq->driver_addr;
1626 	ri->cb = mvq->event_cb;
1627 	ri->restore = true;
1628 	return 0;
1629 }
1630 
1631 static int save_channels_info(struct mlx5_vdpa_net *ndev)
1632 {
1633 	int i;
1634 
1635 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
1636 		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
1637 		save_channel_info(ndev, &ndev->vqs[i]);
1638 	}
1639 	return 0;
1640 }
1641 
1642 static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
1643 {
1644 	int i;
1645 
1646 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
1647 		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
1648 }
1649 
1650 static void restore_channels_info(struct mlx5_vdpa_net *ndev)
1651 {
1652 	struct mlx5_vdpa_virtqueue *mvq;
1653 	struct mlx5_vq_restore_info *ri;
1654 	int i;
1655 
1656 	mlx5_clear_vqs(ndev);
1657 	init_mvqs(ndev);
1658 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
1659 		mvq = &ndev->vqs[i];
1660 		ri = &mvq->ri;
1661 		if (!ri->restore)
1662 			continue;
1663 
1664 		mvq->avail_idx = ri->avail_index;
1665 		mvq->used_idx = ri->used_index;
1666 		mvq->ready = ri->ready;
1667 		mvq->num_ent = ri->num_ent;
1668 		mvq->desc_addr = ri->desc_addr;
1669 		mvq->device_addr = ri->device_addr;
1670 		mvq->driver_addr = ri->driver_addr;
1671 		mvq->event_cb = ri->cb;
1672 	}
1673 }
1674 
1675 static int mlx5_vdpa_change_map(struct mlx5_vdpa_net *ndev, struct vhost_iotlb *iotlb)
1676 {
1677 	int err;
1678 
1679 	suspend_vqs(ndev);
1680 	err = save_channels_info(ndev);
1681 	if (err)
1682 		goto err_mr;
1683 
1684 	teardown_driver(ndev);
1685 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
1686 	err = mlx5_vdpa_create_mr(&ndev->mvdev, iotlb);
1687 	if (err)
1688 		goto err_mr;
1689 
1690 	if (!(ndev->mvdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
1691 		return 0;
1692 
1693 	restore_channels_info(ndev);
1694 	err = setup_driver(ndev);
1695 	if (err)
1696 		goto err_setup;
1697 
1698 	return 0;
1699 
1700 err_setup:
1701 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
1702 err_mr:
1703 	return err;
1704 }
1705 
1706 static int setup_driver(struct mlx5_vdpa_net *ndev)
1707 {
1708 	int err;
1709 
1710 	mutex_lock(&ndev->reslock);
1711 	if (ndev->setup) {
1712 		mlx5_vdpa_warn(&ndev->mvdev, "setup driver called for already setup driver\n");
1713 		err = 0;
1714 		goto out;
1715 	}
1716 	err = setup_virtqueues(ndev);
1717 	if (err) {
1718 		mlx5_vdpa_warn(&ndev->mvdev, "setup_virtqueues\n");
1719 		goto out;
1720 	}
1721 
1722 	err = create_rqt(ndev);
1723 	if (err) {
1724 		mlx5_vdpa_warn(&ndev->mvdev, "create_rqt\n");
1725 		goto err_rqt;
1726 	}
1727 
1728 	err = create_tir(ndev);
1729 	if (err) {
1730 		mlx5_vdpa_warn(&ndev->mvdev, "create_tir\n");
1731 		goto err_tir;
1732 	}
1733 
1734 	err = add_fwd_to_tir(ndev);
1735 	if (err) {
1736 		mlx5_vdpa_warn(&ndev->mvdev, "add_fwd_to_tir\n");
1737 		goto err_fwd;
1738 	}
1739 	ndev->setup = true;
1740 	mutex_unlock(&ndev->reslock);
1741 
1742 	return 0;
1743 
1744 err_fwd:
1745 	destroy_tir(ndev);
1746 err_tir:
1747 	destroy_rqt(ndev);
1748 err_rqt:
1749 	teardown_virtqueues(ndev);
1750 out:
1751 	mutex_unlock(&ndev->reslock);
1752 	return err;
1753 }
1754 
1755 static void teardown_driver(struct mlx5_vdpa_net *ndev)
1756 {
1757 	mutex_lock(&ndev->reslock);
1758 	if (!ndev->setup)
1759 		goto out;
1760 
1761 	remove_fwd_to_tir(ndev);
1762 	destroy_tir(ndev);
1763 	destroy_rqt(ndev);
1764 	teardown_virtqueues(ndev);
1765 	ndev->setup = false;
1766 out:
1767 	mutex_unlock(&ndev->reslock);
1768 }
1769 
1770 static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
1771 {
1772 	int i;
1773 
1774 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
1775 		ndev->vqs[i].ready = false;
1776 }
1777 
1778 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
1779 {
1780 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1781 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1782 	int err;
1783 
1784 	print_status(mvdev, status, true);
1785 	if (!status) {
1786 		mlx5_vdpa_info(mvdev, "performing device reset\n");
1787 		teardown_driver(ndev);
1788 		clear_vqs_ready(ndev);
1789 		mlx5_vdpa_destroy_mr(&ndev->mvdev);
1790 		ndev->mvdev.status = 0;
1791 		ndev->mvdev.mlx_features = 0;
1792 		++mvdev->generation;
1793 		if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
1794 			if (mlx5_vdpa_create_mr(mvdev, NULL))
1795 				mlx5_vdpa_warn(mvdev, "create MR failed\n");
1796 		}
1797 		return;
1798 	}
1799 
1800 	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
1801 		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
1802 			err = setup_driver(ndev);
1803 			if (err) {
1804 				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
1805 				goto err_setup;
1806 			}
1807 		} else {
1808 			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
1809 			return;
1810 		}
1811 	}
1812 
1813 	ndev->mvdev.status = status;
1814 	return;
1815 
1816 err_setup:
1817 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
1818 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
1819 }
1820 
1821 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
1822 {
1823 	return sizeof(struct virtio_net_config);
1824 }
1825 
1826 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
1827 				 unsigned int len)
1828 {
1829 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1830 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1831 
1832 	if (offset + len <= sizeof(struct virtio_net_config))
1833 		memcpy(buf, (u8 *)&ndev->config + offset, len);
1834 }
1835 
1836 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
1837 				 unsigned int len)
1838 {
1839 	/* not supported */
1840 }
1841 
1842 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
1843 {
1844 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1845 
1846 	return mvdev->generation;
1847 }
1848 
1849 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb)
1850 {
1851 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1852 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1853 	bool change_map;
1854 	int err;
1855 
1856 	err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map);
1857 	if (err) {
1858 		mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err);
1859 		return err;
1860 	}
1861 
1862 	if (change_map)
1863 		return mlx5_vdpa_change_map(ndev, iotlb);
1864 
1865 	return 0;
1866 }
1867 
1868 static void mlx5_vdpa_free(struct vdpa_device *vdev)
1869 {
1870 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1871 	struct mlx5_core_dev *pfmdev;
1872 	struct mlx5_vdpa_net *ndev;
1873 
1874 	ndev = to_mlx5_vdpa_ndev(mvdev);
1875 
1876 	free_resources(ndev);
1877 	mlx5_vdpa_destroy_mr(mvdev);
1878 	if (!is_zero_ether_addr(ndev->config.mac)) {
1879 		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
1880 		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
1881 	}
1882 	mlx5_vdpa_free_resources(&ndev->mvdev);
1883 	mutex_destroy(&ndev->reslock);
1884 }
1885 
1886 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
1887 {
1888 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1889 	struct vdpa_notification_area ret = {};
1890 	struct mlx5_vdpa_net *ndev;
1891 	phys_addr_t addr;
1892 
1893 	/* If SF BAR size is smaller than PAGE_SIZE, do not use direct
1894 	 * notification to avoid the risk of mapping pages that contain BAR of more
1895 	 * than one SF
1896 	 */
1897 	if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
1898 		return ret;
1899 
1900 	ndev = to_mlx5_vdpa_ndev(mvdev);
1901 	addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
1902 	ret.addr = addr;
1903 	ret.size = PAGE_SIZE;
1904 	return ret;
1905 }
1906 
1907 static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 idx)
1908 {
1909 	return -EOPNOTSUPP;
1910 }
1911 
1912 static const struct vdpa_config_ops mlx5_vdpa_ops = {
1913 	.set_vq_address = mlx5_vdpa_set_vq_address,
1914 	.set_vq_num = mlx5_vdpa_set_vq_num,
1915 	.kick_vq = mlx5_vdpa_kick_vq,
1916 	.set_vq_cb = mlx5_vdpa_set_vq_cb,
1917 	.set_vq_ready = mlx5_vdpa_set_vq_ready,
1918 	.get_vq_ready = mlx5_vdpa_get_vq_ready,
1919 	.set_vq_state = mlx5_vdpa_set_vq_state,
1920 	.get_vq_state = mlx5_vdpa_get_vq_state,
1921 	.get_vq_notification = mlx5_get_vq_notification,
1922 	.get_vq_irq = mlx5_get_vq_irq,
1923 	.get_vq_align = mlx5_vdpa_get_vq_align,
1924 	.get_features = mlx5_vdpa_get_features,
1925 	.set_features = mlx5_vdpa_set_features,
1926 	.set_config_cb = mlx5_vdpa_set_config_cb,
1927 	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
1928 	.get_device_id = mlx5_vdpa_get_device_id,
1929 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
1930 	.get_status = mlx5_vdpa_get_status,
1931 	.set_status = mlx5_vdpa_set_status,
1932 	.get_config_size = mlx5_vdpa_get_config_size,
1933 	.get_config = mlx5_vdpa_get_config,
1934 	.set_config = mlx5_vdpa_set_config,
1935 	.get_generation = mlx5_vdpa_get_generation,
1936 	.set_map = mlx5_vdpa_set_map,
1937 	.free = mlx5_vdpa_free,
1938 };
1939 
1940 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
1941 {
1942 	u16 hw_mtu;
1943 	int err;
1944 
1945 	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
1946 	if (err)
1947 		return err;
1948 
1949 	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
1950 	return 0;
1951 }
1952 
1953 static int alloc_resources(struct mlx5_vdpa_net *ndev)
1954 {
1955 	struct mlx5_vdpa_net_resources *res = &ndev->res;
1956 	int err;
1957 
1958 	if (res->valid) {
1959 		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
1960 		return -EEXIST;
1961 	}
1962 
1963 	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
1964 	if (err)
1965 		return err;
1966 
1967 	err = create_tis(ndev);
1968 	if (err)
1969 		goto err_tis;
1970 
1971 	res->valid = true;
1972 
1973 	return 0;
1974 
1975 err_tis:
1976 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
1977 	return err;
1978 }
1979 
1980 static void free_resources(struct mlx5_vdpa_net *ndev)
1981 {
1982 	struct mlx5_vdpa_net_resources *res = &ndev->res;
1983 
1984 	if (!res->valid)
1985 		return;
1986 
1987 	destroy_tis(ndev);
1988 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
1989 	res->valid = false;
1990 }
1991 
1992 static void init_mvqs(struct mlx5_vdpa_net *ndev)
1993 {
1994 	struct mlx5_vdpa_virtqueue *mvq;
1995 	int i;
1996 
1997 	for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); ++i) {
1998 		mvq = &ndev->vqs[i];
1999 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2000 		mvq->index = i;
2001 		mvq->ndev = ndev;
2002 		mvq->fwqp.fw = true;
2003 	}
2004 	for (; i < ndev->mvdev.max_vqs; i++) {
2005 		mvq = &ndev->vqs[i];
2006 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2007 		mvq->index = i;
2008 		mvq->ndev = ndev;
2009 	}
2010 }
2011 
2012 struct mlx5_vdpa_mgmtdev {
2013 	struct vdpa_mgmt_dev mgtdev;
2014 	struct mlx5_adev *madev;
2015 	struct mlx5_vdpa_net *ndev;
2016 };
2017 
2018 static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
2019 {
2020 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
2021 	struct virtio_net_config *config;
2022 	struct mlx5_core_dev *pfmdev;
2023 	struct mlx5_vdpa_dev *mvdev;
2024 	struct mlx5_vdpa_net *ndev;
2025 	struct mlx5_core_dev *mdev;
2026 	u32 max_vqs;
2027 	int err;
2028 
2029 	if (mgtdev->ndev)
2030 		return -ENOSPC;
2031 
2032 	mdev = mgtdev->madev->mdev;
2033 	/* we save one virtqueue for control virtqueue should we require it */
2034 	max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues);
2035 	max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
2036 
2037 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
2038 				 name);
2039 	if (IS_ERR(ndev))
2040 		return PTR_ERR(ndev);
2041 
2042 	ndev->mvdev.max_vqs = max_vqs;
2043 	mvdev = &ndev->mvdev;
2044 	mvdev->mdev = mdev;
2045 	init_mvqs(ndev);
2046 	mutex_init(&ndev->reslock);
2047 	config = &ndev->config;
2048 	err = query_mtu(mdev, &ndev->mtu);
2049 	if (err)
2050 		goto err_mtu;
2051 
2052 	err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
2053 	if (err)
2054 		goto err_mtu;
2055 
2056 	if (!is_zero_ether_addr(config->mac)) {
2057 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
2058 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
2059 		if (err)
2060 			goto err_mtu;
2061 	}
2062 
2063 	mvdev->vdev.dma_dev = &mdev->pdev->dev;
2064 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
2065 	if (err)
2066 		goto err_mpfs;
2067 
2068 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2069 		err = mlx5_vdpa_create_mr(mvdev, NULL);
2070 		if (err)
2071 			goto err_res;
2072 	}
2073 
2074 	err = alloc_resources(ndev);
2075 	if (err)
2076 		goto err_mr;
2077 
2078 	mvdev->vdev.mdev = &mgtdev->mgtdev;
2079 	err = _vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs));
2080 	if (err)
2081 		goto err_reg;
2082 
2083 	mgtdev->ndev = ndev;
2084 	return 0;
2085 
2086 err_reg:
2087 	free_resources(ndev);
2088 err_mr:
2089 	mlx5_vdpa_destroy_mr(mvdev);
2090 err_res:
2091 	mlx5_vdpa_free_resources(&ndev->mvdev);
2092 err_mpfs:
2093 	if (!is_zero_ether_addr(config->mac))
2094 		mlx5_mpfs_del_mac(pfmdev, config->mac);
2095 err_mtu:
2096 	mutex_destroy(&ndev->reslock);
2097 	put_device(&mvdev->vdev.dev);
2098 	return err;
2099 }
2100 
2101 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
2102 {
2103 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
2104 
2105 	_vdpa_unregister_device(dev);
2106 	mgtdev->ndev = NULL;
2107 }
2108 
2109 static const struct vdpa_mgmtdev_ops mdev_ops = {
2110 	.dev_add = mlx5_vdpa_dev_add,
2111 	.dev_del = mlx5_vdpa_dev_del,
2112 };
2113 
2114 static struct virtio_device_id id_table[] = {
2115 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
2116 	{ 0 },
2117 };
2118 
2119 static int mlx5v_probe(struct auxiliary_device *adev,
2120 		       const struct auxiliary_device_id *id)
2121 
2122 {
2123 	struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
2124 	struct mlx5_core_dev *mdev = madev->mdev;
2125 	struct mlx5_vdpa_mgmtdev *mgtdev;
2126 	int err;
2127 
2128 	mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
2129 	if (!mgtdev)
2130 		return -ENOMEM;
2131 
2132 	mgtdev->mgtdev.ops = &mdev_ops;
2133 	mgtdev->mgtdev.device = mdev->device;
2134 	mgtdev->mgtdev.id_table = id_table;
2135 	mgtdev->madev = madev;
2136 
2137 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
2138 	if (err)
2139 		goto reg_err;
2140 
2141 	dev_set_drvdata(&adev->dev, mgtdev);
2142 
2143 	return 0;
2144 
2145 reg_err:
2146 	kfree(mgtdev);
2147 	return err;
2148 }
2149 
2150 static void mlx5v_remove(struct auxiliary_device *adev)
2151 {
2152 	struct mlx5_vdpa_mgmtdev *mgtdev;
2153 
2154 	mgtdev = dev_get_drvdata(&adev->dev);
2155 	vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
2156 	kfree(mgtdev);
2157 }
2158 
2159 static const struct auxiliary_device_id mlx5v_id_table[] = {
2160 	{ .name = MLX5_ADEV_NAME ".vnet", },
2161 	{},
2162 };
2163 
2164 MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
2165 
2166 static struct auxiliary_driver mlx5v_driver = {
2167 	.name = "vnet",
2168 	.probe = mlx5v_probe,
2169 	.remove = mlx5v_remove,
2170 	.id_table = mlx5v_id_table,
2171 };
2172 
2173 module_auxiliary_driver(mlx5v_driver);
2174