xref: /openbmc/linux/drivers/vdpa/mlx5/net/mlx5_vnet.c (revision 0eb76ba2)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
3 
4 #include <linux/module.h>
5 #include <linux/vdpa.h>
6 #include <linux/vringh.h>
7 #include <uapi/linux/virtio_net.h>
8 #include <uapi/linux/virtio_ids.h>
9 #include <linux/virtio_config.h>
10 #include <linux/auxiliary_bus.h>
11 #include <linux/mlx5/cq.h>
12 #include <linux/mlx5/qp.h>
13 #include <linux/mlx5/device.h>
14 #include <linux/mlx5/driver.h>
15 #include <linux/mlx5/vport.h>
16 #include <linux/mlx5/fs.h>
17 #include <linux/mlx5/mlx5_ifc_vdpa.h>
18 #include "mlx5_vdpa.h"
19 
20 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
21 MODULE_DESCRIPTION("Mellanox VDPA driver");
22 MODULE_LICENSE("Dual BSD/GPL");
23 
24 #define to_mlx5_vdpa_ndev(__mvdev)                                             \
25 	container_of(__mvdev, struct mlx5_vdpa_net, mvdev)
26 #define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev)
27 
28 #define VALID_FEATURES_MASK                                                                        \
29 	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
30 	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
31 	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
32 	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
33 	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
34 	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
35 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
36 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
37 	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
38 	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
39 	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
40 	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
41 	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
42 
43 #define VALID_STATUS_MASK                                                                          \
44 	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
45 	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
46 
47 struct mlx5_vdpa_net_resources {
48 	u32 tisn;
49 	u32 tdn;
50 	u32 tirn;
51 	u32 rqtn;
52 	bool valid;
53 };
54 
55 struct mlx5_vdpa_cq_buf {
56 	struct mlx5_frag_buf_ctrl fbc;
57 	struct mlx5_frag_buf frag_buf;
58 	int cqe_size;
59 	int nent;
60 };
61 
62 struct mlx5_vdpa_cq {
63 	struct mlx5_core_cq mcq;
64 	struct mlx5_vdpa_cq_buf buf;
65 	struct mlx5_db db;
66 	int cqe;
67 };
68 
69 struct mlx5_vdpa_umem {
70 	struct mlx5_frag_buf_ctrl fbc;
71 	struct mlx5_frag_buf frag_buf;
72 	int size;
73 	u32 id;
74 };
75 
76 struct mlx5_vdpa_qp {
77 	struct mlx5_core_qp mqp;
78 	struct mlx5_frag_buf frag_buf;
79 	struct mlx5_db db;
80 	u16 head;
81 	bool fw;
82 };
83 
84 struct mlx5_vq_restore_info {
85 	u32 num_ent;
86 	u64 desc_addr;
87 	u64 device_addr;
88 	u64 driver_addr;
89 	u16 avail_index;
90 	bool ready;
91 	struct vdpa_callback cb;
92 	bool restore;
93 };
94 
95 struct mlx5_vdpa_virtqueue {
96 	bool ready;
97 	u64 desc_addr;
98 	u64 device_addr;
99 	u64 driver_addr;
100 	u32 num_ent;
101 	struct vdpa_callback event_cb;
102 
103 	/* Resources for implementing the notification channel from the device
104 	 * to the driver. fwqp is the firmware end of an RC connection; the
105 	 * other end is vqqp used by the driver. cq is is where completions are
106 	 * reported.
107 	 */
108 	struct mlx5_vdpa_cq cq;
109 	struct mlx5_vdpa_qp fwqp;
110 	struct mlx5_vdpa_qp vqqp;
111 
112 	/* umem resources are required for the virtqueue operation. They're use
113 	 * is internal and they must be provided by the driver.
114 	 */
115 	struct mlx5_vdpa_umem umem1;
116 	struct mlx5_vdpa_umem umem2;
117 	struct mlx5_vdpa_umem umem3;
118 
119 	bool initialized;
120 	int index;
121 	u32 virtq_id;
122 	struct mlx5_vdpa_net *ndev;
123 	u16 avail_idx;
124 	int fw_state;
125 
126 	/* keep last in the struct */
127 	struct mlx5_vq_restore_info ri;
128 };
129 
130 /* We will remove this limitation once mlx5_vdpa_alloc_resources()
131  * provides for driver space allocation
132  */
133 #define MLX5_MAX_SUPPORTED_VQS 16
134 
135 struct mlx5_vdpa_net {
136 	struct mlx5_vdpa_dev mvdev;
137 	struct mlx5_vdpa_net_resources res;
138 	struct virtio_net_config config;
139 	struct mlx5_vdpa_virtqueue vqs[MLX5_MAX_SUPPORTED_VQS];
140 
141 	/* Serialize vq resources creation and destruction. This is required
142 	 * since memory map might change and we need to destroy and create
143 	 * resources while driver in operational.
144 	 */
145 	struct mutex reslock;
146 	struct mlx5_flow_table *rxft;
147 	struct mlx5_fc *rx_counter;
148 	struct mlx5_flow_handle *rx_rule;
149 	bool setup;
150 	u16 mtu;
151 };
152 
153 static void free_resources(struct mlx5_vdpa_net *ndev);
154 static void init_mvqs(struct mlx5_vdpa_net *ndev);
155 static int setup_driver(struct mlx5_vdpa_net *ndev);
156 static void teardown_driver(struct mlx5_vdpa_net *ndev);
157 
158 static bool mlx5_vdpa_debug;
159 
160 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
161 	do {                                                                                       \
162 		if (features & BIT_ULL(_feature))                                                  \
163 			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
164 	} while (0)
165 
166 #define MLX5_LOG_VIO_STAT(_status)                                                                 \
167 	do {                                                                                       \
168 		if (status & (_status))                                                            \
169 			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
170 	} while (0)
171 
172 static inline u32 mlx5_vdpa_max_qps(int max_vqs)
173 {
174 	return max_vqs / 2;
175 }
176 
177 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
178 {
179 	if (status & ~VALID_STATUS_MASK)
180 		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
181 			       status & ~VALID_STATUS_MASK);
182 
183 	if (!mlx5_vdpa_debug)
184 		return;
185 
186 	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
187 	if (set && !status) {
188 		mlx5_vdpa_info(mvdev, "driver resets the device\n");
189 		return;
190 	}
191 
192 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
193 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
194 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
195 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
196 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
197 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
198 }
199 
200 static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
201 {
202 	if (features & ~VALID_FEATURES_MASK)
203 		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
204 			       features & ~VALID_FEATURES_MASK);
205 
206 	if (!mlx5_vdpa_debug)
207 		return;
208 
209 	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
210 	if (!features)
211 		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
212 
213 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
214 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
215 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
216 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
217 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
218 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
219 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
220 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
221 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
222 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
223 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
224 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
225 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
226 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
227 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
228 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
229 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
230 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
231 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
232 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
233 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
234 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
235 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
236 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
237 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
238 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
239 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
240 	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
241 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
242 	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
243 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
244 	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
245 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
246 	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
247 }
248 
249 static int create_tis(struct mlx5_vdpa_net *ndev)
250 {
251 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
252 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
253 	void *tisc;
254 	int err;
255 
256 	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
257 	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
258 	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
259 	if (err)
260 		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
261 
262 	return err;
263 }
264 
265 static void destroy_tis(struct mlx5_vdpa_net *ndev)
266 {
267 	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
268 }
269 
270 #define MLX5_VDPA_CQE_SIZE 64
271 #define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
272 
273 static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
274 {
275 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
276 	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
277 	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
278 	int err;
279 
280 	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
281 				       ndev->mvdev.mdev->priv.numa_node);
282 	if (err)
283 		return err;
284 
285 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
286 
287 	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
288 	buf->nent = nent;
289 
290 	return 0;
291 }
292 
293 static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
294 {
295 	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
296 
297 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
298 					ndev->mvdev.mdev->priv.numa_node);
299 }
300 
301 static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
302 {
303 	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
304 }
305 
306 static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
307 {
308 	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
309 }
310 
311 static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
312 {
313 	struct mlx5_cqe64 *cqe64;
314 	void *cqe;
315 	int i;
316 
317 	for (i = 0; i < buf->nent; i++) {
318 		cqe = get_cqe(vcq, i);
319 		cqe64 = cqe;
320 		cqe64->op_own = MLX5_CQE_INVALID << 4;
321 	}
322 }
323 
324 static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
325 {
326 	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
327 
328 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
329 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
330 		return cqe64;
331 
332 	return NULL;
333 }
334 
335 static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
336 {
337 	vqp->head += n;
338 	vqp->db.db[0] = cpu_to_be32(vqp->head);
339 }
340 
341 static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
342 		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
343 {
344 	struct mlx5_vdpa_qp *vqp;
345 	__be64 *pas;
346 	void *qpc;
347 
348 	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
349 	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
350 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
351 	if (vqp->fw) {
352 		/* Firmware QP is allocated by the driver for the firmware's
353 		 * use so we can skip part of the params as they will be chosen by firmware
354 		 */
355 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
356 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
357 		MLX5_SET(qpc, qpc, no_sq, 1);
358 		return;
359 	}
360 
361 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
362 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
363 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
364 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
365 	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
366 	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
367 	MLX5_SET(qpc, qpc, no_sq, 1);
368 	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
369 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
370 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
371 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
372 	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
373 }
374 
375 static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
376 {
377 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
378 					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
379 					ndev->mvdev.mdev->priv.numa_node);
380 }
381 
382 static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
383 {
384 	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
385 }
386 
387 static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
388 		     struct mlx5_vdpa_qp *vqp)
389 {
390 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
391 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
392 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
393 	void *qpc;
394 	void *in;
395 	int err;
396 
397 	if (!vqp->fw) {
398 		vqp = &mvq->vqqp;
399 		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
400 		if (err)
401 			return err;
402 
403 		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
404 		if (err)
405 			goto err_db;
406 		inlen += vqp->frag_buf.npages * sizeof(__be64);
407 	}
408 
409 	in = kzalloc(inlen, GFP_KERNEL);
410 	if (!in) {
411 		err = -ENOMEM;
412 		goto err_kzalloc;
413 	}
414 
415 	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
416 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
417 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
418 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
419 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
420 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
421 	if (!vqp->fw)
422 		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
423 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
424 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
425 	kfree(in);
426 	if (err)
427 		goto err_kzalloc;
428 
429 	vqp->mqp.uid = ndev->mvdev.res.uid;
430 	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
431 
432 	if (!vqp->fw)
433 		rx_post(vqp, mvq->num_ent);
434 
435 	return 0;
436 
437 err_kzalloc:
438 	if (!vqp->fw)
439 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
440 err_db:
441 	if (!vqp->fw)
442 		rq_buf_free(ndev, vqp);
443 
444 	return err;
445 }
446 
447 static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
448 {
449 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
450 
451 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
452 	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
453 	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
454 	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
455 		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
456 	if (!vqp->fw) {
457 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
458 		rq_buf_free(ndev, vqp);
459 	}
460 }
461 
462 static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
463 {
464 	return get_sw_cqe(cq, cq->mcq.cons_index);
465 }
466 
467 static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
468 {
469 	struct mlx5_cqe64 *cqe64;
470 
471 	cqe64 = next_cqe_sw(vcq);
472 	if (!cqe64)
473 		return -EAGAIN;
474 
475 	vcq->mcq.cons_index++;
476 	return 0;
477 }
478 
479 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
480 {
481 	mlx5_cq_set_ci(&mvq->cq.mcq);
482 
483 	/* make sure CQ cosumer update is visible to the hardware before updating
484 	 * RX doorbell record.
485 	 */
486 	dma_wmb();
487 	rx_post(&mvq->vqqp, num);
488 	if (mvq->event_cb.callback)
489 		mvq->event_cb.callback(mvq->event_cb.private);
490 }
491 
492 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
493 {
494 	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
495 	struct mlx5_vdpa_net *ndev = mvq->ndev;
496 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
497 	int num = 0;
498 
499 	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
500 		num++;
501 		if (num > mvq->num_ent / 2) {
502 			/* If completions keep coming while we poll, we want to
503 			 * let the hardware know that we consumed them by
504 			 * updating the doorbell record.  We also let vdpa core
505 			 * know about this so it passes it on the virtio driver
506 			 * on the guest.
507 			 */
508 			mlx5_vdpa_handle_completions(mvq, num);
509 			num = 0;
510 		}
511 	}
512 
513 	if (num)
514 		mlx5_vdpa_handle_completions(mvq, num);
515 
516 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
517 }
518 
519 static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
520 {
521 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
522 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
523 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
524 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
525 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
526 	unsigned int irqn;
527 	__be64 *pas;
528 	int inlen;
529 	void *cqc;
530 	void *in;
531 	int err;
532 	int eqn;
533 
534 	err = mlx5_db_alloc(mdev, &vcq->db);
535 	if (err)
536 		return err;
537 
538 	vcq->mcq.set_ci_db = vcq->db.db;
539 	vcq->mcq.arm_db = vcq->db.db + 1;
540 	vcq->mcq.cqe_sz = 64;
541 
542 	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
543 	if (err)
544 		goto err_db;
545 
546 	cq_frag_buf_init(vcq, &vcq->buf);
547 
548 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
549 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
550 	in = kzalloc(inlen, GFP_KERNEL);
551 	if (!in) {
552 		err = -ENOMEM;
553 		goto err_vzalloc;
554 	}
555 
556 	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
557 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
558 	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
559 
560 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
561 	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
562 
563 	/* Use vector 0 by default. Consider adding code to choose least used
564 	 * vector.
565 	 */
566 	err = mlx5_vector2eqn(mdev, 0, &eqn, &irqn);
567 	if (err)
568 		goto err_vec;
569 
570 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
571 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
572 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
573 	MLX5_SET(cqc, cqc, c_eqn, eqn);
574 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
575 
576 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
577 	if (err)
578 		goto err_vec;
579 
580 	vcq->mcq.comp = mlx5_vdpa_cq_comp;
581 	vcq->cqe = num_ent;
582 	vcq->mcq.set_ci_db = vcq->db.db;
583 	vcq->mcq.arm_db = vcq->db.db + 1;
584 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
585 	kfree(in);
586 	return 0;
587 
588 err_vec:
589 	kfree(in);
590 err_vzalloc:
591 	cq_frag_buf_free(ndev, &vcq->buf);
592 err_db:
593 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
594 	return err;
595 }
596 
597 static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
598 {
599 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
600 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
601 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
602 
603 	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
604 		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
605 		return;
606 	}
607 	cq_frag_buf_free(ndev, &vcq->buf);
608 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
609 }
610 
611 static int umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
612 		     struct mlx5_vdpa_umem **umemp)
613 {
614 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
615 	int p_a;
616 	int p_b;
617 
618 	switch (num) {
619 	case 1:
620 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_a);
621 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_b);
622 		*umemp = &mvq->umem1;
623 		break;
624 	case 2:
625 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_a);
626 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_b);
627 		*umemp = &mvq->umem2;
628 		break;
629 	case 3:
630 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_a);
631 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_b);
632 		*umemp = &mvq->umem3;
633 		break;
634 	}
635 	return p_a * mvq->num_ent + p_b;
636 }
637 
638 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
639 {
640 	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
641 }
642 
643 static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
644 {
645 	int inlen;
646 	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
647 	void *um;
648 	void *in;
649 	int err;
650 	__be64 *pas;
651 	int size;
652 	struct mlx5_vdpa_umem *umem;
653 
654 	size = umem_size(ndev, mvq, num, &umem);
655 	if (size < 0)
656 		return size;
657 
658 	umem->size = size;
659 	err = umem_frag_buf_alloc(ndev, umem, size);
660 	if (err)
661 		return err;
662 
663 	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
664 
665 	in = kzalloc(inlen, GFP_KERNEL);
666 	if (!in) {
667 		err = -ENOMEM;
668 		goto err_in;
669 	}
670 
671 	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
672 	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
673 	um = MLX5_ADDR_OF(create_umem_in, in, umem);
674 	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
675 	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
676 
677 	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
678 	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
679 
680 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
681 	if (err) {
682 		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
683 		goto err_cmd;
684 	}
685 
686 	kfree(in);
687 	umem->id = MLX5_GET(create_umem_out, out, umem_id);
688 
689 	return 0;
690 
691 err_cmd:
692 	kfree(in);
693 err_in:
694 	umem_frag_buf_free(ndev, umem);
695 	return err;
696 }
697 
698 static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
699 {
700 	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
701 	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
702 	struct mlx5_vdpa_umem *umem;
703 
704 	switch (num) {
705 	case 1:
706 		umem = &mvq->umem1;
707 		break;
708 	case 2:
709 		umem = &mvq->umem2;
710 		break;
711 	case 3:
712 		umem = &mvq->umem3;
713 		break;
714 	}
715 
716 	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
717 	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
718 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
719 		return;
720 
721 	umem_frag_buf_free(ndev, umem);
722 }
723 
724 static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
725 {
726 	int num;
727 	int err;
728 
729 	for (num = 1; num <= 3; num++) {
730 		err = create_umem(ndev, mvq, num);
731 		if (err)
732 			goto err_umem;
733 	}
734 	return 0;
735 
736 err_umem:
737 	for (num--; num > 0; num--)
738 		umem_destroy(ndev, mvq, num);
739 
740 	return err;
741 }
742 
743 static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
744 {
745 	int num;
746 
747 	for (num = 3; num > 0; num--)
748 		umem_destroy(ndev, mvq, num);
749 }
750 
751 static int get_queue_type(struct mlx5_vdpa_net *ndev)
752 {
753 	u32 type_mask;
754 
755 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
756 
757 	/* prefer split queue */
758 	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED)
759 		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
760 
761 	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT));
762 
763 	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
764 }
765 
766 static bool vq_is_tx(u16 idx)
767 {
768 	return idx % 2;
769 }
770 
771 static u16 get_features_12_3(u64 features)
772 {
773 	return (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << 9) |
774 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << 8) |
775 	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << 7) |
776 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_CSUM)) << 6);
777 }
778 
779 static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
780 {
781 	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
782 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
783 	void *obj_context;
784 	void *cmd_hdr;
785 	void *vq_ctx;
786 	void *in;
787 	int err;
788 
789 	err = umems_create(ndev, mvq);
790 	if (err)
791 		return err;
792 
793 	in = kzalloc(inlen, GFP_KERNEL);
794 	if (!in) {
795 		err = -ENOMEM;
796 		goto err_alloc;
797 	}
798 
799 	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
800 
801 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
802 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
803 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
804 
805 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
806 	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
807 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
808 		 get_features_12_3(ndev->mvdev.actual_features));
809 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
810 	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
811 
812 	if (vq_is_tx(mvq->index))
813 		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
814 
815 	MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
816 	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
817 	MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
818 	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
819 	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
820 		 !!(ndev->mvdev.actual_features & VIRTIO_F_VERSION_1));
821 	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
822 	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
823 	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
824 	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey.key);
825 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
826 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
827 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
828 	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem1.size);
829 	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
830 	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem1.size);
831 	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
832 	if (MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, eth_frame_offload_type))
833 		MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, 1);
834 
835 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
836 	if (err)
837 		goto err_cmd;
838 
839 	kfree(in);
840 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
841 
842 	return 0;
843 
844 err_cmd:
845 	kfree(in);
846 err_alloc:
847 	umems_destroy(ndev, mvq);
848 	return err;
849 }
850 
851 static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
852 {
853 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
854 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
855 
856 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
857 		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
858 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
859 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
860 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
861 		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
862 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
863 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
864 		return;
865 	}
866 	umems_destroy(ndev, mvq);
867 }
868 
869 static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
870 {
871 	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
872 }
873 
874 static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
875 {
876 	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
877 }
878 
879 static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
880 			int *outlen, u32 qpn, u32 rqpn)
881 {
882 	void *qpc;
883 	void *pp;
884 
885 	switch (cmd) {
886 	case MLX5_CMD_OP_2RST_QP:
887 		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
888 		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
889 		*in = kzalloc(*inlen, GFP_KERNEL);
890 		*out = kzalloc(*outlen, GFP_KERNEL);
891 		if (!*in || !*out)
892 			goto outerr;
893 
894 		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
895 		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
896 		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
897 		break;
898 	case MLX5_CMD_OP_RST2INIT_QP:
899 		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
900 		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
901 		*in = kzalloc(*inlen, GFP_KERNEL);
902 		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
903 		if (!*in || !*out)
904 			goto outerr;
905 
906 		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
907 		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
908 		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
909 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
910 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
911 		MLX5_SET(qpc, qpc, rwe, 1);
912 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
913 		MLX5_SET(ads, pp, vhca_port_num, 1);
914 		break;
915 	case MLX5_CMD_OP_INIT2RTR_QP:
916 		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
917 		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
918 		*in = kzalloc(*inlen, GFP_KERNEL);
919 		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
920 		if (!*in || !*out)
921 			goto outerr;
922 
923 		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
924 		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
925 		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
926 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
927 		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
928 		MLX5_SET(qpc, qpc, log_msg_max, 30);
929 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
930 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
931 		MLX5_SET(ads, pp, fl, 1);
932 		break;
933 	case MLX5_CMD_OP_RTR2RTS_QP:
934 		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
935 		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
936 		*in = kzalloc(*inlen, GFP_KERNEL);
937 		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
938 		if (!*in || !*out)
939 			goto outerr;
940 
941 		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
942 		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
943 		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
944 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
945 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
946 		MLX5_SET(ads, pp, ack_timeout, 14);
947 		MLX5_SET(qpc, qpc, retry_count, 7);
948 		MLX5_SET(qpc, qpc, rnr_retry, 7);
949 		break;
950 	default:
951 		goto outerr_nullify;
952 	}
953 
954 	return;
955 
956 outerr:
957 	kfree(*in);
958 	kfree(*out);
959 outerr_nullify:
960 	*in = NULL;
961 	*out = NULL;
962 }
963 
964 static void free_inout(void *in, void *out)
965 {
966 	kfree(in);
967 	kfree(out);
968 }
969 
970 /* Two QPs are used by each virtqueue. One is used by the driver and one by
971  * firmware. The fw argument indicates whether the subjected QP is the one used
972  * by firmware.
973  */
974 static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
975 {
976 	int outlen;
977 	int inlen;
978 	void *out;
979 	void *in;
980 	int err;
981 
982 	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
983 	if (!in || !out)
984 		return -ENOMEM;
985 
986 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
987 	free_inout(in, out);
988 	return err;
989 }
990 
991 static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
992 {
993 	int err;
994 
995 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
996 	if (err)
997 		return err;
998 
999 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
1000 	if (err)
1001 		return err;
1002 
1003 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
1004 	if (err)
1005 		return err;
1006 
1007 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
1008 	if (err)
1009 		return err;
1010 
1011 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
1012 	if (err)
1013 		return err;
1014 
1015 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
1016 	if (err)
1017 		return err;
1018 
1019 	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1020 }
1021 
1022 struct mlx5_virtq_attr {
1023 	u8 state;
1024 	u16 available_index;
1025 };
1026 
1027 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1028 			   struct mlx5_virtq_attr *attr)
1029 {
1030 	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1031 	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1032 	void *out;
1033 	void *obj_context;
1034 	void *cmd_hdr;
1035 	int err;
1036 
1037 	out = kzalloc(outlen, GFP_KERNEL);
1038 	if (!out)
1039 		return -ENOMEM;
1040 
1041 	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1042 
1043 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1044 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1045 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1046 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1047 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1048 	if (err)
1049 		goto err_cmd;
1050 
1051 	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1052 	memset(attr, 0, sizeof(*attr));
1053 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1054 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1055 	kfree(out);
1056 	return 0;
1057 
1058 err_cmd:
1059 	kfree(out);
1060 	return err;
1061 }
1062 
1063 static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
1064 {
1065 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1066 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1067 	void *obj_context;
1068 	void *cmd_hdr;
1069 	void *in;
1070 	int err;
1071 
1072 	in = kzalloc(inlen, GFP_KERNEL);
1073 	if (!in)
1074 		return -ENOMEM;
1075 
1076 	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1077 
1078 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1079 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1080 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1081 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1082 
1083 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1084 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select,
1085 		   MLX5_VIRTQ_MODIFY_MASK_STATE);
1086 	MLX5_SET(virtio_net_q_object, obj_context, state, state);
1087 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1088 	kfree(in);
1089 	if (!err)
1090 		mvq->fw_state = state;
1091 
1092 	return err;
1093 }
1094 
1095 static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1096 {
1097 	u16 idx = mvq->index;
1098 	int err;
1099 
1100 	if (!mvq->num_ent)
1101 		return 0;
1102 
1103 	if (mvq->initialized) {
1104 		mlx5_vdpa_warn(&ndev->mvdev, "attempt re init\n");
1105 		return -EINVAL;
1106 	}
1107 
1108 	err = cq_create(ndev, idx, mvq->num_ent);
1109 	if (err)
1110 		return err;
1111 
1112 	err = qp_create(ndev, mvq, &mvq->fwqp);
1113 	if (err)
1114 		goto err_fwqp;
1115 
1116 	err = qp_create(ndev, mvq, &mvq->vqqp);
1117 	if (err)
1118 		goto err_vqqp;
1119 
1120 	err = connect_qps(ndev, mvq);
1121 	if (err)
1122 		goto err_connect;
1123 
1124 	err = create_virtqueue(ndev, mvq);
1125 	if (err)
1126 		goto err_connect;
1127 
1128 	if (mvq->ready) {
1129 		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1130 		if (err) {
1131 			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
1132 				       idx, err);
1133 			goto err_connect;
1134 		}
1135 	}
1136 
1137 	mvq->initialized = true;
1138 	return 0;
1139 
1140 err_connect:
1141 	qp_destroy(ndev, &mvq->vqqp);
1142 err_vqqp:
1143 	qp_destroy(ndev, &mvq->fwqp);
1144 err_fwqp:
1145 	cq_destroy(ndev, idx);
1146 	return err;
1147 }
1148 
1149 static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1150 {
1151 	struct mlx5_virtq_attr attr;
1152 
1153 	if (!mvq->initialized)
1154 		return;
1155 
1156 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1157 		return;
1158 
1159 	if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
1160 		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
1161 
1162 	if (query_virtqueue(ndev, mvq, &attr)) {
1163 		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
1164 		return;
1165 	}
1166 	mvq->avail_idx = attr.available_index;
1167 }
1168 
1169 static void suspend_vqs(struct mlx5_vdpa_net *ndev)
1170 {
1171 	int i;
1172 
1173 	for (i = 0; i < MLX5_MAX_SUPPORTED_VQS; i++)
1174 		suspend_vq(ndev, &ndev->vqs[i]);
1175 }
1176 
1177 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1178 {
1179 	if (!mvq->initialized)
1180 		return;
1181 
1182 	suspend_vq(ndev, mvq);
1183 	destroy_virtqueue(ndev, mvq);
1184 	qp_destroy(ndev, &mvq->vqqp);
1185 	qp_destroy(ndev, &mvq->fwqp);
1186 	cq_destroy(ndev, mvq->index);
1187 	mvq->initialized = false;
1188 }
1189 
1190 static int create_rqt(struct mlx5_vdpa_net *ndev)
1191 {
1192 	int log_max_rqt;
1193 	__be32 *list;
1194 	void *rqtc;
1195 	int inlen;
1196 	void *in;
1197 	int i, j;
1198 	int err;
1199 
1200 	log_max_rqt = min_t(int, 1, MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
1201 	if (log_max_rqt < 1)
1202 		return -EOPNOTSUPP;
1203 
1204 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + (1 << log_max_rqt) * MLX5_ST_SZ_BYTES(rq_num);
1205 	in = kzalloc(inlen, GFP_KERNEL);
1206 	if (!in)
1207 		return -ENOMEM;
1208 
1209 	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1210 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1211 
1212 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1213 	MLX5_SET(rqtc, rqtc, rqt_max_size, 1 << log_max_rqt);
1214 	MLX5_SET(rqtc, rqtc, rqt_actual_size, 1);
1215 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1216 	for (i = 0, j = 0; j < ndev->mvdev.max_vqs; j++) {
1217 		if (!ndev->vqs[j].initialized)
1218 			continue;
1219 
1220 		if (!vq_is_tx(ndev->vqs[j].index)) {
1221 			list[i] = cpu_to_be32(ndev->vqs[j].virtq_id);
1222 			i++;
1223 		}
1224 	}
1225 
1226 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1227 	kfree(in);
1228 	if (err)
1229 		return err;
1230 
1231 	return 0;
1232 }
1233 
1234 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1235 {
1236 	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1237 }
1238 
1239 static int create_tir(struct mlx5_vdpa_net *ndev)
1240 {
1241 #define HASH_IP_L4PORTS                                                                            \
1242 	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1243 	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1244 	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1245 						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1246 						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1247 						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1248 						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1249 	void *rss_key;
1250 	void *outer;
1251 	void *tirc;
1252 	void *in;
1253 	int err;
1254 
1255 	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1256 	if (!in)
1257 		return -ENOMEM;
1258 
1259 	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1260 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1261 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1262 
1263 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1264 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1265 	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1266 	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1267 
1268 	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1269 	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1270 	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1271 	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1272 
1273 	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1274 	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1275 
1276 	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1277 	kfree(in);
1278 	return err;
1279 }
1280 
1281 static void destroy_tir(struct mlx5_vdpa_net *ndev)
1282 {
1283 	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1284 }
1285 
1286 static int add_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1287 {
1288 	struct mlx5_flow_destination dest[2] = {};
1289 	struct mlx5_flow_table_attr ft_attr = {};
1290 	struct mlx5_flow_act flow_act = {};
1291 	struct mlx5_flow_namespace *ns;
1292 	int err;
1293 
1294 	/* for now, one entry, match all, forward to tir */
1295 	ft_attr.max_fte = 1;
1296 	ft_attr.autogroup.max_num_groups = 1;
1297 
1298 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
1299 	if (!ns) {
1300 		mlx5_vdpa_warn(&ndev->mvdev, "get flow namespace\n");
1301 		return -EOPNOTSUPP;
1302 	}
1303 
1304 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
1305 	if (IS_ERR(ndev->rxft))
1306 		return PTR_ERR(ndev->rxft);
1307 
1308 	ndev->rx_counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1309 	if (IS_ERR(ndev->rx_counter)) {
1310 		err = PTR_ERR(ndev->rx_counter);
1311 		goto err_fc;
1312 	}
1313 
1314 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_COUNT;
1315 	dest[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1316 	dest[0].tir_num = ndev->res.tirn;
1317 	dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1318 	dest[1].counter_id = mlx5_fc_id(ndev->rx_counter);
1319 	ndev->rx_rule = mlx5_add_flow_rules(ndev->rxft, NULL, &flow_act, dest, 2);
1320 	if (IS_ERR(ndev->rx_rule)) {
1321 		err = PTR_ERR(ndev->rx_rule);
1322 		ndev->rx_rule = NULL;
1323 		goto err_rule;
1324 	}
1325 
1326 	return 0;
1327 
1328 err_rule:
1329 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1330 err_fc:
1331 	mlx5_destroy_flow_table(ndev->rxft);
1332 	return err;
1333 }
1334 
1335 static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1336 {
1337 	if (!ndev->rx_rule)
1338 		return;
1339 
1340 	mlx5_del_flow_rules(ndev->rx_rule);
1341 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1342 	mlx5_destroy_flow_table(ndev->rxft);
1343 
1344 	ndev->rx_rule = NULL;
1345 }
1346 
1347 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
1348 {
1349 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1350 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1351 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1352 
1353 	if (unlikely(!mvq->ready))
1354 		return;
1355 
1356 	iowrite16(idx, ndev->mvdev.res.kick_addr);
1357 }
1358 
1359 static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
1360 				    u64 driver_area, u64 device_area)
1361 {
1362 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1363 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1364 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1365 
1366 	mvq->desc_addr = desc_area;
1367 	mvq->device_addr = device_area;
1368 	mvq->driver_addr = driver_area;
1369 	return 0;
1370 }
1371 
1372 static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
1373 {
1374 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1375 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1376 	struct mlx5_vdpa_virtqueue *mvq;
1377 
1378 	mvq = &ndev->vqs[idx];
1379 	mvq->num_ent = num;
1380 }
1381 
1382 static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
1383 {
1384 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1385 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1386 	struct mlx5_vdpa_virtqueue *vq = &ndev->vqs[idx];
1387 
1388 	vq->event_cb = *cb;
1389 }
1390 
1391 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
1392 {
1393 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1394 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1395 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1396 
1397 	if (!ready)
1398 		suspend_vq(ndev, mvq);
1399 
1400 	mvq->ready = ready;
1401 }
1402 
1403 static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
1404 {
1405 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1406 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1407 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1408 
1409 	return mvq->ready;
1410 }
1411 
1412 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
1413 				  const struct vdpa_vq_state *state)
1414 {
1415 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1416 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1417 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1418 
1419 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
1420 		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
1421 		return -EINVAL;
1422 	}
1423 
1424 	mvq->avail_idx = state->avail_index;
1425 	return 0;
1426 }
1427 
1428 static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
1429 {
1430 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1431 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1432 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
1433 	struct mlx5_virtq_attr attr;
1434 	int err;
1435 
1436 	/* If the virtq object was destroyed, use the value saved at
1437 	 * the last minute of suspend_vq. This caters for userspace
1438 	 * that cares about emulating the index after vq is stopped.
1439 	 */
1440 	if (!mvq->initialized) {
1441 		state->avail_index = mvq->avail_idx;
1442 		return 0;
1443 	}
1444 
1445 	err = query_virtqueue(ndev, mvq, &attr);
1446 	if (err) {
1447 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
1448 		return err;
1449 	}
1450 	state->avail_index = attr.available_index;
1451 	return 0;
1452 }
1453 
1454 static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
1455 {
1456 	return PAGE_SIZE;
1457 }
1458 
1459 enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9,
1460 	MLX5_VIRTIO_NET_F_CSUM = 1 << 10,
1461 	MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11,
1462 	MLX5_VIRTIO_NET_F_HOST_TSO4 = 1 << 12,
1463 };
1464 
1465 static u64 mlx_to_vritio_features(u16 dev_features)
1466 {
1467 	u64 result = 0;
1468 
1469 	if (dev_features & MLX5_VIRTIO_NET_F_GUEST_CSUM)
1470 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
1471 	if (dev_features & MLX5_VIRTIO_NET_F_CSUM)
1472 		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
1473 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO6)
1474 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
1475 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO4)
1476 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
1477 
1478 	return result;
1479 }
1480 
1481 static u64 mlx5_vdpa_get_features(struct vdpa_device *vdev)
1482 {
1483 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1484 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1485 	u16 dev_features;
1486 
1487 	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, device_features_bits_mask);
1488 	ndev->mvdev.mlx_features = mlx_to_vritio_features(dev_features);
1489 	if (MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, virtio_version_1_0))
1490 		ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_VERSION_1);
1491 	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
1492 	print_features(mvdev, ndev->mvdev.mlx_features, false);
1493 	return ndev->mvdev.mlx_features;
1494 }
1495 
1496 static int verify_min_features(struct mlx5_vdpa_dev *mvdev, u64 features)
1497 {
1498 	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
1499 		return -EOPNOTSUPP;
1500 
1501 	return 0;
1502 }
1503 
1504 static int setup_virtqueues(struct mlx5_vdpa_net *ndev)
1505 {
1506 	int err;
1507 	int i;
1508 
1509 	for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); i++) {
1510 		err = setup_vq(ndev, &ndev->vqs[i]);
1511 		if (err)
1512 			goto err_vq;
1513 	}
1514 
1515 	return 0;
1516 
1517 err_vq:
1518 	for (--i; i >= 0; i--)
1519 		teardown_vq(ndev, &ndev->vqs[i]);
1520 
1521 	return err;
1522 }
1523 
1524 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
1525 {
1526 	struct mlx5_vdpa_virtqueue *mvq;
1527 	int i;
1528 
1529 	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
1530 		mvq = &ndev->vqs[i];
1531 		if (!mvq->initialized)
1532 			continue;
1533 
1534 		teardown_vq(ndev, mvq);
1535 	}
1536 }
1537 
1538 /* TODO: cross-endian support */
1539 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
1540 {
1541 	return virtio_legacy_is_little_endian() ||
1542 		(mvdev->actual_features & (1ULL << VIRTIO_F_VERSION_1));
1543 }
1544 
1545 static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
1546 {
1547 	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
1548 }
1549 
1550 static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features)
1551 {
1552 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1553 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1554 	int err;
1555 
1556 	print_features(mvdev, features, true);
1557 
1558 	err = verify_min_features(mvdev, features);
1559 	if (err)
1560 		return err;
1561 
1562 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
1563 	ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu);
1564 	ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
1565 	return err;
1566 }
1567 
1568 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
1569 {
1570 	/* not implemented */
1571 	mlx5_vdpa_warn(to_mvdev(vdev), "set config callback not supported\n");
1572 }
1573 
1574 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
1575 static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
1576 {
1577 	return MLX5_VDPA_MAX_VQ_ENTRIES;
1578 }
1579 
1580 static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
1581 {
1582 	return VIRTIO_ID_NET;
1583 }
1584 
1585 static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
1586 {
1587 	return PCI_VENDOR_ID_MELLANOX;
1588 }
1589 
1590 static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
1591 {
1592 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1593 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1594 
1595 	print_status(mvdev, ndev->mvdev.status, false);
1596 	return ndev->mvdev.status;
1597 }
1598 
1599 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1600 {
1601 	struct mlx5_vq_restore_info *ri = &mvq->ri;
1602 	struct mlx5_virtq_attr attr;
1603 	int err;
1604 
1605 	if (!mvq->initialized)
1606 		return 0;
1607 
1608 	err = query_virtqueue(ndev, mvq, &attr);
1609 	if (err)
1610 		return err;
1611 
1612 	ri->avail_index = attr.available_index;
1613 	ri->ready = mvq->ready;
1614 	ri->num_ent = mvq->num_ent;
1615 	ri->desc_addr = mvq->desc_addr;
1616 	ri->device_addr = mvq->device_addr;
1617 	ri->driver_addr = mvq->driver_addr;
1618 	ri->cb = mvq->event_cb;
1619 	ri->restore = true;
1620 	return 0;
1621 }
1622 
1623 static int save_channels_info(struct mlx5_vdpa_net *ndev)
1624 {
1625 	int i;
1626 
1627 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
1628 		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
1629 		save_channel_info(ndev, &ndev->vqs[i]);
1630 	}
1631 	return 0;
1632 }
1633 
1634 static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
1635 {
1636 	int i;
1637 
1638 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
1639 		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
1640 }
1641 
1642 static void restore_channels_info(struct mlx5_vdpa_net *ndev)
1643 {
1644 	struct mlx5_vdpa_virtqueue *mvq;
1645 	struct mlx5_vq_restore_info *ri;
1646 	int i;
1647 
1648 	mlx5_clear_vqs(ndev);
1649 	init_mvqs(ndev);
1650 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
1651 		mvq = &ndev->vqs[i];
1652 		ri = &mvq->ri;
1653 		if (!ri->restore)
1654 			continue;
1655 
1656 		mvq->avail_idx = ri->avail_index;
1657 		mvq->ready = ri->ready;
1658 		mvq->num_ent = ri->num_ent;
1659 		mvq->desc_addr = ri->desc_addr;
1660 		mvq->device_addr = ri->device_addr;
1661 		mvq->driver_addr = ri->driver_addr;
1662 		mvq->event_cb = ri->cb;
1663 	}
1664 }
1665 
1666 static int mlx5_vdpa_change_map(struct mlx5_vdpa_net *ndev, struct vhost_iotlb *iotlb)
1667 {
1668 	int err;
1669 
1670 	suspend_vqs(ndev);
1671 	err = save_channels_info(ndev);
1672 	if (err)
1673 		goto err_mr;
1674 
1675 	teardown_driver(ndev);
1676 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
1677 	err = mlx5_vdpa_create_mr(&ndev->mvdev, iotlb);
1678 	if (err)
1679 		goto err_mr;
1680 
1681 	if (!(ndev->mvdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
1682 		return 0;
1683 
1684 	restore_channels_info(ndev);
1685 	err = setup_driver(ndev);
1686 	if (err)
1687 		goto err_setup;
1688 
1689 	return 0;
1690 
1691 err_setup:
1692 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
1693 err_mr:
1694 	return err;
1695 }
1696 
1697 static int setup_driver(struct mlx5_vdpa_net *ndev)
1698 {
1699 	int err;
1700 
1701 	mutex_lock(&ndev->reslock);
1702 	if (ndev->setup) {
1703 		mlx5_vdpa_warn(&ndev->mvdev, "setup driver called for already setup driver\n");
1704 		err = 0;
1705 		goto out;
1706 	}
1707 	err = setup_virtqueues(ndev);
1708 	if (err) {
1709 		mlx5_vdpa_warn(&ndev->mvdev, "setup_virtqueues\n");
1710 		goto out;
1711 	}
1712 
1713 	err = create_rqt(ndev);
1714 	if (err) {
1715 		mlx5_vdpa_warn(&ndev->mvdev, "create_rqt\n");
1716 		goto err_rqt;
1717 	}
1718 
1719 	err = create_tir(ndev);
1720 	if (err) {
1721 		mlx5_vdpa_warn(&ndev->mvdev, "create_tir\n");
1722 		goto err_tir;
1723 	}
1724 
1725 	err = add_fwd_to_tir(ndev);
1726 	if (err) {
1727 		mlx5_vdpa_warn(&ndev->mvdev, "add_fwd_to_tir\n");
1728 		goto err_fwd;
1729 	}
1730 	ndev->setup = true;
1731 	mutex_unlock(&ndev->reslock);
1732 
1733 	return 0;
1734 
1735 err_fwd:
1736 	destroy_tir(ndev);
1737 err_tir:
1738 	destroy_rqt(ndev);
1739 err_rqt:
1740 	teardown_virtqueues(ndev);
1741 out:
1742 	mutex_unlock(&ndev->reslock);
1743 	return err;
1744 }
1745 
1746 static void teardown_driver(struct mlx5_vdpa_net *ndev)
1747 {
1748 	mutex_lock(&ndev->reslock);
1749 	if (!ndev->setup)
1750 		goto out;
1751 
1752 	remove_fwd_to_tir(ndev);
1753 	destroy_tir(ndev);
1754 	destroy_rqt(ndev);
1755 	teardown_virtqueues(ndev);
1756 	ndev->setup = false;
1757 out:
1758 	mutex_unlock(&ndev->reslock);
1759 }
1760 
1761 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
1762 {
1763 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1764 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1765 	int err;
1766 
1767 	print_status(mvdev, status, true);
1768 	if (!status) {
1769 		mlx5_vdpa_info(mvdev, "performing device reset\n");
1770 		teardown_driver(ndev);
1771 		mlx5_vdpa_destroy_mr(&ndev->mvdev);
1772 		ndev->mvdev.status = 0;
1773 		ndev->mvdev.mlx_features = 0;
1774 		++mvdev->generation;
1775 		return;
1776 	}
1777 
1778 	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
1779 		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
1780 			err = setup_driver(ndev);
1781 			if (err) {
1782 				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
1783 				goto err_setup;
1784 			}
1785 		} else {
1786 			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
1787 			return;
1788 		}
1789 	}
1790 
1791 	ndev->mvdev.status = status;
1792 	return;
1793 
1794 err_setup:
1795 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
1796 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
1797 }
1798 
1799 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
1800 				 unsigned int len)
1801 {
1802 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1803 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1804 
1805 	if (offset + len < sizeof(struct virtio_net_config))
1806 		memcpy(buf, (u8 *)&ndev->config + offset, len);
1807 }
1808 
1809 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
1810 				 unsigned int len)
1811 {
1812 	/* not supported */
1813 }
1814 
1815 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
1816 {
1817 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1818 
1819 	return mvdev->generation;
1820 }
1821 
1822 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb)
1823 {
1824 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1825 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1826 	bool change_map;
1827 	int err;
1828 
1829 	err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map);
1830 	if (err) {
1831 		mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err);
1832 		return err;
1833 	}
1834 
1835 	if (change_map)
1836 		return mlx5_vdpa_change_map(ndev, iotlb);
1837 
1838 	return 0;
1839 }
1840 
1841 static void mlx5_vdpa_free(struct vdpa_device *vdev)
1842 {
1843 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1844 	struct mlx5_vdpa_net *ndev;
1845 
1846 	ndev = to_mlx5_vdpa_ndev(mvdev);
1847 
1848 	free_resources(ndev);
1849 	mlx5_vdpa_free_resources(&ndev->mvdev);
1850 	mutex_destroy(&ndev->reslock);
1851 }
1852 
1853 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
1854 {
1855 	struct vdpa_notification_area ret = {};
1856 
1857 	return ret;
1858 }
1859 
1860 static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 idx)
1861 {
1862 	return -EOPNOTSUPP;
1863 }
1864 
1865 static const struct vdpa_config_ops mlx5_vdpa_ops = {
1866 	.set_vq_address = mlx5_vdpa_set_vq_address,
1867 	.set_vq_num = mlx5_vdpa_set_vq_num,
1868 	.kick_vq = mlx5_vdpa_kick_vq,
1869 	.set_vq_cb = mlx5_vdpa_set_vq_cb,
1870 	.set_vq_ready = mlx5_vdpa_set_vq_ready,
1871 	.get_vq_ready = mlx5_vdpa_get_vq_ready,
1872 	.set_vq_state = mlx5_vdpa_set_vq_state,
1873 	.get_vq_state = mlx5_vdpa_get_vq_state,
1874 	.get_vq_notification = mlx5_get_vq_notification,
1875 	.get_vq_irq = mlx5_get_vq_irq,
1876 	.get_vq_align = mlx5_vdpa_get_vq_align,
1877 	.get_features = mlx5_vdpa_get_features,
1878 	.set_features = mlx5_vdpa_set_features,
1879 	.set_config_cb = mlx5_vdpa_set_config_cb,
1880 	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
1881 	.get_device_id = mlx5_vdpa_get_device_id,
1882 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
1883 	.get_status = mlx5_vdpa_get_status,
1884 	.set_status = mlx5_vdpa_set_status,
1885 	.get_config = mlx5_vdpa_get_config,
1886 	.set_config = mlx5_vdpa_set_config,
1887 	.get_generation = mlx5_vdpa_get_generation,
1888 	.set_map = mlx5_vdpa_set_map,
1889 	.free = mlx5_vdpa_free,
1890 };
1891 
1892 static int alloc_resources(struct mlx5_vdpa_net *ndev)
1893 {
1894 	struct mlx5_vdpa_net_resources *res = &ndev->res;
1895 	int err;
1896 
1897 	if (res->valid) {
1898 		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
1899 		return -EEXIST;
1900 	}
1901 
1902 	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
1903 	if (err)
1904 		return err;
1905 
1906 	err = create_tis(ndev);
1907 	if (err)
1908 		goto err_tis;
1909 
1910 	res->valid = true;
1911 
1912 	return 0;
1913 
1914 err_tis:
1915 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
1916 	return err;
1917 }
1918 
1919 static void free_resources(struct mlx5_vdpa_net *ndev)
1920 {
1921 	struct mlx5_vdpa_net_resources *res = &ndev->res;
1922 
1923 	if (!res->valid)
1924 		return;
1925 
1926 	destroy_tis(ndev);
1927 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
1928 	res->valid = false;
1929 }
1930 
1931 static void init_mvqs(struct mlx5_vdpa_net *ndev)
1932 {
1933 	struct mlx5_vdpa_virtqueue *mvq;
1934 	int i;
1935 
1936 	for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); ++i) {
1937 		mvq = &ndev->vqs[i];
1938 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
1939 		mvq->index = i;
1940 		mvq->ndev = ndev;
1941 		mvq->fwqp.fw = true;
1942 	}
1943 	for (; i < ndev->mvdev.max_vqs; i++) {
1944 		mvq = &ndev->vqs[i];
1945 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
1946 		mvq->index = i;
1947 		mvq->ndev = ndev;
1948 	}
1949 }
1950 
1951 static int mlx5v_probe(struct auxiliary_device *adev,
1952 		       const struct auxiliary_device_id *id)
1953 {
1954 	struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
1955 	struct mlx5_core_dev *mdev = madev->mdev;
1956 	struct virtio_net_config *config;
1957 	struct mlx5_vdpa_dev *mvdev;
1958 	struct mlx5_vdpa_net *ndev;
1959 	u32 max_vqs;
1960 	int err;
1961 
1962 	/* we save one virtqueue for control virtqueue should we require it */
1963 	max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues);
1964 	max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
1965 
1966 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
1967 				 2 * mlx5_vdpa_max_qps(max_vqs));
1968 	if (IS_ERR(ndev))
1969 		return PTR_ERR(ndev);
1970 
1971 	ndev->mvdev.max_vqs = max_vqs;
1972 	mvdev = &ndev->mvdev;
1973 	mvdev->mdev = mdev;
1974 	init_mvqs(ndev);
1975 	mutex_init(&ndev->reslock);
1976 	config = &ndev->config;
1977 	err = mlx5_query_nic_vport_mtu(mdev, &ndev->mtu);
1978 	if (err)
1979 		goto err_mtu;
1980 
1981 	err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
1982 	if (err)
1983 		goto err_mtu;
1984 
1985 	mvdev->vdev.dma_dev = mdev->device;
1986 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
1987 	if (err)
1988 		goto err_mtu;
1989 
1990 	err = alloc_resources(ndev);
1991 	if (err)
1992 		goto err_res;
1993 
1994 	err = vdpa_register_device(&mvdev->vdev);
1995 	if (err)
1996 		goto err_reg;
1997 
1998 	dev_set_drvdata(&adev->dev, ndev);
1999 	return 0;
2000 
2001 err_reg:
2002 	free_resources(ndev);
2003 err_res:
2004 	mlx5_vdpa_free_resources(&ndev->mvdev);
2005 err_mtu:
2006 	mutex_destroy(&ndev->reslock);
2007 	put_device(&mvdev->vdev.dev);
2008 	return err;
2009 }
2010 
2011 static void mlx5v_remove(struct auxiliary_device *adev)
2012 {
2013 	struct mlx5_vdpa_dev *mvdev = dev_get_drvdata(&adev->dev);
2014 
2015 	vdpa_unregister_device(&mvdev->vdev);
2016 }
2017 
2018 static const struct auxiliary_device_id mlx5v_id_table[] = {
2019 	{ .name = MLX5_ADEV_NAME ".vnet", },
2020 	{},
2021 };
2022 
2023 MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
2024 
2025 static struct auxiliary_driver mlx5v_driver = {
2026 	.name = "vnet",
2027 	.probe = mlx5v_probe,
2028 	.remove = mlx5v_remove,
2029 	.id_table = mlx5v_id_table,
2030 };
2031 
2032 module_auxiliary_driver(mlx5v_driver);
2033