1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2019 Mellanox Technologies. */
3 
4 #include <linux/smp.h>
5 #include "dr_types.h"
6 
7 #define QUEUE_SIZE 128
8 #define SIGNAL_PER_DIV_QUEUE 16
9 #define TH_NUMS_TO_DRAIN 2
10 
11 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
12 
13 struct dr_data_seg {
14 	u64 addr;
15 	u32 length;
16 	u32 lkey;
17 	unsigned int send_flags;
18 };
19 
20 struct postsend_info {
21 	struct dr_data_seg write;
22 	struct dr_data_seg read;
23 	u64 remote_addr;
24 	u32 rkey;
25 };
26 
27 struct dr_qp_rtr_attr {
28 	struct mlx5dr_cmd_gid_attr dgid_attr;
29 	enum ib_mtu mtu;
30 	u32 qp_num;
31 	u16 port_num;
32 	u8 min_rnr_timer;
33 	u8 sgid_index;
34 	u16 udp_src_port;
35 };
36 
37 struct dr_qp_rts_attr {
38 	u8 timeout;
39 	u8 retry_cnt;
40 	u8 rnr_retry;
41 };
42 
43 struct dr_qp_init_attr {
44 	u32 cqn;
45 	u32 pdn;
46 	u32 max_send_wr;
47 	struct mlx5_uars_page *uar;
48 };
49 
50 static int dr_parse_cqe(struct mlx5dr_cq *dr_cq, struct mlx5_cqe64 *cqe64)
51 {
52 	unsigned int idx;
53 	u8 opcode;
54 
55 	opcode = get_cqe_opcode(cqe64);
56 	if (opcode == MLX5_CQE_REQ_ERR) {
57 		idx = be16_to_cpu(cqe64->wqe_counter) &
58 			(dr_cq->qp->sq.wqe_cnt - 1);
59 		dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
60 	} else if (opcode == MLX5_CQE_RESP_ERR) {
61 		++dr_cq->qp->sq.cc;
62 	} else {
63 		idx = be16_to_cpu(cqe64->wqe_counter) &
64 			(dr_cq->qp->sq.wqe_cnt - 1);
65 		dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
66 
67 		return CQ_OK;
68 	}
69 
70 	return CQ_POLL_ERR;
71 }
72 
73 static int dr_cq_poll_one(struct mlx5dr_cq *dr_cq)
74 {
75 	struct mlx5_cqe64 *cqe64;
76 	int err;
77 
78 	cqe64 = mlx5_cqwq_get_cqe(&dr_cq->wq);
79 	if (!cqe64)
80 		return CQ_EMPTY;
81 
82 	mlx5_cqwq_pop(&dr_cq->wq);
83 	err = dr_parse_cqe(dr_cq, cqe64);
84 	mlx5_cqwq_update_db_record(&dr_cq->wq);
85 
86 	return err;
87 }
88 
89 static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne)
90 {
91 	int npolled;
92 	int err = 0;
93 
94 	for (npolled = 0; npolled < ne; ++npolled) {
95 		err = dr_cq_poll_one(dr_cq);
96 		if (err != CQ_OK)
97 			break;
98 	}
99 
100 	return err == CQ_POLL_ERR ? err : npolled;
101 }
102 
103 static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
104 					 struct dr_qp_init_attr *attr)
105 {
106 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
107 	u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {};
108 	struct mlx5_wq_param wqp;
109 	struct mlx5dr_qp *dr_qp;
110 	int inlen;
111 	void *qpc;
112 	void *in;
113 	int err;
114 
115 	dr_qp = kzalloc(sizeof(*dr_qp), GFP_KERNEL);
116 	if (!dr_qp)
117 		return NULL;
118 
119 	wqp.buf_numa_node = mdev->priv.numa_node;
120 	wqp.db_numa_node = mdev->priv.numa_node;
121 
122 	dr_qp->rq.pc = 0;
123 	dr_qp->rq.cc = 0;
124 	dr_qp->rq.wqe_cnt = 4;
125 	dr_qp->sq.pc = 0;
126 	dr_qp->sq.cc = 0;
127 	dr_qp->sq.wqe_cnt = roundup_pow_of_two(attr->max_send_wr);
128 
129 	MLX5_SET(qpc, temp_qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
130 	MLX5_SET(qpc, temp_qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
131 	MLX5_SET(qpc, temp_qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
132 	err = mlx5_wq_qp_create(mdev, &wqp, temp_qpc, &dr_qp->wq,
133 				&dr_qp->wq_ctrl);
134 	if (err) {
135 		mlx5_core_warn(mdev, "Can't create QP WQ\n");
136 		goto err_wq;
137 	}
138 
139 	dr_qp->sq.wqe_head = kcalloc(dr_qp->sq.wqe_cnt,
140 				     sizeof(dr_qp->sq.wqe_head[0]),
141 				     GFP_KERNEL);
142 
143 	if (!dr_qp->sq.wqe_head) {
144 		mlx5_core_warn(mdev, "Can't allocate wqe head\n");
145 		goto err_wqe_head;
146 	}
147 
148 	inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
149 		MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
150 		dr_qp->wq_ctrl.buf.npages;
151 	in = kvzalloc(inlen, GFP_KERNEL);
152 	if (!in) {
153 		err = -ENOMEM;
154 		goto err_in;
155 	}
156 
157 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
158 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
159 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
160 	MLX5_SET(qpc, qpc, pd, attr->pdn);
161 	MLX5_SET(qpc, qpc, uar_page, attr->uar->index);
162 	MLX5_SET(qpc, qpc, log_page_size,
163 		 dr_qp->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
164 	MLX5_SET(qpc, qpc, fre, 1);
165 	MLX5_SET(qpc, qpc, rlky, 1);
166 	MLX5_SET(qpc, qpc, cqn_snd, attr->cqn);
167 	MLX5_SET(qpc, qpc, cqn_rcv, attr->cqn);
168 	MLX5_SET(qpc, qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
169 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
170 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
171 	MLX5_SET(qpc, qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
172 	MLX5_SET64(qpc, qpc, dbr_addr, dr_qp->wq_ctrl.db.dma);
173 	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
174 		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
175 	mlx5_fill_page_frag_array(&dr_qp->wq_ctrl.buf,
176 				  (__be64 *)MLX5_ADDR_OF(create_qp_in,
177 							 in, pas));
178 
179 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
180 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
181 	dr_qp->qpn = MLX5_GET(create_qp_out, out, qpn);
182 	kvfree(in);
183 	if (err)
184 		goto err_in;
185 	dr_qp->uar = attr->uar;
186 
187 	return dr_qp;
188 
189 err_in:
190 	kfree(dr_qp->sq.wqe_head);
191 err_wqe_head:
192 	mlx5_wq_destroy(&dr_qp->wq_ctrl);
193 err_wq:
194 	kfree(dr_qp);
195 	return NULL;
196 }
197 
198 static void dr_destroy_qp(struct mlx5_core_dev *mdev,
199 			  struct mlx5dr_qp *dr_qp)
200 {
201 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
202 
203 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
204 	MLX5_SET(destroy_qp_in, in, qpn, dr_qp->qpn);
205 	mlx5_cmd_exec_in(mdev, destroy_qp, in);
206 
207 	kfree(dr_qp->sq.wqe_head);
208 	mlx5_wq_destroy(&dr_qp->wq_ctrl);
209 	kfree(dr_qp);
210 }
211 
212 static void dr_cmd_notify_hw(struct mlx5dr_qp *dr_qp, void *ctrl)
213 {
214 	dma_wmb();
215 	*dr_qp->wq.sq.db = cpu_to_be32(dr_qp->sq.pc & 0xfffff);
216 
217 	/* After wmb() the hw aware of new work */
218 	wmb();
219 
220 	mlx5_write64(ctrl, dr_qp->uar->map + MLX5_BF_OFFSET);
221 }
222 
223 static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr,
224 			     u32 rkey, struct dr_data_seg *data_seg,
225 			     u32 opcode, int nreq)
226 {
227 	struct mlx5_wqe_raddr_seg *wq_raddr;
228 	struct mlx5_wqe_ctrl_seg *wq_ctrl;
229 	struct mlx5_wqe_data_seg *wq_dseg;
230 	unsigned int size;
231 	unsigned int idx;
232 
233 	size = sizeof(*wq_ctrl) / 16 + sizeof(*wq_dseg) / 16 +
234 		sizeof(*wq_raddr) / 16;
235 
236 	idx = dr_qp->sq.pc & (dr_qp->sq.wqe_cnt - 1);
237 
238 	wq_ctrl = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx);
239 	wq_ctrl->imm = 0;
240 	wq_ctrl->fm_ce_se = (data_seg->send_flags) ?
241 		MLX5_WQE_CTRL_CQ_UPDATE : 0;
242 	wq_ctrl->opmod_idx_opcode = cpu_to_be32(((dr_qp->sq.pc & 0xffff) << 8) |
243 						opcode);
244 	wq_ctrl->qpn_ds = cpu_to_be32(size | dr_qp->qpn << 8);
245 	wq_raddr = (void *)(wq_ctrl + 1);
246 	wq_raddr->raddr = cpu_to_be64(remote_addr);
247 	wq_raddr->rkey = cpu_to_be32(rkey);
248 	wq_raddr->reserved = 0;
249 
250 	wq_dseg = (void *)(wq_raddr + 1);
251 	wq_dseg->byte_count = cpu_to_be32(data_seg->length);
252 	wq_dseg->lkey = cpu_to_be32(data_seg->lkey);
253 	wq_dseg->addr = cpu_to_be64(data_seg->addr);
254 
255 	dr_qp->sq.wqe_head[idx] = dr_qp->sq.pc++;
256 
257 	if (nreq)
258 		dr_cmd_notify_hw(dr_qp, wq_ctrl);
259 }
260 
261 static void dr_post_send(struct mlx5dr_qp *dr_qp, struct postsend_info *send_info)
262 {
263 	dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
264 			 &send_info->write, MLX5_OPCODE_RDMA_WRITE, 0);
265 	dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
266 			 &send_info->read, MLX5_OPCODE_RDMA_READ, 1);
267 }
268 
269 /**
270  * mlx5dr_send_fill_and_append_ste_send_info: Add data to be sent
271  * with send_list parameters:
272  *
273  *     @ste:       The data that attached to this specific ste
274  *     @size:      of data to write
275  *     @offset:    of the data from start of the hw_ste entry
276  *     @data:      data
277  *     @ste_info:  ste to be sent with send_list
278  *     @send_list: to append into it
279  *     @copy_data: if true indicates that the data should be kept because
280  *                 it's not backuped any where (like in re-hash).
281  *                 if false, it lets the data to be updated after
282  *                 it was added to the list.
283  */
284 void mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste *ste, u16 size,
285 					       u16 offset, u8 *data,
286 					       struct mlx5dr_ste_send_info *ste_info,
287 					       struct list_head *send_list,
288 					       bool copy_data)
289 {
290 	ste_info->size = size;
291 	ste_info->ste = ste;
292 	ste_info->offset = offset;
293 
294 	if (copy_data) {
295 		memcpy(ste_info->data_cont, data, size);
296 		ste_info->data = ste_info->data_cont;
297 	} else {
298 		ste_info->data = data;
299 	}
300 
301 	list_add_tail(&ste_info->send_list, send_list);
302 }
303 
304 /* The function tries to consume one wc each time, unless the queue is full, in
305  * that case, which means that the hw is behind the sw in a full queue len
306  * the function will drain the cq till it empty.
307  */
308 static int dr_handle_pending_wc(struct mlx5dr_domain *dmn,
309 				struct mlx5dr_send_ring *send_ring)
310 {
311 	bool is_drain = false;
312 	int ne;
313 
314 	if (send_ring->pending_wqe < send_ring->signal_th)
315 		return 0;
316 
317 	/* Queue is full start drain it */
318 	if (send_ring->pending_wqe >=
319 	    dmn->send_ring->signal_th * TH_NUMS_TO_DRAIN)
320 		is_drain = true;
321 
322 	do {
323 		ne = dr_poll_cq(send_ring->cq, 1);
324 		if (ne < 0)
325 			return ne;
326 		else if (ne == 1)
327 			send_ring->pending_wqe -= send_ring->signal_th;
328 	} while (is_drain && send_ring->pending_wqe);
329 
330 	return 0;
331 }
332 
333 static void dr_fill_data_segs(struct mlx5dr_send_ring *send_ring,
334 			      struct postsend_info *send_info)
335 {
336 	send_ring->pending_wqe++;
337 
338 	if (send_ring->pending_wqe % send_ring->signal_th == 0)
339 		send_info->write.send_flags |= IB_SEND_SIGNALED;
340 
341 	send_ring->pending_wqe++;
342 	send_info->read.length = send_info->write.length;
343 	/* Read into the same write area */
344 	send_info->read.addr = (uintptr_t)send_info->write.addr;
345 	send_info->read.lkey = send_ring->mr->mkey.key;
346 
347 	if (send_ring->pending_wqe % send_ring->signal_th == 0)
348 		send_info->read.send_flags = IB_SEND_SIGNALED;
349 	else
350 		send_info->read.send_flags = 0;
351 }
352 
353 static int dr_postsend_icm_data(struct mlx5dr_domain *dmn,
354 				struct postsend_info *send_info)
355 {
356 	struct mlx5dr_send_ring *send_ring = dmn->send_ring;
357 	u32 buff_offset;
358 	int ret;
359 
360 	spin_lock(&send_ring->lock);
361 
362 	ret = dr_handle_pending_wc(dmn, send_ring);
363 	if (ret)
364 		goto out_unlock;
365 
366 	if (send_info->write.length > dmn->info.max_inline_size) {
367 		buff_offset = (send_ring->tx_head &
368 			       (dmn->send_ring->signal_th - 1)) *
369 			send_ring->max_post_send_size;
370 		/* Copy to ring mr */
371 		memcpy(send_ring->buf + buff_offset,
372 		       (void *)(uintptr_t)send_info->write.addr,
373 		       send_info->write.length);
374 		send_info->write.addr = (uintptr_t)send_ring->mr->dma_addr + buff_offset;
375 		send_info->write.lkey = send_ring->mr->mkey.key;
376 	}
377 
378 	send_ring->tx_head++;
379 	dr_fill_data_segs(send_ring, send_info);
380 	dr_post_send(send_ring->qp, send_info);
381 
382 out_unlock:
383 	spin_unlock(&send_ring->lock);
384 	return ret;
385 }
386 
387 static int dr_get_tbl_copy_details(struct mlx5dr_domain *dmn,
388 				   struct mlx5dr_ste_htbl *htbl,
389 				   u8 **data,
390 				   u32 *byte_size,
391 				   int *iterations,
392 				   int *num_stes)
393 {
394 	int alloc_size;
395 
396 	if (htbl->chunk->byte_size > dmn->send_ring->max_post_send_size) {
397 		*iterations = htbl->chunk->byte_size /
398 			dmn->send_ring->max_post_send_size;
399 		*byte_size = dmn->send_ring->max_post_send_size;
400 		alloc_size = *byte_size;
401 		*num_stes = *byte_size / DR_STE_SIZE;
402 	} else {
403 		*iterations = 1;
404 		*num_stes = htbl->chunk->num_of_entries;
405 		alloc_size = *num_stes * DR_STE_SIZE;
406 	}
407 
408 	*data = kzalloc(alloc_size, GFP_KERNEL);
409 	if (!*data)
410 		return -ENOMEM;
411 
412 	return 0;
413 }
414 
415 /**
416  * mlx5dr_send_postsend_ste: write size bytes into offset from the hw cm.
417  *
418  *     @dmn:    Domain
419  *     @ste:    The ste struct that contains the data (at
420  *              least part of it)
421  *     @data:   The real data to send size data
422  *     @size:   for writing.
423  *     @offset: The offset from the icm mapped data to
424  *              start write to this for write only part of the
425  *              buffer.
426  *
427  * Return: 0 on success.
428  */
429 int mlx5dr_send_postsend_ste(struct mlx5dr_domain *dmn, struct mlx5dr_ste *ste,
430 			     u8 *data, u16 size, u16 offset)
431 {
432 	struct postsend_info send_info = {};
433 
434 	mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, data, size);
435 
436 	send_info.write.addr = (uintptr_t)data;
437 	send_info.write.length = size;
438 	send_info.write.lkey = 0;
439 	send_info.remote_addr = mlx5dr_ste_get_mr_addr(ste) + offset;
440 	send_info.rkey = ste->htbl->chunk->rkey;
441 
442 	return dr_postsend_icm_data(dmn, &send_info);
443 }
444 
445 int mlx5dr_send_postsend_htbl(struct mlx5dr_domain *dmn,
446 			      struct mlx5dr_ste_htbl *htbl,
447 			      u8 *formatted_ste, u8 *mask)
448 {
449 	u32 byte_size = htbl->chunk->byte_size;
450 	int num_stes_per_iter;
451 	int iterations;
452 	u8 *data;
453 	int ret;
454 	int i;
455 	int j;
456 
457 	ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
458 				      &iterations, &num_stes_per_iter);
459 	if (ret)
460 		return ret;
461 
462 	mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, formatted_ste, DR_STE_SIZE);
463 
464 	/* Send the data iteration times */
465 	for (i = 0; i < iterations; i++) {
466 		u32 ste_index = i * (byte_size / DR_STE_SIZE);
467 		struct postsend_info send_info = {};
468 
469 		/* Copy all ste's on the data buffer
470 		 * need to add the bit_mask
471 		 */
472 		for (j = 0; j < num_stes_per_iter; j++) {
473 			struct mlx5dr_ste *ste = &htbl->ste_arr[ste_index + j];
474 			u32 ste_off = j * DR_STE_SIZE;
475 
476 			if (mlx5dr_ste_is_not_used(ste)) {
477 				memcpy(data + ste_off,
478 				       formatted_ste, DR_STE_SIZE);
479 			} else {
480 				/* Copy data */
481 				memcpy(data + ste_off,
482 				       htbl->ste_arr[ste_index + j].hw_ste,
483 				       DR_STE_SIZE_REDUCED);
484 				/* Copy bit_mask */
485 				memcpy(data + ste_off + DR_STE_SIZE_REDUCED,
486 				       mask, DR_STE_SIZE_MASK);
487 				/* Only when we have mask we need to re-arrange the STE */
488 				mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx,
489 								data + (j * DR_STE_SIZE),
490 								DR_STE_SIZE);
491 			}
492 		}
493 
494 		send_info.write.addr = (uintptr_t)data;
495 		send_info.write.length = byte_size;
496 		send_info.write.lkey = 0;
497 		send_info.remote_addr =
498 			mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
499 		send_info.rkey = htbl->chunk->rkey;
500 
501 		ret = dr_postsend_icm_data(dmn, &send_info);
502 		if (ret)
503 			goto out_free;
504 	}
505 
506 out_free:
507 	kfree(data);
508 	return ret;
509 }
510 
511 /* Initialize htble with default STEs */
512 int mlx5dr_send_postsend_formatted_htbl(struct mlx5dr_domain *dmn,
513 					struct mlx5dr_ste_htbl *htbl,
514 					u8 *ste_init_data,
515 					bool update_hw_ste)
516 {
517 	u32 byte_size = htbl->chunk->byte_size;
518 	int iterations;
519 	int num_stes;
520 	u8 *copy_dst;
521 	u8 *data;
522 	int ret;
523 	int i;
524 
525 	ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
526 				      &iterations, &num_stes);
527 	if (ret)
528 		return ret;
529 
530 	if (update_hw_ste) {
531 		/* Copy the reduced STE to hash table ste_arr */
532 		for (i = 0; i < num_stes; i++) {
533 			copy_dst = htbl->hw_ste_arr + i * DR_STE_SIZE_REDUCED;
534 			memcpy(copy_dst, ste_init_data, DR_STE_SIZE_REDUCED);
535 		}
536 	}
537 
538 	mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, ste_init_data, DR_STE_SIZE);
539 
540 	/* Copy the same STE on the data buffer */
541 	for (i = 0; i < num_stes; i++) {
542 		copy_dst = data + i * DR_STE_SIZE;
543 		memcpy(copy_dst, ste_init_data, DR_STE_SIZE);
544 	}
545 
546 	/* Send the data iteration times */
547 	for (i = 0; i < iterations; i++) {
548 		u8 ste_index = i * (byte_size / DR_STE_SIZE);
549 		struct postsend_info send_info = {};
550 
551 		send_info.write.addr = (uintptr_t)data;
552 		send_info.write.length = byte_size;
553 		send_info.write.lkey = 0;
554 		send_info.remote_addr =
555 			mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
556 		send_info.rkey = htbl->chunk->rkey;
557 
558 		ret = dr_postsend_icm_data(dmn, &send_info);
559 		if (ret)
560 			goto out_free;
561 	}
562 
563 out_free:
564 	kfree(data);
565 	return ret;
566 }
567 
568 int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn,
569 				struct mlx5dr_action *action)
570 {
571 	struct postsend_info send_info = {};
572 	int ret;
573 
574 	send_info.write.addr = (uintptr_t)action->rewrite.data;
575 	send_info.write.length = action->rewrite.num_of_actions *
576 				 DR_MODIFY_ACTION_SIZE;
577 	send_info.write.lkey = 0;
578 	send_info.remote_addr = action->rewrite.chunk->mr_addr;
579 	send_info.rkey = action->rewrite.chunk->rkey;
580 
581 	ret = dr_postsend_icm_data(dmn, &send_info);
582 
583 	return ret;
584 }
585 
586 static int dr_modify_qp_rst2init(struct mlx5_core_dev *mdev,
587 				 struct mlx5dr_qp *dr_qp,
588 				 int port)
589 {
590 	u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
591 	void *qpc;
592 
593 	qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc);
594 
595 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, port);
596 	MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
597 	MLX5_SET(qpc, qpc, rre, 1);
598 	MLX5_SET(qpc, qpc, rwe, 1);
599 
600 	MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP);
601 	MLX5_SET(rst2init_qp_in, in, qpn, dr_qp->qpn);
602 
603 	return mlx5_cmd_exec_in(mdev, rst2init_qp, in);
604 }
605 
606 static int dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev *mdev,
607 				    struct mlx5dr_qp *dr_qp,
608 				    struct dr_qp_rts_attr *attr)
609 {
610 	u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
611 	void *qpc;
612 
613 	qpc  = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc);
614 
615 	MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn);
616 
617 	MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt);
618 	MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry);
619 
620 	MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
621 	MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn);
622 
623 	return mlx5_cmd_exec_in(mdev, rtr2rts_qp, in);
624 }
625 
626 static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev,
627 				     struct mlx5dr_qp *dr_qp,
628 				     struct dr_qp_rtr_attr *attr)
629 {
630 	u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
631 	void *qpc;
632 
633 	qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc);
634 
635 	MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn);
636 
637 	MLX5_SET(qpc, qpc, mtu, attr->mtu);
638 	MLX5_SET(qpc, qpc, log_msg_max, DR_CHUNK_SIZE_MAX - 1);
639 	MLX5_SET(qpc, qpc, remote_qpn, attr->qp_num);
640 	memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32),
641 	       attr->dgid_attr.mac, sizeof(attr->dgid_attr.mac));
642 	memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
643 	       attr->dgid_attr.gid, sizeof(attr->dgid_attr.gid));
644 	MLX5_SET(qpc, qpc, primary_address_path.src_addr_index,
645 		 attr->sgid_index);
646 
647 	if (attr->dgid_attr.roce_ver == MLX5_ROCE_VERSION_2)
648 		MLX5_SET(qpc, qpc, primary_address_path.udp_sport,
649 			 attr->udp_src_port);
650 
651 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num);
652 	MLX5_SET(qpc, qpc, min_rnr_nak, 1);
653 
654 	MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
655 	MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn);
656 
657 	return mlx5_cmd_exec_in(mdev, init2rtr_qp, in);
658 }
659 
660 static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn)
661 {
662 	struct mlx5dr_qp *dr_qp = dmn->send_ring->qp;
663 	struct dr_qp_rts_attr rts_attr = {};
664 	struct dr_qp_rtr_attr rtr_attr = {};
665 	enum ib_mtu mtu = IB_MTU_1024;
666 	u16 gid_index = 0;
667 	int port = 1;
668 	int ret;
669 
670 	/* Init */
671 	ret = dr_modify_qp_rst2init(dmn->mdev, dr_qp, port);
672 	if (ret) {
673 		mlx5dr_err(dmn, "Failed modify QP rst2init\n");
674 		return ret;
675 	}
676 
677 	/* RTR */
678 	ret = mlx5dr_cmd_query_gid(dmn->mdev, port, gid_index, &rtr_attr.dgid_attr);
679 	if (ret)
680 		return ret;
681 
682 	rtr_attr.mtu		= mtu;
683 	rtr_attr.qp_num		= dr_qp->qpn;
684 	rtr_attr.min_rnr_timer	= 12;
685 	rtr_attr.port_num	= port;
686 	rtr_attr.sgid_index	= gid_index;
687 	rtr_attr.udp_src_port	= dmn->info.caps.roce_min_src_udp;
688 
689 	ret = dr_cmd_modify_qp_init2rtr(dmn->mdev, dr_qp, &rtr_attr);
690 	if (ret) {
691 		mlx5dr_err(dmn, "Failed modify QP init2rtr\n");
692 		return ret;
693 	}
694 
695 	/* RTS */
696 	rts_attr.timeout	= 14;
697 	rts_attr.retry_cnt	= 7;
698 	rts_attr.rnr_retry	= 7;
699 
700 	ret = dr_cmd_modify_qp_rtr2rts(dmn->mdev, dr_qp, &rts_attr);
701 	if (ret) {
702 		mlx5dr_err(dmn, "Failed modify QP rtr2rts\n");
703 		return ret;
704 	}
705 
706 	return 0;
707 }
708 
709 static void dr_cq_complete(struct mlx5_core_cq *mcq,
710 			   struct mlx5_eqe *eqe)
711 {
712 	pr_err("CQ completion CQ: #%u\n", mcq->cqn);
713 }
714 
715 static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev,
716 				      struct mlx5_uars_page *uar,
717 				      size_t ncqe)
718 {
719 	u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {};
720 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
721 	struct mlx5_wq_param wqp;
722 	struct mlx5_cqe64 *cqe;
723 	struct mlx5dr_cq *cq;
724 	int inlen, err, eqn;
725 	unsigned int irqn;
726 	void *cqc, *in;
727 	__be64 *pas;
728 	int vector;
729 	u32 i;
730 
731 	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
732 	if (!cq)
733 		return NULL;
734 
735 	ncqe = roundup_pow_of_two(ncqe);
736 	MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(ncqe));
737 
738 	wqp.buf_numa_node = mdev->priv.numa_node;
739 	wqp.db_numa_node = mdev->priv.numa_node;
740 
741 	err = mlx5_cqwq_create(mdev, &wqp, temp_cqc, &cq->wq,
742 			       &cq->wq_ctrl);
743 	if (err)
744 		goto out;
745 
746 	for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
747 		cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
748 		cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK;
749 	}
750 
751 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
752 		sizeof(u64) * cq->wq_ctrl.buf.npages;
753 	in = kvzalloc(inlen, GFP_KERNEL);
754 	if (!in)
755 		goto err_cqwq;
756 
757 	vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev);
758 	err = mlx5_vector2eqn(mdev, vector, &eqn, &irqn);
759 	if (err) {
760 		kvfree(in);
761 		goto err_cqwq;
762 	}
763 
764 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
765 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
766 	MLX5_SET(cqc, cqc, c_eqn, eqn);
767 	MLX5_SET(cqc, cqc, uar_page, uar->index);
768 	MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
769 		 MLX5_ADAPTER_PAGE_SHIFT);
770 	MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma);
771 
772 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
773 	mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas);
774 
775 	cq->mcq.comp  = dr_cq_complete;
776 
777 	err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
778 	kvfree(in);
779 
780 	if (err)
781 		goto err_cqwq;
782 
783 	cq->mcq.cqe_sz = 64;
784 	cq->mcq.set_ci_db = cq->wq_ctrl.db.db;
785 	cq->mcq.arm_db = cq->wq_ctrl.db.db + 1;
786 	*cq->mcq.set_ci_db = 0;
787 
788 	/* set no-zero value, in order to avoid the HW to run db-recovery on
789 	 * CQ that used in polling mode.
790 	 */
791 	*cq->mcq.arm_db = cpu_to_be32(2 << 28);
792 
793 	cq->mcq.vector = 0;
794 	cq->mcq.irqn = irqn;
795 	cq->mcq.uar = uar;
796 
797 	return cq;
798 
799 err_cqwq:
800 	mlx5_wq_destroy(&cq->wq_ctrl);
801 out:
802 	kfree(cq);
803 	return NULL;
804 }
805 
806 static void dr_destroy_cq(struct mlx5_core_dev *mdev, struct mlx5dr_cq *cq)
807 {
808 	mlx5_core_destroy_cq(mdev, &cq->mcq);
809 	mlx5_wq_destroy(&cq->wq_ctrl);
810 	kfree(cq);
811 }
812 
813 static int
814 dr_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, struct mlx5_core_mkey *mkey)
815 {
816 	u32 in[MLX5_ST_SZ_DW(create_mkey_in)] = {};
817 	void *mkc;
818 
819 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
820 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
821 	MLX5_SET(mkc, mkc, a, 1);
822 	MLX5_SET(mkc, mkc, rw, 1);
823 	MLX5_SET(mkc, mkc, rr, 1);
824 	MLX5_SET(mkc, mkc, lw, 1);
825 	MLX5_SET(mkc, mkc, lr, 1);
826 
827 	MLX5_SET(mkc, mkc, pd, pdn);
828 	MLX5_SET(mkc, mkc, length64, 1);
829 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
830 
831 	return mlx5_core_create_mkey(mdev, mkey, in, sizeof(in));
832 }
833 
834 static struct mlx5dr_mr *dr_reg_mr(struct mlx5_core_dev *mdev,
835 				   u32 pdn, void *buf, size_t size)
836 {
837 	struct mlx5dr_mr *mr = kzalloc(sizeof(*mr), GFP_KERNEL);
838 	struct device *dma_device;
839 	dma_addr_t dma_addr;
840 	int err;
841 
842 	if (!mr)
843 		return NULL;
844 
845 	dma_device = mlx5_core_dma_dev(mdev);
846 	dma_addr = dma_map_single(dma_device, buf, size,
847 				  DMA_BIDIRECTIONAL);
848 	err = dma_mapping_error(dma_device, dma_addr);
849 	if (err) {
850 		mlx5_core_warn(mdev, "Can't dma buf\n");
851 		kfree(mr);
852 		return NULL;
853 	}
854 
855 	err = dr_create_mkey(mdev, pdn, &mr->mkey);
856 	if (err) {
857 		mlx5_core_warn(mdev, "Can't create mkey\n");
858 		dma_unmap_single(dma_device, dma_addr, size,
859 				 DMA_BIDIRECTIONAL);
860 		kfree(mr);
861 		return NULL;
862 	}
863 
864 	mr->dma_addr = dma_addr;
865 	mr->size = size;
866 	mr->addr = buf;
867 
868 	return mr;
869 }
870 
871 static void dr_dereg_mr(struct mlx5_core_dev *mdev, struct mlx5dr_mr *mr)
872 {
873 	mlx5_core_destroy_mkey(mdev, &mr->mkey);
874 	dma_unmap_single(mlx5_core_dma_dev(mdev), mr->dma_addr, mr->size,
875 			 DMA_BIDIRECTIONAL);
876 	kfree(mr);
877 }
878 
879 int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn)
880 {
881 	struct dr_qp_init_attr init_attr = {};
882 	int cq_size;
883 	int size;
884 	int ret;
885 
886 	dmn->send_ring = kzalloc(sizeof(*dmn->send_ring), GFP_KERNEL);
887 	if (!dmn->send_ring)
888 		return -ENOMEM;
889 
890 	cq_size = QUEUE_SIZE + 1;
891 	dmn->send_ring->cq = dr_create_cq(dmn->mdev, dmn->uar, cq_size);
892 	if (!dmn->send_ring->cq) {
893 		mlx5dr_err(dmn, "Failed creating CQ\n");
894 		ret = -ENOMEM;
895 		goto free_send_ring;
896 	}
897 
898 	init_attr.cqn = dmn->send_ring->cq->mcq.cqn;
899 	init_attr.pdn = dmn->pdn;
900 	init_attr.uar = dmn->uar;
901 	init_attr.max_send_wr = QUEUE_SIZE;
902 	spin_lock_init(&dmn->send_ring->lock);
903 
904 	dmn->send_ring->qp = dr_create_rc_qp(dmn->mdev, &init_attr);
905 	if (!dmn->send_ring->qp)  {
906 		mlx5dr_err(dmn, "Failed creating QP\n");
907 		ret = -ENOMEM;
908 		goto clean_cq;
909 	}
910 
911 	dmn->send_ring->cq->qp = dmn->send_ring->qp;
912 
913 	dmn->info.max_send_wr = QUEUE_SIZE;
914 	dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data,
915 					DR_STE_SIZE);
916 
917 	dmn->send_ring->signal_th = dmn->info.max_send_wr /
918 		SIGNAL_PER_DIV_QUEUE;
919 
920 	/* Prepare qp to be used */
921 	ret = dr_prepare_qp_to_rts(dmn);
922 	if (ret)
923 		goto clean_qp;
924 
925 	dmn->send_ring->max_post_send_size =
926 		mlx5dr_icm_pool_chunk_size_to_byte(DR_CHUNK_SIZE_1K,
927 						   DR_ICM_TYPE_STE);
928 
929 	/* Allocating the max size as a buffer for writing */
930 	size = dmn->send_ring->signal_th * dmn->send_ring->max_post_send_size;
931 	dmn->send_ring->buf = kzalloc(size, GFP_KERNEL);
932 	if (!dmn->send_ring->buf) {
933 		ret = -ENOMEM;
934 		goto clean_qp;
935 	}
936 
937 	dmn->send_ring->buf_size = size;
938 
939 	dmn->send_ring->mr = dr_reg_mr(dmn->mdev,
940 				       dmn->pdn, dmn->send_ring->buf, size);
941 	if (!dmn->send_ring->mr) {
942 		ret = -ENOMEM;
943 		goto free_mem;
944 	}
945 
946 	dmn->send_ring->sync_mr = dr_reg_mr(dmn->mdev,
947 					    dmn->pdn, dmn->send_ring->sync_buff,
948 					    MIN_READ_SYNC);
949 	if (!dmn->send_ring->sync_mr) {
950 		ret = -ENOMEM;
951 		goto clean_mr;
952 	}
953 
954 	return 0;
955 
956 clean_mr:
957 	dr_dereg_mr(dmn->mdev, dmn->send_ring->mr);
958 free_mem:
959 	kfree(dmn->send_ring->buf);
960 clean_qp:
961 	dr_destroy_qp(dmn->mdev, dmn->send_ring->qp);
962 clean_cq:
963 	dr_destroy_cq(dmn->mdev, dmn->send_ring->cq);
964 free_send_ring:
965 	kfree(dmn->send_ring);
966 
967 	return ret;
968 }
969 
970 void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn,
971 			   struct mlx5dr_send_ring *send_ring)
972 {
973 	dr_destroy_qp(dmn->mdev, send_ring->qp);
974 	dr_destroy_cq(dmn->mdev, send_ring->cq);
975 	dr_dereg_mr(dmn->mdev, send_ring->sync_mr);
976 	dr_dereg_mr(dmn->mdev, send_ring->mr);
977 	kfree(send_ring->buf);
978 	kfree(send_ring);
979 }
980 
981 int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn)
982 {
983 	struct mlx5dr_send_ring *send_ring = dmn->send_ring;
984 	struct postsend_info send_info = {};
985 	u8 data[DR_STE_SIZE];
986 	int num_of_sends_req;
987 	int ret;
988 	int i;
989 
990 	/* Sending this amount of requests makes sure we will get drain */
991 	num_of_sends_req = send_ring->signal_th * TH_NUMS_TO_DRAIN / 2;
992 
993 	/* Send fake requests forcing the last to be signaled */
994 	send_info.write.addr = (uintptr_t)data;
995 	send_info.write.length = DR_STE_SIZE;
996 	send_info.write.lkey = 0;
997 	/* Using the sync_mr in order to write/read */
998 	send_info.remote_addr = (uintptr_t)send_ring->sync_mr->addr;
999 	send_info.rkey = send_ring->sync_mr->mkey.key;
1000 
1001 	for (i = 0; i < num_of_sends_req; i++) {
1002 		ret = dr_postsend_icm_data(dmn, &send_info);
1003 		if (ret)
1004 			return ret;
1005 	}
1006 
1007 	spin_lock(&send_ring->lock);
1008 	ret = dr_handle_pending_wc(dmn, send_ring);
1009 	spin_unlock(&send_ring->lock);
1010 
1011 	return ret;
1012 }
1013