1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2019 Mellanox Technologies. */
3 
4 #include <linux/smp.h>
5 #include "dr_types.h"
6 
7 #define QUEUE_SIZE 128
8 #define SIGNAL_PER_DIV_QUEUE 16
9 #define TH_NUMS_TO_DRAIN 2
10 
11 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
12 
13 struct dr_data_seg {
14 	u64 addr;
15 	u32 length;
16 	u32 lkey;
17 	unsigned int send_flags;
18 };
19 
20 struct postsend_info {
21 	struct dr_data_seg write;
22 	struct dr_data_seg read;
23 	u64 remote_addr;
24 	u32 rkey;
25 };
26 
27 struct dr_qp_rtr_attr {
28 	struct mlx5dr_cmd_gid_attr dgid_attr;
29 	enum ib_mtu mtu;
30 	u32 qp_num;
31 	u16 port_num;
32 	u8 min_rnr_timer;
33 	u8 sgid_index;
34 	u16 udp_src_port;
35 	u8 fl:1;
36 };
37 
38 struct dr_qp_rts_attr {
39 	u8 timeout;
40 	u8 retry_cnt;
41 	u8 rnr_retry;
42 };
43 
44 struct dr_qp_init_attr {
45 	u32 cqn;
46 	u32 pdn;
47 	u32 max_send_wr;
48 	struct mlx5_uars_page *uar;
49 	u8 isolate_vl_tc:1;
50 };
51 
52 static int dr_parse_cqe(struct mlx5dr_cq *dr_cq, struct mlx5_cqe64 *cqe64)
53 {
54 	unsigned int idx;
55 	u8 opcode;
56 
57 	opcode = get_cqe_opcode(cqe64);
58 	if (opcode == MLX5_CQE_REQ_ERR) {
59 		idx = be16_to_cpu(cqe64->wqe_counter) &
60 			(dr_cq->qp->sq.wqe_cnt - 1);
61 		dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
62 	} else if (opcode == MLX5_CQE_RESP_ERR) {
63 		++dr_cq->qp->sq.cc;
64 	} else {
65 		idx = be16_to_cpu(cqe64->wqe_counter) &
66 			(dr_cq->qp->sq.wqe_cnt - 1);
67 		dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
68 
69 		return CQ_OK;
70 	}
71 
72 	return CQ_POLL_ERR;
73 }
74 
75 static int dr_cq_poll_one(struct mlx5dr_cq *dr_cq)
76 {
77 	struct mlx5_cqe64 *cqe64;
78 	int err;
79 
80 	cqe64 = mlx5_cqwq_get_cqe(&dr_cq->wq);
81 	if (!cqe64)
82 		return CQ_EMPTY;
83 
84 	mlx5_cqwq_pop(&dr_cq->wq);
85 	err = dr_parse_cqe(dr_cq, cqe64);
86 	mlx5_cqwq_update_db_record(&dr_cq->wq);
87 
88 	return err;
89 }
90 
91 static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne)
92 {
93 	int npolled;
94 	int err = 0;
95 
96 	for (npolled = 0; npolled < ne; ++npolled) {
97 		err = dr_cq_poll_one(dr_cq);
98 		if (err != CQ_OK)
99 			break;
100 	}
101 
102 	return err == CQ_POLL_ERR ? err : npolled;
103 }
104 
105 static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
106 					 struct dr_qp_init_attr *attr)
107 {
108 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
109 	u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {};
110 	struct mlx5_wq_param wqp;
111 	struct mlx5dr_qp *dr_qp;
112 	int inlen;
113 	void *qpc;
114 	void *in;
115 	int err;
116 
117 	dr_qp = kzalloc(sizeof(*dr_qp), GFP_KERNEL);
118 	if (!dr_qp)
119 		return NULL;
120 
121 	wqp.buf_numa_node = mdev->priv.numa_node;
122 	wqp.db_numa_node = mdev->priv.numa_node;
123 
124 	dr_qp->rq.pc = 0;
125 	dr_qp->rq.cc = 0;
126 	dr_qp->rq.wqe_cnt = 4;
127 	dr_qp->sq.pc = 0;
128 	dr_qp->sq.cc = 0;
129 	dr_qp->sq.wqe_cnt = roundup_pow_of_two(attr->max_send_wr);
130 
131 	MLX5_SET(qpc, temp_qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
132 	MLX5_SET(qpc, temp_qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
133 	MLX5_SET(qpc, temp_qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
134 	err = mlx5_wq_qp_create(mdev, &wqp, temp_qpc, &dr_qp->wq,
135 				&dr_qp->wq_ctrl);
136 	if (err) {
137 		mlx5_core_warn(mdev, "Can't create QP WQ\n");
138 		goto err_wq;
139 	}
140 
141 	dr_qp->sq.wqe_head = kcalloc(dr_qp->sq.wqe_cnt,
142 				     sizeof(dr_qp->sq.wqe_head[0]),
143 				     GFP_KERNEL);
144 
145 	if (!dr_qp->sq.wqe_head) {
146 		mlx5_core_warn(mdev, "Can't allocate wqe head\n");
147 		goto err_wqe_head;
148 	}
149 
150 	inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
151 		MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
152 		dr_qp->wq_ctrl.buf.npages;
153 	in = kvzalloc(inlen, GFP_KERNEL);
154 	if (!in) {
155 		err = -ENOMEM;
156 		goto err_in;
157 	}
158 
159 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
160 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
161 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
162 	MLX5_SET(qpc, qpc, isolate_vl_tc, attr->isolate_vl_tc);
163 	MLX5_SET(qpc, qpc, pd, attr->pdn);
164 	MLX5_SET(qpc, qpc, uar_page, attr->uar->index);
165 	MLX5_SET(qpc, qpc, log_page_size,
166 		 dr_qp->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
167 	MLX5_SET(qpc, qpc, fre, 1);
168 	MLX5_SET(qpc, qpc, rlky, 1);
169 	MLX5_SET(qpc, qpc, cqn_snd, attr->cqn);
170 	MLX5_SET(qpc, qpc, cqn_rcv, attr->cqn);
171 	MLX5_SET(qpc, qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
172 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
173 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
174 	MLX5_SET(qpc, qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
175 	MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
176 	MLX5_SET64(qpc, qpc, dbr_addr, dr_qp->wq_ctrl.db.dma);
177 	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
178 		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
179 	mlx5_fill_page_frag_array(&dr_qp->wq_ctrl.buf,
180 				  (__be64 *)MLX5_ADDR_OF(create_qp_in,
181 							 in, pas));
182 
183 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
184 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
185 	dr_qp->qpn = MLX5_GET(create_qp_out, out, qpn);
186 	kvfree(in);
187 	if (err)
188 		goto err_in;
189 	dr_qp->uar = attr->uar;
190 
191 	return dr_qp;
192 
193 err_in:
194 	kfree(dr_qp->sq.wqe_head);
195 err_wqe_head:
196 	mlx5_wq_destroy(&dr_qp->wq_ctrl);
197 err_wq:
198 	kfree(dr_qp);
199 	return NULL;
200 }
201 
202 static void dr_destroy_qp(struct mlx5_core_dev *mdev,
203 			  struct mlx5dr_qp *dr_qp)
204 {
205 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
206 
207 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
208 	MLX5_SET(destroy_qp_in, in, qpn, dr_qp->qpn);
209 	mlx5_cmd_exec_in(mdev, destroy_qp, in);
210 
211 	kfree(dr_qp->sq.wqe_head);
212 	mlx5_wq_destroy(&dr_qp->wq_ctrl);
213 	kfree(dr_qp);
214 }
215 
216 static void dr_cmd_notify_hw(struct mlx5dr_qp *dr_qp, void *ctrl)
217 {
218 	dma_wmb();
219 	*dr_qp->wq.sq.db = cpu_to_be32(dr_qp->sq.pc & 0xffff);
220 
221 	/* After wmb() the hw aware of new work */
222 	wmb();
223 
224 	mlx5_write64(ctrl, dr_qp->uar->map + MLX5_BF_OFFSET);
225 }
226 
227 static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr,
228 			     u32 rkey, struct dr_data_seg *data_seg,
229 			     u32 opcode, bool notify_hw)
230 {
231 	struct mlx5_wqe_raddr_seg *wq_raddr;
232 	struct mlx5_wqe_ctrl_seg *wq_ctrl;
233 	struct mlx5_wqe_data_seg *wq_dseg;
234 	unsigned int size;
235 	unsigned int idx;
236 
237 	size = sizeof(*wq_ctrl) / 16 + sizeof(*wq_dseg) / 16 +
238 		sizeof(*wq_raddr) / 16;
239 
240 	idx = dr_qp->sq.pc & (dr_qp->sq.wqe_cnt - 1);
241 
242 	wq_ctrl = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx);
243 	wq_ctrl->imm = 0;
244 	wq_ctrl->fm_ce_se = (data_seg->send_flags) ?
245 		MLX5_WQE_CTRL_CQ_UPDATE : 0;
246 	wq_ctrl->opmod_idx_opcode = cpu_to_be32(((dr_qp->sq.pc & 0xffff) << 8) |
247 						opcode);
248 	wq_ctrl->qpn_ds = cpu_to_be32(size | dr_qp->qpn << 8);
249 	wq_raddr = (void *)(wq_ctrl + 1);
250 	wq_raddr->raddr = cpu_to_be64(remote_addr);
251 	wq_raddr->rkey = cpu_to_be32(rkey);
252 	wq_raddr->reserved = 0;
253 
254 	wq_dseg = (void *)(wq_raddr + 1);
255 	wq_dseg->byte_count = cpu_to_be32(data_seg->length);
256 	wq_dseg->lkey = cpu_to_be32(data_seg->lkey);
257 	wq_dseg->addr = cpu_to_be64(data_seg->addr);
258 
259 	dr_qp->sq.wqe_head[idx] = dr_qp->sq.pc++;
260 
261 	if (notify_hw)
262 		dr_cmd_notify_hw(dr_qp, wq_ctrl);
263 }
264 
265 static void dr_post_send(struct mlx5dr_qp *dr_qp, struct postsend_info *send_info)
266 {
267 	dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
268 			 &send_info->write, MLX5_OPCODE_RDMA_WRITE, false);
269 	dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
270 			 &send_info->read, MLX5_OPCODE_RDMA_READ, true);
271 }
272 
273 /**
274  * mlx5dr_send_fill_and_append_ste_send_info: Add data to be sent
275  * with send_list parameters:
276  *
277  *     @ste:       The data that attached to this specific ste
278  *     @size:      of data to write
279  *     @offset:    of the data from start of the hw_ste entry
280  *     @data:      data
281  *     @ste_info:  ste to be sent with send_list
282  *     @send_list: to append into it
283  *     @copy_data: if true indicates that the data should be kept because
284  *                 it's not backuped any where (like in re-hash).
285  *                 if false, it lets the data to be updated after
286  *                 it was added to the list.
287  */
288 void mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste *ste, u16 size,
289 					       u16 offset, u8 *data,
290 					       struct mlx5dr_ste_send_info *ste_info,
291 					       struct list_head *send_list,
292 					       bool copy_data)
293 {
294 	ste_info->size = size;
295 	ste_info->ste = ste;
296 	ste_info->offset = offset;
297 
298 	if (copy_data) {
299 		memcpy(ste_info->data_cont, data, size);
300 		ste_info->data = ste_info->data_cont;
301 	} else {
302 		ste_info->data = data;
303 	}
304 
305 	list_add_tail(&ste_info->send_list, send_list);
306 }
307 
308 /* The function tries to consume one wc each time, unless the queue is full, in
309  * that case, which means that the hw is behind the sw in a full queue len
310  * the function will drain the cq till it empty.
311  */
312 static int dr_handle_pending_wc(struct mlx5dr_domain *dmn,
313 				struct mlx5dr_send_ring *send_ring)
314 {
315 	bool is_drain = false;
316 	int ne;
317 
318 	if (send_ring->pending_wqe < send_ring->signal_th)
319 		return 0;
320 
321 	/* Queue is full start drain it */
322 	if (send_ring->pending_wqe >=
323 	    dmn->send_ring->signal_th * TH_NUMS_TO_DRAIN)
324 		is_drain = true;
325 
326 	do {
327 		ne = dr_poll_cq(send_ring->cq, 1);
328 		if (ne < 0)
329 			return ne;
330 		else if (ne == 1)
331 			send_ring->pending_wqe -= send_ring->signal_th;
332 	} while (is_drain && send_ring->pending_wqe);
333 
334 	return 0;
335 }
336 
337 static void dr_fill_data_segs(struct mlx5dr_send_ring *send_ring,
338 			      struct postsend_info *send_info)
339 {
340 	send_ring->pending_wqe++;
341 
342 	if (send_ring->pending_wqe % send_ring->signal_th == 0)
343 		send_info->write.send_flags |= IB_SEND_SIGNALED;
344 
345 	send_ring->pending_wqe++;
346 	send_info->read.length = send_info->write.length;
347 	/* Read into the same write area */
348 	send_info->read.addr = (uintptr_t)send_info->write.addr;
349 	send_info->read.lkey = send_ring->mr->mkey.key;
350 
351 	if (send_ring->pending_wqe % send_ring->signal_th == 0)
352 		send_info->read.send_flags = IB_SEND_SIGNALED;
353 	else
354 		send_info->read.send_flags = 0;
355 }
356 
357 static int dr_postsend_icm_data(struct mlx5dr_domain *dmn,
358 				struct postsend_info *send_info)
359 {
360 	struct mlx5dr_send_ring *send_ring = dmn->send_ring;
361 	u32 buff_offset;
362 	int ret;
363 
364 	spin_lock(&send_ring->lock);
365 
366 	ret = dr_handle_pending_wc(dmn, send_ring);
367 	if (ret)
368 		goto out_unlock;
369 
370 	if (send_info->write.length > dmn->info.max_inline_size) {
371 		buff_offset = (send_ring->tx_head &
372 			       (dmn->send_ring->signal_th - 1)) *
373 			send_ring->max_post_send_size;
374 		/* Copy to ring mr */
375 		memcpy(send_ring->buf + buff_offset,
376 		       (void *)(uintptr_t)send_info->write.addr,
377 		       send_info->write.length);
378 		send_info->write.addr = (uintptr_t)send_ring->mr->dma_addr + buff_offset;
379 		send_info->write.lkey = send_ring->mr->mkey.key;
380 	}
381 
382 	send_ring->tx_head++;
383 	dr_fill_data_segs(send_ring, send_info);
384 	dr_post_send(send_ring->qp, send_info);
385 
386 out_unlock:
387 	spin_unlock(&send_ring->lock);
388 	return ret;
389 }
390 
391 static int dr_get_tbl_copy_details(struct mlx5dr_domain *dmn,
392 				   struct mlx5dr_ste_htbl *htbl,
393 				   u8 **data,
394 				   u32 *byte_size,
395 				   int *iterations,
396 				   int *num_stes)
397 {
398 	int alloc_size;
399 
400 	if (htbl->chunk->byte_size > dmn->send_ring->max_post_send_size) {
401 		*iterations = htbl->chunk->byte_size /
402 			dmn->send_ring->max_post_send_size;
403 		*byte_size = dmn->send_ring->max_post_send_size;
404 		alloc_size = *byte_size;
405 		*num_stes = *byte_size / DR_STE_SIZE;
406 	} else {
407 		*iterations = 1;
408 		*num_stes = htbl->chunk->num_of_entries;
409 		alloc_size = *num_stes * DR_STE_SIZE;
410 	}
411 
412 	*data = kvzalloc(alloc_size, GFP_KERNEL);
413 	if (!*data)
414 		return -ENOMEM;
415 
416 	return 0;
417 }
418 
419 /**
420  * mlx5dr_send_postsend_ste: write size bytes into offset from the hw cm.
421  *
422  *     @dmn:    Domain
423  *     @ste:    The ste struct that contains the data (at
424  *              least part of it)
425  *     @data:   The real data to send size data
426  *     @size:   for writing.
427  *     @offset: The offset from the icm mapped data to
428  *              start write to this for write only part of the
429  *              buffer.
430  *
431  * Return: 0 on success.
432  */
433 int mlx5dr_send_postsend_ste(struct mlx5dr_domain *dmn, struct mlx5dr_ste *ste,
434 			     u8 *data, u16 size, u16 offset)
435 {
436 	struct postsend_info send_info = {};
437 
438 	mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, data, size);
439 
440 	send_info.write.addr = (uintptr_t)data;
441 	send_info.write.length = size;
442 	send_info.write.lkey = 0;
443 	send_info.remote_addr = mlx5dr_ste_get_mr_addr(ste) + offset;
444 	send_info.rkey = ste->htbl->chunk->rkey;
445 
446 	return dr_postsend_icm_data(dmn, &send_info);
447 }
448 
449 int mlx5dr_send_postsend_htbl(struct mlx5dr_domain *dmn,
450 			      struct mlx5dr_ste_htbl *htbl,
451 			      u8 *formatted_ste, u8 *mask)
452 {
453 	u32 byte_size = htbl->chunk->byte_size;
454 	int num_stes_per_iter;
455 	int iterations;
456 	u8 *data;
457 	int ret;
458 	int i;
459 	int j;
460 
461 	ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
462 				      &iterations, &num_stes_per_iter);
463 	if (ret)
464 		return ret;
465 
466 	mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, formatted_ste, DR_STE_SIZE);
467 
468 	/* Send the data iteration times */
469 	for (i = 0; i < iterations; i++) {
470 		u32 ste_index = i * (byte_size / DR_STE_SIZE);
471 		struct postsend_info send_info = {};
472 
473 		/* Copy all ste's on the data buffer
474 		 * need to add the bit_mask
475 		 */
476 		for (j = 0; j < num_stes_per_iter; j++) {
477 			struct mlx5dr_ste *ste = &htbl->ste_arr[ste_index + j];
478 			u32 ste_off = j * DR_STE_SIZE;
479 
480 			if (mlx5dr_ste_is_not_used(ste)) {
481 				memcpy(data + ste_off,
482 				       formatted_ste, DR_STE_SIZE);
483 			} else {
484 				/* Copy data */
485 				memcpy(data + ste_off,
486 				       htbl->ste_arr[ste_index + j].hw_ste,
487 				       DR_STE_SIZE_REDUCED);
488 				/* Copy bit_mask */
489 				memcpy(data + ste_off + DR_STE_SIZE_REDUCED,
490 				       mask, DR_STE_SIZE_MASK);
491 				/* Only when we have mask we need to re-arrange the STE */
492 				mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx,
493 								data + (j * DR_STE_SIZE),
494 								DR_STE_SIZE);
495 			}
496 		}
497 
498 		send_info.write.addr = (uintptr_t)data;
499 		send_info.write.length = byte_size;
500 		send_info.write.lkey = 0;
501 		send_info.remote_addr =
502 			mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
503 		send_info.rkey = htbl->chunk->rkey;
504 
505 		ret = dr_postsend_icm_data(dmn, &send_info);
506 		if (ret)
507 			goto out_free;
508 	}
509 
510 out_free:
511 	kvfree(data);
512 	return ret;
513 }
514 
515 /* Initialize htble with default STEs */
516 int mlx5dr_send_postsend_formatted_htbl(struct mlx5dr_domain *dmn,
517 					struct mlx5dr_ste_htbl *htbl,
518 					u8 *ste_init_data,
519 					bool update_hw_ste)
520 {
521 	u32 byte_size = htbl->chunk->byte_size;
522 	int iterations;
523 	int num_stes;
524 	u8 *copy_dst;
525 	u8 *data;
526 	int ret;
527 	int i;
528 
529 	ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
530 				      &iterations, &num_stes);
531 	if (ret)
532 		return ret;
533 
534 	if (update_hw_ste) {
535 		/* Copy the reduced STE to hash table ste_arr */
536 		for (i = 0; i < num_stes; i++) {
537 			copy_dst = htbl->hw_ste_arr + i * DR_STE_SIZE_REDUCED;
538 			memcpy(copy_dst, ste_init_data, DR_STE_SIZE_REDUCED);
539 		}
540 	}
541 
542 	mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, ste_init_data, DR_STE_SIZE);
543 
544 	/* Copy the same STE on the data buffer */
545 	for (i = 0; i < num_stes; i++) {
546 		copy_dst = data + i * DR_STE_SIZE;
547 		memcpy(copy_dst, ste_init_data, DR_STE_SIZE);
548 	}
549 
550 	/* Send the data iteration times */
551 	for (i = 0; i < iterations; i++) {
552 		u8 ste_index = i * (byte_size / DR_STE_SIZE);
553 		struct postsend_info send_info = {};
554 
555 		send_info.write.addr = (uintptr_t)data;
556 		send_info.write.length = byte_size;
557 		send_info.write.lkey = 0;
558 		send_info.remote_addr =
559 			mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
560 		send_info.rkey = htbl->chunk->rkey;
561 
562 		ret = dr_postsend_icm_data(dmn, &send_info);
563 		if (ret)
564 			goto out_free;
565 	}
566 
567 out_free:
568 	kvfree(data);
569 	return ret;
570 }
571 
572 int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn,
573 				struct mlx5dr_action *action)
574 {
575 	struct postsend_info send_info = {};
576 	int ret;
577 
578 	send_info.write.addr = (uintptr_t)action->rewrite->data;
579 	send_info.write.length = action->rewrite->num_of_actions *
580 				 DR_MODIFY_ACTION_SIZE;
581 	send_info.write.lkey = 0;
582 	send_info.remote_addr = action->rewrite->chunk->mr_addr;
583 	send_info.rkey = action->rewrite->chunk->rkey;
584 
585 	ret = dr_postsend_icm_data(dmn, &send_info);
586 
587 	return ret;
588 }
589 
590 static int dr_modify_qp_rst2init(struct mlx5_core_dev *mdev,
591 				 struct mlx5dr_qp *dr_qp,
592 				 int port)
593 {
594 	u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
595 	void *qpc;
596 
597 	qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc);
598 
599 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, port);
600 	MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
601 	MLX5_SET(qpc, qpc, rre, 1);
602 	MLX5_SET(qpc, qpc, rwe, 1);
603 
604 	MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP);
605 	MLX5_SET(rst2init_qp_in, in, qpn, dr_qp->qpn);
606 
607 	return mlx5_cmd_exec_in(mdev, rst2init_qp, in);
608 }
609 
610 static int dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev *mdev,
611 				    struct mlx5dr_qp *dr_qp,
612 				    struct dr_qp_rts_attr *attr)
613 {
614 	u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
615 	void *qpc;
616 
617 	qpc  = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc);
618 
619 	MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn);
620 
621 	MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt);
622 	MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry);
623 	MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
624 
625 	MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
626 	MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn);
627 
628 	return mlx5_cmd_exec_in(mdev, rtr2rts_qp, in);
629 }
630 
631 static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev,
632 				     struct mlx5dr_qp *dr_qp,
633 				     struct dr_qp_rtr_attr *attr)
634 {
635 	u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
636 	void *qpc;
637 
638 	qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc);
639 
640 	MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn);
641 
642 	MLX5_SET(qpc, qpc, mtu, attr->mtu);
643 	MLX5_SET(qpc, qpc, log_msg_max, DR_CHUNK_SIZE_MAX - 1);
644 	MLX5_SET(qpc, qpc, remote_qpn, attr->qp_num);
645 	memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32),
646 	       attr->dgid_attr.mac, sizeof(attr->dgid_attr.mac));
647 	memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
648 	       attr->dgid_attr.gid, sizeof(attr->dgid_attr.gid));
649 	MLX5_SET(qpc, qpc, primary_address_path.src_addr_index,
650 		 attr->sgid_index);
651 
652 	if (attr->dgid_attr.roce_ver == MLX5_ROCE_VERSION_2)
653 		MLX5_SET(qpc, qpc, primary_address_path.udp_sport,
654 			 attr->udp_src_port);
655 
656 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num);
657 	MLX5_SET(qpc, qpc, primary_address_path.fl, attr->fl);
658 	MLX5_SET(qpc, qpc, min_rnr_nak, 1);
659 
660 	MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
661 	MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn);
662 
663 	return mlx5_cmd_exec_in(mdev, init2rtr_qp, in);
664 }
665 
666 static bool dr_send_allow_fl(struct mlx5dr_cmd_caps *caps)
667 {
668 	/* Check whether RC RoCE QP creation with force loopback is allowed.
669 	 * There are two separate capability bits for this:
670 	 *  - force loopback when RoCE is enabled
671 	 *  - force loopback when RoCE is disabled
672 	 */
673 	return ((caps->roce_caps.roce_en &&
674 		 caps->roce_caps.fl_rc_qp_when_roce_enabled) ||
675 		(!caps->roce_caps.roce_en &&
676 		 caps->roce_caps.fl_rc_qp_when_roce_disabled));
677 }
678 
679 static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn)
680 {
681 	struct mlx5dr_qp *dr_qp = dmn->send_ring->qp;
682 	struct dr_qp_rts_attr rts_attr = {};
683 	struct dr_qp_rtr_attr rtr_attr = {};
684 	enum ib_mtu mtu = IB_MTU_1024;
685 	u16 gid_index = 0;
686 	int port = 1;
687 	int ret;
688 
689 	/* Init */
690 	ret = dr_modify_qp_rst2init(dmn->mdev, dr_qp, port);
691 	if (ret) {
692 		mlx5dr_err(dmn, "Failed modify QP rst2init\n");
693 		return ret;
694 	}
695 
696 	/* RTR */
697 	rtr_attr.mtu		= mtu;
698 	rtr_attr.qp_num		= dr_qp->qpn;
699 	rtr_attr.min_rnr_timer	= 12;
700 	rtr_attr.port_num	= port;
701 	rtr_attr.udp_src_port	= dmn->info.caps.roce_min_src_udp;
702 
703 	/* If QP creation with force loopback is allowed, then there
704 	 * is no need for GID index when creating the QP.
705 	 * Otherwise we query GID attributes and use GID index.
706 	 */
707 	rtr_attr.fl = dr_send_allow_fl(&dmn->info.caps);
708 	if (!rtr_attr.fl) {
709 		ret = mlx5dr_cmd_query_gid(dmn->mdev, port, gid_index,
710 					   &rtr_attr.dgid_attr);
711 		if (ret)
712 			return ret;
713 
714 		rtr_attr.sgid_index = gid_index;
715 	}
716 
717 	ret = dr_cmd_modify_qp_init2rtr(dmn->mdev, dr_qp, &rtr_attr);
718 	if (ret) {
719 		mlx5dr_err(dmn, "Failed modify QP init2rtr\n");
720 		return ret;
721 	}
722 
723 	/* RTS */
724 	rts_attr.timeout	= 14;
725 	rts_attr.retry_cnt	= 7;
726 	rts_attr.rnr_retry	= 7;
727 
728 	ret = dr_cmd_modify_qp_rtr2rts(dmn->mdev, dr_qp, &rts_attr);
729 	if (ret) {
730 		mlx5dr_err(dmn, "Failed modify QP rtr2rts\n");
731 		return ret;
732 	}
733 
734 	return 0;
735 }
736 
737 static void dr_cq_complete(struct mlx5_core_cq *mcq,
738 			   struct mlx5_eqe *eqe)
739 {
740 	pr_err("CQ completion CQ: #%u\n", mcq->cqn);
741 }
742 
743 static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev,
744 				      struct mlx5_uars_page *uar,
745 				      size_t ncqe)
746 {
747 	u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {};
748 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
749 	struct mlx5_wq_param wqp;
750 	struct mlx5_cqe64 *cqe;
751 	struct mlx5dr_cq *cq;
752 	int inlen, err, eqn;
753 	void *cqc, *in;
754 	__be64 *pas;
755 	int vector;
756 	u32 i;
757 
758 	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
759 	if (!cq)
760 		return NULL;
761 
762 	ncqe = roundup_pow_of_two(ncqe);
763 	MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(ncqe));
764 
765 	wqp.buf_numa_node = mdev->priv.numa_node;
766 	wqp.db_numa_node = mdev->priv.numa_node;
767 
768 	err = mlx5_cqwq_create(mdev, &wqp, temp_cqc, &cq->wq,
769 			       &cq->wq_ctrl);
770 	if (err)
771 		goto out;
772 
773 	for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
774 		cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
775 		cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK;
776 	}
777 
778 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
779 		sizeof(u64) * cq->wq_ctrl.buf.npages;
780 	in = kvzalloc(inlen, GFP_KERNEL);
781 	if (!in)
782 		goto err_cqwq;
783 
784 	vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev);
785 	err = mlx5_vector2eqn(mdev, vector, &eqn);
786 	if (err) {
787 		kvfree(in);
788 		goto err_cqwq;
789 	}
790 
791 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
792 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
793 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
794 	MLX5_SET(cqc, cqc, uar_page, uar->index);
795 	MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
796 		 MLX5_ADAPTER_PAGE_SHIFT);
797 	MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma);
798 
799 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
800 	mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas);
801 
802 	cq->mcq.comp  = dr_cq_complete;
803 
804 	err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
805 	kvfree(in);
806 
807 	if (err)
808 		goto err_cqwq;
809 
810 	cq->mcq.cqe_sz = 64;
811 	cq->mcq.set_ci_db = cq->wq_ctrl.db.db;
812 	cq->mcq.arm_db = cq->wq_ctrl.db.db + 1;
813 	*cq->mcq.set_ci_db = 0;
814 
815 	/* set no-zero value, in order to avoid the HW to run db-recovery on
816 	 * CQ that used in polling mode.
817 	 */
818 	*cq->mcq.arm_db = cpu_to_be32(2 << 28);
819 
820 	cq->mcq.vector = 0;
821 	cq->mcq.uar = uar;
822 
823 	return cq;
824 
825 err_cqwq:
826 	mlx5_wq_destroy(&cq->wq_ctrl);
827 out:
828 	kfree(cq);
829 	return NULL;
830 }
831 
832 static void dr_destroy_cq(struct mlx5_core_dev *mdev, struct mlx5dr_cq *cq)
833 {
834 	mlx5_core_destroy_cq(mdev, &cq->mcq);
835 	mlx5_wq_destroy(&cq->wq_ctrl);
836 	kfree(cq);
837 }
838 
839 static int
840 dr_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, struct mlx5_core_mkey *mkey)
841 {
842 	u32 in[MLX5_ST_SZ_DW(create_mkey_in)] = {};
843 	void *mkc;
844 
845 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
846 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
847 	MLX5_SET(mkc, mkc, a, 1);
848 	MLX5_SET(mkc, mkc, rw, 1);
849 	MLX5_SET(mkc, mkc, rr, 1);
850 	MLX5_SET(mkc, mkc, lw, 1);
851 	MLX5_SET(mkc, mkc, lr, 1);
852 
853 	MLX5_SET(mkc, mkc, pd, pdn);
854 	MLX5_SET(mkc, mkc, length64, 1);
855 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
856 
857 	return mlx5_core_create_mkey(mdev, mkey, in, sizeof(in));
858 }
859 
860 static struct mlx5dr_mr *dr_reg_mr(struct mlx5_core_dev *mdev,
861 				   u32 pdn, void *buf, size_t size)
862 {
863 	struct mlx5dr_mr *mr = kzalloc(sizeof(*mr), GFP_KERNEL);
864 	struct device *dma_device;
865 	dma_addr_t dma_addr;
866 	int err;
867 
868 	if (!mr)
869 		return NULL;
870 
871 	dma_device = mlx5_core_dma_dev(mdev);
872 	dma_addr = dma_map_single(dma_device, buf, size,
873 				  DMA_BIDIRECTIONAL);
874 	err = dma_mapping_error(dma_device, dma_addr);
875 	if (err) {
876 		mlx5_core_warn(mdev, "Can't dma buf\n");
877 		kfree(mr);
878 		return NULL;
879 	}
880 
881 	err = dr_create_mkey(mdev, pdn, &mr->mkey);
882 	if (err) {
883 		mlx5_core_warn(mdev, "Can't create mkey\n");
884 		dma_unmap_single(dma_device, dma_addr, size,
885 				 DMA_BIDIRECTIONAL);
886 		kfree(mr);
887 		return NULL;
888 	}
889 
890 	mr->dma_addr = dma_addr;
891 	mr->size = size;
892 	mr->addr = buf;
893 
894 	return mr;
895 }
896 
897 static void dr_dereg_mr(struct mlx5_core_dev *mdev, struct mlx5dr_mr *mr)
898 {
899 	mlx5_core_destroy_mkey(mdev, &mr->mkey);
900 	dma_unmap_single(mlx5_core_dma_dev(mdev), mr->dma_addr, mr->size,
901 			 DMA_BIDIRECTIONAL);
902 	kfree(mr);
903 }
904 
905 int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn)
906 {
907 	struct dr_qp_init_attr init_attr = {};
908 	int cq_size;
909 	int size;
910 	int ret;
911 
912 	dmn->send_ring = kzalloc(sizeof(*dmn->send_ring), GFP_KERNEL);
913 	if (!dmn->send_ring)
914 		return -ENOMEM;
915 
916 	cq_size = QUEUE_SIZE + 1;
917 	dmn->send_ring->cq = dr_create_cq(dmn->mdev, dmn->uar, cq_size);
918 	if (!dmn->send_ring->cq) {
919 		mlx5dr_err(dmn, "Failed creating CQ\n");
920 		ret = -ENOMEM;
921 		goto free_send_ring;
922 	}
923 
924 	init_attr.cqn = dmn->send_ring->cq->mcq.cqn;
925 	init_attr.pdn = dmn->pdn;
926 	init_attr.uar = dmn->uar;
927 	init_attr.max_send_wr = QUEUE_SIZE;
928 
929 	/* Isolated VL is applicable only if force loopback is supported */
930 	if (dr_send_allow_fl(&dmn->info.caps))
931 		init_attr.isolate_vl_tc = dmn->info.caps.isolate_vl_tc;
932 
933 	spin_lock_init(&dmn->send_ring->lock);
934 
935 	dmn->send_ring->qp = dr_create_rc_qp(dmn->mdev, &init_attr);
936 	if (!dmn->send_ring->qp)  {
937 		mlx5dr_err(dmn, "Failed creating QP\n");
938 		ret = -ENOMEM;
939 		goto clean_cq;
940 	}
941 
942 	dmn->send_ring->cq->qp = dmn->send_ring->qp;
943 
944 	dmn->info.max_send_wr = QUEUE_SIZE;
945 	dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data,
946 					DR_STE_SIZE);
947 
948 	dmn->send_ring->signal_th = dmn->info.max_send_wr /
949 		SIGNAL_PER_DIV_QUEUE;
950 
951 	/* Prepare qp to be used */
952 	ret = dr_prepare_qp_to_rts(dmn);
953 	if (ret)
954 		goto clean_qp;
955 
956 	dmn->send_ring->max_post_send_size =
957 		mlx5dr_icm_pool_chunk_size_to_byte(DR_CHUNK_SIZE_1K,
958 						   DR_ICM_TYPE_STE);
959 
960 	/* Allocating the max size as a buffer for writing */
961 	size = dmn->send_ring->signal_th * dmn->send_ring->max_post_send_size;
962 	dmn->send_ring->buf = kzalloc(size, GFP_KERNEL);
963 	if (!dmn->send_ring->buf) {
964 		ret = -ENOMEM;
965 		goto clean_qp;
966 	}
967 
968 	dmn->send_ring->buf_size = size;
969 
970 	dmn->send_ring->mr = dr_reg_mr(dmn->mdev,
971 				       dmn->pdn, dmn->send_ring->buf, size);
972 	if (!dmn->send_ring->mr) {
973 		ret = -ENOMEM;
974 		goto free_mem;
975 	}
976 
977 	dmn->send_ring->sync_mr = dr_reg_mr(dmn->mdev,
978 					    dmn->pdn, dmn->send_ring->sync_buff,
979 					    MIN_READ_SYNC);
980 	if (!dmn->send_ring->sync_mr) {
981 		ret = -ENOMEM;
982 		goto clean_mr;
983 	}
984 
985 	return 0;
986 
987 clean_mr:
988 	dr_dereg_mr(dmn->mdev, dmn->send_ring->mr);
989 free_mem:
990 	kfree(dmn->send_ring->buf);
991 clean_qp:
992 	dr_destroy_qp(dmn->mdev, dmn->send_ring->qp);
993 clean_cq:
994 	dr_destroy_cq(dmn->mdev, dmn->send_ring->cq);
995 free_send_ring:
996 	kfree(dmn->send_ring);
997 
998 	return ret;
999 }
1000 
1001 void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn,
1002 			   struct mlx5dr_send_ring *send_ring)
1003 {
1004 	dr_destroy_qp(dmn->mdev, send_ring->qp);
1005 	dr_destroy_cq(dmn->mdev, send_ring->cq);
1006 	dr_dereg_mr(dmn->mdev, send_ring->sync_mr);
1007 	dr_dereg_mr(dmn->mdev, send_ring->mr);
1008 	kfree(send_ring->buf);
1009 	kfree(send_ring);
1010 }
1011 
1012 int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn)
1013 {
1014 	struct mlx5dr_send_ring *send_ring = dmn->send_ring;
1015 	struct postsend_info send_info = {};
1016 	u8 data[DR_STE_SIZE];
1017 	int num_of_sends_req;
1018 	int ret;
1019 	int i;
1020 
1021 	/* Sending this amount of requests makes sure we will get drain */
1022 	num_of_sends_req = send_ring->signal_th * TH_NUMS_TO_DRAIN / 2;
1023 
1024 	/* Send fake requests forcing the last to be signaled */
1025 	send_info.write.addr = (uintptr_t)data;
1026 	send_info.write.length = DR_STE_SIZE;
1027 	send_info.write.lkey = 0;
1028 	/* Using the sync_mr in order to write/read */
1029 	send_info.remote_addr = (uintptr_t)send_ring->sync_mr->addr;
1030 	send_info.rkey = send_ring->sync_mr->mkey.key;
1031 
1032 	for (i = 0; i < num_of_sends_req; i++) {
1033 		ret = dr_postsend_icm_data(dmn, &send_info);
1034 		if (ret)
1035 			return ret;
1036 	}
1037 
1038 	spin_lock(&send_ring->lock);
1039 	ret = dr_handle_pending_wc(dmn, send_ring);
1040 	spin_unlock(&send_ring->lock);
1041 
1042 	return ret;
1043 }
1044