1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2019 Mellanox Technologies. */
3 
4 #include <linux/smp.h>
5 #include "dr_types.h"
6 
7 #define QUEUE_SIZE 128
8 #define SIGNAL_PER_DIV_QUEUE 16
9 #define TH_NUMS_TO_DRAIN 2
10 
11 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
12 
13 struct dr_data_seg {
14 	u64 addr;
15 	u32 length;
16 	u32 lkey;
17 	unsigned int send_flags;
18 };
19 
20 struct postsend_info {
21 	struct dr_data_seg write;
22 	struct dr_data_seg read;
23 	u64 remote_addr;
24 	u32 rkey;
25 };
26 
27 struct dr_qp_rtr_attr {
28 	struct mlx5dr_cmd_gid_attr dgid_attr;
29 	enum ib_mtu mtu;
30 	u32 qp_num;
31 	u16 port_num;
32 	u8 min_rnr_timer;
33 	u8 sgid_index;
34 	u16 udp_src_port;
35 };
36 
37 struct dr_qp_rts_attr {
38 	u8 timeout;
39 	u8 retry_cnt;
40 	u8 rnr_retry;
41 };
42 
43 struct dr_qp_init_attr {
44 	u32 cqn;
45 	u32 pdn;
46 	u32 max_send_wr;
47 	struct mlx5_uars_page *uar;
48 };
49 
50 static int dr_parse_cqe(struct mlx5dr_cq *dr_cq, struct mlx5_cqe64 *cqe64)
51 {
52 	unsigned int idx;
53 	u8 opcode;
54 
55 	opcode = get_cqe_opcode(cqe64);
56 	if (opcode == MLX5_CQE_REQ_ERR) {
57 		idx = be16_to_cpu(cqe64->wqe_counter) &
58 			(dr_cq->qp->sq.wqe_cnt - 1);
59 		dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
60 	} else if (opcode == MLX5_CQE_RESP_ERR) {
61 		++dr_cq->qp->sq.cc;
62 	} else {
63 		idx = be16_to_cpu(cqe64->wqe_counter) &
64 			(dr_cq->qp->sq.wqe_cnt - 1);
65 		dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
66 
67 		return CQ_OK;
68 	}
69 
70 	return CQ_POLL_ERR;
71 }
72 
73 static int dr_cq_poll_one(struct mlx5dr_cq *dr_cq)
74 {
75 	struct mlx5_cqe64 *cqe64;
76 	int err;
77 
78 	cqe64 = mlx5_cqwq_get_cqe(&dr_cq->wq);
79 	if (!cqe64)
80 		return CQ_EMPTY;
81 
82 	mlx5_cqwq_pop(&dr_cq->wq);
83 	err = dr_parse_cqe(dr_cq, cqe64);
84 	mlx5_cqwq_update_db_record(&dr_cq->wq);
85 
86 	return err;
87 }
88 
89 static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne)
90 {
91 	int npolled;
92 	int err = 0;
93 
94 	for (npolled = 0; npolled < ne; ++npolled) {
95 		err = dr_cq_poll_one(dr_cq);
96 		if (err != CQ_OK)
97 			break;
98 	}
99 
100 	return err == CQ_POLL_ERR ? err : npolled;
101 }
102 
103 static void dr_qp_event(struct mlx5_core_qp *mqp, int event)
104 {
105 	pr_info("DR QP event %u on QP #%u\n", event, mqp->qpn);
106 }
107 
108 static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
109 					 struct dr_qp_init_attr *attr)
110 {
111 	u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {};
112 	struct mlx5_wq_param wqp;
113 	struct mlx5dr_qp *dr_qp;
114 	int inlen;
115 	void *qpc;
116 	void *in;
117 	int err;
118 
119 	dr_qp = kzalloc(sizeof(*dr_qp), GFP_KERNEL);
120 	if (!dr_qp)
121 		return NULL;
122 
123 	wqp.buf_numa_node = mdev->priv.numa_node;
124 	wqp.db_numa_node = mdev->priv.numa_node;
125 
126 	dr_qp->rq.pc = 0;
127 	dr_qp->rq.cc = 0;
128 	dr_qp->rq.wqe_cnt = 4;
129 	dr_qp->sq.pc = 0;
130 	dr_qp->sq.cc = 0;
131 	dr_qp->sq.wqe_cnt = roundup_pow_of_two(attr->max_send_wr);
132 
133 	MLX5_SET(qpc, temp_qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
134 	MLX5_SET(qpc, temp_qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
135 	MLX5_SET(qpc, temp_qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
136 	err = mlx5_wq_qp_create(mdev, &wqp, temp_qpc, &dr_qp->wq,
137 				&dr_qp->wq_ctrl);
138 	if (err) {
139 		mlx5_core_warn(mdev, "Can't create QP WQ\n");
140 		goto err_wq;
141 	}
142 
143 	dr_qp->sq.wqe_head = kcalloc(dr_qp->sq.wqe_cnt,
144 				     sizeof(dr_qp->sq.wqe_head[0]),
145 				     GFP_KERNEL);
146 
147 	if (!dr_qp->sq.wqe_head) {
148 		mlx5_core_warn(mdev, "Can't allocate wqe head\n");
149 		goto err_wqe_head;
150 	}
151 
152 	inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
153 		MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
154 		dr_qp->wq_ctrl.buf.npages;
155 	in = kvzalloc(inlen, GFP_KERNEL);
156 	if (!in) {
157 		err = -ENOMEM;
158 		goto err_in;
159 	}
160 
161 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
162 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
163 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
164 	MLX5_SET(qpc, qpc, pd, attr->pdn);
165 	MLX5_SET(qpc, qpc, uar_page, attr->uar->index);
166 	MLX5_SET(qpc, qpc, log_page_size,
167 		 dr_qp->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
168 	MLX5_SET(qpc, qpc, fre, 1);
169 	MLX5_SET(qpc, qpc, rlky, 1);
170 	MLX5_SET(qpc, qpc, cqn_snd, attr->cqn);
171 	MLX5_SET(qpc, qpc, cqn_rcv, attr->cqn);
172 	MLX5_SET(qpc, qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
173 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
174 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
175 	MLX5_SET(qpc, qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
176 	MLX5_SET64(qpc, qpc, dbr_addr, dr_qp->wq_ctrl.db.dma);
177 	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
178 		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
179 	mlx5_fill_page_frag_array(&dr_qp->wq_ctrl.buf,
180 				  (__be64 *)MLX5_ADDR_OF(create_qp_in,
181 							 in, pas));
182 
183 	err = mlx5_core_create_qp(mdev, &dr_qp->mqp, in, inlen);
184 	kfree(in);
185 
186 	if (err) {
187 		mlx5_core_warn(mdev, " Can't create QP\n");
188 		goto err_in;
189 	}
190 	dr_qp->mqp.event = dr_qp_event;
191 	dr_qp->uar = attr->uar;
192 
193 	return dr_qp;
194 
195 err_in:
196 	kfree(dr_qp->sq.wqe_head);
197 err_wqe_head:
198 	mlx5_wq_destroy(&dr_qp->wq_ctrl);
199 err_wq:
200 	kfree(dr_qp);
201 	return NULL;
202 }
203 
204 static void dr_destroy_qp(struct mlx5_core_dev *mdev,
205 			  struct mlx5dr_qp *dr_qp)
206 {
207 	mlx5_core_destroy_qp(mdev, &dr_qp->mqp);
208 	kfree(dr_qp->sq.wqe_head);
209 	mlx5_wq_destroy(&dr_qp->wq_ctrl);
210 	kfree(dr_qp);
211 }
212 
213 static void dr_cmd_notify_hw(struct mlx5dr_qp *dr_qp, void *ctrl)
214 {
215 	dma_wmb();
216 	*dr_qp->wq.sq.db = cpu_to_be32(dr_qp->sq.pc & 0xfffff);
217 
218 	/* After wmb() the hw aware of new work */
219 	wmb();
220 
221 	mlx5_write64(ctrl, dr_qp->uar->map + MLX5_BF_OFFSET);
222 }
223 
224 static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr,
225 			     u32 rkey, struct dr_data_seg *data_seg,
226 			     u32 opcode, int nreq)
227 {
228 	struct mlx5_wqe_raddr_seg *wq_raddr;
229 	struct mlx5_wqe_ctrl_seg *wq_ctrl;
230 	struct mlx5_wqe_data_seg *wq_dseg;
231 	unsigned int size;
232 	unsigned int idx;
233 
234 	size = sizeof(*wq_ctrl) / 16 + sizeof(*wq_dseg) / 16 +
235 		sizeof(*wq_raddr) / 16;
236 
237 	idx = dr_qp->sq.pc & (dr_qp->sq.wqe_cnt - 1);
238 
239 	wq_ctrl = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx);
240 	wq_ctrl->imm = 0;
241 	wq_ctrl->fm_ce_se = (data_seg->send_flags) ?
242 		MLX5_WQE_CTRL_CQ_UPDATE : 0;
243 	wq_ctrl->opmod_idx_opcode = cpu_to_be32(((dr_qp->sq.pc & 0xffff) << 8) |
244 						opcode);
245 	wq_ctrl->qpn_ds = cpu_to_be32(size | dr_qp->mqp.qpn << 8);
246 	wq_raddr = (void *)(wq_ctrl + 1);
247 	wq_raddr->raddr = cpu_to_be64(remote_addr);
248 	wq_raddr->rkey = cpu_to_be32(rkey);
249 	wq_raddr->reserved = 0;
250 
251 	wq_dseg = (void *)(wq_raddr + 1);
252 	wq_dseg->byte_count = cpu_to_be32(data_seg->length);
253 	wq_dseg->lkey = cpu_to_be32(data_seg->lkey);
254 	wq_dseg->addr = cpu_to_be64(data_seg->addr);
255 
256 	dr_qp->sq.wqe_head[idx] = dr_qp->sq.pc++;
257 
258 	if (nreq)
259 		dr_cmd_notify_hw(dr_qp, wq_ctrl);
260 }
261 
262 static void dr_post_send(struct mlx5dr_qp *dr_qp, struct postsend_info *send_info)
263 {
264 	dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
265 			 &send_info->write, MLX5_OPCODE_RDMA_WRITE, 0);
266 	dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
267 			 &send_info->read, MLX5_OPCODE_RDMA_READ, 1);
268 }
269 
270 /**
271  * mlx5dr_send_fill_and_append_ste_send_info: Add data to be sent
272  * with send_list parameters:
273  *
274  *     @ste:       The data that attached to this specific ste
275  *     @size:      of data to write
276  *     @offset:    of the data from start of the hw_ste entry
277  *     @data:      data
278  *     @ste_info:  ste to be sent with send_list
279  *     @send_list: to append into it
280  *     @copy_data: if true indicates that the data should be kept because
281  *                 it's not backuped any where (like in re-hash).
282  *                 if false, it lets the data to be updated after
283  *                 it was added to the list.
284  */
285 void mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste *ste, u16 size,
286 					       u16 offset, u8 *data,
287 					       struct mlx5dr_ste_send_info *ste_info,
288 					       struct list_head *send_list,
289 					       bool copy_data)
290 {
291 	ste_info->size = size;
292 	ste_info->ste = ste;
293 	ste_info->offset = offset;
294 
295 	if (copy_data) {
296 		memcpy(ste_info->data_cont, data, size);
297 		ste_info->data = ste_info->data_cont;
298 	} else {
299 		ste_info->data = data;
300 	}
301 
302 	list_add_tail(&ste_info->send_list, send_list);
303 }
304 
305 /* The function tries to consume one wc each time, unless the queue is full, in
306  * that case, which means that the hw is behind the sw in a full queue len
307  * the function will drain the cq till it empty.
308  */
309 static int dr_handle_pending_wc(struct mlx5dr_domain *dmn,
310 				struct mlx5dr_send_ring *send_ring)
311 {
312 	bool is_drain = false;
313 	int ne;
314 
315 	if (send_ring->pending_wqe < send_ring->signal_th)
316 		return 0;
317 
318 	/* Queue is full start drain it */
319 	if (send_ring->pending_wqe >=
320 	    dmn->send_ring->signal_th * TH_NUMS_TO_DRAIN)
321 		is_drain = true;
322 
323 	do {
324 		ne = dr_poll_cq(send_ring->cq, 1);
325 		if (ne < 0)
326 			return ne;
327 		else if (ne == 1)
328 			send_ring->pending_wqe -= send_ring->signal_th;
329 	} while (is_drain && send_ring->pending_wqe);
330 
331 	return 0;
332 }
333 
334 static void dr_fill_data_segs(struct mlx5dr_send_ring *send_ring,
335 			      struct postsend_info *send_info)
336 {
337 	send_ring->pending_wqe++;
338 
339 	if (send_ring->pending_wqe % send_ring->signal_th == 0)
340 		send_info->write.send_flags |= IB_SEND_SIGNALED;
341 
342 	send_ring->pending_wqe++;
343 	send_info->read.length = send_info->write.length;
344 	/* Read into the same write area */
345 	send_info->read.addr = (uintptr_t)send_info->write.addr;
346 	send_info->read.lkey = send_ring->mr->mkey.key;
347 
348 	if (send_ring->pending_wqe % send_ring->signal_th == 0)
349 		send_info->read.send_flags = IB_SEND_SIGNALED;
350 	else
351 		send_info->read.send_flags = 0;
352 }
353 
354 static int dr_postsend_icm_data(struct mlx5dr_domain *dmn,
355 				struct postsend_info *send_info)
356 {
357 	struct mlx5dr_send_ring *send_ring = dmn->send_ring;
358 	u32 buff_offset;
359 	int ret;
360 
361 	ret = dr_handle_pending_wc(dmn, send_ring);
362 	if (ret)
363 		return ret;
364 
365 	if (send_info->write.length > dmn->info.max_inline_size) {
366 		buff_offset = (send_ring->tx_head &
367 			       (dmn->send_ring->signal_th - 1)) *
368 			send_ring->max_post_send_size;
369 		/* Copy to ring mr */
370 		memcpy(send_ring->buf + buff_offset,
371 		       (void *)(uintptr_t)send_info->write.addr,
372 		       send_info->write.length);
373 		send_info->write.addr = (uintptr_t)send_ring->mr->dma_addr + buff_offset;
374 		send_info->write.lkey = send_ring->mr->mkey.key;
375 	}
376 
377 	send_ring->tx_head++;
378 	dr_fill_data_segs(send_ring, send_info);
379 	dr_post_send(send_ring->qp, send_info);
380 
381 	return 0;
382 }
383 
384 static int dr_get_tbl_copy_details(struct mlx5dr_domain *dmn,
385 				   struct mlx5dr_ste_htbl *htbl,
386 				   u8 **data,
387 				   u32 *byte_size,
388 				   int *iterations,
389 				   int *num_stes)
390 {
391 	int alloc_size;
392 
393 	if (htbl->chunk->byte_size > dmn->send_ring->max_post_send_size) {
394 		*iterations = htbl->chunk->byte_size /
395 			dmn->send_ring->max_post_send_size;
396 		*byte_size = dmn->send_ring->max_post_send_size;
397 		alloc_size = *byte_size;
398 		*num_stes = *byte_size / DR_STE_SIZE;
399 	} else {
400 		*iterations = 1;
401 		*num_stes = htbl->chunk->num_of_entries;
402 		alloc_size = *num_stes * DR_STE_SIZE;
403 	}
404 
405 	*data = kzalloc(alloc_size, GFP_KERNEL);
406 	if (!*data)
407 		return -ENOMEM;
408 
409 	return 0;
410 }
411 
412 /**
413  * mlx5dr_send_postsend_ste: write size bytes into offset from the hw cm.
414  *
415  *     @dmn:    Domain
416  *     @ste:    The ste struct that contains the data (at
417  *              least part of it)
418  *     @data:   The real data to send size data
419  *     @size:   for writing.
420  *     @offset: The offset from the icm mapped data to
421  *              start write to this for write only part of the
422  *              buffer.
423  *
424  * Return: 0 on success.
425  */
426 int mlx5dr_send_postsend_ste(struct mlx5dr_domain *dmn, struct mlx5dr_ste *ste,
427 			     u8 *data, u16 size, u16 offset)
428 {
429 	struct postsend_info send_info = {};
430 
431 	send_info.write.addr = (uintptr_t)data;
432 	send_info.write.length = size;
433 	send_info.write.lkey = 0;
434 	send_info.remote_addr = mlx5dr_ste_get_mr_addr(ste) + offset;
435 	send_info.rkey = ste->htbl->chunk->rkey;
436 
437 	return dr_postsend_icm_data(dmn, &send_info);
438 }
439 
440 int mlx5dr_send_postsend_htbl(struct mlx5dr_domain *dmn,
441 			      struct mlx5dr_ste_htbl *htbl,
442 			      u8 *formatted_ste, u8 *mask)
443 {
444 	u32 byte_size = htbl->chunk->byte_size;
445 	int num_stes_per_iter;
446 	int iterations;
447 	u8 *data;
448 	int ret;
449 	int i;
450 	int j;
451 
452 	ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
453 				      &iterations, &num_stes_per_iter);
454 	if (ret)
455 		return ret;
456 
457 	/* Send the data iteration times */
458 	for (i = 0; i < iterations; i++) {
459 		u32 ste_index = i * (byte_size / DR_STE_SIZE);
460 		struct postsend_info send_info = {};
461 
462 		/* Copy all ste's on the data buffer
463 		 * need to add the bit_mask
464 		 */
465 		for (j = 0; j < num_stes_per_iter; j++) {
466 			u8 *hw_ste = htbl->ste_arr[ste_index + j].hw_ste;
467 			u32 ste_off = j * DR_STE_SIZE;
468 
469 			if (mlx5dr_ste_is_not_valid_entry(hw_ste)) {
470 				memcpy(data + ste_off,
471 				       formatted_ste, DR_STE_SIZE);
472 			} else {
473 				/* Copy data */
474 				memcpy(data + ste_off,
475 				       htbl->ste_arr[ste_index + j].hw_ste,
476 				       DR_STE_SIZE_REDUCED);
477 				/* Copy bit_mask */
478 				memcpy(data + ste_off + DR_STE_SIZE_REDUCED,
479 				       mask, DR_STE_SIZE_MASK);
480 			}
481 		}
482 
483 		send_info.write.addr = (uintptr_t)data;
484 		send_info.write.length = byte_size;
485 		send_info.write.lkey = 0;
486 		send_info.remote_addr =
487 			mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
488 		send_info.rkey = htbl->chunk->rkey;
489 
490 		ret = dr_postsend_icm_data(dmn, &send_info);
491 		if (ret)
492 			goto out_free;
493 	}
494 
495 out_free:
496 	kfree(data);
497 	return ret;
498 }
499 
500 /* Initialize htble with default STEs */
501 int mlx5dr_send_postsend_formatted_htbl(struct mlx5dr_domain *dmn,
502 					struct mlx5dr_ste_htbl *htbl,
503 					u8 *ste_init_data,
504 					bool update_hw_ste)
505 {
506 	u32 byte_size = htbl->chunk->byte_size;
507 	int iterations;
508 	int num_stes;
509 	u8 *data;
510 	int ret;
511 	int i;
512 
513 	ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
514 				      &iterations, &num_stes);
515 	if (ret)
516 		return ret;
517 
518 	for (i = 0; i < num_stes; i++) {
519 		u8 *copy_dst;
520 
521 		/* Copy the same ste on the data buffer */
522 		copy_dst = data + i * DR_STE_SIZE;
523 		memcpy(copy_dst, ste_init_data, DR_STE_SIZE);
524 
525 		if (update_hw_ste) {
526 			/* Copy the reduced ste to hash table ste_arr */
527 			copy_dst = htbl->hw_ste_arr + i * DR_STE_SIZE_REDUCED;
528 			memcpy(copy_dst, ste_init_data, DR_STE_SIZE_REDUCED);
529 		}
530 	}
531 
532 	/* Send the data iteration times */
533 	for (i = 0; i < iterations; i++) {
534 		u8 ste_index = i * (byte_size / DR_STE_SIZE);
535 		struct postsend_info send_info = {};
536 
537 		send_info.write.addr = (uintptr_t)data;
538 		send_info.write.length = byte_size;
539 		send_info.write.lkey = 0;
540 		send_info.remote_addr =
541 			mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
542 		send_info.rkey = htbl->chunk->rkey;
543 
544 		ret = dr_postsend_icm_data(dmn, &send_info);
545 		if (ret)
546 			goto out_free;
547 	}
548 
549 out_free:
550 	kfree(data);
551 	return ret;
552 }
553 
554 int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn,
555 				struct mlx5dr_action *action)
556 {
557 	struct postsend_info send_info = {};
558 	int ret;
559 
560 	send_info.write.addr = (uintptr_t)action->rewrite.data;
561 	send_info.write.length = action->rewrite.chunk->byte_size;
562 	send_info.write.lkey = 0;
563 	send_info.remote_addr = action->rewrite.chunk->mr_addr;
564 	send_info.rkey = action->rewrite.chunk->rkey;
565 
566 	mutex_lock(&dmn->mutex);
567 	ret = dr_postsend_icm_data(dmn, &send_info);
568 	mutex_unlock(&dmn->mutex);
569 
570 	return ret;
571 }
572 
573 static int dr_modify_qp_rst2init(struct mlx5_core_dev *mdev,
574 				 struct mlx5dr_qp *dr_qp,
575 				 int port)
576 {
577 	u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
578 	void *qpc;
579 
580 	qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc);
581 
582 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, port);
583 	MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
584 	MLX5_SET(qpc, qpc, rre, 1);
585 	MLX5_SET(qpc, qpc, rwe, 1);
586 
587 	return mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RST2INIT_QP, 0, qpc,
588 				   &dr_qp->mqp);
589 }
590 
591 static int dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev *mdev,
592 				    struct mlx5dr_qp *dr_qp,
593 				    struct dr_qp_rts_attr *attr)
594 {
595 	u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
596 	void *qpc;
597 
598 	qpc  = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc);
599 
600 	MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->mqp.qpn);
601 
602 	MLX5_SET(qpc, qpc, log_ack_req_freq, 0);
603 	MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt);
604 	MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry);
605 
606 	return mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RTR2RTS_QP, 0, qpc,
607 				   &dr_qp->mqp);
608 }
609 
610 static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev,
611 				     struct mlx5dr_qp *dr_qp,
612 				     struct dr_qp_rtr_attr *attr)
613 {
614 	u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
615 	void *qpc;
616 
617 	qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc);
618 
619 	MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->mqp.qpn);
620 
621 	MLX5_SET(qpc, qpc, mtu, attr->mtu);
622 	MLX5_SET(qpc, qpc, log_msg_max, DR_CHUNK_SIZE_MAX - 1);
623 	MLX5_SET(qpc, qpc, remote_qpn, attr->qp_num);
624 	memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32),
625 	       attr->dgid_attr.mac, sizeof(attr->dgid_attr.mac));
626 	memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
627 	       attr->dgid_attr.gid, sizeof(attr->dgid_attr.gid));
628 	MLX5_SET(qpc, qpc, primary_address_path.src_addr_index,
629 		 attr->sgid_index);
630 
631 	if (attr->dgid_attr.roce_ver == MLX5_ROCE_VERSION_2)
632 		MLX5_SET(qpc, qpc, primary_address_path.udp_sport,
633 			 attr->udp_src_port);
634 
635 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num);
636 	MLX5_SET(qpc, qpc, min_rnr_nak, 1);
637 
638 	return mlx5_core_qp_modify(mdev, MLX5_CMD_OP_INIT2RTR_QP, 0, qpc,
639 				   &dr_qp->mqp);
640 }
641 
642 static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn)
643 {
644 	struct mlx5dr_qp *dr_qp = dmn->send_ring->qp;
645 	struct dr_qp_rts_attr rts_attr = {};
646 	struct dr_qp_rtr_attr rtr_attr = {};
647 	enum ib_mtu mtu = IB_MTU_1024;
648 	u16 gid_index = 0;
649 	int port = 1;
650 	int ret;
651 
652 	/* Init */
653 	ret = dr_modify_qp_rst2init(dmn->mdev, dr_qp, port);
654 	if (ret) {
655 		mlx5dr_err(dmn, "Failed modify QP rst2init\n");
656 		return ret;
657 	}
658 
659 	/* RTR */
660 	ret = mlx5dr_cmd_query_gid(dmn->mdev, port, gid_index, &rtr_attr.dgid_attr);
661 	if (ret)
662 		return ret;
663 
664 	rtr_attr.mtu		= mtu;
665 	rtr_attr.qp_num		= dr_qp->mqp.qpn;
666 	rtr_attr.min_rnr_timer	= 12;
667 	rtr_attr.port_num	= port;
668 	rtr_attr.sgid_index	= gid_index;
669 	rtr_attr.udp_src_port	= dmn->info.caps.roce_min_src_udp;
670 
671 	ret = dr_cmd_modify_qp_init2rtr(dmn->mdev, dr_qp, &rtr_attr);
672 	if (ret) {
673 		mlx5dr_err(dmn, "Failed modify QP init2rtr\n");
674 		return ret;
675 	}
676 
677 	/* RTS */
678 	rts_attr.timeout	= 14;
679 	rts_attr.retry_cnt	= 7;
680 	rts_attr.rnr_retry	= 7;
681 
682 	ret = dr_cmd_modify_qp_rtr2rts(dmn->mdev, dr_qp, &rts_attr);
683 	if (ret) {
684 		mlx5dr_err(dmn, "Failed modify QP rtr2rts\n");
685 		return ret;
686 	}
687 
688 	return 0;
689 }
690 
691 static void dr_cq_event(struct mlx5_core_cq *mcq,
692 			enum mlx5_event event)
693 {
694 	pr_info("CQ event %u on CQ #%u\n", event, mcq->cqn);
695 }
696 
697 static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev,
698 				      struct mlx5_uars_page *uar,
699 				      size_t ncqe)
700 {
701 	u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {};
702 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
703 	struct mlx5_wq_param wqp;
704 	struct mlx5_cqe64 *cqe;
705 	struct mlx5dr_cq *cq;
706 	int inlen, err, eqn;
707 	unsigned int irqn;
708 	void *cqc, *in;
709 	__be64 *pas;
710 	int vector;
711 	u32 i;
712 
713 	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
714 	if (!cq)
715 		return NULL;
716 
717 	ncqe = roundup_pow_of_two(ncqe);
718 	MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(ncqe));
719 
720 	wqp.buf_numa_node = mdev->priv.numa_node;
721 	wqp.db_numa_node = mdev->priv.numa_node;
722 
723 	err = mlx5_cqwq_create(mdev, &wqp, temp_cqc, &cq->wq,
724 			       &cq->wq_ctrl);
725 	if (err)
726 		goto out;
727 
728 	for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
729 		cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
730 		cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK;
731 	}
732 
733 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
734 		sizeof(u64) * cq->wq_ctrl.buf.npages;
735 	in = kvzalloc(inlen, GFP_KERNEL);
736 	if (!in)
737 		goto err_cqwq;
738 
739 	vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev);
740 	err = mlx5_vector2eqn(mdev, vector, &eqn, &irqn);
741 	if (err) {
742 		kvfree(in);
743 		goto err_cqwq;
744 	}
745 
746 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
747 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
748 	MLX5_SET(cqc, cqc, c_eqn, eqn);
749 	MLX5_SET(cqc, cqc, uar_page, uar->index);
750 	MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
751 		 MLX5_ADAPTER_PAGE_SHIFT);
752 	MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma);
753 
754 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
755 	mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas);
756 
757 	cq->mcq.event = dr_cq_event;
758 
759 	err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
760 	kvfree(in);
761 
762 	if (err)
763 		goto err_cqwq;
764 
765 	cq->mcq.cqe_sz = 64;
766 	cq->mcq.set_ci_db = cq->wq_ctrl.db.db;
767 	cq->mcq.arm_db = cq->wq_ctrl.db.db + 1;
768 	*cq->mcq.set_ci_db = 0;
769 	*cq->mcq.arm_db = 0;
770 	cq->mcq.vector = 0;
771 	cq->mcq.irqn = irqn;
772 	cq->mcq.uar = uar;
773 
774 	return cq;
775 
776 err_cqwq:
777 	mlx5_wq_destroy(&cq->wq_ctrl);
778 out:
779 	kfree(cq);
780 	return NULL;
781 }
782 
783 static void dr_destroy_cq(struct mlx5_core_dev *mdev, struct mlx5dr_cq *cq)
784 {
785 	mlx5_core_destroy_cq(mdev, &cq->mcq);
786 	mlx5_wq_destroy(&cq->wq_ctrl);
787 	kfree(cq);
788 }
789 
790 static int
791 dr_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, struct mlx5_core_mkey *mkey)
792 {
793 	u32 in[MLX5_ST_SZ_DW(create_mkey_in)] = {};
794 	void *mkc;
795 
796 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
797 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
798 	MLX5_SET(mkc, mkc, a, 1);
799 	MLX5_SET(mkc, mkc, rw, 1);
800 	MLX5_SET(mkc, mkc, rr, 1);
801 	MLX5_SET(mkc, mkc, lw, 1);
802 	MLX5_SET(mkc, mkc, lr, 1);
803 
804 	MLX5_SET(mkc, mkc, pd, pdn);
805 	MLX5_SET(mkc, mkc, length64, 1);
806 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
807 
808 	return mlx5_core_create_mkey(mdev, mkey, in, sizeof(in));
809 }
810 
811 static struct mlx5dr_mr *dr_reg_mr(struct mlx5_core_dev *mdev,
812 				   u32 pdn, void *buf, size_t size)
813 {
814 	struct mlx5dr_mr *mr = kzalloc(sizeof(*mr), GFP_KERNEL);
815 	struct device *dma_device;
816 	dma_addr_t dma_addr;
817 	int err;
818 
819 	if (!mr)
820 		return NULL;
821 
822 	dma_device = &mdev->pdev->dev;
823 	dma_addr = dma_map_single(dma_device, buf, size,
824 				  DMA_BIDIRECTIONAL);
825 	err = dma_mapping_error(dma_device, dma_addr);
826 	if (err) {
827 		mlx5_core_warn(mdev, "Can't dma buf\n");
828 		kfree(mr);
829 		return NULL;
830 	}
831 
832 	err = dr_create_mkey(mdev, pdn, &mr->mkey);
833 	if (err) {
834 		mlx5_core_warn(mdev, "Can't create mkey\n");
835 		dma_unmap_single(dma_device, dma_addr, size,
836 				 DMA_BIDIRECTIONAL);
837 		kfree(mr);
838 		return NULL;
839 	}
840 
841 	mr->dma_addr = dma_addr;
842 	mr->size = size;
843 	mr->addr = buf;
844 
845 	return mr;
846 }
847 
848 static void dr_dereg_mr(struct mlx5_core_dev *mdev, struct mlx5dr_mr *mr)
849 {
850 	mlx5_core_destroy_mkey(mdev, &mr->mkey);
851 	dma_unmap_single(&mdev->pdev->dev, mr->dma_addr, mr->size,
852 			 DMA_BIDIRECTIONAL);
853 	kfree(mr);
854 }
855 
856 int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn)
857 {
858 	struct dr_qp_init_attr init_attr = {};
859 	int cq_size;
860 	int size;
861 	int ret;
862 
863 	dmn->send_ring = kzalloc(sizeof(*dmn->send_ring), GFP_KERNEL);
864 	if (!dmn->send_ring)
865 		return -ENOMEM;
866 
867 	cq_size = QUEUE_SIZE + 1;
868 	dmn->send_ring->cq = dr_create_cq(dmn->mdev, dmn->uar, cq_size);
869 	if (!dmn->send_ring->cq) {
870 		mlx5dr_err(dmn, "Failed creating CQ\n");
871 		ret = -ENOMEM;
872 		goto free_send_ring;
873 	}
874 
875 	init_attr.cqn = dmn->send_ring->cq->mcq.cqn;
876 	init_attr.pdn = dmn->pdn;
877 	init_attr.uar = dmn->uar;
878 	init_attr.max_send_wr = QUEUE_SIZE;
879 
880 	dmn->send_ring->qp = dr_create_rc_qp(dmn->mdev, &init_attr);
881 	if (!dmn->send_ring->qp)  {
882 		mlx5dr_err(dmn, "Failed creating QP\n");
883 		ret = -ENOMEM;
884 		goto clean_cq;
885 	}
886 
887 	dmn->send_ring->cq->qp = dmn->send_ring->qp;
888 
889 	dmn->info.max_send_wr = QUEUE_SIZE;
890 	dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data,
891 					DR_STE_SIZE);
892 
893 	dmn->send_ring->signal_th = dmn->info.max_send_wr /
894 		SIGNAL_PER_DIV_QUEUE;
895 
896 	/* Prepare qp to be used */
897 	ret = dr_prepare_qp_to_rts(dmn);
898 	if (ret)
899 		goto clean_qp;
900 
901 	dmn->send_ring->max_post_send_size =
902 		mlx5dr_icm_pool_chunk_size_to_byte(DR_CHUNK_SIZE_1K,
903 						   DR_ICM_TYPE_STE);
904 
905 	/* Allocating the max size as a buffer for writing */
906 	size = dmn->send_ring->signal_th * dmn->send_ring->max_post_send_size;
907 	dmn->send_ring->buf = kzalloc(size, GFP_KERNEL);
908 	if (!dmn->send_ring->buf) {
909 		ret = -ENOMEM;
910 		goto clean_qp;
911 	}
912 
913 	dmn->send_ring->buf_size = size;
914 
915 	dmn->send_ring->mr = dr_reg_mr(dmn->mdev,
916 				       dmn->pdn, dmn->send_ring->buf, size);
917 	if (!dmn->send_ring->mr) {
918 		ret = -ENOMEM;
919 		goto free_mem;
920 	}
921 
922 	dmn->send_ring->sync_mr = dr_reg_mr(dmn->mdev,
923 					    dmn->pdn, dmn->send_ring->sync_buff,
924 					    MIN_READ_SYNC);
925 	if (!dmn->send_ring->sync_mr) {
926 		ret = -ENOMEM;
927 		goto clean_mr;
928 	}
929 
930 	return 0;
931 
932 clean_mr:
933 	dr_dereg_mr(dmn->mdev, dmn->send_ring->mr);
934 free_mem:
935 	kfree(dmn->send_ring->buf);
936 clean_qp:
937 	dr_destroy_qp(dmn->mdev, dmn->send_ring->qp);
938 clean_cq:
939 	dr_destroy_cq(dmn->mdev, dmn->send_ring->cq);
940 free_send_ring:
941 	kfree(dmn->send_ring);
942 
943 	return ret;
944 }
945 
946 void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn,
947 			   struct mlx5dr_send_ring *send_ring)
948 {
949 	dr_destroy_qp(dmn->mdev, send_ring->qp);
950 	dr_destroy_cq(dmn->mdev, send_ring->cq);
951 	dr_dereg_mr(dmn->mdev, send_ring->sync_mr);
952 	dr_dereg_mr(dmn->mdev, send_ring->mr);
953 	kfree(send_ring->buf);
954 	kfree(send_ring);
955 }
956 
957 int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn)
958 {
959 	struct mlx5dr_send_ring *send_ring = dmn->send_ring;
960 	struct postsend_info send_info = {};
961 	u8 data[DR_STE_SIZE];
962 	int num_of_sends_req;
963 	int ret;
964 	int i;
965 
966 	/* Sending this amount of requests makes sure we will get drain */
967 	num_of_sends_req = send_ring->signal_th * TH_NUMS_TO_DRAIN / 2;
968 
969 	/* Send fake requests forcing the last to be signaled */
970 	send_info.write.addr = (uintptr_t)data;
971 	send_info.write.length = DR_STE_SIZE;
972 	send_info.write.lkey = 0;
973 	/* Using the sync_mr in order to write/read */
974 	send_info.remote_addr = (uintptr_t)send_ring->sync_mr->addr;
975 	send_info.rkey = send_ring->sync_mr->mkey.key;
976 
977 	for (i = 0; i < num_of_sends_req; i++) {
978 		ret = dr_postsend_icm_data(dmn, &send_info);
979 		if (ret)
980 			return ret;
981 	}
982 
983 	ret = dr_handle_pending_wc(dmn, send_ring);
984 
985 	return ret;
986 }
987