xref: /openbmc/linux/drivers/infiniband/hw/efa/efa_verbs.c (revision 7d9326f10cdd9028b4460ccc4006d4d138996b6d)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3  * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
4  */
5 
6 #include <linux/dma-buf.h>
7 #include <linux/dma-resv.h>
8 #include <linux/vmalloc.h>
9 #include <linux/log2.h>
10 
11 #include <rdma/ib_addr.h>
12 #include <rdma/ib_umem.h>
13 #include <rdma/ib_user_verbs.h>
14 #include <rdma/ib_verbs.h>
15 #include <rdma/uverbs_ioctl.h>
16 
17 #include "efa.h"
18 #include "efa_io_defs.h"
19 
20 enum {
21 	EFA_MMAP_DMA_PAGE = 0,
22 	EFA_MMAP_IO_WC,
23 	EFA_MMAP_IO_NC,
24 };
25 
26 #define EFA_AENQ_ENABLED_GROUPS \
27 	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
28 	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
29 
30 struct efa_user_mmap_entry {
31 	struct rdma_user_mmap_entry rdma_entry;
32 	u64 address;
33 	u8 mmap_flag;
34 };
35 
36 #define EFA_DEFINE_DEVICE_STATS(op) \
37 	op(EFA_SUBMITTED_CMDS, "submitted_cmds") \
38 	op(EFA_COMPLETED_CMDS, "completed_cmds") \
39 	op(EFA_CMDS_ERR, "cmds_err") \
40 	op(EFA_NO_COMPLETION_CMDS, "no_completion_cmds") \
41 	op(EFA_KEEP_ALIVE_RCVD, "keep_alive_rcvd") \
42 	op(EFA_ALLOC_PD_ERR, "alloc_pd_err") \
43 	op(EFA_CREATE_QP_ERR, "create_qp_err") \
44 	op(EFA_CREATE_CQ_ERR, "create_cq_err") \
45 	op(EFA_REG_MR_ERR, "reg_mr_err") \
46 	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
47 	op(EFA_CREATE_AH_ERR, "create_ah_err") \
48 	op(EFA_MMAP_ERR, "mmap_err")
49 
50 #define EFA_DEFINE_PORT_STATS(op) \
51 	op(EFA_TX_BYTES, "tx_bytes") \
52 	op(EFA_TX_PKTS, "tx_pkts") \
53 	op(EFA_RX_BYTES, "rx_bytes") \
54 	op(EFA_RX_PKTS, "rx_pkts") \
55 	op(EFA_RX_DROPS, "rx_drops") \
56 	op(EFA_SEND_BYTES, "send_bytes") \
57 	op(EFA_SEND_WRS, "send_wrs") \
58 	op(EFA_RECV_BYTES, "recv_bytes") \
59 	op(EFA_RECV_WRS, "recv_wrs") \
60 	op(EFA_RDMA_READ_WRS, "rdma_read_wrs") \
61 	op(EFA_RDMA_READ_BYTES, "rdma_read_bytes") \
62 	op(EFA_RDMA_READ_WR_ERR, "rdma_read_wr_err") \
63 	op(EFA_RDMA_READ_RESP_BYTES, "rdma_read_resp_bytes") \
64 
65 #define EFA_STATS_ENUM(ename, name) ename,
66 #define EFA_STATS_STR(ename, nam) \
67 	[ename].name = nam,
68 
69 enum efa_hw_device_stats {
70 	EFA_DEFINE_DEVICE_STATS(EFA_STATS_ENUM)
71 };
72 
73 static const struct rdma_stat_desc efa_device_stats_descs[] = {
74 	EFA_DEFINE_DEVICE_STATS(EFA_STATS_STR)
75 };
76 
77 enum efa_hw_port_stats {
78 	EFA_DEFINE_PORT_STATS(EFA_STATS_ENUM)
79 };
80 
81 static const struct rdma_stat_desc efa_port_stats_descs[] = {
82 	EFA_DEFINE_PORT_STATS(EFA_STATS_STR)
83 };
84 
85 #define EFA_CHUNK_PAYLOAD_SHIFT       12
86 #define EFA_CHUNK_PAYLOAD_SIZE        BIT(EFA_CHUNK_PAYLOAD_SHIFT)
87 #define EFA_CHUNK_PAYLOAD_PTR_SIZE    8
88 
89 #define EFA_CHUNK_SHIFT               12
90 #define EFA_CHUNK_SIZE                BIT(EFA_CHUNK_SHIFT)
91 #define EFA_CHUNK_PTR_SIZE            sizeof(struct efa_com_ctrl_buff_info)
92 
93 #define EFA_PTRS_PER_CHUNK \
94 	((EFA_CHUNK_SIZE - EFA_CHUNK_PTR_SIZE) / EFA_CHUNK_PAYLOAD_PTR_SIZE)
95 
96 #define EFA_CHUNK_USED_SIZE \
97 	((EFA_PTRS_PER_CHUNK * EFA_CHUNK_PAYLOAD_PTR_SIZE) + EFA_CHUNK_PTR_SIZE)
98 
99 struct pbl_chunk {
100 	dma_addr_t dma_addr;
101 	u64 *buf;
102 	u32 length;
103 };
104 
105 struct pbl_chunk_list {
106 	struct pbl_chunk *chunks;
107 	unsigned int size;
108 };
109 
110 struct pbl_context {
111 	union {
112 		struct {
113 			dma_addr_t dma_addr;
114 		} continuous;
115 		struct {
116 			u32 pbl_buf_size_in_pages;
117 			struct scatterlist *sgl;
118 			int sg_dma_cnt;
119 			struct pbl_chunk_list chunk_list;
120 		} indirect;
121 	} phys;
122 	u64 *pbl_buf;
123 	u32 pbl_buf_size_in_bytes;
124 	u8 physically_continuous;
125 };
126 
127 static inline struct efa_dev *to_edev(struct ib_device *ibdev)
128 {
129 	return container_of(ibdev, struct efa_dev, ibdev);
130 }
131 
132 static inline struct efa_ucontext *to_eucontext(struct ib_ucontext *ibucontext)
133 {
134 	return container_of(ibucontext, struct efa_ucontext, ibucontext);
135 }
136 
137 static inline struct efa_pd *to_epd(struct ib_pd *ibpd)
138 {
139 	return container_of(ibpd, struct efa_pd, ibpd);
140 }
141 
142 static inline struct efa_mr *to_emr(struct ib_mr *ibmr)
143 {
144 	return container_of(ibmr, struct efa_mr, ibmr);
145 }
146 
147 static inline struct efa_qp *to_eqp(struct ib_qp *ibqp)
148 {
149 	return container_of(ibqp, struct efa_qp, ibqp);
150 }
151 
152 static inline struct efa_cq *to_ecq(struct ib_cq *ibcq)
153 {
154 	return container_of(ibcq, struct efa_cq, ibcq);
155 }
156 
157 static inline struct efa_ah *to_eah(struct ib_ah *ibah)
158 {
159 	return container_of(ibah, struct efa_ah, ibah);
160 }
161 
162 static inline struct efa_user_mmap_entry *
163 to_emmap(struct rdma_user_mmap_entry *rdma_entry)
164 {
165 	return container_of(rdma_entry, struct efa_user_mmap_entry, rdma_entry);
166 }
167 
168 #define EFA_DEV_CAP(dev, cap) \
169 	((dev)->dev_attr.device_caps & \
170 	 EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_##cap##_MASK)
171 
172 #define is_reserved_cleared(reserved) \
173 	!memchr_inv(reserved, 0, sizeof(reserved))
174 
175 static void *efa_zalloc_mapped(struct efa_dev *dev, dma_addr_t *dma_addr,
176 			       size_t size, enum dma_data_direction dir)
177 {
178 	void *addr;
179 
180 	addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
181 	if (!addr)
182 		return NULL;
183 
184 	*dma_addr = dma_map_single(&dev->pdev->dev, addr, size, dir);
185 	if (dma_mapping_error(&dev->pdev->dev, *dma_addr)) {
186 		ibdev_err(&dev->ibdev, "Failed to map DMA address\n");
187 		free_pages_exact(addr, size);
188 		return NULL;
189 	}
190 
191 	return addr;
192 }
193 
194 static void efa_free_mapped(struct efa_dev *dev, void *cpu_addr,
195 			    dma_addr_t dma_addr,
196 			    size_t size, enum dma_data_direction dir)
197 {
198 	dma_unmap_single(&dev->pdev->dev, dma_addr, size, dir);
199 	free_pages_exact(cpu_addr, size);
200 }
201 
202 int efa_query_device(struct ib_device *ibdev,
203 		     struct ib_device_attr *props,
204 		     struct ib_udata *udata)
205 {
206 	struct efa_com_get_device_attr_result *dev_attr;
207 	struct efa_ibv_ex_query_device_resp resp = {};
208 	struct efa_dev *dev = to_edev(ibdev);
209 	int err;
210 
211 	if (udata && udata->inlen &&
212 	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
213 		ibdev_dbg(ibdev,
214 			  "Incompatible ABI params, udata not cleared\n");
215 		return -EINVAL;
216 	}
217 
218 	dev_attr = &dev->dev_attr;
219 
220 	memset(props, 0, sizeof(*props));
221 	props->max_mr_size = dev_attr->max_mr_pages * PAGE_SIZE;
222 	props->page_size_cap = dev_attr->page_size_cap;
223 	props->vendor_id = dev->pdev->vendor;
224 	props->vendor_part_id = dev->pdev->device;
225 	props->hw_ver = dev->pdev->subsystem_device;
226 	props->max_qp = dev_attr->max_qp;
227 	props->max_cq = dev_attr->max_cq;
228 	props->max_pd = dev_attr->max_pd;
229 	props->max_mr = dev_attr->max_mr;
230 	props->max_ah = dev_attr->max_ah;
231 	props->max_cqe = dev_attr->max_cq_depth;
232 	props->max_qp_wr = min_t(u32, dev_attr->max_sq_depth,
233 				 dev_attr->max_rq_depth);
234 	props->max_send_sge = dev_attr->max_sq_sge;
235 	props->max_recv_sge = dev_attr->max_rq_sge;
236 	props->max_sge_rd = dev_attr->max_wr_rdma_sge;
237 	props->max_pkeys = 1;
238 
239 	if (udata && udata->outlen) {
240 		resp.max_sq_sge = dev_attr->max_sq_sge;
241 		resp.max_rq_sge = dev_attr->max_rq_sge;
242 		resp.max_sq_wr = dev_attr->max_sq_depth;
243 		resp.max_rq_wr = dev_attr->max_rq_depth;
244 		resp.max_rdma_size = dev_attr->max_rdma_size;
245 
246 		resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID;
247 		if (EFA_DEV_CAP(dev, RDMA_READ))
248 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_READ;
249 
250 		if (EFA_DEV_CAP(dev, RNR_RETRY))
251 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RNR_RETRY;
252 
253 		if (EFA_DEV_CAP(dev, DATA_POLLING_128))
254 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128;
255 
256 		if (EFA_DEV_CAP(dev, RDMA_WRITE))
257 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_WRITE;
258 
259 		if (dev->neqs)
260 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS;
261 
262 		err = ib_copy_to_udata(udata, &resp,
263 				       min(sizeof(resp), udata->outlen));
264 		if (err) {
265 			ibdev_dbg(ibdev,
266 				  "Failed to copy udata for query_device\n");
267 			return err;
268 		}
269 	}
270 
271 	return 0;
272 }
273 
274 int efa_query_port(struct ib_device *ibdev, u32 port,
275 		   struct ib_port_attr *props)
276 {
277 	struct efa_dev *dev = to_edev(ibdev);
278 
279 	props->lmc = 1;
280 
281 	props->state = IB_PORT_ACTIVE;
282 	props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
283 	props->gid_tbl_len = 1;
284 	props->pkey_tbl_len = 1;
285 	props->active_speed = IB_SPEED_EDR;
286 	props->active_width = IB_WIDTH_4X;
287 	props->max_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu);
288 	props->active_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu);
289 	props->max_msg_sz = dev->dev_attr.mtu;
290 	props->max_vl_num = 1;
291 
292 	return 0;
293 }
294 
295 int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
296 		 int qp_attr_mask,
297 		 struct ib_qp_init_attr *qp_init_attr)
298 {
299 	struct efa_dev *dev = to_edev(ibqp->device);
300 	struct efa_com_query_qp_params params = {};
301 	struct efa_com_query_qp_result result;
302 	struct efa_qp *qp = to_eqp(ibqp);
303 	int err;
304 
305 #define EFA_QUERY_QP_SUPP_MASK \
306 	(IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | \
307 	 IB_QP_QKEY | IB_QP_SQ_PSN | IB_QP_CAP | IB_QP_RNR_RETRY)
308 
309 	if (qp_attr_mask & ~EFA_QUERY_QP_SUPP_MASK) {
310 		ibdev_dbg(&dev->ibdev,
311 			  "Unsupported qp_attr_mask[%#x] supported[%#x]\n",
312 			  qp_attr_mask, EFA_QUERY_QP_SUPP_MASK);
313 		return -EOPNOTSUPP;
314 	}
315 
316 	memset(qp_attr, 0, sizeof(*qp_attr));
317 	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
318 
319 	params.qp_handle = qp->qp_handle;
320 	err = efa_com_query_qp(&dev->edev, &params, &result);
321 	if (err)
322 		return err;
323 
324 	qp_attr->qp_state = result.qp_state;
325 	qp_attr->qkey = result.qkey;
326 	qp_attr->sq_psn = result.sq_psn;
327 	qp_attr->sq_draining = result.sq_draining;
328 	qp_attr->port_num = 1;
329 	qp_attr->rnr_retry = result.rnr_retry;
330 
331 	qp_attr->cap.max_send_wr = qp->max_send_wr;
332 	qp_attr->cap.max_recv_wr = qp->max_recv_wr;
333 	qp_attr->cap.max_send_sge = qp->max_send_sge;
334 	qp_attr->cap.max_recv_sge = qp->max_recv_sge;
335 	qp_attr->cap.max_inline_data = qp->max_inline_data;
336 
337 	qp_init_attr->qp_type = ibqp->qp_type;
338 	qp_init_attr->recv_cq = ibqp->recv_cq;
339 	qp_init_attr->send_cq = ibqp->send_cq;
340 	qp_init_attr->qp_context = ibqp->qp_context;
341 	qp_init_attr->cap = qp_attr->cap;
342 
343 	return 0;
344 }
345 
346 int efa_query_gid(struct ib_device *ibdev, u32 port, int index,
347 		  union ib_gid *gid)
348 {
349 	struct efa_dev *dev = to_edev(ibdev);
350 
351 	memcpy(gid->raw, dev->dev_attr.addr, sizeof(dev->dev_attr.addr));
352 
353 	return 0;
354 }
355 
356 int efa_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
357 		   u16 *pkey)
358 {
359 	if (index > 0)
360 		return -EINVAL;
361 
362 	*pkey = 0xffff;
363 	return 0;
364 }
365 
366 static int efa_pd_dealloc(struct efa_dev *dev, u16 pdn)
367 {
368 	struct efa_com_dealloc_pd_params params = {
369 		.pdn = pdn,
370 	};
371 
372 	return efa_com_dealloc_pd(&dev->edev, &params);
373 }
374 
375 int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
376 {
377 	struct efa_dev *dev = to_edev(ibpd->device);
378 	struct efa_ibv_alloc_pd_resp resp = {};
379 	struct efa_com_alloc_pd_result result;
380 	struct efa_pd *pd = to_epd(ibpd);
381 	int err;
382 
383 	if (udata->inlen &&
384 	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
385 		ibdev_dbg(&dev->ibdev,
386 			  "Incompatible ABI params, udata not cleared\n");
387 		err = -EINVAL;
388 		goto err_out;
389 	}
390 
391 	err = efa_com_alloc_pd(&dev->edev, &result);
392 	if (err)
393 		goto err_out;
394 
395 	pd->pdn = result.pdn;
396 	resp.pdn = result.pdn;
397 
398 	if (udata->outlen) {
399 		err = ib_copy_to_udata(udata, &resp,
400 				       min(sizeof(resp), udata->outlen));
401 		if (err) {
402 			ibdev_dbg(&dev->ibdev,
403 				  "Failed to copy udata for alloc_pd\n");
404 			goto err_dealloc_pd;
405 		}
406 	}
407 
408 	ibdev_dbg(&dev->ibdev, "Allocated pd[%d]\n", pd->pdn);
409 
410 	return 0;
411 
412 err_dealloc_pd:
413 	efa_pd_dealloc(dev, result.pdn);
414 err_out:
415 	atomic64_inc(&dev->stats.alloc_pd_err);
416 	return err;
417 }
418 
419 int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
420 {
421 	struct efa_dev *dev = to_edev(ibpd->device);
422 	struct efa_pd *pd = to_epd(ibpd);
423 
424 	ibdev_dbg(&dev->ibdev, "Dealloc pd[%d]\n", pd->pdn);
425 	efa_pd_dealloc(dev, pd->pdn);
426 	return 0;
427 }
428 
429 static int efa_destroy_qp_handle(struct efa_dev *dev, u32 qp_handle)
430 {
431 	struct efa_com_destroy_qp_params params = { .qp_handle = qp_handle };
432 
433 	return efa_com_destroy_qp(&dev->edev, &params);
434 }
435 
436 static void efa_qp_user_mmap_entries_remove(struct efa_qp *qp)
437 {
438 	rdma_user_mmap_entry_remove(qp->rq_mmap_entry);
439 	rdma_user_mmap_entry_remove(qp->rq_db_mmap_entry);
440 	rdma_user_mmap_entry_remove(qp->llq_desc_mmap_entry);
441 	rdma_user_mmap_entry_remove(qp->sq_db_mmap_entry);
442 }
443 
444 int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
445 {
446 	struct efa_dev *dev = to_edev(ibqp->pd->device);
447 	struct efa_qp *qp = to_eqp(ibqp);
448 	int err;
449 
450 	ibdev_dbg(&dev->ibdev, "Destroy qp[%u]\n", ibqp->qp_num);
451 
452 	efa_qp_user_mmap_entries_remove(qp);
453 
454 	err = efa_destroy_qp_handle(dev, qp->qp_handle);
455 	if (err)
456 		return err;
457 
458 	if (qp->rq_cpu_addr) {
459 		ibdev_dbg(&dev->ibdev,
460 			  "qp->cpu_addr[0x%p] freed: size[%lu], dma[%pad]\n",
461 			  qp->rq_cpu_addr, qp->rq_size,
462 			  &qp->rq_dma_addr);
463 		efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr,
464 				qp->rq_size, DMA_TO_DEVICE);
465 	}
466 
467 	return 0;
468 }
469 
470 static struct rdma_user_mmap_entry*
471 efa_user_mmap_entry_insert(struct ib_ucontext *ucontext,
472 			   u64 address, size_t length,
473 			   u8 mmap_flag, u64 *offset)
474 {
475 	struct efa_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
476 	int err;
477 
478 	if (!entry)
479 		return NULL;
480 
481 	entry->address = address;
482 	entry->mmap_flag = mmap_flag;
483 
484 	err = rdma_user_mmap_entry_insert(ucontext, &entry->rdma_entry,
485 					  length);
486 	if (err) {
487 		kfree(entry);
488 		return NULL;
489 	}
490 	*offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
491 
492 	return &entry->rdma_entry;
493 }
494 
495 static int qp_mmap_entries_setup(struct efa_qp *qp,
496 				 struct efa_dev *dev,
497 				 struct efa_ucontext *ucontext,
498 				 struct efa_com_create_qp_params *params,
499 				 struct efa_ibv_create_qp_resp *resp)
500 {
501 	size_t length;
502 	u64 address;
503 
504 	address = dev->db_bar_addr + resp->sq_db_offset;
505 	qp->sq_db_mmap_entry =
506 		efa_user_mmap_entry_insert(&ucontext->ibucontext,
507 					   address,
508 					   PAGE_SIZE, EFA_MMAP_IO_NC,
509 					   &resp->sq_db_mmap_key);
510 	if (!qp->sq_db_mmap_entry)
511 		return -ENOMEM;
512 
513 	resp->sq_db_offset &= ~PAGE_MASK;
514 
515 	address = dev->mem_bar_addr + resp->llq_desc_offset;
516 	length = PAGE_ALIGN(params->sq_ring_size_in_bytes +
517 			    (resp->llq_desc_offset & ~PAGE_MASK));
518 
519 	qp->llq_desc_mmap_entry =
520 		efa_user_mmap_entry_insert(&ucontext->ibucontext,
521 					   address, length,
522 					   EFA_MMAP_IO_WC,
523 					   &resp->llq_desc_mmap_key);
524 	if (!qp->llq_desc_mmap_entry)
525 		goto err_remove_mmap;
526 
527 	resp->llq_desc_offset &= ~PAGE_MASK;
528 
529 	if (qp->rq_size) {
530 		address = dev->db_bar_addr + resp->rq_db_offset;
531 
532 		qp->rq_db_mmap_entry =
533 			efa_user_mmap_entry_insert(&ucontext->ibucontext,
534 						   address, PAGE_SIZE,
535 						   EFA_MMAP_IO_NC,
536 						   &resp->rq_db_mmap_key);
537 		if (!qp->rq_db_mmap_entry)
538 			goto err_remove_mmap;
539 
540 		resp->rq_db_offset &= ~PAGE_MASK;
541 
542 		address = virt_to_phys(qp->rq_cpu_addr);
543 		qp->rq_mmap_entry =
544 			efa_user_mmap_entry_insert(&ucontext->ibucontext,
545 						   address, qp->rq_size,
546 						   EFA_MMAP_DMA_PAGE,
547 						   &resp->rq_mmap_key);
548 		if (!qp->rq_mmap_entry)
549 			goto err_remove_mmap;
550 
551 		resp->rq_mmap_size = qp->rq_size;
552 	}
553 
554 	return 0;
555 
556 err_remove_mmap:
557 	efa_qp_user_mmap_entries_remove(qp);
558 
559 	return -ENOMEM;
560 }
561 
562 static int efa_qp_validate_cap(struct efa_dev *dev,
563 			       struct ib_qp_init_attr *init_attr)
564 {
565 	if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
566 		ibdev_dbg(&dev->ibdev,
567 			  "qp: requested send wr[%u] exceeds the max[%u]\n",
568 			  init_attr->cap.max_send_wr,
569 			  dev->dev_attr.max_sq_depth);
570 		return -EINVAL;
571 	}
572 	if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
573 		ibdev_dbg(&dev->ibdev,
574 			  "qp: requested receive wr[%u] exceeds the max[%u]\n",
575 			  init_attr->cap.max_recv_wr,
576 			  dev->dev_attr.max_rq_depth);
577 		return -EINVAL;
578 	}
579 	if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
580 		ibdev_dbg(&dev->ibdev,
581 			  "qp: requested sge send[%u] exceeds the max[%u]\n",
582 			  init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge);
583 		return -EINVAL;
584 	}
585 	if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
586 		ibdev_dbg(&dev->ibdev,
587 			  "qp: requested sge recv[%u] exceeds the max[%u]\n",
588 			  init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge);
589 		return -EINVAL;
590 	}
591 	if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
592 		ibdev_dbg(&dev->ibdev,
593 			  "qp: requested inline data[%u] exceeds the max[%u]\n",
594 			  init_attr->cap.max_inline_data,
595 			  dev->dev_attr.inline_buf_size);
596 		return -EINVAL;
597 	}
598 
599 	return 0;
600 }
601 
602 static int efa_qp_validate_attr(struct efa_dev *dev,
603 				struct ib_qp_init_attr *init_attr)
604 {
605 	if (init_attr->qp_type != IB_QPT_DRIVER &&
606 	    init_attr->qp_type != IB_QPT_UD) {
607 		ibdev_dbg(&dev->ibdev,
608 			  "Unsupported qp type %d\n", init_attr->qp_type);
609 		return -EOPNOTSUPP;
610 	}
611 
612 	if (init_attr->srq) {
613 		ibdev_dbg(&dev->ibdev, "SRQ is not supported\n");
614 		return -EOPNOTSUPP;
615 	}
616 
617 	if (init_attr->create_flags) {
618 		ibdev_dbg(&dev->ibdev, "Unsupported create flags\n");
619 		return -EOPNOTSUPP;
620 	}
621 
622 	return 0;
623 }
624 
625 int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
626 		  struct ib_udata *udata)
627 {
628 	struct efa_com_create_qp_params create_qp_params = {};
629 	struct efa_com_create_qp_result create_qp_resp;
630 	struct efa_dev *dev = to_edev(ibqp->device);
631 	struct efa_ibv_create_qp_resp resp = {};
632 	struct efa_ibv_create_qp cmd = {};
633 	struct efa_qp *qp = to_eqp(ibqp);
634 	struct efa_ucontext *ucontext;
635 	int err;
636 
637 	ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext,
638 					     ibucontext);
639 
640 	err = efa_qp_validate_cap(dev, init_attr);
641 	if (err)
642 		goto err_out;
643 
644 	err = efa_qp_validate_attr(dev, init_attr);
645 	if (err)
646 		goto err_out;
647 
648 	if (offsetofend(typeof(cmd), driver_qp_type) > udata->inlen) {
649 		ibdev_dbg(&dev->ibdev,
650 			  "Incompatible ABI params, no input udata\n");
651 		err = -EINVAL;
652 		goto err_out;
653 	}
654 
655 	if (udata->inlen > sizeof(cmd) &&
656 	    !ib_is_udata_cleared(udata, sizeof(cmd),
657 				 udata->inlen - sizeof(cmd))) {
658 		ibdev_dbg(&dev->ibdev,
659 			  "Incompatible ABI params, unknown fields in udata\n");
660 		err = -EINVAL;
661 		goto err_out;
662 	}
663 
664 	err = ib_copy_from_udata(&cmd, udata,
665 				 min(sizeof(cmd), udata->inlen));
666 	if (err) {
667 		ibdev_dbg(&dev->ibdev,
668 			  "Cannot copy udata for create_qp\n");
669 		goto err_out;
670 	}
671 
672 	if (cmd.comp_mask) {
673 		ibdev_dbg(&dev->ibdev,
674 			  "Incompatible ABI params, unknown fields in udata\n");
675 		err = -EINVAL;
676 		goto err_out;
677 	}
678 
679 	create_qp_params.uarn = ucontext->uarn;
680 	create_qp_params.pd = to_epd(ibqp->pd)->pdn;
681 
682 	if (init_attr->qp_type == IB_QPT_UD) {
683 		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_UD;
684 	} else if (cmd.driver_qp_type == EFA_QP_DRIVER_TYPE_SRD) {
685 		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_SRD;
686 	} else {
687 		ibdev_dbg(&dev->ibdev,
688 			  "Unsupported qp type %d driver qp type %d\n",
689 			  init_attr->qp_type, cmd.driver_qp_type);
690 		err = -EOPNOTSUPP;
691 		goto err_out;
692 	}
693 
694 	ibdev_dbg(&dev->ibdev, "Create QP: qp type %d driver qp type %#x\n",
695 		  init_attr->qp_type, cmd.driver_qp_type);
696 	create_qp_params.send_cq_idx = to_ecq(init_attr->send_cq)->cq_idx;
697 	create_qp_params.recv_cq_idx = to_ecq(init_attr->recv_cq)->cq_idx;
698 	create_qp_params.sq_depth = init_attr->cap.max_send_wr;
699 	create_qp_params.sq_ring_size_in_bytes = cmd.sq_ring_size;
700 
701 	create_qp_params.rq_depth = init_attr->cap.max_recv_wr;
702 	create_qp_params.rq_ring_size_in_bytes = cmd.rq_ring_size;
703 	qp->rq_size = PAGE_ALIGN(create_qp_params.rq_ring_size_in_bytes);
704 	if (qp->rq_size) {
705 		qp->rq_cpu_addr = efa_zalloc_mapped(dev, &qp->rq_dma_addr,
706 						    qp->rq_size, DMA_TO_DEVICE);
707 		if (!qp->rq_cpu_addr) {
708 			err = -ENOMEM;
709 			goto err_out;
710 		}
711 
712 		ibdev_dbg(&dev->ibdev,
713 			  "qp->cpu_addr[0x%p] allocated: size[%lu], dma[%pad]\n",
714 			  qp->rq_cpu_addr, qp->rq_size, &qp->rq_dma_addr);
715 		create_qp_params.rq_base_addr = qp->rq_dma_addr;
716 	}
717 
718 	err = efa_com_create_qp(&dev->edev, &create_qp_params,
719 				&create_qp_resp);
720 	if (err)
721 		goto err_free_mapped;
722 
723 	resp.sq_db_offset = create_qp_resp.sq_db_offset;
724 	resp.rq_db_offset = create_qp_resp.rq_db_offset;
725 	resp.llq_desc_offset = create_qp_resp.llq_descriptors_offset;
726 	resp.send_sub_cq_idx = create_qp_resp.send_sub_cq_idx;
727 	resp.recv_sub_cq_idx = create_qp_resp.recv_sub_cq_idx;
728 
729 	err = qp_mmap_entries_setup(qp, dev, ucontext, &create_qp_params,
730 				    &resp);
731 	if (err)
732 		goto err_destroy_qp;
733 
734 	qp->qp_handle = create_qp_resp.qp_handle;
735 	qp->ibqp.qp_num = create_qp_resp.qp_num;
736 	qp->max_send_wr = init_attr->cap.max_send_wr;
737 	qp->max_recv_wr = init_attr->cap.max_recv_wr;
738 	qp->max_send_sge = init_attr->cap.max_send_sge;
739 	qp->max_recv_sge = init_attr->cap.max_recv_sge;
740 	qp->max_inline_data = init_attr->cap.max_inline_data;
741 
742 	if (udata->outlen) {
743 		err = ib_copy_to_udata(udata, &resp,
744 				       min(sizeof(resp), udata->outlen));
745 		if (err) {
746 			ibdev_dbg(&dev->ibdev,
747 				  "Failed to copy udata for qp[%u]\n",
748 				  create_qp_resp.qp_num);
749 			goto err_remove_mmap_entries;
750 		}
751 	}
752 
753 	ibdev_dbg(&dev->ibdev, "Created qp[%d]\n", qp->ibqp.qp_num);
754 
755 	return 0;
756 
757 err_remove_mmap_entries:
758 	efa_qp_user_mmap_entries_remove(qp);
759 err_destroy_qp:
760 	efa_destroy_qp_handle(dev, create_qp_resp.qp_handle);
761 err_free_mapped:
762 	if (qp->rq_size)
763 		efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr,
764 				qp->rq_size, DMA_TO_DEVICE);
765 err_out:
766 	atomic64_inc(&dev->stats.create_qp_err);
767 	return err;
768 }
769 
770 static const struct {
771 	int			valid;
772 	enum ib_qp_attr_mask	req_param;
773 	enum ib_qp_attr_mask	opt_param;
774 } srd_qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
775 	[IB_QPS_RESET] = {
776 		[IB_QPS_RESET] = { .valid = 1 },
777 		[IB_QPS_INIT]  = {
778 			.valid = 1,
779 			.req_param = IB_QP_PKEY_INDEX |
780 				     IB_QP_PORT |
781 				     IB_QP_QKEY,
782 		},
783 	},
784 	[IB_QPS_INIT] = {
785 		[IB_QPS_RESET] = { .valid = 1 },
786 		[IB_QPS_ERR]   = { .valid = 1 },
787 		[IB_QPS_INIT]  = {
788 			.valid = 1,
789 			.opt_param = IB_QP_PKEY_INDEX |
790 				     IB_QP_PORT |
791 				     IB_QP_QKEY,
792 		},
793 		[IB_QPS_RTR]   = {
794 			.valid = 1,
795 			.opt_param = IB_QP_PKEY_INDEX |
796 				     IB_QP_QKEY,
797 		},
798 	},
799 	[IB_QPS_RTR] = {
800 		[IB_QPS_RESET] = { .valid = 1 },
801 		[IB_QPS_ERR]   = { .valid = 1 },
802 		[IB_QPS_RTS]   = {
803 			.valid = 1,
804 			.req_param = IB_QP_SQ_PSN,
805 			.opt_param = IB_QP_CUR_STATE |
806 				     IB_QP_QKEY |
807 				     IB_QP_RNR_RETRY,
808 
809 		}
810 	},
811 	[IB_QPS_RTS] = {
812 		[IB_QPS_RESET] = { .valid = 1 },
813 		[IB_QPS_ERR]   = { .valid = 1 },
814 		[IB_QPS_RTS]   = {
815 			.valid = 1,
816 			.opt_param = IB_QP_CUR_STATE |
817 				     IB_QP_QKEY,
818 		},
819 		[IB_QPS_SQD] = {
820 			.valid = 1,
821 			.opt_param = IB_QP_EN_SQD_ASYNC_NOTIFY,
822 		},
823 	},
824 	[IB_QPS_SQD] = {
825 		[IB_QPS_RESET] = { .valid = 1 },
826 		[IB_QPS_ERR]   = { .valid = 1 },
827 		[IB_QPS_RTS]   = {
828 			.valid = 1,
829 			.opt_param = IB_QP_CUR_STATE |
830 				     IB_QP_QKEY,
831 		},
832 		[IB_QPS_SQD] = {
833 			.valid = 1,
834 			.opt_param = IB_QP_PKEY_INDEX |
835 				     IB_QP_QKEY,
836 		}
837 	},
838 	[IB_QPS_SQE] = {
839 		[IB_QPS_RESET] = { .valid = 1 },
840 		[IB_QPS_ERR]   = { .valid = 1 },
841 		[IB_QPS_RTS]   = {
842 			.valid = 1,
843 			.opt_param = IB_QP_CUR_STATE |
844 				     IB_QP_QKEY,
845 		}
846 	},
847 	[IB_QPS_ERR] = {
848 		[IB_QPS_RESET] = { .valid = 1 },
849 		[IB_QPS_ERR]   = { .valid = 1 },
850 	}
851 };
852 
853 static bool efa_modify_srd_qp_is_ok(enum ib_qp_state cur_state,
854 				    enum ib_qp_state next_state,
855 				    enum ib_qp_attr_mask mask)
856 {
857 	enum ib_qp_attr_mask req_param, opt_param;
858 
859 	if (mask & IB_QP_CUR_STATE  &&
860 	    cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
861 	    cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
862 		return false;
863 
864 	if (!srd_qp_state_table[cur_state][next_state].valid)
865 		return false;
866 
867 	req_param = srd_qp_state_table[cur_state][next_state].req_param;
868 	opt_param = srd_qp_state_table[cur_state][next_state].opt_param;
869 
870 	if ((mask & req_param) != req_param)
871 		return false;
872 
873 	if (mask & ~(req_param | opt_param | IB_QP_STATE))
874 		return false;
875 
876 	return true;
877 }
878 
879 static int efa_modify_qp_validate(struct efa_dev *dev, struct efa_qp *qp,
880 				  struct ib_qp_attr *qp_attr, int qp_attr_mask,
881 				  enum ib_qp_state cur_state,
882 				  enum ib_qp_state new_state)
883 {
884 	int err;
885 
886 #define EFA_MODIFY_QP_SUPP_MASK \
887 	(IB_QP_STATE | IB_QP_CUR_STATE | IB_QP_EN_SQD_ASYNC_NOTIFY | \
888 	 IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY | IB_QP_SQ_PSN | \
889 	 IB_QP_RNR_RETRY)
890 
891 	if (qp_attr_mask & ~EFA_MODIFY_QP_SUPP_MASK) {
892 		ibdev_dbg(&dev->ibdev,
893 			  "Unsupported qp_attr_mask[%#x] supported[%#x]\n",
894 			  qp_attr_mask, EFA_MODIFY_QP_SUPP_MASK);
895 		return -EOPNOTSUPP;
896 	}
897 
898 	if (qp->ibqp.qp_type == IB_QPT_DRIVER)
899 		err = !efa_modify_srd_qp_is_ok(cur_state, new_state,
900 					       qp_attr_mask);
901 	else
902 		err = !ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD,
903 					  qp_attr_mask);
904 
905 	if (err) {
906 		ibdev_dbg(&dev->ibdev, "Invalid modify QP parameters\n");
907 		return -EINVAL;
908 	}
909 
910 	if ((qp_attr_mask & IB_QP_PORT) && qp_attr->port_num != 1) {
911 		ibdev_dbg(&dev->ibdev, "Can't change port num\n");
912 		return -EOPNOTSUPP;
913 	}
914 
915 	if ((qp_attr_mask & IB_QP_PKEY_INDEX) && qp_attr->pkey_index) {
916 		ibdev_dbg(&dev->ibdev, "Can't change pkey index\n");
917 		return -EOPNOTSUPP;
918 	}
919 
920 	return 0;
921 }
922 
923 int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
924 		  int qp_attr_mask, struct ib_udata *udata)
925 {
926 	struct efa_dev *dev = to_edev(ibqp->device);
927 	struct efa_com_modify_qp_params params = {};
928 	struct efa_qp *qp = to_eqp(ibqp);
929 	enum ib_qp_state cur_state;
930 	enum ib_qp_state new_state;
931 	int err;
932 
933 	if (qp_attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
934 		return -EOPNOTSUPP;
935 
936 	if (udata->inlen &&
937 	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
938 		ibdev_dbg(&dev->ibdev,
939 			  "Incompatible ABI params, udata not cleared\n");
940 		return -EINVAL;
941 	}
942 
943 	cur_state = qp_attr_mask & IB_QP_CUR_STATE ? qp_attr->cur_qp_state :
944 						     qp->state;
945 	new_state = qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : cur_state;
946 
947 	err = efa_modify_qp_validate(dev, qp, qp_attr, qp_attr_mask, cur_state,
948 				     new_state);
949 	if (err)
950 		return err;
951 
952 	params.qp_handle = qp->qp_handle;
953 
954 	if (qp_attr_mask & IB_QP_STATE) {
955 		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_QP_STATE,
956 			1);
957 		EFA_SET(&params.modify_mask,
958 			EFA_ADMIN_MODIFY_QP_CMD_CUR_QP_STATE, 1);
959 		params.cur_qp_state = cur_state;
960 		params.qp_state = new_state;
961 	}
962 
963 	if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
964 		EFA_SET(&params.modify_mask,
965 			EFA_ADMIN_MODIFY_QP_CMD_SQ_DRAINED_ASYNC_NOTIFY, 1);
966 		params.sq_drained_async_notify = qp_attr->en_sqd_async_notify;
967 	}
968 
969 	if (qp_attr_mask & IB_QP_QKEY) {
970 		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_QKEY, 1);
971 		params.qkey = qp_attr->qkey;
972 	}
973 
974 	if (qp_attr_mask & IB_QP_SQ_PSN) {
975 		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_SQ_PSN, 1);
976 		params.sq_psn = qp_attr->sq_psn;
977 	}
978 
979 	if (qp_attr_mask & IB_QP_RNR_RETRY) {
980 		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_RNR_RETRY,
981 			1);
982 		params.rnr_retry = qp_attr->rnr_retry;
983 	}
984 
985 	err = efa_com_modify_qp(&dev->edev, &params);
986 	if (err)
987 		return err;
988 
989 	qp->state = new_state;
990 
991 	return 0;
992 }
993 
994 static int efa_destroy_cq_idx(struct efa_dev *dev, int cq_idx)
995 {
996 	struct efa_com_destroy_cq_params params = { .cq_idx = cq_idx };
997 
998 	return efa_com_destroy_cq(&dev->edev, &params);
999 }
1000 
1001 static void efa_cq_user_mmap_entries_remove(struct efa_cq *cq)
1002 {
1003 	rdma_user_mmap_entry_remove(cq->db_mmap_entry);
1004 	rdma_user_mmap_entry_remove(cq->mmap_entry);
1005 }
1006 
1007 int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
1008 {
1009 	struct efa_dev *dev = to_edev(ibcq->device);
1010 	struct efa_cq *cq = to_ecq(ibcq);
1011 
1012 	ibdev_dbg(&dev->ibdev,
1013 		  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
1014 		  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
1015 
1016 	efa_cq_user_mmap_entries_remove(cq);
1017 	efa_destroy_cq_idx(dev, cq->cq_idx);
1018 	if (cq->eq) {
1019 		xa_erase(&dev->cqs_xa, cq->cq_idx);
1020 		synchronize_irq(cq->eq->irq.irqn);
1021 	}
1022 	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
1023 			DMA_FROM_DEVICE);
1024 	return 0;
1025 }
1026 
1027 static struct efa_eq *efa_vec2eq(struct efa_dev *dev, int vec)
1028 {
1029 	return &dev->eqs[vec];
1030 }
1031 
1032 static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
1033 				 struct efa_ibv_create_cq_resp *resp,
1034 				 bool db_valid)
1035 {
1036 	resp->q_mmap_size = cq->size;
1037 	cq->mmap_entry = efa_user_mmap_entry_insert(&cq->ucontext->ibucontext,
1038 						    virt_to_phys(cq->cpu_addr),
1039 						    cq->size, EFA_MMAP_DMA_PAGE,
1040 						    &resp->q_mmap_key);
1041 	if (!cq->mmap_entry)
1042 		return -ENOMEM;
1043 
1044 	if (db_valid) {
1045 		cq->db_mmap_entry =
1046 			efa_user_mmap_entry_insert(&cq->ucontext->ibucontext,
1047 						   dev->db_bar_addr + resp->db_off,
1048 						   PAGE_SIZE, EFA_MMAP_IO_NC,
1049 						   &resp->db_mmap_key);
1050 		if (!cq->db_mmap_entry) {
1051 			rdma_user_mmap_entry_remove(cq->mmap_entry);
1052 			return -ENOMEM;
1053 		}
1054 
1055 		resp->db_off &= ~PAGE_MASK;
1056 		resp->comp_mask |= EFA_CREATE_CQ_RESP_DB_OFF;
1057 	}
1058 
1059 	return 0;
1060 }
1061 
1062 int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
1063 		  struct ib_udata *udata)
1064 {
1065 	struct efa_ucontext *ucontext = rdma_udata_to_drv_context(
1066 		udata, struct efa_ucontext, ibucontext);
1067 	struct efa_com_create_cq_params params = {};
1068 	struct efa_ibv_create_cq_resp resp = {};
1069 	struct efa_com_create_cq_result result;
1070 	struct ib_device *ibdev = ibcq->device;
1071 	struct efa_dev *dev = to_edev(ibdev);
1072 	struct efa_ibv_create_cq cmd = {};
1073 	struct efa_cq *cq = to_ecq(ibcq);
1074 	int entries = attr->cqe;
1075 	bool set_src_addr;
1076 	int err;
1077 
1078 	ibdev_dbg(ibdev, "create_cq entries %d\n", entries);
1079 
1080 	if (attr->flags)
1081 		return -EOPNOTSUPP;
1082 
1083 	if (entries < 1 || entries > dev->dev_attr.max_cq_depth) {
1084 		ibdev_dbg(ibdev,
1085 			  "cq: requested entries[%u] non-positive or greater than max[%u]\n",
1086 			  entries, dev->dev_attr.max_cq_depth);
1087 		err = -EINVAL;
1088 		goto err_out;
1089 	}
1090 
1091 	if (offsetofend(typeof(cmd), num_sub_cqs) > udata->inlen) {
1092 		ibdev_dbg(ibdev,
1093 			  "Incompatible ABI params, no input udata\n");
1094 		err = -EINVAL;
1095 		goto err_out;
1096 	}
1097 
1098 	if (udata->inlen > sizeof(cmd) &&
1099 	    !ib_is_udata_cleared(udata, sizeof(cmd),
1100 				 udata->inlen - sizeof(cmd))) {
1101 		ibdev_dbg(ibdev,
1102 			  "Incompatible ABI params, unknown fields in udata\n");
1103 		err = -EINVAL;
1104 		goto err_out;
1105 	}
1106 
1107 	err = ib_copy_from_udata(&cmd, udata,
1108 				 min(sizeof(cmd), udata->inlen));
1109 	if (err) {
1110 		ibdev_dbg(ibdev, "Cannot copy udata for create_cq\n");
1111 		goto err_out;
1112 	}
1113 
1114 	if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_58)) {
1115 		ibdev_dbg(ibdev,
1116 			  "Incompatible ABI params, unknown fields in udata\n");
1117 		err = -EINVAL;
1118 		goto err_out;
1119 	}
1120 
1121 	set_src_addr = !!(cmd.flags & EFA_CREATE_CQ_WITH_SGID);
1122 	if ((cmd.cq_entry_size != sizeof(struct efa_io_rx_cdesc_ex)) &&
1123 	    (set_src_addr ||
1124 	     cmd.cq_entry_size != sizeof(struct efa_io_rx_cdesc))) {
1125 		ibdev_dbg(ibdev,
1126 			  "Invalid entry size [%u]\n", cmd.cq_entry_size);
1127 		err = -EINVAL;
1128 		goto err_out;
1129 	}
1130 
1131 	if (cmd.num_sub_cqs != dev->dev_attr.sub_cqs_per_cq) {
1132 		ibdev_dbg(ibdev,
1133 			  "Invalid number of sub cqs[%u] expected[%u]\n",
1134 			  cmd.num_sub_cqs, dev->dev_attr.sub_cqs_per_cq);
1135 		err = -EINVAL;
1136 		goto err_out;
1137 	}
1138 
1139 	cq->ucontext = ucontext;
1140 	cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs);
1141 	cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size,
1142 					 DMA_FROM_DEVICE);
1143 	if (!cq->cpu_addr) {
1144 		err = -ENOMEM;
1145 		goto err_out;
1146 	}
1147 
1148 	params.uarn = cq->ucontext->uarn;
1149 	params.cq_depth = entries;
1150 	params.dma_addr = cq->dma_addr;
1151 	params.entry_size_in_bytes = cmd.cq_entry_size;
1152 	params.num_sub_cqs = cmd.num_sub_cqs;
1153 	params.set_src_addr = set_src_addr;
1154 	if (cmd.flags & EFA_CREATE_CQ_WITH_COMPLETION_CHANNEL) {
1155 		cq->eq = efa_vec2eq(dev, attr->comp_vector);
1156 		params.eqn = cq->eq->eeq.eqn;
1157 		params.interrupt_mode_enabled = true;
1158 	}
1159 
1160 	err = efa_com_create_cq(&dev->edev, &params, &result);
1161 	if (err)
1162 		goto err_free_mapped;
1163 
1164 	resp.db_off = result.db_off;
1165 	resp.cq_idx = result.cq_idx;
1166 	cq->cq_idx = result.cq_idx;
1167 	cq->ibcq.cqe = result.actual_depth;
1168 	WARN_ON_ONCE(entries != result.actual_depth);
1169 
1170 	err = cq_mmap_entries_setup(dev, cq, &resp, result.db_valid);
1171 	if (err) {
1172 		ibdev_dbg(ibdev, "Could not setup cq[%u] mmap entries\n",
1173 			  cq->cq_idx);
1174 		goto err_destroy_cq;
1175 	}
1176 
1177 	if (cq->eq) {
1178 		err = xa_err(xa_store(&dev->cqs_xa, cq->cq_idx, cq, GFP_KERNEL));
1179 		if (err) {
1180 			ibdev_dbg(ibdev, "Failed to store cq[%u] in xarray\n",
1181 				  cq->cq_idx);
1182 			goto err_remove_mmap;
1183 		}
1184 	}
1185 
1186 	if (udata->outlen) {
1187 		err = ib_copy_to_udata(udata, &resp,
1188 				       min(sizeof(resp), udata->outlen));
1189 		if (err) {
1190 			ibdev_dbg(ibdev,
1191 				  "Failed to copy udata for create_cq\n");
1192 			goto err_xa_erase;
1193 		}
1194 	}
1195 
1196 	ibdev_dbg(ibdev, "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n",
1197 		  cq->cq_idx, result.actual_depth, &cq->dma_addr, cq->cpu_addr);
1198 
1199 	return 0;
1200 
1201 err_xa_erase:
1202 	if (cq->eq)
1203 		xa_erase(&dev->cqs_xa, cq->cq_idx);
1204 err_remove_mmap:
1205 	efa_cq_user_mmap_entries_remove(cq);
1206 err_destroy_cq:
1207 	efa_destroy_cq_idx(dev, cq->cq_idx);
1208 err_free_mapped:
1209 	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
1210 			DMA_FROM_DEVICE);
1211 
1212 err_out:
1213 	atomic64_inc(&dev->stats.create_cq_err);
1214 	return err;
1215 }
1216 
1217 static int umem_to_page_list(struct efa_dev *dev,
1218 			     struct ib_umem *umem,
1219 			     u64 *page_list,
1220 			     u32 hp_cnt,
1221 			     u8 hp_shift)
1222 {
1223 	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
1224 	struct ib_block_iter biter;
1225 	unsigned int hp_idx = 0;
1226 
1227 	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
1228 		  hp_cnt, pages_in_hp);
1229 
1230 	rdma_umem_for_each_dma_block(umem, &biter, BIT(hp_shift))
1231 		page_list[hp_idx++] = rdma_block_iter_dma_address(&biter);
1232 
1233 	return 0;
1234 }
1235 
1236 static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
1237 {
1238 	struct scatterlist *sglist;
1239 	struct page *pg;
1240 	int i;
1241 
1242 	sglist = kmalloc_array(page_cnt, sizeof(*sglist), GFP_KERNEL);
1243 	if (!sglist)
1244 		return NULL;
1245 	sg_init_table(sglist, page_cnt);
1246 	for (i = 0; i < page_cnt; i++) {
1247 		pg = vmalloc_to_page(buf);
1248 		if (!pg)
1249 			goto err;
1250 		sg_set_page(&sglist[i], pg, PAGE_SIZE, 0);
1251 		buf += PAGE_SIZE / sizeof(*buf);
1252 	}
1253 	return sglist;
1254 
1255 err:
1256 	kfree(sglist);
1257 	return NULL;
1258 }
1259 
1260 /*
1261  * create a chunk list of physical pages dma addresses from the supplied
1262  * scatter gather list
1263  */
1264 static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl)
1265 {
1266 	struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
1267 	int page_cnt = pbl->phys.indirect.pbl_buf_size_in_pages;
1268 	struct scatterlist *pages_sgl = pbl->phys.indirect.sgl;
1269 	unsigned int chunk_list_size, chunk_idx, payload_idx;
1270 	int sg_dma_cnt = pbl->phys.indirect.sg_dma_cnt;
1271 	struct efa_com_ctrl_buff_info *ctrl_buf;
1272 	u64 *cur_chunk_buf, *prev_chunk_buf;
1273 	struct ib_block_iter biter;
1274 	dma_addr_t dma_addr;
1275 	int i;
1276 
1277 	/* allocate a chunk list that consists of 4KB chunks */
1278 	chunk_list_size = DIV_ROUND_UP(page_cnt, EFA_PTRS_PER_CHUNK);
1279 
1280 	chunk_list->size = chunk_list_size;
1281 	chunk_list->chunks = kcalloc(chunk_list_size,
1282 				     sizeof(*chunk_list->chunks),
1283 				     GFP_KERNEL);
1284 	if (!chunk_list->chunks)
1285 		return -ENOMEM;
1286 
1287 	ibdev_dbg(&dev->ibdev,
1288 		  "chunk_list_size[%u] - pages[%u]\n", chunk_list_size,
1289 		  page_cnt);
1290 
1291 	/* allocate chunk buffers: */
1292 	for (i = 0; i < chunk_list_size; i++) {
1293 		chunk_list->chunks[i].buf = kzalloc(EFA_CHUNK_SIZE, GFP_KERNEL);
1294 		if (!chunk_list->chunks[i].buf)
1295 			goto chunk_list_dealloc;
1296 
1297 		chunk_list->chunks[i].length = EFA_CHUNK_USED_SIZE;
1298 	}
1299 	chunk_list->chunks[chunk_list_size - 1].length =
1300 		((page_cnt % EFA_PTRS_PER_CHUNK) * EFA_CHUNK_PAYLOAD_PTR_SIZE) +
1301 			EFA_CHUNK_PTR_SIZE;
1302 
1303 	/* fill the dma addresses of sg list pages to chunks: */
1304 	chunk_idx = 0;
1305 	payload_idx = 0;
1306 	cur_chunk_buf = chunk_list->chunks[0].buf;
1307 	rdma_for_each_block(pages_sgl, &biter, sg_dma_cnt,
1308 			    EFA_CHUNK_PAYLOAD_SIZE) {
1309 		cur_chunk_buf[payload_idx++] =
1310 			rdma_block_iter_dma_address(&biter);
1311 
1312 		if (payload_idx == EFA_PTRS_PER_CHUNK) {
1313 			chunk_idx++;
1314 			cur_chunk_buf = chunk_list->chunks[chunk_idx].buf;
1315 			payload_idx = 0;
1316 		}
1317 	}
1318 
1319 	/* map chunks to dma and fill chunks next ptrs */
1320 	for (i = chunk_list_size - 1; i >= 0; i--) {
1321 		dma_addr = dma_map_single(&dev->pdev->dev,
1322 					  chunk_list->chunks[i].buf,
1323 					  chunk_list->chunks[i].length,
1324 					  DMA_TO_DEVICE);
1325 		if (dma_mapping_error(&dev->pdev->dev, dma_addr)) {
1326 			ibdev_err(&dev->ibdev,
1327 				  "chunk[%u] dma_map_failed\n", i);
1328 			goto chunk_list_unmap;
1329 		}
1330 
1331 		chunk_list->chunks[i].dma_addr = dma_addr;
1332 		ibdev_dbg(&dev->ibdev,
1333 			  "chunk[%u] mapped at [%pad]\n", i, &dma_addr);
1334 
1335 		if (!i)
1336 			break;
1337 
1338 		prev_chunk_buf = chunk_list->chunks[i - 1].buf;
1339 
1340 		ctrl_buf = (struct efa_com_ctrl_buff_info *)
1341 				&prev_chunk_buf[EFA_PTRS_PER_CHUNK];
1342 		ctrl_buf->length = chunk_list->chunks[i].length;
1343 
1344 		efa_com_set_dma_addr(dma_addr,
1345 				     &ctrl_buf->address.mem_addr_high,
1346 				     &ctrl_buf->address.mem_addr_low);
1347 	}
1348 
1349 	return 0;
1350 
1351 chunk_list_unmap:
1352 	for (; i < chunk_list_size; i++) {
1353 		dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr,
1354 				 chunk_list->chunks[i].length, DMA_TO_DEVICE);
1355 	}
1356 chunk_list_dealloc:
1357 	for (i = 0; i < chunk_list_size; i++)
1358 		kfree(chunk_list->chunks[i].buf);
1359 
1360 	kfree(chunk_list->chunks);
1361 	return -ENOMEM;
1362 }
1363 
1364 static void pbl_chunk_list_destroy(struct efa_dev *dev, struct pbl_context *pbl)
1365 {
1366 	struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
1367 	int i;
1368 
1369 	for (i = 0; i < chunk_list->size; i++) {
1370 		dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr,
1371 				 chunk_list->chunks[i].length, DMA_TO_DEVICE);
1372 		kfree(chunk_list->chunks[i].buf);
1373 	}
1374 
1375 	kfree(chunk_list->chunks);
1376 }
1377 
1378 /* initialize pbl continuous mode: map pbl buffer to a dma address. */
1379 static int pbl_continuous_initialize(struct efa_dev *dev,
1380 				     struct pbl_context *pbl)
1381 {
1382 	dma_addr_t dma_addr;
1383 
1384 	dma_addr = dma_map_single(&dev->pdev->dev, pbl->pbl_buf,
1385 				  pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
1386 	if (dma_mapping_error(&dev->pdev->dev, dma_addr)) {
1387 		ibdev_err(&dev->ibdev, "Unable to map pbl to DMA address\n");
1388 		return -ENOMEM;
1389 	}
1390 
1391 	pbl->phys.continuous.dma_addr = dma_addr;
1392 	ibdev_dbg(&dev->ibdev,
1393 		  "pbl continuous - dma_addr = %pad, size[%u]\n",
1394 		  &dma_addr, pbl->pbl_buf_size_in_bytes);
1395 
1396 	return 0;
1397 }
1398 
1399 /*
1400  * initialize pbl indirect mode:
1401  * create a chunk list out of the dma addresses of the physical pages of
1402  * pbl buffer.
1403  */
1404 static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl)
1405 {
1406 	u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, EFA_CHUNK_PAYLOAD_SIZE);
1407 	struct scatterlist *sgl;
1408 	int sg_dma_cnt, err;
1409 
1410 	BUILD_BUG_ON(EFA_CHUNK_PAYLOAD_SIZE > PAGE_SIZE);
1411 	sgl = efa_vmalloc_buf_to_sg(pbl->pbl_buf, size_in_pages);
1412 	if (!sgl)
1413 		return -ENOMEM;
1414 
1415 	sg_dma_cnt = dma_map_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE);
1416 	if (!sg_dma_cnt) {
1417 		err = -EINVAL;
1418 		goto err_map;
1419 	}
1420 
1421 	pbl->phys.indirect.pbl_buf_size_in_pages = size_in_pages;
1422 	pbl->phys.indirect.sgl = sgl;
1423 	pbl->phys.indirect.sg_dma_cnt = sg_dma_cnt;
1424 	err = pbl_chunk_list_create(dev, pbl);
1425 	if (err) {
1426 		ibdev_dbg(&dev->ibdev,
1427 			  "chunk_list creation failed[%d]\n", err);
1428 		goto err_chunk;
1429 	}
1430 
1431 	ibdev_dbg(&dev->ibdev,
1432 		  "pbl indirect - size[%u], chunks[%u]\n",
1433 		  pbl->pbl_buf_size_in_bytes,
1434 		  pbl->phys.indirect.chunk_list.size);
1435 
1436 	return 0;
1437 
1438 err_chunk:
1439 	dma_unmap_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE);
1440 err_map:
1441 	kfree(sgl);
1442 	return err;
1443 }
1444 
1445 static void pbl_indirect_terminate(struct efa_dev *dev, struct pbl_context *pbl)
1446 {
1447 	pbl_chunk_list_destroy(dev, pbl);
1448 	dma_unmap_sg(&dev->pdev->dev, pbl->phys.indirect.sgl,
1449 		     pbl->phys.indirect.pbl_buf_size_in_pages, DMA_TO_DEVICE);
1450 	kfree(pbl->phys.indirect.sgl);
1451 }
1452 
1453 /* create a page buffer list from a mapped user memory region */
1454 static int pbl_create(struct efa_dev *dev,
1455 		      struct pbl_context *pbl,
1456 		      struct ib_umem *umem,
1457 		      int hp_cnt,
1458 		      u8 hp_shift)
1459 {
1460 	int err;
1461 
1462 	pbl->pbl_buf_size_in_bytes = hp_cnt * EFA_CHUNK_PAYLOAD_PTR_SIZE;
1463 	pbl->pbl_buf = kvzalloc(pbl->pbl_buf_size_in_bytes, GFP_KERNEL);
1464 	if (!pbl->pbl_buf)
1465 		return -ENOMEM;
1466 
1467 	if (is_vmalloc_addr(pbl->pbl_buf)) {
1468 		pbl->physically_continuous = 0;
1469 		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
1470 					hp_shift);
1471 		if (err)
1472 			goto err_free;
1473 
1474 		err = pbl_indirect_initialize(dev, pbl);
1475 		if (err)
1476 			goto err_free;
1477 	} else {
1478 		pbl->physically_continuous = 1;
1479 		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
1480 					hp_shift);
1481 		if (err)
1482 			goto err_free;
1483 
1484 		err = pbl_continuous_initialize(dev, pbl);
1485 		if (err)
1486 			goto err_free;
1487 	}
1488 
1489 	ibdev_dbg(&dev->ibdev,
1490 		  "user_pbl_created: user_pages[%u], continuous[%u]\n",
1491 		  hp_cnt, pbl->physically_continuous);
1492 
1493 	return 0;
1494 
1495 err_free:
1496 	kvfree(pbl->pbl_buf);
1497 	return err;
1498 }
1499 
1500 static void pbl_destroy(struct efa_dev *dev, struct pbl_context *pbl)
1501 {
1502 	if (pbl->physically_continuous)
1503 		dma_unmap_single(&dev->pdev->dev, pbl->phys.continuous.dma_addr,
1504 				 pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
1505 	else
1506 		pbl_indirect_terminate(dev, pbl);
1507 
1508 	kvfree(pbl->pbl_buf);
1509 }
1510 
1511 static int efa_create_inline_pbl(struct efa_dev *dev, struct efa_mr *mr,
1512 				 struct efa_com_reg_mr_params *params)
1513 {
1514 	int err;
1515 
1516 	params->inline_pbl = 1;
1517 	err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array,
1518 				params->page_num, params->page_shift);
1519 	if (err)
1520 		return err;
1521 
1522 	ibdev_dbg(&dev->ibdev,
1523 		  "inline_pbl_array - pages[%u]\n", params->page_num);
1524 
1525 	return 0;
1526 }
1527 
1528 static int efa_create_pbl(struct efa_dev *dev,
1529 			  struct pbl_context *pbl,
1530 			  struct efa_mr *mr,
1531 			  struct efa_com_reg_mr_params *params)
1532 {
1533 	int err;
1534 
1535 	err = pbl_create(dev, pbl, mr->umem, params->page_num,
1536 			 params->page_shift);
1537 	if (err) {
1538 		ibdev_dbg(&dev->ibdev, "Failed to create pbl[%d]\n", err);
1539 		return err;
1540 	}
1541 
1542 	params->inline_pbl = 0;
1543 	params->indirect = !pbl->physically_continuous;
1544 	if (pbl->physically_continuous) {
1545 		params->pbl.pbl.length = pbl->pbl_buf_size_in_bytes;
1546 
1547 		efa_com_set_dma_addr(pbl->phys.continuous.dma_addr,
1548 				     &params->pbl.pbl.address.mem_addr_high,
1549 				     &params->pbl.pbl.address.mem_addr_low);
1550 	} else {
1551 		params->pbl.pbl.length =
1552 			pbl->phys.indirect.chunk_list.chunks[0].length;
1553 
1554 		efa_com_set_dma_addr(pbl->phys.indirect.chunk_list.chunks[0].dma_addr,
1555 				     &params->pbl.pbl.address.mem_addr_high,
1556 				     &params->pbl.pbl.address.mem_addr_low);
1557 	}
1558 
1559 	return 0;
1560 }
1561 
1562 static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags,
1563 				   struct ib_udata *udata)
1564 {
1565 	struct efa_dev *dev = to_edev(ibpd->device);
1566 	int supp_access_flags;
1567 	struct efa_mr *mr;
1568 
1569 	if (udata && udata->inlen &&
1570 	    !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
1571 		ibdev_dbg(&dev->ibdev,
1572 			  "Incompatible ABI params, udata not cleared\n");
1573 		return ERR_PTR(-EINVAL);
1574 	}
1575 
1576 	supp_access_flags =
1577 		IB_ACCESS_LOCAL_WRITE |
1578 		(EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0) |
1579 		(EFA_DEV_CAP(dev, RDMA_WRITE) ? IB_ACCESS_REMOTE_WRITE : 0);
1580 
1581 	access_flags &= ~IB_ACCESS_OPTIONAL;
1582 	if (access_flags & ~supp_access_flags) {
1583 		ibdev_dbg(&dev->ibdev,
1584 			  "Unsupported access flags[%#x], supported[%#x]\n",
1585 			  access_flags, supp_access_flags);
1586 		return ERR_PTR(-EOPNOTSUPP);
1587 	}
1588 
1589 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1590 	if (!mr)
1591 		return ERR_PTR(-ENOMEM);
1592 
1593 	return mr;
1594 }
1595 
1596 static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start,
1597 			   u64 length, u64 virt_addr, int access_flags)
1598 {
1599 	struct efa_dev *dev = to_edev(ibpd->device);
1600 	struct efa_com_reg_mr_params params = {};
1601 	struct efa_com_reg_mr_result result = {};
1602 	struct pbl_context pbl;
1603 	unsigned int pg_sz;
1604 	int inline_size;
1605 	int err;
1606 
1607 	params.pd = to_epd(ibpd)->pdn;
1608 	params.iova = virt_addr;
1609 	params.mr_length_in_bytes = length;
1610 	params.permissions = access_flags;
1611 
1612 	pg_sz = ib_umem_find_best_pgsz(mr->umem,
1613 				       dev->dev_attr.page_size_cap,
1614 				       virt_addr);
1615 	if (!pg_sz) {
1616 		ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n",
1617 			  dev->dev_attr.page_size_cap);
1618 		return -EOPNOTSUPP;
1619 	}
1620 
1621 	params.page_shift = order_base_2(pg_sz);
1622 	params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz);
1623 
1624 	ibdev_dbg(&dev->ibdev,
1625 		  "start %#llx length %#llx params.page_shift %u params.page_num %u\n",
1626 		  start, length, params.page_shift, params.page_num);
1627 
1628 	inline_size = ARRAY_SIZE(params.pbl.inline_pbl_array);
1629 	if (params.page_num <= inline_size) {
1630 		err = efa_create_inline_pbl(dev, mr, &params);
1631 		if (err)
1632 			return err;
1633 
1634 		err = efa_com_register_mr(&dev->edev, &params, &result);
1635 		if (err)
1636 			return err;
1637 	} else {
1638 		err = efa_create_pbl(dev, &pbl, mr, &params);
1639 		if (err)
1640 			return err;
1641 
1642 		err = efa_com_register_mr(&dev->edev, &params, &result);
1643 		pbl_destroy(dev, &pbl);
1644 
1645 		if (err)
1646 			return err;
1647 	}
1648 
1649 	mr->ibmr.lkey = result.l_key;
1650 	mr->ibmr.rkey = result.r_key;
1651 	mr->ibmr.length = length;
1652 	ibdev_dbg(&dev->ibdev, "Registered mr[%d]\n", mr->ibmr.lkey);
1653 
1654 	return 0;
1655 }
1656 
1657 struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
1658 				     u64 length, u64 virt_addr,
1659 				     int fd, int access_flags,
1660 				     struct ib_udata *udata)
1661 {
1662 	struct efa_dev *dev = to_edev(ibpd->device);
1663 	struct ib_umem_dmabuf *umem_dmabuf;
1664 	struct efa_mr *mr;
1665 	int err;
1666 
1667 	mr = efa_alloc_mr(ibpd, access_flags, udata);
1668 	if (IS_ERR(mr)) {
1669 		err = PTR_ERR(mr);
1670 		goto err_out;
1671 	}
1672 
1673 	umem_dmabuf = ib_umem_dmabuf_get_pinned(ibpd->device, start, length, fd,
1674 						access_flags);
1675 	if (IS_ERR(umem_dmabuf)) {
1676 		err = PTR_ERR(umem_dmabuf);
1677 		ibdev_dbg(&dev->ibdev, "Failed to get dmabuf umem[%d]\n", err);
1678 		goto err_free;
1679 	}
1680 
1681 	mr->umem = &umem_dmabuf->umem;
1682 	err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags);
1683 	if (err)
1684 		goto err_release;
1685 
1686 	return &mr->ibmr;
1687 
1688 err_release:
1689 	ib_umem_release(mr->umem);
1690 err_free:
1691 	kfree(mr);
1692 err_out:
1693 	atomic64_inc(&dev->stats.reg_mr_err);
1694 	return ERR_PTR(err);
1695 }
1696 
1697 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
1698 			 u64 virt_addr, int access_flags,
1699 			 struct ib_udata *udata)
1700 {
1701 	struct efa_dev *dev = to_edev(ibpd->device);
1702 	struct efa_mr *mr;
1703 	int err;
1704 
1705 	mr = efa_alloc_mr(ibpd, access_flags, udata);
1706 	if (IS_ERR(mr)) {
1707 		err = PTR_ERR(mr);
1708 		goto err_out;
1709 	}
1710 
1711 	mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
1712 	if (IS_ERR(mr->umem)) {
1713 		err = PTR_ERR(mr->umem);
1714 		ibdev_dbg(&dev->ibdev,
1715 			  "Failed to pin and map user space memory[%d]\n", err);
1716 		goto err_free;
1717 	}
1718 
1719 	err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags);
1720 	if (err)
1721 		goto err_release;
1722 
1723 	return &mr->ibmr;
1724 
1725 err_release:
1726 	ib_umem_release(mr->umem);
1727 err_free:
1728 	kfree(mr);
1729 err_out:
1730 	atomic64_inc(&dev->stats.reg_mr_err);
1731 	return ERR_PTR(err);
1732 }
1733 
1734 int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1735 {
1736 	struct efa_dev *dev = to_edev(ibmr->device);
1737 	struct efa_com_dereg_mr_params params;
1738 	struct efa_mr *mr = to_emr(ibmr);
1739 	int err;
1740 
1741 	ibdev_dbg(&dev->ibdev, "Deregister mr[%d]\n", ibmr->lkey);
1742 
1743 	params.l_key = mr->ibmr.lkey;
1744 	err = efa_com_dereg_mr(&dev->edev, &params);
1745 	if (err)
1746 		return err;
1747 
1748 	ib_umem_release(mr->umem);
1749 	kfree(mr);
1750 
1751 	return 0;
1752 }
1753 
1754 int efa_get_port_immutable(struct ib_device *ibdev, u32 port_num,
1755 			   struct ib_port_immutable *immutable)
1756 {
1757 	struct ib_port_attr attr;
1758 	int err;
1759 
1760 	err = ib_query_port(ibdev, port_num, &attr);
1761 	if (err) {
1762 		ibdev_dbg(ibdev, "Couldn't query port err[%d]\n", err);
1763 		return err;
1764 	}
1765 
1766 	immutable->pkey_tbl_len = attr.pkey_tbl_len;
1767 	immutable->gid_tbl_len = attr.gid_tbl_len;
1768 
1769 	return 0;
1770 }
1771 
1772 static int efa_dealloc_uar(struct efa_dev *dev, u16 uarn)
1773 {
1774 	struct efa_com_dealloc_uar_params params = {
1775 		.uarn = uarn,
1776 	};
1777 
1778 	return efa_com_dealloc_uar(&dev->edev, &params);
1779 }
1780 
1781 #define EFA_CHECK_USER_COMP(_dev, _comp_mask, _attr, _mask, _attr_str) \
1782 	(_attr_str = (!(_dev)->dev_attr._attr || ((_comp_mask) & (_mask))) ? \
1783 		     NULL : #_attr)
1784 
1785 static int efa_user_comp_handshake(const struct ib_ucontext *ibucontext,
1786 				   const struct efa_ibv_alloc_ucontext_cmd *cmd)
1787 {
1788 	struct efa_dev *dev = to_edev(ibucontext->device);
1789 	char *attr_str;
1790 
1791 	if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, max_tx_batch,
1792 				EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH, attr_str))
1793 		goto err;
1794 
1795 	if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, min_sq_depth,
1796 				EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR,
1797 				attr_str))
1798 		goto err;
1799 
1800 	return 0;
1801 
1802 err:
1803 	ibdev_dbg(&dev->ibdev, "Userspace handshake failed for %s attribute\n",
1804 		  attr_str);
1805 	return -EOPNOTSUPP;
1806 }
1807 
1808 int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata)
1809 {
1810 	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
1811 	struct efa_dev *dev = to_edev(ibucontext->device);
1812 	struct efa_ibv_alloc_ucontext_resp resp = {};
1813 	struct efa_ibv_alloc_ucontext_cmd cmd = {};
1814 	struct efa_com_alloc_uar_result result;
1815 	int err;
1816 
1817 	/*
1818 	 * it's fine if the driver does not know all request fields,
1819 	 * we will ack input fields in our response.
1820 	 */
1821 
1822 	err = ib_copy_from_udata(&cmd, udata,
1823 				 min(sizeof(cmd), udata->inlen));
1824 	if (err) {
1825 		ibdev_dbg(&dev->ibdev,
1826 			  "Cannot copy udata for alloc_ucontext\n");
1827 		goto err_out;
1828 	}
1829 
1830 	err = efa_user_comp_handshake(ibucontext, &cmd);
1831 	if (err)
1832 		goto err_out;
1833 
1834 	err = efa_com_alloc_uar(&dev->edev, &result);
1835 	if (err)
1836 		goto err_out;
1837 
1838 	ucontext->uarn = result.uarn;
1839 
1840 	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE;
1841 	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH;
1842 	resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq;
1843 	resp.inline_buf_size = dev->dev_attr.inline_buf_size;
1844 	resp.max_llq_size = dev->dev_attr.max_llq_size;
1845 	resp.max_tx_batch = dev->dev_attr.max_tx_batch;
1846 	resp.min_sq_wr = dev->dev_attr.min_sq_depth;
1847 
1848 	err = ib_copy_to_udata(udata, &resp,
1849 			       min(sizeof(resp), udata->outlen));
1850 	if (err)
1851 		goto err_dealloc_uar;
1852 
1853 	return 0;
1854 
1855 err_dealloc_uar:
1856 	efa_dealloc_uar(dev, result.uarn);
1857 err_out:
1858 	atomic64_inc(&dev->stats.alloc_ucontext_err);
1859 	return err;
1860 }
1861 
1862 void efa_dealloc_ucontext(struct ib_ucontext *ibucontext)
1863 {
1864 	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
1865 	struct efa_dev *dev = to_edev(ibucontext->device);
1866 
1867 	efa_dealloc_uar(dev, ucontext->uarn);
1868 }
1869 
1870 void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
1871 {
1872 	struct efa_user_mmap_entry *entry = to_emmap(rdma_entry);
1873 
1874 	kfree(entry);
1875 }
1876 
1877 static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
1878 		      struct vm_area_struct *vma)
1879 {
1880 	struct rdma_user_mmap_entry *rdma_entry;
1881 	struct efa_user_mmap_entry *entry;
1882 	unsigned long va;
1883 	int err = 0;
1884 	u64 pfn;
1885 
1886 	rdma_entry = rdma_user_mmap_entry_get(&ucontext->ibucontext, vma);
1887 	if (!rdma_entry) {
1888 		ibdev_dbg(&dev->ibdev,
1889 			  "pgoff[%#lx] does not have valid entry\n",
1890 			  vma->vm_pgoff);
1891 		atomic64_inc(&dev->stats.mmap_err);
1892 		return -EINVAL;
1893 	}
1894 	entry = to_emmap(rdma_entry);
1895 
1896 	ibdev_dbg(&dev->ibdev,
1897 		  "Mapping address[%#llx], length[%#zx], mmap_flag[%d]\n",
1898 		  entry->address, rdma_entry->npages * PAGE_SIZE,
1899 		  entry->mmap_flag);
1900 
1901 	pfn = entry->address >> PAGE_SHIFT;
1902 	switch (entry->mmap_flag) {
1903 	case EFA_MMAP_IO_NC:
1904 		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
1905 					entry->rdma_entry.npages * PAGE_SIZE,
1906 					pgprot_noncached(vma->vm_page_prot),
1907 					rdma_entry);
1908 		break;
1909 	case EFA_MMAP_IO_WC:
1910 		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
1911 					entry->rdma_entry.npages * PAGE_SIZE,
1912 					pgprot_writecombine(vma->vm_page_prot),
1913 					rdma_entry);
1914 		break;
1915 	case EFA_MMAP_DMA_PAGE:
1916 		for (va = vma->vm_start; va < vma->vm_end;
1917 		     va += PAGE_SIZE, pfn++) {
1918 			err = vm_insert_page(vma, va, pfn_to_page(pfn));
1919 			if (err)
1920 				break;
1921 		}
1922 		break;
1923 	default:
1924 		err = -EINVAL;
1925 	}
1926 
1927 	if (err) {
1928 		ibdev_dbg(
1929 			&dev->ibdev,
1930 			"Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n",
1931 			entry->address, rdma_entry->npages * PAGE_SIZE,
1932 			entry->mmap_flag, err);
1933 		atomic64_inc(&dev->stats.mmap_err);
1934 	}
1935 
1936 	rdma_user_mmap_entry_put(rdma_entry);
1937 	return err;
1938 }
1939 
1940 int efa_mmap(struct ib_ucontext *ibucontext,
1941 	     struct vm_area_struct *vma)
1942 {
1943 	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
1944 	struct efa_dev *dev = to_edev(ibucontext->device);
1945 	size_t length = vma->vm_end - vma->vm_start;
1946 
1947 	ibdev_dbg(&dev->ibdev,
1948 		  "start %#lx, end %#lx, length = %#zx, pgoff = %#lx\n",
1949 		  vma->vm_start, vma->vm_end, length, vma->vm_pgoff);
1950 
1951 	return __efa_mmap(dev, ucontext, vma);
1952 }
1953 
1954 static int efa_ah_destroy(struct efa_dev *dev, struct efa_ah *ah)
1955 {
1956 	struct efa_com_destroy_ah_params params = {
1957 		.ah = ah->ah,
1958 		.pdn = to_epd(ah->ibah.pd)->pdn,
1959 	};
1960 
1961 	return efa_com_destroy_ah(&dev->edev, &params);
1962 }
1963 
1964 int efa_create_ah(struct ib_ah *ibah,
1965 		  struct rdma_ah_init_attr *init_attr,
1966 		  struct ib_udata *udata)
1967 {
1968 	struct rdma_ah_attr *ah_attr = init_attr->ah_attr;
1969 	struct efa_dev *dev = to_edev(ibah->device);
1970 	struct efa_com_create_ah_params params = {};
1971 	struct efa_ibv_create_ah_resp resp = {};
1972 	struct efa_com_create_ah_result result;
1973 	struct efa_ah *ah = to_eah(ibah);
1974 	int err;
1975 
1976 	if (!(init_attr->flags & RDMA_CREATE_AH_SLEEPABLE)) {
1977 		ibdev_dbg(&dev->ibdev,
1978 			  "Create address handle is not supported in atomic context\n");
1979 		err = -EOPNOTSUPP;
1980 		goto err_out;
1981 	}
1982 
1983 	if (udata->inlen &&
1984 	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
1985 		ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
1986 		err = -EINVAL;
1987 		goto err_out;
1988 	}
1989 
1990 	memcpy(params.dest_addr, ah_attr->grh.dgid.raw,
1991 	       sizeof(params.dest_addr));
1992 	params.pdn = to_epd(ibah->pd)->pdn;
1993 	err = efa_com_create_ah(&dev->edev, &params, &result);
1994 	if (err)
1995 		goto err_out;
1996 
1997 	memcpy(ah->id, ah_attr->grh.dgid.raw, sizeof(ah->id));
1998 	ah->ah = result.ah;
1999 
2000 	resp.efa_address_handle = result.ah;
2001 
2002 	if (udata->outlen) {
2003 		err = ib_copy_to_udata(udata, &resp,
2004 				       min(sizeof(resp), udata->outlen));
2005 		if (err) {
2006 			ibdev_dbg(&dev->ibdev,
2007 				  "Failed to copy udata for create_ah response\n");
2008 			goto err_destroy_ah;
2009 		}
2010 	}
2011 	ibdev_dbg(&dev->ibdev, "Created ah[%d]\n", ah->ah);
2012 
2013 	return 0;
2014 
2015 err_destroy_ah:
2016 	efa_ah_destroy(dev, ah);
2017 err_out:
2018 	atomic64_inc(&dev->stats.create_ah_err);
2019 	return err;
2020 }
2021 
2022 int efa_destroy_ah(struct ib_ah *ibah, u32 flags)
2023 {
2024 	struct efa_dev *dev = to_edev(ibah->pd->device);
2025 	struct efa_ah *ah = to_eah(ibah);
2026 
2027 	ibdev_dbg(&dev->ibdev, "Destroy ah[%d]\n", ah->ah);
2028 
2029 	if (!(flags & RDMA_DESTROY_AH_SLEEPABLE)) {
2030 		ibdev_dbg(&dev->ibdev,
2031 			  "Destroy address handle is not supported in atomic context\n");
2032 		return -EOPNOTSUPP;
2033 	}
2034 
2035 	efa_ah_destroy(dev, ah);
2036 	return 0;
2037 }
2038 
2039 struct rdma_hw_stats *efa_alloc_hw_port_stats(struct ib_device *ibdev,
2040 					      u32 port_num)
2041 {
2042 	return rdma_alloc_hw_stats_struct(efa_port_stats_descs,
2043 					  ARRAY_SIZE(efa_port_stats_descs),
2044 					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
2045 }
2046 
2047 struct rdma_hw_stats *efa_alloc_hw_device_stats(struct ib_device *ibdev)
2048 {
2049 	return rdma_alloc_hw_stats_struct(efa_device_stats_descs,
2050 					  ARRAY_SIZE(efa_device_stats_descs),
2051 					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
2052 }
2053 
2054 static int efa_fill_device_stats(struct efa_dev *dev,
2055 				 struct rdma_hw_stats *stats)
2056 {
2057 	struct efa_com_stats_admin *as = &dev->edev.aq.stats;
2058 	struct efa_stats *s = &dev->stats;
2059 
2060 	stats->value[EFA_SUBMITTED_CMDS] = atomic64_read(&as->submitted_cmd);
2061 	stats->value[EFA_COMPLETED_CMDS] = atomic64_read(&as->completed_cmd);
2062 	stats->value[EFA_CMDS_ERR] = atomic64_read(&as->cmd_err);
2063 	stats->value[EFA_NO_COMPLETION_CMDS] = atomic64_read(&as->no_completion);
2064 
2065 	stats->value[EFA_KEEP_ALIVE_RCVD] = atomic64_read(&s->keep_alive_rcvd);
2066 	stats->value[EFA_ALLOC_PD_ERR] = atomic64_read(&s->alloc_pd_err);
2067 	stats->value[EFA_CREATE_QP_ERR] = atomic64_read(&s->create_qp_err);
2068 	stats->value[EFA_CREATE_CQ_ERR] = atomic64_read(&s->create_cq_err);
2069 	stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->reg_mr_err);
2070 	stats->value[EFA_ALLOC_UCONTEXT_ERR] =
2071 		atomic64_read(&s->alloc_ucontext_err);
2072 	stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->create_ah_err);
2073 	stats->value[EFA_MMAP_ERR] = atomic64_read(&s->mmap_err);
2074 
2075 	return ARRAY_SIZE(efa_device_stats_descs);
2076 }
2077 
2078 static int efa_fill_port_stats(struct efa_dev *dev, struct rdma_hw_stats *stats,
2079 			       u32 port_num)
2080 {
2081 	struct efa_com_get_stats_params params = {};
2082 	union efa_com_get_stats_result result;
2083 	struct efa_com_rdma_read_stats *rrs;
2084 	struct efa_com_messages_stats *ms;
2085 	struct efa_com_basic_stats *bs;
2086 	int err;
2087 
2088 	params.scope = EFA_ADMIN_GET_STATS_SCOPE_ALL;
2089 	params.type = EFA_ADMIN_GET_STATS_TYPE_BASIC;
2090 
2091 	err = efa_com_get_stats(&dev->edev, &params, &result);
2092 	if (err)
2093 		return err;
2094 
2095 	bs = &result.basic_stats;
2096 	stats->value[EFA_TX_BYTES] = bs->tx_bytes;
2097 	stats->value[EFA_TX_PKTS] = bs->tx_pkts;
2098 	stats->value[EFA_RX_BYTES] = bs->rx_bytes;
2099 	stats->value[EFA_RX_PKTS] = bs->rx_pkts;
2100 	stats->value[EFA_RX_DROPS] = bs->rx_drops;
2101 
2102 	params.type = EFA_ADMIN_GET_STATS_TYPE_MESSAGES;
2103 	err = efa_com_get_stats(&dev->edev, &params, &result);
2104 	if (err)
2105 		return err;
2106 
2107 	ms = &result.messages_stats;
2108 	stats->value[EFA_SEND_BYTES] = ms->send_bytes;
2109 	stats->value[EFA_SEND_WRS] = ms->send_wrs;
2110 	stats->value[EFA_RECV_BYTES] = ms->recv_bytes;
2111 	stats->value[EFA_RECV_WRS] = ms->recv_wrs;
2112 
2113 	params.type = EFA_ADMIN_GET_STATS_TYPE_RDMA_READ;
2114 	err = efa_com_get_stats(&dev->edev, &params, &result);
2115 	if (err)
2116 		return err;
2117 
2118 	rrs = &result.rdma_read_stats;
2119 	stats->value[EFA_RDMA_READ_WRS] = rrs->read_wrs;
2120 	stats->value[EFA_RDMA_READ_BYTES] = rrs->read_bytes;
2121 	stats->value[EFA_RDMA_READ_WR_ERR] = rrs->read_wr_err;
2122 	stats->value[EFA_RDMA_READ_RESP_BYTES] = rrs->read_resp_bytes;
2123 
2124 	return ARRAY_SIZE(efa_port_stats_descs);
2125 }
2126 
2127 int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
2128 		     u32 port_num, int index)
2129 {
2130 	if (port_num)
2131 		return efa_fill_port_stats(to_edev(ibdev), stats, port_num);
2132 	else
2133 		return efa_fill_device_stats(to_edev(ibdev), stats);
2134 }
2135 
2136 enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
2137 					 u32 port_num)
2138 {
2139 	return IB_LINK_LAYER_UNSPECIFIED;
2140 }
2141 
2142