1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause 2 3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 4 /* Copyright (c) 2008-2019, IBM Corporation */ 5 6 #include <linux/errno.h> 7 #include <linux/types.h> 8 #include <linux/uaccess.h> 9 #include <linux/vmalloc.h> 10 #include <linux/xarray.h> 11 12 #include <rdma/iw_cm.h> 13 #include <rdma/ib_verbs.h> 14 #include <rdma/ib_user_verbs.h> 15 #include <rdma/uverbs_ioctl.h> 16 17 #include "siw.h" 18 #include "siw_verbs.h" 19 #include "siw_mem.h" 20 21 static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = { 22 [IB_QPS_RESET] = SIW_QP_STATE_IDLE, 23 [IB_QPS_INIT] = SIW_QP_STATE_IDLE, 24 [IB_QPS_RTR] = SIW_QP_STATE_RTR, 25 [IB_QPS_RTS] = SIW_QP_STATE_RTS, 26 [IB_QPS_SQD] = SIW_QP_STATE_CLOSING, 27 [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE, 28 [IB_QPS_ERR] = SIW_QP_STATE_ERROR 29 }; 30 31 static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = { 32 [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR", 33 [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE", 34 [IB_QPS_ERR] = "ERR" 35 }; 36 37 static u32 siw_create_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size) 38 { 39 struct siw_uobj *uobj; 40 struct xa_limit limit = XA_LIMIT(0, SIW_UOBJ_MAX_KEY); 41 u32 key; 42 43 uobj = kzalloc(sizeof(*uobj), GFP_KERNEL); 44 if (!uobj) 45 return SIW_INVAL_UOBJ_KEY; 46 47 if (xa_alloc_cyclic(&uctx->xa, &key, uobj, limit, &uctx->uobj_nextkey, 48 GFP_KERNEL) < 0) { 49 kfree(uobj); 50 return SIW_INVAL_UOBJ_KEY; 51 } 52 uobj->size = PAGE_ALIGN(size); 53 uobj->addr = vaddr; 54 55 return key; 56 } 57 58 static struct siw_uobj *siw_get_uobj(struct siw_ucontext *uctx, 59 unsigned long off, u32 size) 60 { 61 struct siw_uobj *uobj = xa_load(&uctx->xa, off); 62 63 if (uobj && uobj->size == size) 64 return uobj; 65 66 return NULL; 67 } 68 69 int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) 70 { 71 struct siw_ucontext *uctx = to_siw_ctx(ctx); 72 struct siw_uobj *uobj; 73 unsigned long off = vma->vm_pgoff; 74 int size = vma->vm_end - vma->vm_start; 75 int rv = -EINVAL; 76 77 /* 78 * Must be page aligned 79 */ 80 if (vma->vm_start & (PAGE_SIZE - 1)) { 81 pr_warn("siw: mmap not page aligned\n"); 82 goto out; 83 } 84 uobj = siw_get_uobj(uctx, off, size); 85 if (!uobj) { 86 siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %u\n", 87 off, size); 88 goto out; 89 } 90 rv = remap_vmalloc_range(vma, uobj->addr, 0); 91 if (rv) 92 pr_warn("remap_vmalloc_range failed: %lu, %u\n", off, size); 93 out: 94 return rv; 95 } 96 97 int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata) 98 { 99 struct siw_device *sdev = to_siw_dev(base_ctx->device); 100 struct siw_ucontext *ctx = to_siw_ctx(base_ctx); 101 struct siw_uresp_alloc_ctx uresp = {}; 102 int rv; 103 104 if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) { 105 rv = -ENOMEM; 106 goto err_out; 107 } 108 xa_init_flags(&ctx->xa, XA_FLAGS_ALLOC); 109 ctx->uobj_nextkey = 0; 110 ctx->sdev = sdev; 111 112 uresp.dev_id = sdev->vendor_part_id; 113 114 if (udata->outlen < sizeof(uresp)) { 115 rv = -EINVAL; 116 goto err_out; 117 } 118 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 119 if (rv) 120 goto err_out; 121 122 siw_dbg(base_ctx->device, "success. now %d context(s)\n", 123 atomic_read(&sdev->num_ctx)); 124 125 return 0; 126 127 err_out: 128 atomic_dec(&sdev->num_ctx); 129 siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv, 130 atomic_read(&sdev->num_ctx)); 131 132 return rv; 133 } 134 135 void siw_dealloc_ucontext(struct ib_ucontext *base_ctx) 136 { 137 struct siw_ucontext *uctx = to_siw_ctx(base_ctx); 138 void *entry; 139 unsigned long index; 140 141 /* 142 * Make sure all user mmap objects are gone. Since QP, CQ 143 * and SRQ destroy routines destroy related objects, nothing 144 * should be found here. 145 */ 146 xa_for_each(&uctx->xa, index, entry) { 147 kfree(xa_erase(&uctx->xa, index)); 148 pr_warn("siw: dropping orphaned uobj at %lu\n", index); 149 } 150 xa_destroy(&uctx->xa); 151 atomic_dec(&uctx->sdev->num_ctx); 152 } 153 154 int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, 155 struct ib_udata *udata) 156 { 157 struct siw_device *sdev = to_siw_dev(base_dev); 158 159 if (udata->inlen || udata->outlen) 160 return -EINVAL; 161 162 memset(attr, 0, sizeof(*attr)); 163 164 /* Revisit atomic caps if RFC 7306 gets supported */ 165 attr->atomic_cap = 0; 166 attr->device_cap_flags = 167 IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG; 168 attr->max_cq = sdev->attrs.max_cq; 169 attr->max_cqe = sdev->attrs.max_cqe; 170 attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; 171 attr->max_fmr = sdev->attrs.max_fmr; 172 attr->max_mr = sdev->attrs.max_mr; 173 attr->max_mw = sdev->attrs.max_mw; 174 attr->max_mr_size = ~0ull; 175 attr->max_pd = sdev->attrs.max_pd; 176 attr->max_qp = sdev->attrs.max_qp; 177 attr->max_qp_init_rd_atom = sdev->attrs.max_ird; 178 attr->max_qp_rd_atom = sdev->attrs.max_ord; 179 attr->max_qp_wr = sdev->attrs.max_qp_wr; 180 attr->max_recv_sge = sdev->attrs.max_sge; 181 attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird; 182 attr->max_send_sge = sdev->attrs.max_sge; 183 attr->max_sge_rd = sdev->attrs.max_sge_rd; 184 attr->max_srq = sdev->attrs.max_srq; 185 attr->max_srq_sge = sdev->attrs.max_srq_sge; 186 attr->max_srq_wr = sdev->attrs.max_srq_wr; 187 attr->page_size_cap = PAGE_SIZE; 188 attr->vendor_id = SIW_VENDOR_ID; 189 attr->vendor_part_id = sdev->vendor_part_id; 190 191 memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6); 192 193 return 0; 194 } 195 196 int siw_query_port(struct ib_device *base_dev, u8 port, 197 struct ib_port_attr *attr) 198 { 199 struct siw_device *sdev = to_siw_dev(base_dev); 200 201 memset(attr, 0, sizeof(*attr)); 202 203 attr->active_mtu = attr->max_mtu; 204 attr->active_speed = 2; 205 attr->active_width = 2; 206 attr->gid_tbl_len = 1; 207 attr->max_msg_sz = -1; 208 attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); 209 attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 5 : 3; 210 attr->pkey_tbl_len = 1; 211 attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; 212 attr->state = sdev->state; 213 /* 214 * All zero 215 * 216 * attr->lid = 0; 217 * attr->bad_pkey_cntr = 0; 218 * attr->qkey_viol_cntr = 0; 219 * attr->sm_lid = 0; 220 * attr->lmc = 0; 221 * attr->max_vl_num = 0; 222 * attr->sm_sl = 0; 223 * attr->subnet_timeout = 0; 224 * attr->init_type_repy = 0; 225 */ 226 return 0; 227 } 228 229 int siw_get_port_immutable(struct ib_device *base_dev, u8 port, 230 struct ib_port_immutable *port_immutable) 231 { 232 struct ib_port_attr attr; 233 int rv = siw_query_port(base_dev, port, &attr); 234 235 if (rv) 236 return rv; 237 238 port_immutable->pkey_tbl_len = attr.pkey_tbl_len; 239 port_immutable->gid_tbl_len = attr.gid_tbl_len; 240 port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; 241 242 return 0; 243 } 244 245 int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey) 246 { 247 /* Report the default pkey */ 248 *pkey = 0xffff; 249 return 0; 250 } 251 252 int siw_query_gid(struct ib_device *base_dev, u8 port, int idx, 253 union ib_gid *gid) 254 { 255 struct siw_device *sdev = to_siw_dev(base_dev); 256 257 /* subnet_prefix == interface_id == 0; */ 258 memset(gid, 0, sizeof(*gid)); 259 memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6); 260 261 return 0; 262 } 263 264 int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata) 265 { 266 struct siw_device *sdev = to_siw_dev(pd->device); 267 268 if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) { 269 atomic_dec(&sdev->num_pd); 270 return -ENOMEM; 271 } 272 siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd)); 273 274 return 0; 275 } 276 277 void siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) 278 { 279 struct siw_device *sdev = to_siw_dev(pd->device); 280 281 siw_dbg_pd(pd, "free PD\n"); 282 atomic_dec(&sdev->num_pd); 283 } 284 285 void siw_qp_get_ref(struct ib_qp *base_qp) 286 { 287 siw_qp_get(to_siw_qp(base_qp)); 288 } 289 290 void siw_qp_put_ref(struct ib_qp *base_qp) 291 { 292 siw_qp_put(to_siw_qp(base_qp)); 293 } 294 295 /* 296 * siw_create_qp() 297 * 298 * Create QP of requested size on given device. 299 * 300 * @pd: Protection Domain 301 * @attrs: Initial QP attributes. 302 * @udata: used to provide QP ID, SQ and RQ size back to user. 303 */ 304 305 struct ib_qp *siw_create_qp(struct ib_pd *pd, 306 struct ib_qp_init_attr *attrs, 307 struct ib_udata *udata) 308 { 309 struct siw_qp *qp = NULL; 310 struct siw_base_qp *siw_base_qp = NULL; 311 struct ib_device *base_dev = pd->device; 312 struct siw_device *sdev = to_siw_dev(base_dev); 313 struct siw_ucontext *uctx = 314 rdma_udata_to_drv_context(udata, struct siw_ucontext, 315 base_ucontext); 316 struct siw_cq *scq = NULL, *rcq = NULL; 317 unsigned long flags; 318 int num_sqe, num_rqe, rv = 0; 319 320 siw_dbg(base_dev, "create new QP\n"); 321 322 if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { 323 siw_dbg(base_dev, "too many QP's\n"); 324 rv = -ENOMEM; 325 goto err_out; 326 } 327 if (attrs->qp_type != IB_QPT_RC) { 328 siw_dbg(base_dev, "only RC QP's supported\n"); 329 rv = -EINVAL; 330 goto err_out; 331 } 332 if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) || 333 (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) || 334 (attrs->cap.max_send_sge > SIW_MAX_SGE) || 335 (attrs->cap.max_recv_sge > SIW_MAX_SGE)) { 336 siw_dbg(base_dev, "QP size error\n"); 337 rv = -EINVAL; 338 goto err_out; 339 } 340 if (attrs->cap.max_inline_data > SIW_MAX_INLINE) { 341 siw_dbg(base_dev, "max inline send: %d > %d\n", 342 attrs->cap.max_inline_data, (int)SIW_MAX_INLINE); 343 rv = -EINVAL; 344 goto err_out; 345 } 346 /* 347 * NOTE: we allow for zero element SQ and RQ WQE's SGL's 348 * but not for a QP unable to hold any WQE (SQ + RQ) 349 */ 350 if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) { 351 siw_dbg(base_dev, "QP must have send or receive queue\n"); 352 rv = -EINVAL; 353 goto err_out; 354 } 355 scq = to_siw_cq(attrs->send_cq); 356 rcq = to_siw_cq(attrs->recv_cq); 357 358 if (!scq || (!rcq && !attrs->srq)) { 359 siw_dbg(base_dev, "send CQ or receive CQ invalid\n"); 360 rv = -EINVAL; 361 goto err_out; 362 } 363 siw_base_qp = kzalloc(sizeof(*siw_base_qp), GFP_KERNEL); 364 if (!siw_base_qp) { 365 rv = -ENOMEM; 366 goto err_out; 367 } 368 qp = kzalloc(sizeof(*qp), GFP_KERNEL); 369 if (!qp) { 370 rv = -ENOMEM; 371 goto err_out; 372 } 373 siw_base_qp->qp = qp; 374 qp->ib_qp = &siw_base_qp->base_qp; 375 376 init_rwsem(&qp->state_lock); 377 spin_lock_init(&qp->sq_lock); 378 spin_lock_init(&qp->rq_lock); 379 spin_lock_init(&qp->orq_lock); 380 381 qp->kernel_verbs = !udata; 382 qp->xa_sq_index = SIW_INVAL_UOBJ_KEY; 383 qp->xa_rq_index = SIW_INVAL_UOBJ_KEY; 384 385 rv = siw_qp_add(sdev, qp); 386 if (rv) 387 goto err_out; 388 389 /* All queue indices are derived from modulo operations 390 * on a free running 'get' (consumer) and 'put' (producer) 391 * unsigned counter. Having queue sizes at power of two 392 * avoids handling counter wrap around. 393 */ 394 num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr); 395 num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr); 396 397 if (qp->kernel_verbs) 398 qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe)); 399 else 400 qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe)); 401 402 if (qp->sendq == NULL) { 403 siw_dbg(base_dev, "SQ size %d alloc failed\n", num_sqe); 404 rv = -ENOMEM; 405 goto err_out_xa; 406 } 407 if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) { 408 if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) 409 qp->attrs.flags |= SIW_SIGNAL_ALL_WR; 410 else { 411 rv = -EINVAL; 412 goto err_out_xa; 413 } 414 } 415 qp->pd = pd; 416 qp->scq = scq; 417 qp->rcq = rcq; 418 419 if (attrs->srq) { 420 /* 421 * SRQ support. 422 * Verbs 6.3.7: ignore RQ size, if SRQ present 423 * Verbs 6.3.5: do not check PD of SRQ against PD of QP 424 */ 425 qp->srq = to_siw_srq(attrs->srq); 426 qp->attrs.rq_size = 0; 427 siw_dbg(base_dev, "QP [%u]: SRQ attached\n", qp->qp_num); 428 } else if (num_rqe) { 429 if (qp->kernel_verbs) 430 qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe)); 431 else 432 qp->recvq = 433 vmalloc_user(num_rqe * sizeof(struct siw_rqe)); 434 435 if (qp->recvq == NULL) { 436 siw_dbg(base_dev, "RQ size %d alloc failed\n", num_rqe); 437 rv = -ENOMEM; 438 goto err_out_xa; 439 } 440 qp->attrs.rq_size = num_rqe; 441 } 442 qp->attrs.sq_size = num_sqe; 443 qp->attrs.sq_max_sges = attrs->cap.max_send_sge; 444 qp->attrs.rq_max_sges = attrs->cap.max_recv_sge; 445 446 /* Make those two tunables fixed for now. */ 447 qp->tx_ctx.gso_seg_limit = 1; 448 qp->tx_ctx.zcopy_tx = zcopy_tx; 449 450 qp->attrs.state = SIW_QP_STATE_IDLE; 451 452 if (udata) { 453 struct siw_uresp_create_qp uresp = {}; 454 455 uresp.num_sqe = num_sqe; 456 uresp.num_rqe = num_rqe; 457 uresp.qp_id = qp_id(qp); 458 459 if (qp->sendq) { 460 qp->xa_sq_index = 461 siw_create_uobj(uctx, qp->sendq, 462 num_sqe * sizeof(struct siw_sqe)); 463 } 464 if (qp->recvq) { 465 qp->xa_rq_index = 466 siw_create_uobj(uctx, qp->recvq, 467 num_rqe * sizeof(struct siw_rqe)); 468 } 469 if (qp->xa_sq_index == SIW_INVAL_UOBJ_KEY || 470 qp->xa_rq_index == SIW_INVAL_UOBJ_KEY) { 471 rv = -ENOMEM; 472 goto err_out_xa; 473 } 474 uresp.sq_key = qp->xa_sq_index << PAGE_SHIFT; 475 uresp.rq_key = qp->xa_rq_index << PAGE_SHIFT; 476 477 if (udata->outlen < sizeof(uresp)) { 478 rv = -EINVAL; 479 goto err_out_xa; 480 } 481 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 482 if (rv) 483 goto err_out_xa; 484 } 485 qp->tx_cpu = siw_get_tx_cpu(sdev); 486 if (qp->tx_cpu < 0) { 487 rv = -EINVAL; 488 goto err_out_xa; 489 } 490 INIT_LIST_HEAD(&qp->devq); 491 spin_lock_irqsave(&sdev->lock, flags); 492 list_add_tail(&qp->devq, &sdev->qp_list); 493 spin_unlock_irqrestore(&sdev->lock, flags); 494 495 return qp->ib_qp; 496 497 err_out_xa: 498 xa_erase(&sdev->qp_xa, qp_id(qp)); 499 err_out: 500 kfree(siw_base_qp); 501 502 if (qp) { 503 if (qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) 504 kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); 505 if (qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) 506 kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); 507 508 vfree(qp->sendq); 509 vfree(qp->recvq); 510 kfree(qp); 511 } 512 atomic_dec(&sdev->num_qp); 513 514 return ERR_PTR(rv); 515 } 516 517 /* 518 * Minimum siw_query_qp() verb interface. 519 * 520 * @qp_attr_mask is not used but all available information is provided 521 */ 522 int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, 523 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) 524 { 525 struct siw_qp *qp; 526 struct siw_device *sdev; 527 528 if (base_qp && qp_attr && qp_init_attr) { 529 qp = to_siw_qp(base_qp); 530 sdev = to_siw_dev(base_qp->device); 531 } else { 532 return -EINVAL; 533 } 534 qp_attr->cap.max_inline_data = SIW_MAX_INLINE; 535 qp_attr->cap.max_send_wr = qp->attrs.sq_size; 536 qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; 537 qp_attr->cap.max_recv_wr = qp->attrs.rq_size; 538 qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; 539 qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); 540 qp_attr->max_rd_atomic = qp->attrs.irq_size; 541 qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; 542 543 qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | 544 IB_ACCESS_REMOTE_WRITE | 545 IB_ACCESS_REMOTE_READ; 546 547 qp_init_attr->qp_type = base_qp->qp_type; 548 qp_init_attr->send_cq = base_qp->send_cq; 549 qp_init_attr->recv_cq = base_qp->recv_cq; 550 qp_init_attr->srq = base_qp->srq; 551 552 qp_init_attr->cap = qp_attr->cap; 553 554 return 0; 555 } 556 557 int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, 558 int attr_mask, struct ib_udata *udata) 559 { 560 struct siw_qp_attrs new_attrs; 561 enum siw_qp_attr_mask siw_attr_mask = 0; 562 struct siw_qp *qp = to_siw_qp(base_qp); 563 int rv = 0; 564 565 if (!attr_mask) 566 return 0; 567 568 memset(&new_attrs, 0, sizeof(new_attrs)); 569 570 if (attr_mask & IB_QP_ACCESS_FLAGS) { 571 siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS; 572 573 if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) 574 new_attrs.flags |= SIW_RDMA_READ_ENABLED; 575 if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) 576 new_attrs.flags |= SIW_RDMA_WRITE_ENABLED; 577 if (attr->qp_access_flags & IB_ACCESS_MW_BIND) 578 new_attrs.flags |= SIW_RDMA_BIND_ENABLED; 579 } 580 if (attr_mask & IB_QP_STATE) { 581 siw_dbg_qp(qp, "desired IB QP state: %s\n", 582 ib_qp_state_to_string[attr->qp_state]); 583 584 new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state]; 585 586 if (new_attrs.state > SIW_QP_STATE_RTS) 587 qp->tx_ctx.tx_suspend = 1; 588 589 siw_attr_mask |= SIW_QP_ATTR_STATE; 590 } 591 if (!siw_attr_mask) 592 goto out; 593 594 down_write(&qp->state_lock); 595 596 rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask); 597 598 up_write(&qp->state_lock); 599 out: 600 return rv; 601 } 602 603 int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata) 604 { 605 struct siw_qp *qp = to_siw_qp(base_qp); 606 struct siw_base_qp *siw_base_qp = to_siw_base_qp(base_qp); 607 struct siw_ucontext *uctx = 608 rdma_udata_to_drv_context(udata, struct siw_ucontext, 609 base_ucontext); 610 struct siw_qp_attrs qp_attrs; 611 612 siw_dbg_qp(qp, "state %d\n", qp->attrs.state); 613 614 /* 615 * Mark QP as in process of destruction to prevent from 616 * any async callbacks to RDMA core 617 */ 618 qp->attrs.flags |= SIW_QP_IN_DESTROY; 619 qp->rx_stream.rx_suspend = 1; 620 621 if (uctx && qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) 622 kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); 623 if (uctx && qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) 624 kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); 625 626 down_write(&qp->state_lock); 627 628 qp_attrs.state = SIW_QP_STATE_ERROR; 629 siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE); 630 631 if (qp->cep) { 632 siw_cep_put(qp->cep); 633 qp->cep = NULL; 634 } 635 up_write(&qp->state_lock); 636 637 kfree(qp->tx_ctx.mpa_crc_hd); 638 kfree(qp->rx_stream.mpa_crc_hd); 639 640 qp->scq = qp->rcq = NULL; 641 642 siw_qp_put(qp); 643 kfree(siw_base_qp); 644 645 return 0; 646 } 647 648 /* 649 * siw_copy_inline_sgl() 650 * 651 * Prepare sgl of inlined data for sending. For userland callers 652 * function checks if given buffer addresses and len's are within 653 * process context bounds. 654 * Data from all provided sge's are copied together into the wqe, 655 * referenced by a single sge. 656 */ 657 static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, 658 struct siw_sqe *sqe) 659 { 660 struct ib_sge *core_sge = core_wr->sg_list; 661 void *kbuf = &sqe->sge[1]; 662 int num_sge = core_wr->num_sge, bytes = 0; 663 664 sqe->sge[0].laddr = (uintptr_t)kbuf; 665 sqe->sge[0].lkey = 0; 666 667 while (num_sge--) { 668 if (!core_sge->length) { 669 core_sge++; 670 continue; 671 } 672 bytes += core_sge->length; 673 if (bytes > SIW_MAX_INLINE) { 674 bytes = -EINVAL; 675 break; 676 } 677 memcpy(kbuf, (void *)(uintptr_t)core_sge->addr, 678 core_sge->length); 679 680 kbuf += core_sge->length; 681 core_sge++; 682 } 683 sqe->sge[0].length = bytes > 0 ? bytes : 0; 684 sqe->num_sge = bytes > 0 ? 1 : 0; 685 686 return bytes; 687 } 688 689 /* 690 * siw_post_send() 691 * 692 * Post a list of S-WR's to a SQ. 693 * 694 * @base_qp: Base QP contained in siw QP 695 * @wr: Null terminated list of user WR's 696 * @bad_wr: Points to failing WR in case of synchronous failure. 697 */ 698 int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, 699 const struct ib_send_wr **bad_wr) 700 { 701 struct siw_qp *qp = to_siw_qp(base_qp); 702 struct siw_wqe *wqe = tx_wqe(qp); 703 704 unsigned long flags; 705 int rv = 0; 706 707 /* 708 * Try to acquire QP state lock. Must be non-blocking 709 * to accommodate kernel clients needs. 710 */ 711 if (!down_read_trylock(&qp->state_lock)) { 712 *bad_wr = wr; 713 siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state); 714 return -ENOTCONN; 715 } 716 if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { 717 up_read(&qp->state_lock); 718 *bad_wr = wr; 719 siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state); 720 return -ENOTCONN; 721 } 722 if (wr && !qp->kernel_verbs) { 723 siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); 724 up_read(&qp->state_lock); 725 *bad_wr = wr; 726 return -EINVAL; 727 } 728 spin_lock_irqsave(&qp->sq_lock, flags); 729 730 while (wr) { 731 u32 idx = qp->sq_put % qp->attrs.sq_size; 732 struct siw_sqe *sqe = &qp->sendq[idx]; 733 734 if (sqe->flags) { 735 siw_dbg_qp(qp, "sq full\n"); 736 rv = -ENOMEM; 737 break; 738 } 739 if (wr->num_sge > qp->attrs.sq_max_sges) { 740 siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); 741 rv = -EINVAL; 742 break; 743 } 744 sqe->id = wr->wr_id; 745 746 if ((wr->send_flags & IB_SEND_SIGNALED) || 747 (qp->attrs.flags & SIW_SIGNAL_ALL_WR)) 748 sqe->flags |= SIW_WQE_SIGNALLED; 749 750 if (wr->send_flags & IB_SEND_FENCE) 751 sqe->flags |= SIW_WQE_READ_FENCE; 752 753 switch (wr->opcode) { 754 case IB_WR_SEND: 755 case IB_WR_SEND_WITH_INV: 756 if (wr->send_flags & IB_SEND_SOLICITED) 757 sqe->flags |= SIW_WQE_SOLICITED; 758 759 if (!(wr->send_flags & IB_SEND_INLINE)) { 760 siw_copy_sgl(wr->sg_list, sqe->sge, 761 wr->num_sge); 762 sqe->num_sge = wr->num_sge; 763 } else { 764 rv = siw_copy_inline_sgl(wr, sqe); 765 if (rv <= 0) { 766 rv = -EINVAL; 767 break; 768 } 769 sqe->flags |= SIW_WQE_INLINE; 770 sqe->num_sge = 1; 771 } 772 if (wr->opcode == IB_WR_SEND) 773 sqe->opcode = SIW_OP_SEND; 774 else { 775 sqe->opcode = SIW_OP_SEND_REMOTE_INV; 776 sqe->rkey = wr->ex.invalidate_rkey; 777 } 778 break; 779 780 case IB_WR_RDMA_READ_WITH_INV: 781 case IB_WR_RDMA_READ: 782 /* 783 * iWarp restricts RREAD sink to SGL containing 784 * 1 SGE only. we could relax to SGL with multiple 785 * elements referring the SAME ltag or even sending 786 * a private per-rreq tag referring to a checked 787 * local sgl with MULTIPLE ltag's. 788 */ 789 if (unlikely(wr->num_sge != 1)) { 790 rv = -EINVAL; 791 break; 792 } 793 siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1); 794 /* 795 * NOTE: zero length RREAD is allowed! 796 */ 797 sqe->raddr = rdma_wr(wr)->remote_addr; 798 sqe->rkey = rdma_wr(wr)->rkey; 799 sqe->num_sge = 1; 800 801 if (wr->opcode == IB_WR_RDMA_READ) 802 sqe->opcode = SIW_OP_READ; 803 else 804 sqe->opcode = SIW_OP_READ_LOCAL_INV; 805 break; 806 807 case IB_WR_RDMA_WRITE: 808 if (!(wr->send_flags & IB_SEND_INLINE)) { 809 siw_copy_sgl(wr->sg_list, &sqe->sge[0], 810 wr->num_sge); 811 sqe->num_sge = wr->num_sge; 812 } else { 813 rv = siw_copy_inline_sgl(wr, sqe); 814 if (unlikely(rv < 0)) { 815 rv = -EINVAL; 816 break; 817 } 818 sqe->flags |= SIW_WQE_INLINE; 819 sqe->num_sge = 1; 820 } 821 sqe->raddr = rdma_wr(wr)->remote_addr; 822 sqe->rkey = rdma_wr(wr)->rkey; 823 sqe->opcode = SIW_OP_WRITE; 824 break; 825 826 case IB_WR_REG_MR: 827 sqe->base_mr = (uintptr_t)reg_wr(wr)->mr; 828 sqe->rkey = reg_wr(wr)->key; 829 sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK; 830 sqe->opcode = SIW_OP_REG_MR; 831 break; 832 833 case IB_WR_LOCAL_INV: 834 sqe->rkey = wr->ex.invalidate_rkey; 835 sqe->opcode = SIW_OP_INVAL_STAG; 836 break; 837 838 default: 839 siw_dbg_qp(qp, "ib wr type %d unsupported\n", 840 wr->opcode); 841 rv = -EINVAL; 842 break; 843 } 844 siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n", 845 sqe->opcode, sqe->flags, 846 (void *)(uintptr_t)sqe->id); 847 848 if (unlikely(rv < 0)) 849 break; 850 851 /* make SQE only valid after completely written */ 852 smp_wmb(); 853 sqe->flags |= SIW_WQE_VALID; 854 855 qp->sq_put++; 856 wr = wr->next; 857 } 858 859 /* 860 * Send directly if SQ processing is not in progress. 861 * Eventual immediate errors (rv < 0) do not affect the involved 862 * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ 863 * processing, if new work is already pending. But rv must be passed 864 * to caller. 865 */ 866 if (wqe->wr_status != SIW_WR_IDLE) { 867 spin_unlock_irqrestore(&qp->sq_lock, flags); 868 goto skip_direct_sending; 869 } 870 rv = siw_activate_tx(qp); 871 spin_unlock_irqrestore(&qp->sq_lock, flags); 872 873 if (rv <= 0) 874 goto skip_direct_sending; 875 876 if (qp->kernel_verbs) { 877 rv = siw_sq_start(qp); 878 } else { 879 qp->tx_ctx.in_syscall = 1; 880 881 if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend)) 882 siw_qp_cm_drop(qp, 0); 883 884 qp->tx_ctx.in_syscall = 0; 885 } 886 skip_direct_sending: 887 888 up_read(&qp->state_lock); 889 890 if (rv >= 0) 891 return 0; 892 /* 893 * Immediate error 894 */ 895 siw_dbg_qp(qp, "error %d\n", rv); 896 897 *bad_wr = wr; 898 return rv; 899 } 900 901 /* 902 * siw_post_receive() 903 * 904 * Post a list of R-WR's to a RQ. 905 * 906 * @base_qp: Base QP contained in siw QP 907 * @wr: Null terminated list of user WR's 908 * @bad_wr: Points to failing WR in case of synchronous failure. 909 */ 910 int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, 911 const struct ib_recv_wr **bad_wr) 912 { 913 struct siw_qp *qp = to_siw_qp(base_qp); 914 unsigned long flags; 915 int rv = 0; 916 917 if (qp->srq) { 918 *bad_wr = wr; 919 return -EOPNOTSUPP; /* what else from errno.h? */ 920 } 921 /* 922 * Try to acquire QP state lock. Must be non-blocking 923 * to accommodate kernel clients needs. 924 */ 925 if (!down_read_trylock(&qp->state_lock)) { 926 *bad_wr = wr; 927 return -ENOTCONN; 928 } 929 if (!qp->kernel_verbs) { 930 siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n"); 931 up_read(&qp->state_lock); 932 *bad_wr = wr; 933 return -EINVAL; 934 } 935 if (qp->attrs.state > SIW_QP_STATE_RTS) { 936 up_read(&qp->state_lock); 937 *bad_wr = wr; 938 return -EINVAL; 939 } 940 /* 941 * Serialize potentially multiple producers. 942 * Not needed for single threaded consumer side. 943 */ 944 spin_lock_irqsave(&qp->rq_lock, flags); 945 946 while (wr) { 947 u32 idx = qp->rq_put % qp->attrs.rq_size; 948 struct siw_rqe *rqe = &qp->recvq[idx]; 949 950 if (rqe->flags) { 951 siw_dbg_qp(qp, "RQ full\n"); 952 rv = -ENOMEM; 953 break; 954 } 955 if (wr->num_sge > qp->attrs.rq_max_sges) { 956 siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); 957 rv = -EINVAL; 958 break; 959 } 960 rqe->id = wr->wr_id; 961 rqe->num_sge = wr->num_sge; 962 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); 963 964 /* make sure RQE is completely written before valid */ 965 smp_wmb(); 966 967 rqe->flags = SIW_WQE_VALID; 968 969 qp->rq_put++; 970 wr = wr->next; 971 } 972 spin_unlock_irqrestore(&qp->rq_lock, flags); 973 974 up_read(&qp->state_lock); 975 976 if (rv < 0) { 977 siw_dbg_qp(qp, "error %d\n", rv); 978 *bad_wr = wr; 979 } 980 return rv > 0 ? 0 : rv; 981 } 982 983 void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) 984 { 985 struct siw_cq *cq = to_siw_cq(base_cq); 986 struct siw_device *sdev = to_siw_dev(base_cq->device); 987 struct siw_ucontext *ctx = 988 rdma_udata_to_drv_context(udata, struct siw_ucontext, 989 base_ucontext); 990 991 siw_dbg_cq(cq, "free CQ resources\n"); 992 993 siw_cq_flush(cq); 994 995 if (ctx && cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) 996 kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); 997 998 atomic_dec(&sdev->num_cq); 999 1000 vfree(cq->queue); 1001 } 1002 1003 /* 1004 * siw_create_cq() 1005 * 1006 * Populate CQ of requested size 1007 * 1008 * @base_cq: CQ as allocated by RDMA midlayer 1009 * @attr: Initial CQ attributes 1010 * @udata: relates to user context 1011 */ 1012 1013 int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, 1014 struct ib_udata *udata) 1015 { 1016 struct siw_device *sdev = to_siw_dev(base_cq->device); 1017 struct siw_cq *cq = to_siw_cq(base_cq); 1018 int rv, size = attr->cqe; 1019 1020 if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) { 1021 siw_dbg(base_cq->device, "too many CQ's\n"); 1022 rv = -ENOMEM; 1023 goto err_out; 1024 } 1025 if (size < 1 || size > sdev->attrs.max_cqe) { 1026 siw_dbg(base_cq->device, "CQ size error: %d\n", size); 1027 rv = -EINVAL; 1028 goto err_out; 1029 } 1030 size = roundup_pow_of_two(size); 1031 cq->base_cq.cqe = size; 1032 cq->num_cqe = size; 1033 cq->xa_cq_index = SIW_INVAL_UOBJ_KEY; 1034 1035 if (!udata) { 1036 cq->kernel_verbs = 1; 1037 cq->queue = vzalloc(size * sizeof(struct siw_cqe) + 1038 sizeof(struct siw_cq_ctrl)); 1039 } else { 1040 cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) + 1041 sizeof(struct siw_cq_ctrl)); 1042 } 1043 if (cq->queue == NULL) { 1044 rv = -ENOMEM; 1045 goto err_out; 1046 } 1047 get_random_bytes(&cq->id, 4); 1048 siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id); 1049 1050 spin_lock_init(&cq->lock); 1051 1052 cq->notify = (struct siw_cq_ctrl *)&cq->queue[size]; 1053 1054 if (udata) { 1055 struct siw_uresp_create_cq uresp = {}; 1056 struct siw_ucontext *ctx = 1057 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1058 base_ucontext); 1059 1060 cq->xa_cq_index = 1061 siw_create_uobj(ctx, cq->queue, 1062 size * sizeof(struct siw_cqe) + 1063 sizeof(struct siw_cq_ctrl)); 1064 if (cq->xa_cq_index == SIW_INVAL_UOBJ_KEY) { 1065 rv = -ENOMEM; 1066 goto err_out; 1067 } 1068 uresp.cq_key = cq->xa_cq_index << PAGE_SHIFT; 1069 uresp.cq_id = cq->id; 1070 uresp.num_cqe = size; 1071 1072 if (udata->outlen < sizeof(uresp)) { 1073 rv = -EINVAL; 1074 goto err_out; 1075 } 1076 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1077 if (rv) 1078 goto err_out; 1079 } 1080 return 0; 1081 1082 err_out: 1083 siw_dbg(base_cq->device, "CQ creation failed: %d", rv); 1084 1085 if (cq && cq->queue) { 1086 struct siw_ucontext *ctx = 1087 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1088 base_ucontext); 1089 if (cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) 1090 kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); 1091 vfree(cq->queue); 1092 } 1093 atomic_dec(&sdev->num_cq); 1094 1095 return rv; 1096 } 1097 1098 /* 1099 * siw_poll_cq() 1100 * 1101 * Reap CQ entries if available and copy work completion status into 1102 * array of WC's provided by caller. Returns number of reaped CQE's. 1103 * 1104 * @base_cq: Base CQ contained in siw CQ. 1105 * @num_cqe: Maximum number of CQE's to reap. 1106 * @wc: Array of work completions to be filled by siw. 1107 */ 1108 int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc) 1109 { 1110 struct siw_cq *cq = to_siw_cq(base_cq); 1111 int i; 1112 1113 for (i = 0; i < num_cqe; i++) { 1114 if (!siw_reap_cqe(cq, wc)) 1115 break; 1116 wc++; 1117 } 1118 return i; 1119 } 1120 1121 /* 1122 * siw_req_notify_cq() 1123 * 1124 * Request notification for new CQE's added to that CQ. 1125 * Defined flags: 1126 * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification 1127 * event if a WQE with notification flag set enters the CQ 1128 * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification 1129 * event if a WQE enters the CQ. 1130 * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the 1131 * number of not reaped CQE's regardless of its notification 1132 * type and current or new CQ notification settings. 1133 * 1134 * @base_cq: Base CQ contained in siw CQ. 1135 * @flags: Requested notification flags. 1136 */ 1137 int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) 1138 { 1139 struct siw_cq *cq = to_siw_cq(base_cq); 1140 1141 siw_dbg_cq(cq, "flags: 0x%02x\n", flags); 1142 1143 if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) 1144 /* 1145 * Enable CQ event for next solicited completion. 1146 * and make it visible to all associated producers. 1147 */ 1148 smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED); 1149 else 1150 /* 1151 * Enable CQ event for any signalled completion. 1152 * and make it visible to all associated producers. 1153 */ 1154 smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL); 1155 1156 if (flags & IB_CQ_REPORT_MISSED_EVENTS) 1157 return cq->cq_put - cq->cq_get; 1158 1159 return 0; 1160 } 1161 1162 /* 1163 * siw_dereg_mr() 1164 * 1165 * Release Memory Region. 1166 * 1167 * @base_mr: Base MR contained in siw MR. 1168 * @udata: points to user context, unused. 1169 */ 1170 int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata) 1171 { 1172 struct siw_mr *mr = to_siw_mr(base_mr); 1173 struct siw_device *sdev = to_siw_dev(base_mr->device); 1174 1175 siw_dbg_mem(mr->mem, "deregister MR\n"); 1176 1177 atomic_dec(&sdev->num_mr); 1178 1179 siw_mr_drop_mem(mr); 1180 kfree_rcu(mr, rcu); 1181 1182 return 0; 1183 } 1184 1185 /* 1186 * siw_reg_user_mr() 1187 * 1188 * Register Memory Region. 1189 * 1190 * @pd: Protection Domain 1191 * @start: starting address of MR (virtual address) 1192 * @len: len of MR 1193 * @rnic_va: not used by siw 1194 * @rights: MR access rights 1195 * @udata: user buffer to communicate STag and Key. 1196 */ 1197 struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, 1198 u64 rnic_va, int rights, struct ib_udata *udata) 1199 { 1200 struct siw_mr *mr = NULL; 1201 struct siw_umem *umem = NULL; 1202 struct siw_ureq_reg_mr ureq; 1203 struct siw_device *sdev = to_siw_dev(pd->device); 1204 1205 unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK); 1206 int rv; 1207 1208 siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n", 1209 (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va, 1210 (unsigned long long)len); 1211 1212 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1213 siw_dbg_pd(pd, "too many mr's\n"); 1214 rv = -ENOMEM; 1215 goto err_out; 1216 } 1217 if (!len) { 1218 rv = -EINVAL; 1219 goto err_out; 1220 } 1221 if (mem_limit != RLIM_INFINITY) { 1222 unsigned long num_pages = 1223 (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT; 1224 mem_limit >>= PAGE_SHIFT; 1225 1226 if (num_pages > mem_limit - current->mm->locked_vm) { 1227 siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n", 1228 num_pages, mem_limit, 1229 current->mm->locked_vm); 1230 rv = -ENOMEM; 1231 goto err_out; 1232 } 1233 } 1234 umem = siw_umem_get(start, len, ib_access_writable(rights)); 1235 if (IS_ERR(umem)) { 1236 rv = PTR_ERR(umem); 1237 siw_dbg_pd(pd, "getting user memory failed: %d\n", rv); 1238 umem = NULL; 1239 goto err_out; 1240 } 1241 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1242 if (!mr) { 1243 rv = -ENOMEM; 1244 goto err_out; 1245 } 1246 rv = siw_mr_add_mem(mr, pd, umem, start, len, rights); 1247 if (rv) 1248 goto err_out; 1249 1250 if (udata) { 1251 struct siw_uresp_reg_mr uresp = {}; 1252 struct siw_mem *mem = mr->mem; 1253 1254 if (udata->inlen < sizeof(ureq)) { 1255 rv = -EINVAL; 1256 goto err_out; 1257 } 1258 rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); 1259 if (rv) 1260 goto err_out; 1261 1262 mr->base_mr.lkey |= ureq.stag_key; 1263 mr->base_mr.rkey |= ureq.stag_key; 1264 mem->stag |= ureq.stag_key; 1265 uresp.stag = mem->stag; 1266 1267 if (udata->outlen < sizeof(uresp)) { 1268 rv = -EINVAL; 1269 goto err_out; 1270 } 1271 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1272 if (rv) 1273 goto err_out; 1274 } 1275 mr->mem->stag_valid = 1; 1276 1277 return &mr->base_mr; 1278 1279 err_out: 1280 atomic_dec(&sdev->num_mr); 1281 if (mr) { 1282 if (mr->mem) 1283 siw_mr_drop_mem(mr); 1284 kfree_rcu(mr, rcu); 1285 } else { 1286 if (umem) 1287 siw_umem_release(umem, false); 1288 } 1289 return ERR_PTR(rv); 1290 } 1291 1292 struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 1293 u32 max_sge, struct ib_udata *udata) 1294 { 1295 struct siw_device *sdev = to_siw_dev(pd->device); 1296 struct siw_mr *mr = NULL; 1297 struct siw_pbl *pbl = NULL; 1298 int rv; 1299 1300 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1301 siw_dbg_pd(pd, "too many mr's\n"); 1302 rv = -ENOMEM; 1303 goto err_out; 1304 } 1305 if (mr_type != IB_MR_TYPE_MEM_REG) { 1306 siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type); 1307 rv = -EOPNOTSUPP; 1308 goto err_out; 1309 } 1310 if (max_sge > SIW_MAX_SGE_PBL) { 1311 siw_dbg_pd(pd, "too many sge's: %d\n", max_sge); 1312 rv = -ENOMEM; 1313 goto err_out; 1314 } 1315 pbl = siw_pbl_alloc(max_sge); 1316 if (IS_ERR(pbl)) { 1317 rv = PTR_ERR(pbl); 1318 siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv); 1319 pbl = NULL; 1320 goto err_out; 1321 } 1322 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1323 if (!mr) { 1324 rv = -ENOMEM; 1325 goto err_out; 1326 } 1327 rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0); 1328 if (rv) 1329 goto err_out; 1330 1331 mr->mem->is_pbl = 1; 1332 1333 siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); 1334 1335 return &mr->base_mr; 1336 1337 err_out: 1338 atomic_dec(&sdev->num_mr); 1339 1340 if (!mr) { 1341 kfree(pbl); 1342 } else { 1343 if (mr->mem) 1344 siw_mr_drop_mem(mr); 1345 kfree_rcu(mr, rcu); 1346 } 1347 siw_dbg_pd(pd, "failed: %d\n", rv); 1348 1349 return ERR_PTR(rv); 1350 } 1351 1352 /* Just used to count number of pages being mapped */ 1353 static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr) 1354 { 1355 return 0; 1356 } 1357 1358 int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, 1359 unsigned int *sg_off) 1360 { 1361 struct scatterlist *slp; 1362 struct siw_mr *mr = to_siw_mr(base_mr); 1363 struct siw_mem *mem = mr->mem; 1364 struct siw_pbl *pbl = mem->pbl; 1365 struct siw_pble *pble; 1366 unsigned long pbl_size; 1367 int i, rv; 1368 1369 if (!pbl) { 1370 siw_dbg_mem(mem, "no PBL allocated\n"); 1371 return -EINVAL; 1372 } 1373 pble = pbl->pbe; 1374 1375 if (pbl->max_buf < num_sle) { 1376 siw_dbg_mem(mem, "too many SGE's: %d > %d\n", 1377 mem->pbl->max_buf, num_sle); 1378 return -ENOMEM; 1379 } 1380 for_each_sg(sl, slp, num_sle, i) { 1381 if (sg_dma_len(slp) == 0) { 1382 siw_dbg_mem(mem, "empty SGE\n"); 1383 return -EINVAL; 1384 } 1385 if (i == 0) { 1386 pble->addr = sg_dma_address(slp); 1387 pble->size = sg_dma_len(slp); 1388 pble->pbl_off = 0; 1389 pbl_size = pble->size; 1390 pbl->num_buf = 1; 1391 } else { 1392 /* Merge PBL entries if adjacent */ 1393 if (pble->addr + pble->size == sg_dma_address(slp)) { 1394 pble->size += sg_dma_len(slp); 1395 } else { 1396 pble++; 1397 pbl->num_buf++; 1398 pble->addr = sg_dma_address(slp); 1399 pble->size = sg_dma_len(slp); 1400 pble->pbl_off = pbl_size; 1401 } 1402 pbl_size += sg_dma_len(slp); 1403 } 1404 siw_dbg_mem(mem, 1405 "sge[%d], size %u, addr 0x%p, total %lu\n", 1406 i, pble->size, (void *)(uintptr_t)pble->addr, 1407 pbl_size); 1408 } 1409 rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); 1410 if (rv > 0) { 1411 mem->len = base_mr->length; 1412 mem->va = base_mr->iova; 1413 siw_dbg_mem(mem, 1414 "%llu bytes, start 0x%pK, %u SLE to %u entries\n", 1415 mem->len, (void *)(uintptr_t)mem->va, num_sle, 1416 pbl->num_buf); 1417 } 1418 return rv; 1419 } 1420 1421 /* 1422 * siw_get_dma_mr() 1423 * 1424 * Create a (empty) DMA memory region, where no umem is attached. 1425 */ 1426 struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights) 1427 { 1428 struct siw_device *sdev = to_siw_dev(pd->device); 1429 struct siw_mr *mr = NULL; 1430 int rv; 1431 1432 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1433 siw_dbg_pd(pd, "too many mr's\n"); 1434 rv = -ENOMEM; 1435 goto err_out; 1436 } 1437 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1438 if (!mr) { 1439 rv = -ENOMEM; 1440 goto err_out; 1441 } 1442 rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights); 1443 if (rv) 1444 goto err_out; 1445 1446 mr->mem->stag_valid = 1; 1447 1448 siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); 1449 1450 return &mr->base_mr; 1451 1452 err_out: 1453 if (rv) 1454 kfree(mr); 1455 1456 atomic_dec(&sdev->num_mr); 1457 1458 return ERR_PTR(rv); 1459 } 1460 1461 /* 1462 * siw_create_srq() 1463 * 1464 * Create Shared Receive Queue of attributes @init_attrs 1465 * within protection domain given by @pd. 1466 * 1467 * @base_srq: Base SRQ contained in siw SRQ. 1468 * @init_attrs: SRQ init attributes. 1469 * @udata: points to user context 1470 */ 1471 int siw_create_srq(struct ib_srq *base_srq, 1472 struct ib_srq_init_attr *init_attrs, struct ib_udata *udata) 1473 { 1474 struct siw_srq *srq = to_siw_srq(base_srq); 1475 struct ib_srq_attr *attrs = &init_attrs->attr; 1476 struct siw_device *sdev = to_siw_dev(base_srq->device); 1477 struct siw_ucontext *ctx = 1478 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1479 base_ucontext); 1480 int rv; 1481 1482 if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) { 1483 siw_dbg_pd(base_srq->pd, "too many SRQ's\n"); 1484 rv = -ENOMEM; 1485 goto err_out; 1486 } 1487 if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR || 1488 attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) { 1489 rv = -EINVAL; 1490 goto err_out; 1491 } 1492 srq->max_sge = attrs->max_sge; 1493 srq->num_rqe = roundup_pow_of_two(attrs->max_wr); 1494 srq->xa_srq_index = SIW_INVAL_UOBJ_KEY; 1495 srq->limit = attrs->srq_limit; 1496 if (srq->limit) 1497 srq->armed = 1; 1498 1499 srq->kernel_verbs = !udata; 1500 1501 if (udata) 1502 srq->recvq = 1503 vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe)); 1504 else 1505 srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe)); 1506 1507 if (srq->recvq == NULL) { 1508 rv = -ENOMEM; 1509 goto err_out; 1510 } 1511 if (udata) { 1512 struct siw_uresp_create_srq uresp = {}; 1513 1514 srq->xa_srq_index = siw_create_uobj( 1515 ctx, srq->recvq, srq->num_rqe * sizeof(struct siw_rqe)); 1516 1517 if (srq->xa_srq_index == SIW_INVAL_UOBJ_KEY) { 1518 rv = -ENOMEM; 1519 goto err_out; 1520 } 1521 uresp.srq_key = srq->xa_srq_index; 1522 uresp.num_rqe = srq->num_rqe; 1523 1524 if (udata->outlen < sizeof(uresp)) { 1525 rv = -EINVAL; 1526 goto err_out; 1527 } 1528 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1529 if (rv) 1530 goto err_out; 1531 } 1532 spin_lock_init(&srq->lock); 1533 1534 siw_dbg_pd(base_srq->pd, "[SRQ]: success\n"); 1535 1536 return 0; 1537 1538 err_out: 1539 if (srq->recvq) { 1540 if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) 1541 kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); 1542 vfree(srq->recvq); 1543 } 1544 atomic_dec(&sdev->num_srq); 1545 1546 return rv; 1547 } 1548 1549 /* 1550 * siw_modify_srq() 1551 * 1552 * Modify SRQ. The caller may resize SRQ and/or set/reset notification 1553 * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification. 1554 * 1555 * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE 1556 * parameter. siw_modify_srq() does not check the attrs->max_sge param. 1557 */ 1558 int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs, 1559 enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) 1560 { 1561 struct siw_srq *srq = to_siw_srq(base_srq); 1562 unsigned long flags; 1563 int rv = 0; 1564 1565 spin_lock_irqsave(&srq->lock, flags); 1566 1567 if (attr_mask & IB_SRQ_MAX_WR) { 1568 /* resize request not yet supported */ 1569 rv = -EOPNOTSUPP; 1570 goto out; 1571 } 1572 if (attr_mask & IB_SRQ_LIMIT) { 1573 if (attrs->srq_limit) { 1574 if (unlikely(attrs->srq_limit > srq->num_rqe)) { 1575 rv = -EINVAL; 1576 goto out; 1577 } 1578 srq->armed = 1; 1579 } else { 1580 srq->armed = 0; 1581 } 1582 srq->limit = attrs->srq_limit; 1583 } 1584 out: 1585 spin_unlock_irqrestore(&srq->lock, flags); 1586 1587 return rv; 1588 } 1589 1590 /* 1591 * siw_query_srq() 1592 * 1593 * Query SRQ attributes. 1594 */ 1595 int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs) 1596 { 1597 struct siw_srq *srq = to_siw_srq(base_srq); 1598 unsigned long flags; 1599 1600 spin_lock_irqsave(&srq->lock, flags); 1601 1602 attrs->max_wr = srq->num_rqe; 1603 attrs->max_sge = srq->max_sge; 1604 attrs->srq_limit = srq->limit; 1605 1606 spin_unlock_irqrestore(&srq->lock, flags); 1607 1608 return 0; 1609 } 1610 1611 /* 1612 * siw_destroy_srq() 1613 * 1614 * Destroy SRQ. 1615 * It is assumed that the SRQ is not referenced by any 1616 * QP anymore - the code trusts the RDMA core environment to keep track 1617 * of QP references. 1618 */ 1619 void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata) 1620 { 1621 struct siw_srq *srq = to_siw_srq(base_srq); 1622 struct siw_device *sdev = to_siw_dev(base_srq->device); 1623 struct siw_ucontext *ctx = 1624 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1625 base_ucontext); 1626 1627 if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) 1628 kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); 1629 1630 vfree(srq->recvq); 1631 atomic_dec(&sdev->num_srq); 1632 } 1633 1634 /* 1635 * siw_post_srq_recv() 1636 * 1637 * Post a list of receive queue elements to SRQ. 1638 * NOTE: The function does not check or lock a certain SRQ state 1639 * during the post operation. The code simply trusts the 1640 * RDMA core environment. 1641 * 1642 * @base_srq: Base SRQ contained in siw SRQ 1643 * @wr: List of R-WR's 1644 * @bad_wr: Updated to failing WR if posting fails. 1645 */ 1646 int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, 1647 const struct ib_recv_wr **bad_wr) 1648 { 1649 struct siw_srq *srq = to_siw_srq(base_srq); 1650 unsigned long flags; 1651 int rv = 0; 1652 1653 if (unlikely(!srq->kernel_verbs)) { 1654 siw_dbg_pd(base_srq->pd, 1655 "[SRQ]: no kernel post_recv for mapped srq\n"); 1656 rv = -EINVAL; 1657 goto out; 1658 } 1659 /* 1660 * Serialize potentially multiple producers. 1661 * Also needed to serialize potentially multiple 1662 * consumers. 1663 */ 1664 spin_lock_irqsave(&srq->lock, flags); 1665 1666 while (wr) { 1667 u32 idx = srq->rq_put % srq->num_rqe; 1668 struct siw_rqe *rqe = &srq->recvq[idx]; 1669 1670 if (rqe->flags) { 1671 siw_dbg_pd(base_srq->pd, "SRQ full\n"); 1672 rv = -ENOMEM; 1673 break; 1674 } 1675 if (unlikely(wr->num_sge > srq->max_sge)) { 1676 siw_dbg_pd(base_srq->pd, 1677 "[SRQ]: too many sge's: %d\n", wr->num_sge); 1678 rv = -EINVAL; 1679 break; 1680 } 1681 rqe->id = wr->wr_id; 1682 rqe->num_sge = wr->num_sge; 1683 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); 1684 1685 /* Make sure S-RQE is completely written before valid */ 1686 smp_wmb(); 1687 1688 rqe->flags = SIW_WQE_VALID; 1689 1690 srq->rq_put++; 1691 wr = wr->next; 1692 } 1693 spin_unlock_irqrestore(&srq->lock, flags); 1694 out: 1695 if (unlikely(rv < 0)) { 1696 siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv); 1697 *bad_wr = wr; 1698 } 1699 return rv; 1700 } 1701 1702 void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype) 1703 { 1704 struct ib_event event; 1705 struct ib_qp *base_qp = qp->ib_qp; 1706 1707 /* 1708 * Do not report asynchronous errors on QP which gets 1709 * destroyed via verbs interface (siw_destroy_qp()) 1710 */ 1711 if (qp->attrs.flags & SIW_QP_IN_DESTROY) 1712 return; 1713 1714 event.event = etype; 1715 event.device = base_qp->device; 1716 event.element.qp = base_qp; 1717 1718 if (base_qp->event_handler) { 1719 siw_dbg_qp(qp, "reporting event %d\n", etype); 1720 base_qp->event_handler(&event, base_qp->qp_context); 1721 } 1722 } 1723 1724 void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype) 1725 { 1726 struct ib_event event; 1727 struct ib_cq *base_cq = &cq->base_cq; 1728 1729 event.event = etype; 1730 event.device = base_cq->device; 1731 event.element.cq = base_cq; 1732 1733 if (base_cq->event_handler) { 1734 siw_dbg_cq(cq, "reporting CQ event %d\n", etype); 1735 base_cq->event_handler(&event, base_cq->cq_context); 1736 } 1737 } 1738 1739 void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype) 1740 { 1741 struct ib_event event; 1742 struct ib_srq *base_srq = &srq->base_srq; 1743 1744 event.event = etype; 1745 event.device = base_srq->device; 1746 event.element.srq = base_srq; 1747 1748 if (base_srq->event_handler) { 1749 siw_dbg_pd(srq->base_srq.pd, 1750 "reporting SRQ event %d\n", etype); 1751 base_srq->event_handler(&event, base_srq->srq_context); 1752 } 1753 } 1754 1755 void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype) 1756 { 1757 struct ib_event event; 1758 1759 event.event = etype; 1760 event.device = &sdev->base_dev; 1761 event.element.port_num = port; 1762 1763 siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype); 1764 1765 ib_dispatch_event(&event); 1766 } 1767