1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause 2 3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 4 /* Copyright (c) 2008-2019, IBM Corporation */ 5 6 #include <linux/errno.h> 7 #include <linux/types.h> 8 #include <linux/uaccess.h> 9 #include <linux/vmalloc.h> 10 #include <linux/xarray.h> 11 12 #include <rdma/iw_cm.h> 13 #include <rdma/ib_verbs.h> 14 #include <rdma/ib_user_verbs.h> 15 #include <rdma/uverbs_ioctl.h> 16 17 #include "siw.h" 18 #include "siw_verbs.h" 19 #include "siw_mem.h" 20 21 static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = { 22 [IB_QPS_RESET] = SIW_QP_STATE_IDLE, 23 [IB_QPS_INIT] = SIW_QP_STATE_IDLE, 24 [IB_QPS_RTR] = SIW_QP_STATE_RTR, 25 [IB_QPS_RTS] = SIW_QP_STATE_RTS, 26 [IB_QPS_SQD] = SIW_QP_STATE_CLOSING, 27 [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE, 28 [IB_QPS_ERR] = SIW_QP_STATE_ERROR 29 }; 30 31 static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = { 32 [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR", 33 [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE", 34 [IB_QPS_ERR] = "ERR" 35 }; 36 37 static u32 siw_create_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size) 38 { 39 struct siw_uobj *uobj; 40 struct xa_limit limit = XA_LIMIT(0, SIW_UOBJ_MAX_KEY); 41 u32 key; 42 43 uobj = kzalloc(sizeof(*uobj), GFP_KERNEL); 44 if (!uobj) 45 return SIW_INVAL_UOBJ_KEY; 46 47 if (xa_alloc_cyclic(&uctx->xa, &key, uobj, limit, &uctx->uobj_nextkey, 48 GFP_KERNEL) < 0) { 49 kfree(uobj); 50 return SIW_INVAL_UOBJ_KEY; 51 } 52 uobj->size = PAGE_ALIGN(size); 53 uobj->addr = vaddr; 54 55 return key; 56 } 57 58 static struct siw_uobj *siw_get_uobj(struct siw_ucontext *uctx, 59 unsigned long off, u32 size) 60 { 61 struct siw_uobj *uobj = xa_load(&uctx->xa, off); 62 63 if (uobj && uobj->size == size) 64 return uobj; 65 66 return NULL; 67 } 68 69 int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) 70 { 71 struct siw_ucontext *uctx = to_siw_ctx(ctx); 72 struct siw_uobj *uobj; 73 unsigned long off = vma->vm_pgoff; 74 int size = vma->vm_end - vma->vm_start; 75 int rv = -EINVAL; 76 77 /* 78 * Must be page aligned 79 */ 80 if (vma->vm_start & (PAGE_SIZE - 1)) { 81 pr_warn("siw: mmap not page aligned\n"); 82 goto out; 83 } 84 uobj = siw_get_uobj(uctx, off, size); 85 if (!uobj) { 86 siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %u\n", 87 off, size); 88 goto out; 89 } 90 rv = remap_vmalloc_range(vma, uobj->addr, 0); 91 if (rv) 92 pr_warn("remap_vmalloc_range failed: %lu, %u\n", off, size); 93 out: 94 return rv; 95 } 96 97 int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata) 98 { 99 struct siw_device *sdev = to_siw_dev(base_ctx->device); 100 struct siw_ucontext *ctx = to_siw_ctx(base_ctx); 101 struct siw_uresp_alloc_ctx uresp = {}; 102 int rv; 103 104 if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) { 105 rv = -ENOMEM; 106 goto err_out; 107 } 108 xa_init_flags(&ctx->xa, XA_FLAGS_ALLOC); 109 ctx->uobj_nextkey = 0; 110 ctx->sdev = sdev; 111 112 uresp.dev_id = sdev->vendor_part_id; 113 114 if (udata->outlen < sizeof(uresp)) { 115 rv = -EINVAL; 116 goto err_out; 117 } 118 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 119 if (rv) 120 goto err_out; 121 122 siw_dbg(base_ctx->device, "success. now %d context(s)\n", 123 atomic_read(&sdev->num_ctx)); 124 125 return 0; 126 127 err_out: 128 atomic_dec(&sdev->num_ctx); 129 siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv, 130 atomic_read(&sdev->num_ctx)); 131 132 return rv; 133 } 134 135 void siw_dealloc_ucontext(struct ib_ucontext *base_ctx) 136 { 137 struct siw_ucontext *uctx = to_siw_ctx(base_ctx); 138 void *entry; 139 unsigned long index; 140 141 /* 142 * Make sure all user mmap objects are gone. Since QP, CQ 143 * and SRQ destroy routines destroy related objects, nothing 144 * should be found here. 145 */ 146 xa_for_each(&uctx->xa, index, entry) { 147 kfree(xa_erase(&uctx->xa, index)); 148 pr_warn("siw: dropping orphaned uobj at %lu\n", index); 149 } 150 xa_destroy(&uctx->xa); 151 atomic_dec(&uctx->sdev->num_ctx); 152 } 153 154 int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, 155 struct ib_udata *udata) 156 { 157 struct siw_device *sdev = to_siw_dev(base_dev); 158 159 if (udata->inlen || udata->outlen) 160 return -EINVAL; 161 162 memset(attr, 0, sizeof(*attr)); 163 164 /* Revisit atomic caps if RFC 7306 gets supported */ 165 attr->atomic_cap = 0; 166 attr->device_cap_flags = 167 IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG; 168 attr->max_cq = sdev->attrs.max_cq; 169 attr->max_cqe = sdev->attrs.max_cqe; 170 attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; 171 attr->max_fmr = sdev->attrs.max_fmr; 172 attr->max_mr = sdev->attrs.max_mr; 173 attr->max_mw = sdev->attrs.max_mw; 174 attr->max_mr_size = ~0ull; 175 attr->max_pd = sdev->attrs.max_pd; 176 attr->max_qp = sdev->attrs.max_qp; 177 attr->max_qp_init_rd_atom = sdev->attrs.max_ird; 178 attr->max_qp_rd_atom = sdev->attrs.max_ord; 179 attr->max_qp_wr = sdev->attrs.max_qp_wr; 180 attr->max_recv_sge = sdev->attrs.max_sge; 181 attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird; 182 attr->max_send_sge = sdev->attrs.max_sge; 183 attr->max_sge_rd = sdev->attrs.max_sge_rd; 184 attr->max_srq = sdev->attrs.max_srq; 185 attr->max_srq_sge = sdev->attrs.max_srq_sge; 186 attr->max_srq_wr = sdev->attrs.max_srq_wr; 187 attr->page_size_cap = PAGE_SIZE; 188 attr->vendor_id = SIW_VENDOR_ID; 189 attr->vendor_part_id = sdev->vendor_part_id; 190 191 memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6); 192 193 return 0; 194 } 195 196 int siw_query_port(struct ib_device *base_dev, u8 port, 197 struct ib_port_attr *attr) 198 { 199 struct siw_device *sdev = to_siw_dev(base_dev); 200 201 memset(attr, 0, sizeof(*attr)); 202 203 attr->active_mtu = attr->max_mtu; 204 attr->active_speed = 2; 205 attr->active_width = 2; 206 attr->gid_tbl_len = 1; 207 attr->max_msg_sz = -1; 208 attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); 209 attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 210 IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED; 211 attr->pkey_tbl_len = 1; 212 attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; 213 attr->state = sdev->state; 214 /* 215 * All zero 216 * 217 * attr->lid = 0; 218 * attr->bad_pkey_cntr = 0; 219 * attr->qkey_viol_cntr = 0; 220 * attr->sm_lid = 0; 221 * attr->lmc = 0; 222 * attr->max_vl_num = 0; 223 * attr->sm_sl = 0; 224 * attr->subnet_timeout = 0; 225 * attr->init_type_repy = 0; 226 */ 227 return 0; 228 } 229 230 int siw_get_port_immutable(struct ib_device *base_dev, u8 port, 231 struct ib_port_immutable *port_immutable) 232 { 233 struct ib_port_attr attr; 234 int rv = siw_query_port(base_dev, port, &attr); 235 236 if (rv) 237 return rv; 238 239 port_immutable->pkey_tbl_len = attr.pkey_tbl_len; 240 port_immutable->gid_tbl_len = attr.gid_tbl_len; 241 port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; 242 243 return 0; 244 } 245 246 int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey) 247 { 248 /* Report the default pkey */ 249 *pkey = 0xffff; 250 return 0; 251 } 252 253 int siw_query_gid(struct ib_device *base_dev, u8 port, int idx, 254 union ib_gid *gid) 255 { 256 struct siw_device *sdev = to_siw_dev(base_dev); 257 258 /* subnet_prefix == interface_id == 0; */ 259 memset(gid, 0, sizeof(*gid)); 260 memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6); 261 262 return 0; 263 } 264 265 int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata) 266 { 267 struct siw_device *sdev = to_siw_dev(pd->device); 268 269 if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) { 270 atomic_dec(&sdev->num_pd); 271 return -ENOMEM; 272 } 273 siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd)); 274 275 return 0; 276 } 277 278 void siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) 279 { 280 struct siw_device *sdev = to_siw_dev(pd->device); 281 282 siw_dbg_pd(pd, "free PD\n"); 283 atomic_dec(&sdev->num_pd); 284 } 285 286 void siw_qp_get_ref(struct ib_qp *base_qp) 287 { 288 siw_qp_get(to_siw_qp(base_qp)); 289 } 290 291 void siw_qp_put_ref(struct ib_qp *base_qp) 292 { 293 siw_qp_put(to_siw_qp(base_qp)); 294 } 295 296 /* 297 * siw_create_qp() 298 * 299 * Create QP of requested size on given device. 300 * 301 * @pd: Protection Domain 302 * @attrs: Initial QP attributes. 303 * @udata: used to provide QP ID, SQ and RQ size back to user. 304 */ 305 306 struct ib_qp *siw_create_qp(struct ib_pd *pd, 307 struct ib_qp_init_attr *attrs, 308 struct ib_udata *udata) 309 { 310 struct siw_qp *qp = NULL; 311 struct siw_base_qp *siw_base_qp = NULL; 312 struct ib_device *base_dev = pd->device; 313 struct siw_device *sdev = to_siw_dev(base_dev); 314 struct siw_ucontext *uctx = 315 rdma_udata_to_drv_context(udata, struct siw_ucontext, 316 base_ucontext); 317 struct siw_cq *scq = NULL, *rcq = NULL; 318 unsigned long flags; 319 int num_sqe, num_rqe, rv = 0; 320 321 siw_dbg(base_dev, "create new QP\n"); 322 323 if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { 324 siw_dbg(base_dev, "too many QP's\n"); 325 rv = -ENOMEM; 326 goto err_out; 327 } 328 if (attrs->qp_type != IB_QPT_RC) { 329 siw_dbg(base_dev, "only RC QP's supported\n"); 330 rv = -EINVAL; 331 goto err_out; 332 } 333 if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) || 334 (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) || 335 (attrs->cap.max_send_sge > SIW_MAX_SGE) || 336 (attrs->cap.max_recv_sge > SIW_MAX_SGE)) { 337 siw_dbg(base_dev, "QP size error\n"); 338 rv = -EINVAL; 339 goto err_out; 340 } 341 if (attrs->cap.max_inline_data > SIW_MAX_INLINE) { 342 siw_dbg(base_dev, "max inline send: %d > %d\n", 343 attrs->cap.max_inline_data, (int)SIW_MAX_INLINE); 344 rv = -EINVAL; 345 goto err_out; 346 } 347 /* 348 * NOTE: we allow for zero element SQ and RQ WQE's SGL's 349 * but not for a QP unable to hold any WQE (SQ + RQ) 350 */ 351 if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) { 352 siw_dbg(base_dev, "QP must have send or receive queue\n"); 353 rv = -EINVAL; 354 goto err_out; 355 } 356 scq = to_siw_cq(attrs->send_cq); 357 rcq = to_siw_cq(attrs->recv_cq); 358 359 if (!scq || (!rcq && !attrs->srq)) { 360 siw_dbg(base_dev, "send CQ or receive CQ invalid\n"); 361 rv = -EINVAL; 362 goto err_out; 363 } 364 siw_base_qp = kzalloc(sizeof(*siw_base_qp), GFP_KERNEL); 365 if (!siw_base_qp) { 366 rv = -ENOMEM; 367 goto err_out; 368 } 369 qp = kzalloc(sizeof(*qp), GFP_KERNEL); 370 if (!qp) { 371 rv = -ENOMEM; 372 goto err_out; 373 } 374 siw_base_qp->qp = qp; 375 qp->ib_qp = &siw_base_qp->base_qp; 376 377 init_rwsem(&qp->state_lock); 378 spin_lock_init(&qp->sq_lock); 379 spin_lock_init(&qp->rq_lock); 380 spin_lock_init(&qp->orq_lock); 381 382 qp->kernel_verbs = !udata; 383 qp->xa_sq_index = SIW_INVAL_UOBJ_KEY; 384 qp->xa_rq_index = SIW_INVAL_UOBJ_KEY; 385 386 rv = siw_qp_add(sdev, qp); 387 if (rv) 388 goto err_out; 389 390 /* All queue indices are derived from modulo operations 391 * on a free running 'get' (consumer) and 'put' (producer) 392 * unsigned counter. Having queue sizes at power of two 393 * avoids handling counter wrap around. 394 */ 395 num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr); 396 num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr); 397 398 if (qp->kernel_verbs) 399 qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe)); 400 else 401 qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe)); 402 403 if (qp->sendq == NULL) { 404 siw_dbg(base_dev, "SQ size %d alloc failed\n", num_sqe); 405 rv = -ENOMEM; 406 goto err_out_xa; 407 } 408 if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) { 409 if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) 410 qp->attrs.flags |= SIW_SIGNAL_ALL_WR; 411 else { 412 rv = -EINVAL; 413 goto err_out_xa; 414 } 415 } 416 qp->pd = pd; 417 qp->scq = scq; 418 qp->rcq = rcq; 419 420 if (attrs->srq) { 421 /* 422 * SRQ support. 423 * Verbs 6.3.7: ignore RQ size, if SRQ present 424 * Verbs 6.3.5: do not check PD of SRQ against PD of QP 425 */ 426 qp->srq = to_siw_srq(attrs->srq); 427 qp->attrs.rq_size = 0; 428 siw_dbg(base_dev, "QP [%u]: SRQ attached\n", qp->qp_num); 429 } else if (num_rqe) { 430 if (qp->kernel_verbs) 431 qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe)); 432 else 433 qp->recvq = 434 vmalloc_user(num_rqe * sizeof(struct siw_rqe)); 435 436 if (qp->recvq == NULL) { 437 siw_dbg(base_dev, "RQ size %d alloc failed\n", num_rqe); 438 rv = -ENOMEM; 439 goto err_out_xa; 440 } 441 qp->attrs.rq_size = num_rqe; 442 } 443 qp->attrs.sq_size = num_sqe; 444 qp->attrs.sq_max_sges = attrs->cap.max_send_sge; 445 qp->attrs.rq_max_sges = attrs->cap.max_recv_sge; 446 447 /* Make those two tunables fixed for now. */ 448 qp->tx_ctx.gso_seg_limit = 1; 449 qp->tx_ctx.zcopy_tx = zcopy_tx; 450 451 qp->attrs.state = SIW_QP_STATE_IDLE; 452 453 if (udata) { 454 struct siw_uresp_create_qp uresp = {}; 455 456 uresp.num_sqe = num_sqe; 457 uresp.num_rqe = num_rqe; 458 uresp.qp_id = qp_id(qp); 459 460 if (qp->sendq) { 461 qp->xa_sq_index = 462 siw_create_uobj(uctx, qp->sendq, 463 num_sqe * sizeof(struct siw_sqe)); 464 } 465 if (qp->recvq) { 466 qp->xa_rq_index = 467 siw_create_uobj(uctx, qp->recvq, 468 num_rqe * sizeof(struct siw_rqe)); 469 } 470 if (qp->xa_sq_index == SIW_INVAL_UOBJ_KEY || 471 qp->xa_rq_index == SIW_INVAL_UOBJ_KEY) { 472 rv = -ENOMEM; 473 goto err_out_xa; 474 } 475 uresp.sq_key = qp->xa_sq_index << PAGE_SHIFT; 476 uresp.rq_key = qp->xa_rq_index << PAGE_SHIFT; 477 478 if (udata->outlen < sizeof(uresp)) { 479 rv = -EINVAL; 480 goto err_out_xa; 481 } 482 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 483 if (rv) 484 goto err_out_xa; 485 } 486 qp->tx_cpu = siw_get_tx_cpu(sdev); 487 if (qp->tx_cpu < 0) { 488 rv = -EINVAL; 489 goto err_out_xa; 490 } 491 INIT_LIST_HEAD(&qp->devq); 492 spin_lock_irqsave(&sdev->lock, flags); 493 list_add_tail(&qp->devq, &sdev->qp_list); 494 spin_unlock_irqrestore(&sdev->lock, flags); 495 496 return qp->ib_qp; 497 498 err_out_xa: 499 xa_erase(&sdev->qp_xa, qp_id(qp)); 500 err_out: 501 kfree(siw_base_qp); 502 503 if (qp) { 504 if (qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) 505 kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); 506 if (qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) 507 kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); 508 509 vfree(qp->sendq); 510 vfree(qp->recvq); 511 kfree(qp); 512 } 513 atomic_dec(&sdev->num_qp); 514 515 return ERR_PTR(rv); 516 } 517 518 /* 519 * Minimum siw_query_qp() verb interface. 520 * 521 * @qp_attr_mask is not used but all available information is provided 522 */ 523 int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, 524 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) 525 { 526 struct siw_qp *qp; 527 struct siw_device *sdev; 528 529 if (base_qp && qp_attr && qp_init_attr) { 530 qp = to_siw_qp(base_qp); 531 sdev = to_siw_dev(base_qp->device); 532 } else { 533 return -EINVAL; 534 } 535 qp_attr->cap.max_inline_data = SIW_MAX_INLINE; 536 qp_attr->cap.max_send_wr = qp->attrs.sq_size; 537 qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; 538 qp_attr->cap.max_recv_wr = qp->attrs.rq_size; 539 qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; 540 qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); 541 qp_attr->max_rd_atomic = qp->attrs.irq_size; 542 qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; 543 544 qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | 545 IB_ACCESS_REMOTE_WRITE | 546 IB_ACCESS_REMOTE_READ; 547 548 qp_init_attr->qp_type = base_qp->qp_type; 549 qp_init_attr->send_cq = base_qp->send_cq; 550 qp_init_attr->recv_cq = base_qp->recv_cq; 551 qp_init_attr->srq = base_qp->srq; 552 553 qp_init_attr->cap = qp_attr->cap; 554 555 return 0; 556 } 557 558 int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, 559 int attr_mask, struct ib_udata *udata) 560 { 561 struct siw_qp_attrs new_attrs; 562 enum siw_qp_attr_mask siw_attr_mask = 0; 563 struct siw_qp *qp = to_siw_qp(base_qp); 564 int rv = 0; 565 566 if (!attr_mask) 567 return 0; 568 569 memset(&new_attrs, 0, sizeof(new_attrs)); 570 571 if (attr_mask & IB_QP_ACCESS_FLAGS) { 572 siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS; 573 574 if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) 575 new_attrs.flags |= SIW_RDMA_READ_ENABLED; 576 if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) 577 new_attrs.flags |= SIW_RDMA_WRITE_ENABLED; 578 if (attr->qp_access_flags & IB_ACCESS_MW_BIND) 579 new_attrs.flags |= SIW_RDMA_BIND_ENABLED; 580 } 581 if (attr_mask & IB_QP_STATE) { 582 siw_dbg_qp(qp, "desired IB QP state: %s\n", 583 ib_qp_state_to_string[attr->qp_state]); 584 585 new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state]; 586 587 if (new_attrs.state > SIW_QP_STATE_RTS) 588 qp->tx_ctx.tx_suspend = 1; 589 590 siw_attr_mask |= SIW_QP_ATTR_STATE; 591 } 592 if (!siw_attr_mask) 593 goto out; 594 595 down_write(&qp->state_lock); 596 597 rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask); 598 599 up_write(&qp->state_lock); 600 out: 601 return rv; 602 } 603 604 int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata) 605 { 606 struct siw_qp *qp = to_siw_qp(base_qp); 607 struct siw_base_qp *siw_base_qp = to_siw_base_qp(base_qp); 608 struct siw_ucontext *uctx = 609 rdma_udata_to_drv_context(udata, struct siw_ucontext, 610 base_ucontext); 611 struct siw_qp_attrs qp_attrs; 612 613 siw_dbg_qp(qp, "state %d\n", qp->attrs.state); 614 615 /* 616 * Mark QP as in process of destruction to prevent from 617 * any async callbacks to RDMA core 618 */ 619 qp->attrs.flags |= SIW_QP_IN_DESTROY; 620 qp->rx_stream.rx_suspend = 1; 621 622 if (uctx && qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) 623 kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); 624 if (uctx && qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) 625 kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); 626 627 down_write(&qp->state_lock); 628 629 qp_attrs.state = SIW_QP_STATE_ERROR; 630 siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE); 631 632 if (qp->cep) { 633 siw_cep_put(qp->cep); 634 qp->cep = NULL; 635 } 636 up_write(&qp->state_lock); 637 638 kfree(qp->tx_ctx.mpa_crc_hd); 639 kfree(qp->rx_stream.mpa_crc_hd); 640 641 qp->scq = qp->rcq = NULL; 642 643 siw_qp_put(qp); 644 kfree(siw_base_qp); 645 646 return 0; 647 } 648 649 /* 650 * siw_copy_inline_sgl() 651 * 652 * Prepare sgl of inlined data for sending. For userland callers 653 * function checks if given buffer addresses and len's are within 654 * process context bounds. 655 * Data from all provided sge's are copied together into the wqe, 656 * referenced by a single sge. 657 */ 658 static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, 659 struct siw_sqe *sqe) 660 { 661 struct ib_sge *core_sge = core_wr->sg_list; 662 void *kbuf = &sqe->sge[1]; 663 int num_sge = core_wr->num_sge, bytes = 0; 664 665 sqe->sge[0].laddr = (uintptr_t)kbuf; 666 sqe->sge[0].lkey = 0; 667 668 while (num_sge--) { 669 if (!core_sge->length) { 670 core_sge++; 671 continue; 672 } 673 bytes += core_sge->length; 674 if (bytes > SIW_MAX_INLINE) { 675 bytes = -EINVAL; 676 break; 677 } 678 memcpy(kbuf, (void *)(uintptr_t)core_sge->addr, 679 core_sge->length); 680 681 kbuf += core_sge->length; 682 core_sge++; 683 } 684 sqe->sge[0].length = bytes > 0 ? bytes : 0; 685 sqe->num_sge = bytes > 0 ? 1 : 0; 686 687 return bytes; 688 } 689 690 /* 691 * siw_post_send() 692 * 693 * Post a list of S-WR's to a SQ. 694 * 695 * @base_qp: Base QP contained in siw QP 696 * @wr: Null terminated list of user WR's 697 * @bad_wr: Points to failing WR in case of synchronous failure. 698 */ 699 int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, 700 const struct ib_send_wr **bad_wr) 701 { 702 struct siw_qp *qp = to_siw_qp(base_qp); 703 struct siw_wqe *wqe = tx_wqe(qp); 704 705 unsigned long flags; 706 int rv = 0; 707 708 /* 709 * Try to acquire QP state lock. Must be non-blocking 710 * to accommodate kernel clients needs. 711 */ 712 if (!down_read_trylock(&qp->state_lock)) { 713 *bad_wr = wr; 714 siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state); 715 return -ENOTCONN; 716 } 717 if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { 718 up_read(&qp->state_lock); 719 *bad_wr = wr; 720 siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state); 721 return -ENOTCONN; 722 } 723 if (wr && !qp->kernel_verbs) { 724 siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); 725 up_read(&qp->state_lock); 726 *bad_wr = wr; 727 return -EINVAL; 728 } 729 spin_lock_irqsave(&qp->sq_lock, flags); 730 731 while (wr) { 732 u32 idx = qp->sq_put % qp->attrs.sq_size; 733 struct siw_sqe *sqe = &qp->sendq[idx]; 734 735 if (sqe->flags) { 736 siw_dbg_qp(qp, "sq full\n"); 737 rv = -ENOMEM; 738 break; 739 } 740 if (wr->num_sge > qp->attrs.sq_max_sges) { 741 siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); 742 rv = -EINVAL; 743 break; 744 } 745 sqe->id = wr->wr_id; 746 747 if ((wr->send_flags & IB_SEND_SIGNALED) || 748 (qp->attrs.flags & SIW_SIGNAL_ALL_WR)) 749 sqe->flags |= SIW_WQE_SIGNALLED; 750 751 if (wr->send_flags & IB_SEND_FENCE) 752 sqe->flags |= SIW_WQE_READ_FENCE; 753 754 switch (wr->opcode) { 755 case IB_WR_SEND: 756 case IB_WR_SEND_WITH_INV: 757 if (wr->send_flags & IB_SEND_SOLICITED) 758 sqe->flags |= SIW_WQE_SOLICITED; 759 760 if (!(wr->send_flags & IB_SEND_INLINE)) { 761 siw_copy_sgl(wr->sg_list, sqe->sge, 762 wr->num_sge); 763 sqe->num_sge = wr->num_sge; 764 } else { 765 rv = siw_copy_inline_sgl(wr, sqe); 766 if (rv <= 0) { 767 rv = -EINVAL; 768 break; 769 } 770 sqe->flags |= SIW_WQE_INLINE; 771 sqe->num_sge = 1; 772 } 773 if (wr->opcode == IB_WR_SEND) 774 sqe->opcode = SIW_OP_SEND; 775 else { 776 sqe->opcode = SIW_OP_SEND_REMOTE_INV; 777 sqe->rkey = wr->ex.invalidate_rkey; 778 } 779 break; 780 781 case IB_WR_RDMA_READ_WITH_INV: 782 case IB_WR_RDMA_READ: 783 /* 784 * iWarp restricts RREAD sink to SGL containing 785 * 1 SGE only. we could relax to SGL with multiple 786 * elements referring the SAME ltag or even sending 787 * a private per-rreq tag referring to a checked 788 * local sgl with MULTIPLE ltag's. 789 */ 790 if (unlikely(wr->num_sge != 1)) { 791 rv = -EINVAL; 792 break; 793 } 794 siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1); 795 /* 796 * NOTE: zero length RREAD is allowed! 797 */ 798 sqe->raddr = rdma_wr(wr)->remote_addr; 799 sqe->rkey = rdma_wr(wr)->rkey; 800 sqe->num_sge = 1; 801 802 if (wr->opcode == IB_WR_RDMA_READ) 803 sqe->opcode = SIW_OP_READ; 804 else 805 sqe->opcode = SIW_OP_READ_LOCAL_INV; 806 break; 807 808 case IB_WR_RDMA_WRITE: 809 if (!(wr->send_flags & IB_SEND_INLINE)) { 810 siw_copy_sgl(wr->sg_list, &sqe->sge[0], 811 wr->num_sge); 812 sqe->num_sge = wr->num_sge; 813 } else { 814 rv = siw_copy_inline_sgl(wr, sqe); 815 if (unlikely(rv < 0)) { 816 rv = -EINVAL; 817 break; 818 } 819 sqe->flags |= SIW_WQE_INLINE; 820 sqe->num_sge = 1; 821 } 822 sqe->raddr = rdma_wr(wr)->remote_addr; 823 sqe->rkey = rdma_wr(wr)->rkey; 824 sqe->opcode = SIW_OP_WRITE; 825 break; 826 827 case IB_WR_REG_MR: 828 sqe->base_mr = (uintptr_t)reg_wr(wr)->mr; 829 sqe->rkey = reg_wr(wr)->key; 830 sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK; 831 sqe->opcode = SIW_OP_REG_MR; 832 break; 833 834 case IB_WR_LOCAL_INV: 835 sqe->rkey = wr->ex.invalidate_rkey; 836 sqe->opcode = SIW_OP_INVAL_STAG; 837 break; 838 839 default: 840 siw_dbg_qp(qp, "ib wr type %d unsupported\n", 841 wr->opcode); 842 rv = -EINVAL; 843 break; 844 } 845 siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n", 846 sqe->opcode, sqe->flags, 847 (void *)(uintptr_t)sqe->id); 848 849 if (unlikely(rv < 0)) 850 break; 851 852 /* make SQE only valid after completely written */ 853 smp_wmb(); 854 sqe->flags |= SIW_WQE_VALID; 855 856 qp->sq_put++; 857 wr = wr->next; 858 } 859 860 /* 861 * Send directly if SQ processing is not in progress. 862 * Eventual immediate errors (rv < 0) do not affect the involved 863 * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ 864 * processing, if new work is already pending. But rv must be passed 865 * to caller. 866 */ 867 if (wqe->wr_status != SIW_WR_IDLE) { 868 spin_unlock_irqrestore(&qp->sq_lock, flags); 869 goto skip_direct_sending; 870 } 871 rv = siw_activate_tx(qp); 872 spin_unlock_irqrestore(&qp->sq_lock, flags); 873 874 if (rv <= 0) 875 goto skip_direct_sending; 876 877 if (qp->kernel_verbs) { 878 rv = siw_sq_start(qp); 879 } else { 880 qp->tx_ctx.in_syscall = 1; 881 882 if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend)) 883 siw_qp_cm_drop(qp, 0); 884 885 qp->tx_ctx.in_syscall = 0; 886 } 887 skip_direct_sending: 888 889 up_read(&qp->state_lock); 890 891 if (rv >= 0) 892 return 0; 893 /* 894 * Immediate error 895 */ 896 siw_dbg_qp(qp, "error %d\n", rv); 897 898 *bad_wr = wr; 899 return rv; 900 } 901 902 /* 903 * siw_post_receive() 904 * 905 * Post a list of R-WR's to a RQ. 906 * 907 * @base_qp: Base QP contained in siw QP 908 * @wr: Null terminated list of user WR's 909 * @bad_wr: Points to failing WR in case of synchronous failure. 910 */ 911 int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, 912 const struct ib_recv_wr **bad_wr) 913 { 914 struct siw_qp *qp = to_siw_qp(base_qp); 915 unsigned long flags; 916 int rv = 0; 917 918 if (qp->srq) { 919 *bad_wr = wr; 920 return -EOPNOTSUPP; /* what else from errno.h? */ 921 } 922 /* 923 * Try to acquire QP state lock. Must be non-blocking 924 * to accommodate kernel clients needs. 925 */ 926 if (!down_read_trylock(&qp->state_lock)) { 927 *bad_wr = wr; 928 return -ENOTCONN; 929 } 930 if (!qp->kernel_verbs) { 931 siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n"); 932 up_read(&qp->state_lock); 933 *bad_wr = wr; 934 return -EINVAL; 935 } 936 if (qp->attrs.state > SIW_QP_STATE_RTS) { 937 up_read(&qp->state_lock); 938 *bad_wr = wr; 939 return -EINVAL; 940 } 941 /* 942 * Serialize potentially multiple producers. 943 * Not needed for single threaded consumer side. 944 */ 945 spin_lock_irqsave(&qp->rq_lock, flags); 946 947 while (wr) { 948 u32 idx = qp->rq_put % qp->attrs.rq_size; 949 struct siw_rqe *rqe = &qp->recvq[idx]; 950 951 if (rqe->flags) { 952 siw_dbg_qp(qp, "RQ full\n"); 953 rv = -ENOMEM; 954 break; 955 } 956 if (wr->num_sge > qp->attrs.rq_max_sges) { 957 siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); 958 rv = -EINVAL; 959 break; 960 } 961 rqe->id = wr->wr_id; 962 rqe->num_sge = wr->num_sge; 963 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); 964 965 /* make sure RQE is completely written before valid */ 966 smp_wmb(); 967 968 rqe->flags = SIW_WQE_VALID; 969 970 qp->rq_put++; 971 wr = wr->next; 972 } 973 spin_unlock_irqrestore(&qp->rq_lock, flags); 974 975 up_read(&qp->state_lock); 976 977 if (rv < 0) { 978 siw_dbg_qp(qp, "error %d\n", rv); 979 *bad_wr = wr; 980 } 981 return rv > 0 ? 0 : rv; 982 } 983 984 void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) 985 { 986 struct siw_cq *cq = to_siw_cq(base_cq); 987 struct siw_device *sdev = to_siw_dev(base_cq->device); 988 struct siw_ucontext *ctx = 989 rdma_udata_to_drv_context(udata, struct siw_ucontext, 990 base_ucontext); 991 992 siw_dbg_cq(cq, "free CQ resources\n"); 993 994 siw_cq_flush(cq); 995 996 if (ctx && cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) 997 kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); 998 999 atomic_dec(&sdev->num_cq); 1000 1001 vfree(cq->queue); 1002 } 1003 1004 /* 1005 * siw_create_cq() 1006 * 1007 * Populate CQ of requested size 1008 * 1009 * @base_cq: CQ as allocated by RDMA midlayer 1010 * @attr: Initial CQ attributes 1011 * @udata: relates to user context 1012 */ 1013 1014 int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, 1015 struct ib_udata *udata) 1016 { 1017 struct siw_device *sdev = to_siw_dev(base_cq->device); 1018 struct siw_cq *cq = to_siw_cq(base_cq); 1019 int rv, size = attr->cqe; 1020 1021 if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) { 1022 siw_dbg(base_cq->device, "too many CQ's\n"); 1023 rv = -ENOMEM; 1024 goto err_out; 1025 } 1026 if (size < 1 || size > sdev->attrs.max_cqe) { 1027 siw_dbg(base_cq->device, "CQ size error: %d\n", size); 1028 rv = -EINVAL; 1029 goto err_out; 1030 } 1031 size = roundup_pow_of_two(size); 1032 cq->base_cq.cqe = size; 1033 cq->num_cqe = size; 1034 cq->xa_cq_index = SIW_INVAL_UOBJ_KEY; 1035 1036 if (!udata) { 1037 cq->kernel_verbs = 1; 1038 cq->queue = vzalloc(size * sizeof(struct siw_cqe) + 1039 sizeof(struct siw_cq_ctrl)); 1040 } else { 1041 cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) + 1042 sizeof(struct siw_cq_ctrl)); 1043 } 1044 if (cq->queue == NULL) { 1045 rv = -ENOMEM; 1046 goto err_out; 1047 } 1048 get_random_bytes(&cq->id, 4); 1049 siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id); 1050 1051 spin_lock_init(&cq->lock); 1052 1053 cq->notify = (struct siw_cq_ctrl *)&cq->queue[size]; 1054 1055 if (udata) { 1056 struct siw_uresp_create_cq uresp = {}; 1057 struct siw_ucontext *ctx = 1058 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1059 base_ucontext); 1060 1061 cq->xa_cq_index = 1062 siw_create_uobj(ctx, cq->queue, 1063 size * sizeof(struct siw_cqe) + 1064 sizeof(struct siw_cq_ctrl)); 1065 if (cq->xa_cq_index == SIW_INVAL_UOBJ_KEY) { 1066 rv = -ENOMEM; 1067 goto err_out; 1068 } 1069 uresp.cq_key = cq->xa_cq_index << PAGE_SHIFT; 1070 uresp.cq_id = cq->id; 1071 uresp.num_cqe = size; 1072 1073 if (udata->outlen < sizeof(uresp)) { 1074 rv = -EINVAL; 1075 goto err_out; 1076 } 1077 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1078 if (rv) 1079 goto err_out; 1080 } 1081 return 0; 1082 1083 err_out: 1084 siw_dbg(base_cq->device, "CQ creation failed: %d", rv); 1085 1086 if (cq && cq->queue) { 1087 struct siw_ucontext *ctx = 1088 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1089 base_ucontext); 1090 if (cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) 1091 kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); 1092 vfree(cq->queue); 1093 } 1094 atomic_dec(&sdev->num_cq); 1095 1096 return rv; 1097 } 1098 1099 /* 1100 * siw_poll_cq() 1101 * 1102 * Reap CQ entries if available and copy work completion status into 1103 * array of WC's provided by caller. Returns number of reaped CQE's. 1104 * 1105 * @base_cq: Base CQ contained in siw CQ. 1106 * @num_cqe: Maximum number of CQE's to reap. 1107 * @wc: Array of work completions to be filled by siw. 1108 */ 1109 int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc) 1110 { 1111 struct siw_cq *cq = to_siw_cq(base_cq); 1112 int i; 1113 1114 for (i = 0; i < num_cqe; i++) { 1115 if (!siw_reap_cqe(cq, wc)) 1116 break; 1117 wc++; 1118 } 1119 return i; 1120 } 1121 1122 /* 1123 * siw_req_notify_cq() 1124 * 1125 * Request notification for new CQE's added to that CQ. 1126 * Defined flags: 1127 * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification 1128 * event if a WQE with notification flag set enters the CQ 1129 * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification 1130 * event if a WQE enters the CQ. 1131 * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the 1132 * number of not reaped CQE's regardless of its notification 1133 * type and current or new CQ notification settings. 1134 * 1135 * @base_cq: Base CQ contained in siw CQ. 1136 * @flags: Requested notification flags. 1137 */ 1138 int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) 1139 { 1140 struct siw_cq *cq = to_siw_cq(base_cq); 1141 1142 siw_dbg_cq(cq, "flags: 0x%02x\n", flags); 1143 1144 if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) 1145 /* 1146 * Enable CQ event for next solicited completion. 1147 * and make it visible to all associated producers. 1148 */ 1149 smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED); 1150 else 1151 /* 1152 * Enable CQ event for any signalled completion. 1153 * and make it visible to all associated producers. 1154 */ 1155 smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL); 1156 1157 if (flags & IB_CQ_REPORT_MISSED_EVENTS) 1158 return cq->cq_put - cq->cq_get; 1159 1160 return 0; 1161 } 1162 1163 /* 1164 * siw_dereg_mr() 1165 * 1166 * Release Memory Region. 1167 * 1168 * @base_mr: Base MR contained in siw MR. 1169 * @udata: points to user context, unused. 1170 */ 1171 int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata) 1172 { 1173 struct siw_mr *mr = to_siw_mr(base_mr); 1174 struct siw_device *sdev = to_siw_dev(base_mr->device); 1175 1176 siw_dbg_mem(mr->mem, "deregister MR\n"); 1177 1178 atomic_dec(&sdev->num_mr); 1179 1180 siw_mr_drop_mem(mr); 1181 kfree_rcu(mr, rcu); 1182 1183 return 0; 1184 } 1185 1186 /* 1187 * siw_reg_user_mr() 1188 * 1189 * Register Memory Region. 1190 * 1191 * @pd: Protection Domain 1192 * @start: starting address of MR (virtual address) 1193 * @len: len of MR 1194 * @rnic_va: not used by siw 1195 * @rights: MR access rights 1196 * @udata: user buffer to communicate STag and Key. 1197 */ 1198 struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, 1199 u64 rnic_va, int rights, struct ib_udata *udata) 1200 { 1201 struct siw_mr *mr = NULL; 1202 struct siw_umem *umem = NULL; 1203 struct siw_ureq_reg_mr ureq; 1204 struct siw_device *sdev = to_siw_dev(pd->device); 1205 1206 unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK); 1207 int rv; 1208 1209 siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n", 1210 (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va, 1211 (unsigned long long)len); 1212 1213 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1214 siw_dbg_pd(pd, "too many mr's\n"); 1215 rv = -ENOMEM; 1216 goto err_out; 1217 } 1218 if (!len) { 1219 rv = -EINVAL; 1220 goto err_out; 1221 } 1222 if (mem_limit != RLIM_INFINITY) { 1223 unsigned long num_pages = 1224 (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT; 1225 mem_limit >>= PAGE_SHIFT; 1226 1227 if (num_pages > mem_limit - current->mm->locked_vm) { 1228 siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n", 1229 num_pages, mem_limit, 1230 current->mm->locked_vm); 1231 rv = -ENOMEM; 1232 goto err_out; 1233 } 1234 } 1235 umem = siw_umem_get(start, len, ib_access_writable(rights)); 1236 if (IS_ERR(umem)) { 1237 rv = PTR_ERR(umem); 1238 siw_dbg_pd(pd, "getting user memory failed: %d\n", rv); 1239 umem = NULL; 1240 goto err_out; 1241 } 1242 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1243 if (!mr) { 1244 rv = -ENOMEM; 1245 goto err_out; 1246 } 1247 rv = siw_mr_add_mem(mr, pd, umem, start, len, rights); 1248 if (rv) 1249 goto err_out; 1250 1251 if (udata) { 1252 struct siw_uresp_reg_mr uresp = {}; 1253 struct siw_mem *mem = mr->mem; 1254 1255 if (udata->inlen < sizeof(ureq)) { 1256 rv = -EINVAL; 1257 goto err_out; 1258 } 1259 rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); 1260 if (rv) 1261 goto err_out; 1262 1263 mr->base_mr.lkey |= ureq.stag_key; 1264 mr->base_mr.rkey |= ureq.stag_key; 1265 mem->stag |= ureq.stag_key; 1266 uresp.stag = mem->stag; 1267 1268 if (udata->outlen < sizeof(uresp)) { 1269 rv = -EINVAL; 1270 goto err_out; 1271 } 1272 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1273 if (rv) 1274 goto err_out; 1275 } 1276 mr->mem->stag_valid = 1; 1277 1278 return &mr->base_mr; 1279 1280 err_out: 1281 atomic_dec(&sdev->num_mr); 1282 if (mr) { 1283 if (mr->mem) 1284 siw_mr_drop_mem(mr); 1285 kfree_rcu(mr, rcu); 1286 } else { 1287 if (umem) 1288 siw_umem_release(umem, false); 1289 } 1290 return ERR_PTR(rv); 1291 } 1292 1293 struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 1294 u32 max_sge, struct ib_udata *udata) 1295 { 1296 struct siw_device *sdev = to_siw_dev(pd->device); 1297 struct siw_mr *mr = NULL; 1298 struct siw_pbl *pbl = NULL; 1299 int rv; 1300 1301 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1302 siw_dbg_pd(pd, "too many mr's\n"); 1303 rv = -ENOMEM; 1304 goto err_out; 1305 } 1306 if (mr_type != IB_MR_TYPE_MEM_REG) { 1307 siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type); 1308 rv = -EOPNOTSUPP; 1309 goto err_out; 1310 } 1311 if (max_sge > SIW_MAX_SGE_PBL) { 1312 siw_dbg_pd(pd, "too many sge's: %d\n", max_sge); 1313 rv = -ENOMEM; 1314 goto err_out; 1315 } 1316 pbl = siw_pbl_alloc(max_sge); 1317 if (IS_ERR(pbl)) { 1318 rv = PTR_ERR(pbl); 1319 siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv); 1320 pbl = NULL; 1321 goto err_out; 1322 } 1323 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1324 if (!mr) { 1325 rv = -ENOMEM; 1326 goto err_out; 1327 } 1328 rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0); 1329 if (rv) 1330 goto err_out; 1331 1332 mr->mem->is_pbl = 1; 1333 1334 siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); 1335 1336 return &mr->base_mr; 1337 1338 err_out: 1339 atomic_dec(&sdev->num_mr); 1340 1341 if (!mr) { 1342 kfree(pbl); 1343 } else { 1344 if (mr->mem) 1345 siw_mr_drop_mem(mr); 1346 kfree_rcu(mr, rcu); 1347 } 1348 siw_dbg_pd(pd, "failed: %d\n", rv); 1349 1350 return ERR_PTR(rv); 1351 } 1352 1353 /* Just used to count number of pages being mapped */ 1354 static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr) 1355 { 1356 return 0; 1357 } 1358 1359 int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, 1360 unsigned int *sg_off) 1361 { 1362 struct scatterlist *slp; 1363 struct siw_mr *mr = to_siw_mr(base_mr); 1364 struct siw_mem *mem = mr->mem; 1365 struct siw_pbl *pbl = mem->pbl; 1366 struct siw_pble *pble; 1367 unsigned long pbl_size; 1368 int i, rv; 1369 1370 if (!pbl) { 1371 siw_dbg_mem(mem, "no PBL allocated\n"); 1372 return -EINVAL; 1373 } 1374 pble = pbl->pbe; 1375 1376 if (pbl->max_buf < num_sle) { 1377 siw_dbg_mem(mem, "too many SGE's: %d > %d\n", 1378 mem->pbl->max_buf, num_sle); 1379 return -ENOMEM; 1380 } 1381 for_each_sg(sl, slp, num_sle, i) { 1382 if (sg_dma_len(slp) == 0) { 1383 siw_dbg_mem(mem, "empty SGE\n"); 1384 return -EINVAL; 1385 } 1386 if (i == 0) { 1387 pble->addr = sg_dma_address(slp); 1388 pble->size = sg_dma_len(slp); 1389 pble->pbl_off = 0; 1390 pbl_size = pble->size; 1391 pbl->num_buf = 1; 1392 } else { 1393 /* Merge PBL entries if adjacent */ 1394 if (pble->addr + pble->size == sg_dma_address(slp)) { 1395 pble->size += sg_dma_len(slp); 1396 } else { 1397 pble++; 1398 pbl->num_buf++; 1399 pble->addr = sg_dma_address(slp); 1400 pble->size = sg_dma_len(slp); 1401 pble->pbl_off = pbl_size; 1402 } 1403 pbl_size += sg_dma_len(slp); 1404 } 1405 siw_dbg_mem(mem, 1406 "sge[%d], size %u, addr 0x%p, total %lu\n", 1407 i, pble->size, (void *)(uintptr_t)pble->addr, 1408 pbl_size); 1409 } 1410 rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); 1411 if (rv > 0) { 1412 mem->len = base_mr->length; 1413 mem->va = base_mr->iova; 1414 siw_dbg_mem(mem, 1415 "%llu bytes, start 0x%pK, %u SLE to %u entries\n", 1416 mem->len, (void *)(uintptr_t)mem->va, num_sle, 1417 pbl->num_buf); 1418 } 1419 return rv; 1420 } 1421 1422 /* 1423 * siw_get_dma_mr() 1424 * 1425 * Create a (empty) DMA memory region, where no umem is attached. 1426 */ 1427 struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights) 1428 { 1429 struct siw_device *sdev = to_siw_dev(pd->device); 1430 struct siw_mr *mr = NULL; 1431 int rv; 1432 1433 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1434 siw_dbg_pd(pd, "too many mr's\n"); 1435 rv = -ENOMEM; 1436 goto err_out; 1437 } 1438 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1439 if (!mr) { 1440 rv = -ENOMEM; 1441 goto err_out; 1442 } 1443 rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights); 1444 if (rv) 1445 goto err_out; 1446 1447 mr->mem->stag_valid = 1; 1448 1449 siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); 1450 1451 return &mr->base_mr; 1452 1453 err_out: 1454 if (rv) 1455 kfree(mr); 1456 1457 atomic_dec(&sdev->num_mr); 1458 1459 return ERR_PTR(rv); 1460 } 1461 1462 /* 1463 * siw_create_srq() 1464 * 1465 * Create Shared Receive Queue of attributes @init_attrs 1466 * within protection domain given by @pd. 1467 * 1468 * @base_srq: Base SRQ contained in siw SRQ. 1469 * @init_attrs: SRQ init attributes. 1470 * @udata: points to user context 1471 */ 1472 int siw_create_srq(struct ib_srq *base_srq, 1473 struct ib_srq_init_attr *init_attrs, struct ib_udata *udata) 1474 { 1475 struct siw_srq *srq = to_siw_srq(base_srq); 1476 struct ib_srq_attr *attrs = &init_attrs->attr; 1477 struct siw_device *sdev = to_siw_dev(base_srq->device); 1478 struct siw_ucontext *ctx = 1479 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1480 base_ucontext); 1481 int rv; 1482 1483 if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) { 1484 siw_dbg_pd(base_srq->pd, "too many SRQ's\n"); 1485 rv = -ENOMEM; 1486 goto err_out; 1487 } 1488 if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR || 1489 attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) { 1490 rv = -EINVAL; 1491 goto err_out; 1492 } 1493 srq->max_sge = attrs->max_sge; 1494 srq->num_rqe = roundup_pow_of_two(attrs->max_wr); 1495 srq->xa_srq_index = SIW_INVAL_UOBJ_KEY; 1496 srq->limit = attrs->srq_limit; 1497 if (srq->limit) 1498 srq->armed = 1; 1499 1500 srq->kernel_verbs = !udata; 1501 1502 if (udata) 1503 srq->recvq = 1504 vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe)); 1505 else 1506 srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe)); 1507 1508 if (srq->recvq == NULL) { 1509 rv = -ENOMEM; 1510 goto err_out; 1511 } 1512 if (udata) { 1513 struct siw_uresp_create_srq uresp = {}; 1514 1515 srq->xa_srq_index = siw_create_uobj( 1516 ctx, srq->recvq, srq->num_rqe * sizeof(struct siw_rqe)); 1517 1518 if (srq->xa_srq_index == SIW_INVAL_UOBJ_KEY) { 1519 rv = -ENOMEM; 1520 goto err_out; 1521 } 1522 uresp.srq_key = srq->xa_srq_index; 1523 uresp.num_rqe = srq->num_rqe; 1524 1525 if (udata->outlen < sizeof(uresp)) { 1526 rv = -EINVAL; 1527 goto err_out; 1528 } 1529 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1530 if (rv) 1531 goto err_out; 1532 } 1533 spin_lock_init(&srq->lock); 1534 1535 siw_dbg_pd(base_srq->pd, "[SRQ]: success\n"); 1536 1537 return 0; 1538 1539 err_out: 1540 if (srq->recvq) { 1541 if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) 1542 kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); 1543 vfree(srq->recvq); 1544 } 1545 atomic_dec(&sdev->num_srq); 1546 1547 return rv; 1548 } 1549 1550 /* 1551 * siw_modify_srq() 1552 * 1553 * Modify SRQ. The caller may resize SRQ and/or set/reset notification 1554 * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification. 1555 * 1556 * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE 1557 * parameter. siw_modify_srq() does not check the attrs->max_sge param. 1558 */ 1559 int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs, 1560 enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) 1561 { 1562 struct siw_srq *srq = to_siw_srq(base_srq); 1563 unsigned long flags; 1564 int rv = 0; 1565 1566 spin_lock_irqsave(&srq->lock, flags); 1567 1568 if (attr_mask & IB_SRQ_MAX_WR) { 1569 /* resize request not yet supported */ 1570 rv = -EOPNOTSUPP; 1571 goto out; 1572 } 1573 if (attr_mask & IB_SRQ_LIMIT) { 1574 if (attrs->srq_limit) { 1575 if (unlikely(attrs->srq_limit > srq->num_rqe)) { 1576 rv = -EINVAL; 1577 goto out; 1578 } 1579 srq->armed = 1; 1580 } else { 1581 srq->armed = 0; 1582 } 1583 srq->limit = attrs->srq_limit; 1584 } 1585 out: 1586 spin_unlock_irqrestore(&srq->lock, flags); 1587 1588 return rv; 1589 } 1590 1591 /* 1592 * siw_query_srq() 1593 * 1594 * Query SRQ attributes. 1595 */ 1596 int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs) 1597 { 1598 struct siw_srq *srq = to_siw_srq(base_srq); 1599 unsigned long flags; 1600 1601 spin_lock_irqsave(&srq->lock, flags); 1602 1603 attrs->max_wr = srq->num_rqe; 1604 attrs->max_sge = srq->max_sge; 1605 attrs->srq_limit = srq->limit; 1606 1607 spin_unlock_irqrestore(&srq->lock, flags); 1608 1609 return 0; 1610 } 1611 1612 /* 1613 * siw_destroy_srq() 1614 * 1615 * Destroy SRQ. 1616 * It is assumed that the SRQ is not referenced by any 1617 * QP anymore - the code trusts the RDMA core environment to keep track 1618 * of QP references. 1619 */ 1620 void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata) 1621 { 1622 struct siw_srq *srq = to_siw_srq(base_srq); 1623 struct siw_device *sdev = to_siw_dev(base_srq->device); 1624 struct siw_ucontext *ctx = 1625 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1626 base_ucontext); 1627 1628 if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) 1629 kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); 1630 1631 vfree(srq->recvq); 1632 atomic_dec(&sdev->num_srq); 1633 } 1634 1635 /* 1636 * siw_post_srq_recv() 1637 * 1638 * Post a list of receive queue elements to SRQ. 1639 * NOTE: The function does not check or lock a certain SRQ state 1640 * during the post operation. The code simply trusts the 1641 * RDMA core environment. 1642 * 1643 * @base_srq: Base SRQ contained in siw SRQ 1644 * @wr: List of R-WR's 1645 * @bad_wr: Updated to failing WR if posting fails. 1646 */ 1647 int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, 1648 const struct ib_recv_wr **bad_wr) 1649 { 1650 struct siw_srq *srq = to_siw_srq(base_srq); 1651 unsigned long flags; 1652 int rv = 0; 1653 1654 if (unlikely(!srq->kernel_verbs)) { 1655 siw_dbg_pd(base_srq->pd, 1656 "[SRQ]: no kernel post_recv for mapped srq\n"); 1657 rv = -EINVAL; 1658 goto out; 1659 } 1660 /* 1661 * Serialize potentially multiple producers. 1662 * Also needed to serialize potentially multiple 1663 * consumers. 1664 */ 1665 spin_lock_irqsave(&srq->lock, flags); 1666 1667 while (wr) { 1668 u32 idx = srq->rq_put % srq->num_rqe; 1669 struct siw_rqe *rqe = &srq->recvq[idx]; 1670 1671 if (rqe->flags) { 1672 siw_dbg_pd(base_srq->pd, "SRQ full\n"); 1673 rv = -ENOMEM; 1674 break; 1675 } 1676 if (unlikely(wr->num_sge > srq->max_sge)) { 1677 siw_dbg_pd(base_srq->pd, 1678 "[SRQ]: too many sge's: %d\n", wr->num_sge); 1679 rv = -EINVAL; 1680 break; 1681 } 1682 rqe->id = wr->wr_id; 1683 rqe->num_sge = wr->num_sge; 1684 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); 1685 1686 /* Make sure S-RQE is completely written before valid */ 1687 smp_wmb(); 1688 1689 rqe->flags = SIW_WQE_VALID; 1690 1691 srq->rq_put++; 1692 wr = wr->next; 1693 } 1694 spin_unlock_irqrestore(&srq->lock, flags); 1695 out: 1696 if (unlikely(rv < 0)) { 1697 siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv); 1698 *bad_wr = wr; 1699 } 1700 return rv; 1701 } 1702 1703 void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype) 1704 { 1705 struct ib_event event; 1706 struct ib_qp *base_qp = qp->ib_qp; 1707 1708 /* 1709 * Do not report asynchronous errors on QP which gets 1710 * destroyed via verbs interface (siw_destroy_qp()) 1711 */ 1712 if (qp->attrs.flags & SIW_QP_IN_DESTROY) 1713 return; 1714 1715 event.event = etype; 1716 event.device = base_qp->device; 1717 event.element.qp = base_qp; 1718 1719 if (base_qp->event_handler) { 1720 siw_dbg_qp(qp, "reporting event %d\n", etype); 1721 base_qp->event_handler(&event, base_qp->qp_context); 1722 } 1723 } 1724 1725 void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype) 1726 { 1727 struct ib_event event; 1728 struct ib_cq *base_cq = &cq->base_cq; 1729 1730 event.event = etype; 1731 event.device = base_cq->device; 1732 event.element.cq = base_cq; 1733 1734 if (base_cq->event_handler) { 1735 siw_dbg_cq(cq, "reporting CQ event %d\n", etype); 1736 base_cq->event_handler(&event, base_cq->cq_context); 1737 } 1738 } 1739 1740 void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype) 1741 { 1742 struct ib_event event; 1743 struct ib_srq *base_srq = &srq->base_srq; 1744 1745 event.event = etype; 1746 event.device = base_srq->device; 1747 event.element.srq = base_srq; 1748 1749 if (base_srq->event_handler) { 1750 siw_dbg_pd(srq->base_srq.pd, 1751 "reporting SRQ event %d\n", etype); 1752 base_srq->event_handler(&event, base_srq->srq_context); 1753 } 1754 } 1755 1756 void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype) 1757 { 1758 struct ib_event event; 1759 1760 event.event = etype; 1761 event.device = &sdev->base_dev; 1762 event.element.port_num = port; 1763 1764 siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype); 1765 1766 ib_dispatch_event(&event); 1767 } 1768