1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause 2 3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 4 /* Copyright (c) 2008-2019, IBM Corporation */ 5 6 #include <linux/errno.h> 7 #include <linux/types.h> 8 #include <linux/uaccess.h> 9 #include <linux/vmalloc.h> 10 #include <linux/xarray.h> 11 12 #include <rdma/iw_cm.h> 13 #include <rdma/ib_verbs.h> 14 #include <rdma/ib_user_verbs.h> 15 #include <rdma/uverbs_ioctl.h> 16 17 #include "siw.h" 18 #include "siw_verbs.h" 19 #include "siw_mem.h" 20 21 static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = { 22 [IB_QPS_RESET] = SIW_QP_STATE_IDLE, 23 [IB_QPS_INIT] = SIW_QP_STATE_IDLE, 24 [IB_QPS_RTR] = SIW_QP_STATE_RTR, 25 [IB_QPS_RTS] = SIW_QP_STATE_RTS, 26 [IB_QPS_SQD] = SIW_QP_STATE_CLOSING, 27 [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE, 28 [IB_QPS_ERR] = SIW_QP_STATE_ERROR 29 }; 30 31 static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = { 32 [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR", 33 [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE", 34 [IB_QPS_ERR] = "ERR" 35 }; 36 37 static u32 siw_create_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size) 38 { 39 struct siw_uobj *uobj; 40 struct xa_limit limit = XA_LIMIT(0, SIW_UOBJ_MAX_KEY); 41 u32 key; 42 43 uobj = kzalloc(sizeof(*uobj), GFP_KERNEL); 44 if (!uobj) 45 return SIW_INVAL_UOBJ_KEY; 46 47 if (xa_alloc_cyclic(&uctx->xa, &key, uobj, limit, &uctx->uobj_nextkey, 48 GFP_KERNEL) < 0) { 49 kfree(uobj); 50 return SIW_INVAL_UOBJ_KEY; 51 } 52 uobj->size = PAGE_ALIGN(size); 53 uobj->addr = vaddr; 54 55 return key; 56 } 57 58 static struct siw_uobj *siw_get_uobj(struct siw_ucontext *uctx, 59 unsigned long off, u32 size) 60 { 61 struct siw_uobj *uobj = xa_load(&uctx->xa, off); 62 63 if (uobj && uobj->size == size) 64 return uobj; 65 66 return NULL; 67 } 68 69 int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) 70 { 71 struct siw_ucontext *uctx = to_siw_ctx(ctx); 72 struct siw_uobj *uobj; 73 unsigned long off = vma->vm_pgoff; 74 int size = vma->vm_end - vma->vm_start; 75 int rv = -EINVAL; 76 77 /* 78 * Must be page aligned 79 */ 80 if (vma->vm_start & (PAGE_SIZE - 1)) { 81 pr_warn("siw: mmap not page aligned\n"); 82 goto out; 83 } 84 uobj = siw_get_uobj(uctx, off, size); 85 if (!uobj) { 86 siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %u\n", 87 off, size); 88 goto out; 89 } 90 rv = remap_vmalloc_range(vma, uobj->addr, 0); 91 if (rv) 92 pr_warn("remap_vmalloc_range failed: %lu, %u\n", off, size); 93 out: 94 return rv; 95 } 96 97 int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata) 98 { 99 struct siw_device *sdev = to_siw_dev(base_ctx->device); 100 struct siw_ucontext *ctx = to_siw_ctx(base_ctx); 101 struct siw_uresp_alloc_ctx uresp = {}; 102 int rv; 103 104 if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) { 105 rv = -ENOMEM; 106 goto err_out; 107 } 108 xa_init_flags(&ctx->xa, XA_FLAGS_ALLOC); 109 ctx->uobj_nextkey = 0; 110 ctx->sdev = sdev; 111 112 uresp.dev_id = sdev->vendor_part_id; 113 114 if (udata->outlen < sizeof(uresp)) { 115 rv = -EINVAL; 116 goto err_out; 117 } 118 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 119 if (rv) 120 goto err_out; 121 122 siw_dbg(base_ctx->device, "success. now %d context(s)\n", 123 atomic_read(&sdev->num_ctx)); 124 125 return 0; 126 127 err_out: 128 atomic_dec(&sdev->num_ctx); 129 siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv, 130 atomic_read(&sdev->num_ctx)); 131 132 return rv; 133 } 134 135 void siw_dealloc_ucontext(struct ib_ucontext *base_ctx) 136 { 137 struct siw_ucontext *uctx = to_siw_ctx(base_ctx); 138 void *entry; 139 unsigned long index; 140 141 /* 142 * Make sure all user mmap objects are gone. Since QP, CQ 143 * and SRQ destroy routines destroy related objects, nothing 144 * should be found here. 145 */ 146 xa_for_each(&uctx->xa, index, entry) { 147 kfree(xa_erase(&uctx->xa, index)); 148 pr_warn("siw: dropping orphaned uobj at %lu\n", index); 149 } 150 xa_destroy(&uctx->xa); 151 atomic_dec(&uctx->sdev->num_ctx); 152 } 153 154 int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, 155 struct ib_udata *udata) 156 { 157 struct siw_device *sdev = to_siw_dev(base_dev); 158 159 if (udata->inlen || udata->outlen) 160 return -EINVAL; 161 162 memset(attr, 0, sizeof(*attr)); 163 164 /* Revisit atomic caps if RFC 7306 gets supported */ 165 attr->atomic_cap = 0; 166 attr->device_cap_flags = 167 IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG; 168 attr->max_cq = sdev->attrs.max_cq; 169 attr->max_cqe = sdev->attrs.max_cqe; 170 attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; 171 attr->max_fmr = sdev->attrs.max_fmr; 172 attr->max_mr = sdev->attrs.max_mr; 173 attr->max_mw = sdev->attrs.max_mw; 174 attr->max_mr_size = ~0ull; 175 attr->max_pd = sdev->attrs.max_pd; 176 attr->max_qp = sdev->attrs.max_qp; 177 attr->max_qp_init_rd_atom = sdev->attrs.max_ird; 178 attr->max_qp_rd_atom = sdev->attrs.max_ord; 179 attr->max_qp_wr = sdev->attrs.max_qp_wr; 180 attr->max_recv_sge = sdev->attrs.max_sge; 181 attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird; 182 attr->max_send_sge = sdev->attrs.max_sge; 183 attr->max_sge_rd = sdev->attrs.max_sge_rd; 184 attr->max_srq = sdev->attrs.max_srq; 185 attr->max_srq_sge = sdev->attrs.max_srq_sge; 186 attr->max_srq_wr = sdev->attrs.max_srq_wr; 187 attr->page_size_cap = PAGE_SIZE; 188 attr->vendor_id = SIW_VENDOR_ID; 189 attr->vendor_part_id = sdev->vendor_part_id; 190 191 memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6); 192 193 return 0; 194 } 195 196 int siw_query_port(struct ib_device *base_dev, u8 port, 197 struct ib_port_attr *attr) 198 { 199 struct siw_device *sdev = to_siw_dev(base_dev); 200 201 memset(attr, 0, sizeof(*attr)); 202 203 attr->active_mtu = attr->max_mtu; 204 attr->active_speed = 2; 205 attr->active_width = 2; 206 attr->gid_tbl_len = 1; 207 attr->max_msg_sz = -1; 208 attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); 209 attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 5 : 3; 210 attr->pkey_tbl_len = 1; 211 attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; 212 attr->state = sdev->state; 213 /* 214 * All zero 215 * 216 * attr->lid = 0; 217 * attr->bad_pkey_cntr = 0; 218 * attr->qkey_viol_cntr = 0; 219 * attr->sm_lid = 0; 220 * attr->lmc = 0; 221 * attr->max_vl_num = 0; 222 * attr->sm_sl = 0; 223 * attr->subnet_timeout = 0; 224 * attr->init_type_repy = 0; 225 */ 226 return 0; 227 } 228 229 int siw_get_port_immutable(struct ib_device *base_dev, u8 port, 230 struct ib_port_immutable *port_immutable) 231 { 232 struct ib_port_attr attr; 233 int rv = siw_query_port(base_dev, port, &attr); 234 235 if (rv) 236 return rv; 237 238 port_immutable->pkey_tbl_len = attr.pkey_tbl_len; 239 port_immutable->gid_tbl_len = attr.gid_tbl_len; 240 port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; 241 242 return 0; 243 } 244 245 int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey) 246 { 247 /* Report the default pkey */ 248 *pkey = 0xffff; 249 return 0; 250 } 251 252 int siw_query_gid(struct ib_device *base_dev, u8 port, int idx, 253 union ib_gid *gid) 254 { 255 struct siw_device *sdev = to_siw_dev(base_dev); 256 257 /* subnet_prefix == interface_id == 0; */ 258 memset(gid, 0, sizeof(*gid)); 259 memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6); 260 261 return 0; 262 } 263 264 int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata) 265 { 266 struct siw_device *sdev = to_siw_dev(pd->device); 267 268 if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) { 269 atomic_dec(&sdev->num_pd); 270 return -ENOMEM; 271 } 272 siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd)); 273 274 return 0; 275 } 276 277 void siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) 278 { 279 struct siw_device *sdev = to_siw_dev(pd->device); 280 281 siw_dbg_pd(pd, "free PD\n"); 282 atomic_dec(&sdev->num_pd); 283 } 284 285 void siw_qp_get_ref(struct ib_qp *base_qp) 286 { 287 siw_qp_get(to_siw_qp(base_qp)); 288 } 289 290 void siw_qp_put_ref(struct ib_qp *base_qp) 291 { 292 siw_qp_put(to_siw_qp(base_qp)); 293 } 294 295 /* 296 * siw_create_qp() 297 * 298 * Create QP of requested size on given device. 299 * 300 * @pd: Protection Domain 301 * @attrs: Initial QP attributes. 302 * @udata: used to provide QP ID, SQ and RQ size back to user. 303 */ 304 305 struct ib_qp *siw_create_qp(struct ib_pd *pd, 306 struct ib_qp_init_attr *attrs, 307 struct ib_udata *udata) 308 { 309 struct siw_qp *qp = NULL; 310 struct siw_base_qp *siw_base_qp = NULL; 311 struct ib_device *base_dev = pd->device; 312 struct siw_device *sdev = to_siw_dev(base_dev); 313 struct siw_ucontext *uctx = 314 rdma_udata_to_drv_context(udata, struct siw_ucontext, 315 base_ucontext); 316 struct siw_cq *scq = NULL, *rcq = NULL; 317 unsigned long flags; 318 int num_sqe, num_rqe, rv = 0; 319 320 siw_dbg(base_dev, "create new QP\n"); 321 322 if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { 323 siw_dbg(base_dev, "too many QP's\n"); 324 rv = -ENOMEM; 325 goto err_out; 326 } 327 if (attrs->qp_type != IB_QPT_RC) { 328 siw_dbg(base_dev, "only RC QP's supported\n"); 329 rv = -EINVAL; 330 goto err_out; 331 } 332 if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) || 333 (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) || 334 (attrs->cap.max_send_sge > SIW_MAX_SGE) || 335 (attrs->cap.max_recv_sge > SIW_MAX_SGE)) { 336 siw_dbg(base_dev, "QP size error\n"); 337 rv = -EINVAL; 338 goto err_out; 339 } 340 if (attrs->cap.max_inline_data > SIW_MAX_INLINE) { 341 siw_dbg(base_dev, "max inline send: %d > %d\n", 342 attrs->cap.max_inline_data, (int)SIW_MAX_INLINE); 343 rv = -EINVAL; 344 goto err_out; 345 } 346 /* 347 * NOTE: we allow for zero element SQ and RQ WQE's SGL's 348 * but not for a QP unable to hold any WQE (SQ + RQ) 349 */ 350 if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) { 351 siw_dbg(base_dev, "QP must have send or receive queue\n"); 352 rv = -EINVAL; 353 goto err_out; 354 } 355 scq = to_siw_cq(attrs->send_cq); 356 rcq = to_siw_cq(attrs->recv_cq); 357 358 if (!scq || (!rcq && !attrs->srq)) { 359 siw_dbg(base_dev, "send CQ or receive CQ invalid\n"); 360 rv = -EINVAL; 361 goto err_out; 362 } 363 siw_base_qp = kzalloc(sizeof(*siw_base_qp), GFP_KERNEL); 364 if (!siw_base_qp) { 365 rv = -ENOMEM; 366 goto err_out; 367 } 368 qp = kzalloc(sizeof(*qp), GFP_KERNEL); 369 if (!qp) { 370 rv = -ENOMEM; 371 goto err_out; 372 } 373 siw_base_qp->qp = qp; 374 qp->ib_qp = &siw_base_qp->base_qp; 375 376 init_rwsem(&qp->state_lock); 377 spin_lock_init(&qp->sq_lock); 378 spin_lock_init(&qp->rq_lock); 379 spin_lock_init(&qp->orq_lock); 380 381 qp->kernel_verbs = !udata; 382 qp->xa_sq_index = SIW_INVAL_UOBJ_KEY; 383 qp->xa_rq_index = SIW_INVAL_UOBJ_KEY; 384 385 rv = siw_qp_add(sdev, qp); 386 if (rv) 387 goto err_out; 388 389 /* All queue indices are derived from modulo operations 390 * on a free running 'get' (consumer) and 'put' (producer) 391 * unsigned counter. Having queue sizes at power of two 392 * avoids handling counter wrap around. 393 */ 394 num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr); 395 num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr); 396 397 if (qp->kernel_verbs) 398 qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe)); 399 else 400 qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe)); 401 402 if (qp->sendq == NULL) { 403 siw_dbg(base_dev, "SQ size %d alloc failed\n", num_sqe); 404 rv = -ENOMEM; 405 goto err_out_xa; 406 } 407 if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) { 408 if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) 409 qp->attrs.flags |= SIW_SIGNAL_ALL_WR; 410 else { 411 rv = -EINVAL; 412 goto err_out_xa; 413 } 414 } 415 qp->pd = pd; 416 qp->scq = scq; 417 qp->rcq = rcq; 418 419 if (attrs->srq) { 420 /* 421 * SRQ support. 422 * Verbs 6.3.7: ignore RQ size, if SRQ present 423 * Verbs 6.3.5: do not check PD of SRQ against PD of QP 424 */ 425 qp->srq = to_siw_srq(attrs->srq); 426 qp->attrs.rq_size = 0; 427 siw_dbg(base_dev, "QP [%u]: [SRQ 0x%p] attached\n", 428 qp->qp_num, qp->srq); 429 } else if (num_rqe) { 430 if (qp->kernel_verbs) 431 qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe)); 432 else 433 qp->recvq = 434 vmalloc_user(num_rqe * sizeof(struct siw_rqe)); 435 436 if (qp->recvq == NULL) { 437 siw_dbg(base_dev, "RQ size %d alloc failed\n", num_rqe); 438 rv = -ENOMEM; 439 goto err_out_xa; 440 } 441 qp->attrs.rq_size = num_rqe; 442 } 443 qp->attrs.sq_size = num_sqe; 444 qp->attrs.sq_max_sges = attrs->cap.max_send_sge; 445 qp->attrs.rq_max_sges = attrs->cap.max_recv_sge; 446 447 /* Make those two tunables fixed for now. */ 448 qp->tx_ctx.gso_seg_limit = 1; 449 qp->tx_ctx.zcopy_tx = zcopy_tx; 450 451 qp->attrs.state = SIW_QP_STATE_IDLE; 452 453 if (udata) { 454 struct siw_uresp_create_qp uresp = {}; 455 456 uresp.num_sqe = num_sqe; 457 uresp.num_rqe = num_rqe; 458 uresp.qp_id = qp_id(qp); 459 460 if (qp->sendq) { 461 qp->xa_sq_index = 462 siw_create_uobj(uctx, qp->sendq, 463 num_sqe * sizeof(struct siw_sqe)); 464 } 465 if (qp->recvq) { 466 qp->xa_rq_index = 467 siw_create_uobj(uctx, qp->recvq, 468 num_rqe * sizeof(struct siw_rqe)); 469 } 470 if (qp->xa_sq_index == SIW_INVAL_UOBJ_KEY || 471 qp->xa_rq_index == SIW_INVAL_UOBJ_KEY) { 472 rv = -ENOMEM; 473 goto err_out_xa; 474 } 475 uresp.sq_key = qp->xa_sq_index << PAGE_SHIFT; 476 uresp.rq_key = qp->xa_rq_index << PAGE_SHIFT; 477 478 if (udata->outlen < sizeof(uresp)) { 479 rv = -EINVAL; 480 goto err_out_xa; 481 } 482 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 483 if (rv) 484 goto err_out_xa; 485 } 486 qp->tx_cpu = siw_get_tx_cpu(sdev); 487 if (qp->tx_cpu < 0) { 488 rv = -EINVAL; 489 goto err_out_xa; 490 } 491 INIT_LIST_HEAD(&qp->devq); 492 spin_lock_irqsave(&sdev->lock, flags); 493 list_add_tail(&qp->devq, &sdev->qp_list); 494 spin_unlock_irqrestore(&sdev->lock, flags); 495 496 return qp->ib_qp; 497 498 err_out_xa: 499 xa_erase(&sdev->qp_xa, qp_id(qp)); 500 err_out: 501 kfree(siw_base_qp); 502 503 if (qp) { 504 if (qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) 505 kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); 506 if (qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) 507 kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); 508 509 vfree(qp->sendq); 510 vfree(qp->recvq); 511 kfree(qp); 512 } 513 atomic_dec(&sdev->num_qp); 514 515 return ERR_PTR(rv); 516 } 517 518 /* 519 * Minimum siw_query_qp() verb interface. 520 * 521 * @qp_attr_mask is not used but all available information is provided 522 */ 523 int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, 524 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) 525 { 526 struct siw_qp *qp; 527 struct siw_device *sdev; 528 529 if (base_qp && qp_attr && qp_init_attr) { 530 qp = to_siw_qp(base_qp); 531 sdev = to_siw_dev(base_qp->device); 532 } else { 533 return -EINVAL; 534 } 535 qp_attr->cap.max_inline_data = SIW_MAX_INLINE; 536 qp_attr->cap.max_send_wr = qp->attrs.sq_size; 537 qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; 538 qp_attr->cap.max_recv_wr = qp->attrs.rq_size; 539 qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; 540 qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); 541 qp_attr->max_rd_atomic = qp->attrs.irq_size; 542 qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; 543 544 qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | 545 IB_ACCESS_REMOTE_WRITE | 546 IB_ACCESS_REMOTE_READ; 547 548 qp_init_attr->qp_type = base_qp->qp_type; 549 qp_init_attr->send_cq = base_qp->send_cq; 550 qp_init_attr->recv_cq = base_qp->recv_cq; 551 qp_init_attr->srq = base_qp->srq; 552 553 qp_init_attr->cap = qp_attr->cap; 554 555 return 0; 556 } 557 558 int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, 559 int attr_mask, struct ib_udata *udata) 560 { 561 struct siw_qp_attrs new_attrs; 562 enum siw_qp_attr_mask siw_attr_mask = 0; 563 struct siw_qp *qp = to_siw_qp(base_qp); 564 int rv = 0; 565 566 if (!attr_mask) 567 return 0; 568 569 memset(&new_attrs, 0, sizeof(new_attrs)); 570 571 if (attr_mask & IB_QP_ACCESS_FLAGS) { 572 siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS; 573 574 if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) 575 new_attrs.flags |= SIW_RDMA_READ_ENABLED; 576 if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) 577 new_attrs.flags |= SIW_RDMA_WRITE_ENABLED; 578 if (attr->qp_access_flags & IB_ACCESS_MW_BIND) 579 new_attrs.flags |= SIW_RDMA_BIND_ENABLED; 580 } 581 if (attr_mask & IB_QP_STATE) { 582 siw_dbg_qp(qp, "desired IB QP state: %s\n", 583 ib_qp_state_to_string[attr->qp_state]); 584 585 new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state]; 586 587 if (new_attrs.state > SIW_QP_STATE_RTS) 588 qp->tx_ctx.tx_suspend = 1; 589 590 siw_attr_mask |= SIW_QP_ATTR_STATE; 591 } 592 if (!siw_attr_mask) 593 goto out; 594 595 down_write(&qp->state_lock); 596 597 rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask); 598 599 up_write(&qp->state_lock); 600 out: 601 return rv; 602 } 603 604 int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata) 605 { 606 struct siw_qp *qp = to_siw_qp(base_qp); 607 struct siw_base_qp *siw_base_qp = to_siw_base_qp(base_qp); 608 struct siw_ucontext *uctx = 609 rdma_udata_to_drv_context(udata, struct siw_ucontext, 610 base_ucontext); 611 struct siw_qp_attrs qp_attrs; 612 613 siw_dbg_qp(qp, "state %d, cep 0x%p\n", qp->attrs.state, qp->cep); 614 615 /* 616 * Mark QP as in process of destruction to prevent from 617 * any async callbacks to RDMA core 618 */ 619 qp->attrs.flags |= SIW_QP_IN_DESTROY; 620 qp->rx_stream.rx_suspend = 1; 621 622 if (uctx && qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) 623 kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); 624 if (uctx && qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) 625 kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); 626 627 down_write(&qp->state_lock); 628 629 qp_attrs.state = SIW_QP_STATE_ERROR; 630 siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE); 631 632 if (qp->cep) { 633 siw_cep_put(qp->cep); 634 qp->cep = NULL; 635 } 636 up_write(&qp->state_lock); 637 638 kfree(qp->tx_ctx.mpa_crc_hd); 639 kfree(qp->rx_stream.mpa_crc_hd); 640 641 qp->scq = qp->rcq = NULL; 642 643 siw_qp_put(qp); 644 kfree(siw_base_qp); 645 646 return 0; 647 } 648 649 /* 650 * siw_copy_inline_sgl() 651 * 652 * Prepare sgl of inlined data for sending. For userland callers 653 * function checks if given buffer addresses and len's are within 654 * process context bounds. 655 * Data from all provided sge's are copied together into the wqe, 656 * referenced by a single sge. 657 */ 658 static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, 659 struct siw_sqe *sqe) 660 { 661 struct ib_sge *core_sge = core_wr->sg_list; 662 void *kbuf = &sqe->sge[1]; 663 int num_sge = core_wr->num_sge, bytes = 0; 664 665 sqe->sge[0].laddr = (u64)kbuf; 666 sqe->sge[0].lkey = 0; 667 668 while (num_sge--) { 669 if (!core_sge->length) { 670 core_sge++; 671 continue; 672 } 673 bytes += core_sge->length; 674 if (bytes > SIW_MAX_INLINE) { 675 bytes = -EINVAL; 676 break; 677 } 678 memcpy(kbuf, (void *)(uintptr_t)core_sge->addr, 679 core_sge->length); 680 681 kbuf += core_sge->length; 682 core_sge++; 683 } 684 sqe->sge[0].length = bytes > 0 ? bytes : 0; 685 sqe->num_sge = bytes > 0 ? 1 : 0; 686 687 return bytes; 688 } 689 690 /* 691 * siw_post_send() 692 * 693 * Post a list of S-WR's to a SQ. 694 * 695 * @base_qp: Base QP contained in siw QP 696 * @wr: Null terminated list of user WR's 697 * @bad_wr: Points to failing WR in case of synchronous failure. 698 */ 699 int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, 700 const struct ib_send_wr **bad_wr) 701 { 702 struct siw_qp *qp = to_siw_qp(base_qp); 703 struct siw_wqe *wqe = tx_wqe(qp); 704 705 unsigned long flags; 706 int rv = 0; 707 708 /* 709 * Try to acquire QP state lock. Must be non-blocking 710 * to accommodate kernel clients needs. 711 */ 712 if (!down_read_trylock(&qp->state_lock)) { 713 *bad_wr = wr; 714 siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state); 715 return -ENOTCONN; 716 } 717 if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { 718 up_read(&qp->state_lock); 719 *bad_wr = wr; 720 siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state); 721 return -ENOTCONN; 722 } 723 if (wr && !qp->kernel_verbs) { 724 siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); 725 up_read(&qp->state_lock); 726 *bad_wr = wr; 727 return -EINVAL; 728 } 729 spin_lock_irqsave(&qp->sq_lock, flags); 730 731 while (wr) { 732 u32 idx = qp->sq_put % qp->attrs.sq_size; 733 struct siw_sqe *sqe = &qp->sendq[idx]; 734 735 if (sqe->flags) { 736 siw_dbg_qp(qp, "sq full\n"); 737 rv = -ENOMEM; 738 break; 739 } 740 if (wr->num_sge > qp->attrs.sq_max_sges) { 741 siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); 742 rv = -EINVAL; 743 break; 744 } 745 sqe->id = wr->wr_id; 746 747 if ((wr->send_flags & IB_SEND_SIGNALED) || 748 (qp->attrs.flags & SIW_SIGNAL_ALL_WR)) 749 sqe->flags |= SIW_WQE_SIGNALLED; 750 751 if (wr->send_flags & IB_SEND_FENCE) 752 sqe->flags |= SIW_WQE_READ_FENCE; 753 754 switch (wr->opcode) { 755 case IB_WR_SEND: 756 case IB_WR_SEND_WITH_INV: 757 if (wr->send_flags & IB_SEND_SOLICITED) 758 sqe->flags |= SIW_WQE_SOLICITED; 759 760 if (!(wr->send_flags & IB_SEND_INLINE)) { 761 siw_copy_sgl(wr->sg_list, sqe->sge, 762 wr->num_sge); 763 sqe->num_sge = wr->num_sge; 764 } else { 765 rv = siw_copy_inline_sgl(wr, sqe); 766 if (rv <= 0) { 767 rv = -EINVAL; 768 break; 769 } 770 sqe->flags |= SIW_WQE_INLINE; 771 sqe->num_sge = 1; 772 } 773 if (wr->opcode == IB_WR_SEND) 774 sqe->opcode = SIW_OP_SEND; 775 else { 776 sqe->opcode = SIW_OP_SEND_REMOTE_INV; 777 sqe->rkey = wr->ex.invalidate_rkey; 778 } 779 break; 780 781 case IB_WR_RDMA_READ_WITH_INV: 782 case IB_WR_RDMA_READ: 783 /* 784 * iWarp restricts RREAD sink to SGL containing 785 * 1 SGE only. we could relax to SGL with multiple 786 * elements referring the SAME ltag or even sending 787 * a private per-rreq tag referring to a checked 788 * local sgl with MULTIPLE ltag's. 789 */ 790 if (unlikely(wr->num_sge != 1)) { 791 rv = -EINVAL; 792 break; 793 } 794 siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1); 795 /* 796 * NOTE: zero length RREAD is allowed! 797 */ 798 sqe->raddr = rdma_wr(wr)->remote_addr; 799 sqe->rkey = rdma_wr(wr)->rkey; 800 sqe->num_sge = 1; 801 802 if (wr->opcode == IB_WR_RDMA_READ) 803 sqe->opcode = SIW_OP_READ; 804 else 805 sqe->opcode = SIW_OP_READ_LOCAL_INV; 806 break; 807 808 case IB_WR_RDMA_WRITE: 809 if (!(wr->send_flags & IB_SEND_INLINE)) { 810 siw_copy_sgl(wr->sg_list, &sqe->sge[0], 811 wr->num_sge); 812 sqe->num_sge = wr->num_sge; 813 } else { 814 rv = siw_copy_inline_sgl(wr, sqe); 815 if (unlikely(rv < 0)) { 816 rv = -EINVAL; 817 break; 818 } 819 sqe->flags |= SIW_WQE_INLINE; 820 sqe->num_sge = 1; 821 } 822 sqe->raddr = rdma_wr(wr)->remote_addr; 823 sqe->rkey = rdma_wr(wr)->rkey; 824 sqe->opcode = SIW_OP_WRITE; 825 break; 826 827 case IB_WR_REG_MR: 828 sqe->base_mr = (uint64_t)reg_wr(wr)->mr; 829 sqe->rkey = reg_wr(wr)->key; 830 sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK; 831 sqe->opcode = SIW_OP_REG_MR; 832 break; 833 834 case IB_WR_LOCAL_INV: 835 sqe->rkey = wr->ex.invalidate_rkey; 836 sqe->opcode = SIW_OP_INVAL_STAG; 837 break; 838 839 default: 840 siw_dbg_qp(qp, "ib wr type %d unsupported\n", 841 wr->opcode); 842 rv = -EINVAL; 843 break; 844 } 845 siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%p\n", 846 sqe->opcode, sqe->flags, (void *)sqe->id); 847 848 if (unlikely(rv < 0)) 849 break; 850 851 /* make SQE only valid after completely written */ 852 smp_wmb(); 853 sqe->flags |= SIW_WQE_VALID; 854 855 qp->sq_put++; 856 wr = wr->next; 857 } 858 859 /* 860 * Send directly if SQ processing is not in progress. 861 * Eventual immediate errors (rv < 0) do not affect the involved 862 * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ 863 * processing, if new work is already pending. But rv must be passed 864 * to caller. 865 */ 866 if (wqe->wr_status != SIW_WR_IDLE) { 867 spin_unlock_irqrestore(&qp->sq_lock, flags); 868 goto skip_direct_sending; 869 } 870 rv = siw_activate_tx(qp); 871 spin_unlock_irqrestore(&qp->sq_lock, flags); 872 873 if (rv <= 0) 874 goto skip_direct_sending; 875 876 if (qp->kernel_verbs) { 877 rv = siw_sq_start(qp); 878 } else { 879 qp->tx_ctx.in_syscall = 1; 880 881 if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend)) 882 siw_qp_cm_drop(qp, 0); 883 884 qp->tx_ctx.in_syscall = 0; 885 } 886 skip_direct_sending: 887 888 up_read(&qp->state_lock); 889 890 if (rv >= 0) 891 return 0; 892 /* 893 * Immediate error 894 */ 895 siw_dbg_qp(qp, "error %d\n", rv); 896 897 *bad_wr = wr; 898 return rv; 899 } 900 901 /* 902 * siw_post_receive() 903 * 904 * Post a list of R-WR's to a RQ. 905 * 906 * @base_qp: Base QP contained in siw QP 907 * @wr: Null terminated list of user WR's 908 * @bad_wr: Points to failing WR in case of synchronous failure. 909 */ 910 int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, 911 const struct ib_recv_wr **bad_wr) 912 { 913 struct siw_qp *qp = to_siw_qp(base_qp); 914 unsigned long flags; 915 int rv = 0; 916 917 if (qp->srq) { 918 *bad_wr = wr; 919 return -EOPNOTSUPP; /* what else from errno.h? */ 920 } 921 /* 922 * Try to acquire QP state lock. Must be non-blocking 923 * to accommodate kernel clients needs. 924 */ 925 if (!down_read_trylock(&qp->state_lock)) { 926 *bad_wr = wr; 927 return -ENOTCONN; 928 } 929 if (!qp->kernel_verbs) { 930 siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n"); 931 up_read(&qp->state_lock); 932 *bad_wr = wr; 933 return -EINVAL; 934 } 935 if (qp->attrs.state > SIW_QP_STATE_RTS) { 936 up_read(&qp->state_lock); 937 *bad_wr = wr; 938 return -EINVAL; 939 } 940 /* 941 * Serialize potentially multiple producers. 942 * Not needed for single threaded consumer side. 943 */ 944 spin_lock_irqsave(&qp->rq_lock, flags); 945 946 while (wr) { 947 u32 idx = qp->rq_put % qp->attrs.rq_size; 948 struct siw_rqe *rqe = &qp->recvq[idx]; 949 950 if (rqe->flags) { 951 siw_dbg_qp(qp, "RQ full\n"); 952 rv = -ENOMEM; 953 break; 954 } 955 if (wr->num_sge > qp->attrs.rq_max_sges) { 956 siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); 957 rv = -EINVAL; 958 break; 959 } 960 rqe->id = wr->wr_id; 961 rqe->num_sge = wr->num_sge; 962 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); 963 964 /* make sure RQE is completely written before valid */ 965 smp_wmb(); 966 967 rqe->flags = SIW_WQE_VALID; 968 969 qp->rq_put++; 970 wr = wr->next; 971 } 972 spin_unlock_irqrestore(&qp->rq_lock, flags); 973 974 up_read(&qp->state_lock); 975 976 if (rv < 0) { 977 siw_dbg_qp(qp, "error %d\n", rv); 978 *bad_wr = wr; 979 } 980 return rv > 0 ? 0 : rv; 981 } 982 983 void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) 984 { 985 struct siw_cq *cq = to_siw_cq(base_cq); 986 struct siw_device *sdev = to_siw_dev(base_cq->device); 987 struct siw_ucontext *ctx = 988 rdma_udata_to_drv_context(udata, struct siw_ucontext, 989 base_ucontext); 990 991 siw_dbg_cq(cq, "free CQ resources\n"); 992 993 siw_cq_flush(cq); 994 995 if (ctx && cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) 996 kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); 997 998 atomic_dec(&sdev->num_cq); 999 1000 vfree(cq->queue); 1001 } 1002 1003 /* 1004 * siw_create_cq() 1005 * 1006 * Populate CQ of requested size 1007 * 1008 * @base_cq: CQ as allocated by RDMA midlayer 1009 * @attr: Initial CQ attributes 1010 * @udata: relates to user context 1011 */ 1012 1013 int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, 1014 struct ib_udata *udata) 1015 { 1016 struct siw_device *sdev = to_siw_dev(base_cq->device); 1017 struct siw_cq *cq = to_siw_cq(base_cq); 1018 int rv, size = attr->cqe; 1019 1020 if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) { 1021 siw_dbg(base_cq->device, "too many CQ's\n"); 1022 rv = -ENOMEM; 1023 goto err_out; 1024 } 1025 if (size < 1 || size > sdev->attrs.max_cqe) { 1026 siw_dbg(base_cq->device, "CQ size error: %d\n", size); 1027 rv = -EINVAL; 1028 goto err_out; 1029 } 1030 size = roundup_pow_of_two(size); 1031 cq->base_cq.cqe = size; 1032 cq->num_cqe = size; 1033 cq->xa_cq_index = SIW_INVAL_UOBJ_KEY; 1034 1035 if (!udata) { 1036 cq->kernel_verbs = 1; 1037 cq->queue = vzalloc(size * sizeof(struct siw_cqe) + 1038 sizeof(struct siw_cq_ctrl)); 1039 } else { 1040 cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) + 1041 sizeof(struct siw_cq_ctrl)); 1042 } 1043 if (cq->queue == NULL) { 1044 rv = -ENOMEM; 1045 goto err_out; 1046 } 1047 get_random_bytes(&cq->id, 4); 1048 siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id); 1049 1050 spin_lock_init(&cq->lock); 1051 1052 cq->notify = &((struct siw_cq_ctrl *)&cq->queue[size])->notify; 1053 1054 if (udata) { 1055 struct siw_uresp_create_cq uresp = {}; 1056 struct siw_ucontext *ctx = 1057 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1058 base_ucontext); 1059 1060 cq->xa_cq_index = 1061 siw_create_uobj(ctx, cq->queue, 1062 size * sizeof(struct siw_cqe) + 1063 sizeof(struct siw_cq_ctrl)); 1064 if (cq->xa_cq_index == SIW_INVAL_UOBJ_KEY) { 1065 rv = -ENOMEM; 1066 goto err_out; 1067 } 1068 uresp.cq_key = cq->xa_cq_index << PAGE_SHIFT; 1069 uresp.cq_id = cq->id; 1070 uresp.num_cqe = size; 1071 1072 if (udata->outlen < sizeof(uresp)) { 1073 rv = -EINVAL; 1074 goto err_out; 1075 } 1076 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1077 if (rv) 1078 goto err_out; 1079 } 1080 return 0; 1081 1082 err_out: 1083 siw_dbg(base_cq->device, "CQ creation failed: %d", rv); 1084 1085 if (cq && cq->queue) { 1086 struct siw_ucontext *ctx = 1087 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1088 base_ucontext); 1089 if (cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) 1090 kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); 1091 vfree(cq->queue); 1092 } 1093 atomic_dec(&sdev->num_cq); 1094 1095 return rv; 1096 } 1097 1098 /* 1099 * siw_poll_cq() 1100 * 1101 * Reap CQ entries if available and copy work completion status into 1102 * array of WC's provided by caller. Returns number of reaped CQE's. 1103 * 1104 * @base_cq: Base CQ contained in siw CQ. 1105 * @num_cqe: Maximum number of CQE's to reap. 1106 * @wc: Array of work completions to be filled by siw. 1107 */ 1108 int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc) 1109 { 1110 struct siw_cq *cq = to_siw_cq(base_cq); 1111 int i; 1112 1113 for (i = 0; i < num_cqe; i++) { 1114 if (!siw_reap_cqe(cq, wc)) 1115 break; 1116 wc++; 1117 } 1118 return i; 1119 } 1120 1121 /* 1122 * siw_req_notify_cq() 1123 * 1124 * Request notification for new CQE's added to that CQ. 1125 * Defined flags: 1126 * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification 1127 * event if a WQE with notification flag set enters the CQ 1128 * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification 1129 * event if a WQE enters the CQ. 1130 * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the 1131 * number of not reaped CQE's regardless of its notification 1132 * type and current or new CQ notification settings. 1133 * 1134 * @base_cq: Base CQ contained in siw CQ. 1135 * @flags: Requested notification flags. 1136 */ 1137 int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) 1138 { 1139 struct siw_cq *cq = to_siw_cq(base_cq); 1140 1141 siw_dbg_cq(cq, "flags: 0x%02x\n", flags); 1142 1143 if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) 1144 /* CQ event for next solicited completion */ 1145 smp_store_mb(*cq->notify, SIW_NOTIFY_SOLICITED); 1146 else 1147 /* CQ event for any signalled completion */ 1148 smp_store_mb(*cq->notify, SIW_NOTIFY_ALL); 1149 1150 if (flags & IB_CQ_REPORT_MISSED_EVENTS) 1151 return cq->cq_put - cq->cq_get; 1152 1153 return 0; 1154 } 1155 1156 /* 1157 * siw_dereg_mr() 1158 * 1159 * Release Memory Region. 1160 * 1161 * @base_mr: Base MR contained in siw MR. 1162 * @udata: points to user context, unused. 1163 */ 1164 int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata) 1165 { 1166 struct siw_mr *mr = to_siw_mr(base_mr); 1167 struct siw_device *sdev = to_siw_dev(base_mr->device); 1168 1169 siw_dbg_mem(mr->mem, "deregister MR\n"); 1170 1171 atomic_dec(&sdev->num_mr); 1172 1173 siw_mr_drop_mem(mr); 1174 kfree_rcu(mr, rcu); 1175 1176 return 0; 1177 } 1178 1179 /* 1180 * siw_reg_user_mr() 1181 * 1182 * Register Memory Region. 1183 * 1184 * @pd: Protection Domain 1185 * @start: starting address of MR (virtual address) 1186 * @len: len of MR 1187 * @rnic_va: not used by siw 1188 * @rights: MR access rights 1189 * @udata: user buffer to communicate STag and Key. 1190 */ 1191 struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, 1192 u64 rnic_va, int rights, struct ib_udata *udata) 1193 { 1194 struct siw_mr *mr = NULL; 1195 struct siw_umem *umem = NULL; 1196 struct siw_ureq_reg_mr ureq; 1197 struct siw_device *sdev = to_siw_dev(pd->device); 1198 1199 unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK); 1200 int rv; 1201 1202 siw_dbg_pd(pd, "start: 0x%016llx, va: 0x%016llx, len: %llu\n", 1203 (unsigned long long)start, (unsigned long long)rnic_va, 1204 (unsigned long long)len); 1205 1206 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1207 siw_dbg_pd(pd, "too many mr's\n"); 1208 rv = -ENOMEM; 1209 goto err_out; 1210 } 1211 if (!len) { 1212 rv = -EINVAL; 1213 goto err_out; 1214 } 1215 if (mem_limit != RLIM_INFINITY) { 1216 unsigned long num_pages = 1217 (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT; 1218 mem_limit >>= PAGE_SHIFT; 1219 1220 if (num_pages > mem_limit - current->mm->locked_vm) { 1221 siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n", 1222 num_pages, mem_limit, 1223 current->mm->locked_vm); 1224 rv = -ENOMEM; 1225 goto err_out; 1226 } 1227 } 1228 umem = siw_umem_get(start, len, ib_access_writable(rights)); 1229 if (IS_ERR(umem)) { 1230 rv = PTR_ERR(umem); 1231 siw_dbg_pd(pd, "getting user memory failed: %d\n", rv); 1232 umem = NULL; 1233 goto err_out; 1234 } 1235 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1236 if (!mr) { 1237 rv = -ENOMEM; 1238 goto err_out; 1239 } 1240 rv = siw_mr_add_mem(mr, pd, umem, start, len, rights); 1241 if (rv) 1242 goto err_out; 1243 1244 if (udata) { 1245 struct siw_uresp_reg_mr uresp = {}; 1246 struct siw_mem *mem = mr->mem; 1247 1248 if (udata->inlen < sizeof(ureq)) { 1249 rv = -EINVAL; 1250 goto err_out; 1251 } 1252 rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); 1253 if (rv) 1254 goto err_out; 1255 1256 mr->base_mr.lkey |= ureq.stag_key; 1257 mr->base_mr.rkey |= ureq.stag_key; 1258 mem->stag |= ureq.stag_key; 1259 uresp.stag = mem->stag; 1260 1261 if (udata->outlen < sizeof(uresp)) { 1262 rv = -EINVAL; 1263 goto err_out; 1264 } 1265 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1266 if (rv) 1267 goto err_out; 1268 } 1269 mr->mem->stag_valid = 1; 1270 1271 return &mr->base_mr; 1272 1273 err_out: 1274 atomic_dec(&sdev->num_mr); 1275 if (mr) { 1276 if (mr->mem) 1277 siw_mr_drop_mem(mr); 1278 kfree_rcu(mr, rcu); 1279 } else { 1280 if (umem) 1281 siw_umem_release(umem, false); 1282 } 1283 return ERR_PTR(rv); 1284 } 1285 1286 struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 1287 u32 max_sge, struct ib_udata *udata) 1288 { 1289 struct siw_device *sdev = to_siw_dev(pd->device); 1290 struct siw_mr *mr = NULL; 1291 struct siw_pbl *pbl = NULL; 1292 int rv; 1293 1294 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1295 siw_dbg_pd(pd, "too many mr's\n"); 1296 rv = -ENOMEM; 1297 goto err_out; 1298 } 1299 if (mr_type != IB_MR_TYPE_MEM_REG) { 1300 siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type); 1301 rv = -EOPNOTSUPP; 1302 goto err_out; 1303 } 1304 if (max_sge > SIW_MAX_SGE_PBL) { 1305 siw_dbg_pd(pd, "too many sge's: %d\n", max_sge); 1306 rv = -ENOMEM; 1307 goto err_out; 1308 } 1309 pbl = siw_pbl_alloc(max_sge); 1310 if (IS_ERR(pbl)) { 1311 rv = PTR_ERR(pbl); 1312 siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv); 1313 pbl = NULL; 1314 goto err_out; 1315 } 1316 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1317 if (!mr) { 1318 rv = -ENOMEM; 1319 goto err_out; 1320 } 1321 rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0); 1322 if (rv) 1323 goto err_out; 1324 1325 mr->mem->is_pbl = 1; 1326 1327 siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); 1328 1329 return &mr->base_mr; 1330 1331 err_out: 1332 atomic_dec(&sdev->num_mr); 1333 1334 if (!mr) { 1335 kfree(pbl); 1336 } else { 1337 if (mr->mem) 1338 siw_mr_drop_mem(mr); 1339 kfree_rcu(mr, rcu); 1340 } 1341 siw_dbg_pd(pd, "failed: %d\n", rv); 1342 1343 return ERR_PTR(rv); 1344 } 1345 1346 /* Just used to count number of pages being mapped */ 1347 static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr) 1348 { 1349 return 0; 1350 } 1351 1352 int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, 1353 unsigned int *sg_off) 1354 { 1355 struct scatterlist *slp; 1356 struct siw_mr *mr = to_siw_mr(base_mr); 1357 struct siw_mem *mem = mr->mem; 1358 struct siw_pbl *pbl = mem->pbl; 1359 struct siw_pble *pble; 1360 u64 pbl_size; 1361 int i, rv; 1362 1363 if (!pbl) { 1364 siw_dbg_mem(mem, "no PBL allocated\n"); 1365 return -EINVAL; 1366 } 1367 pble = pbl->pbe; 1368 1369 if (pbl->max_buf < num_sle) { 1370 siw_dbg_mem(mem, "too many SGE's: %d > %d\n", 1371 mem->pbl->max_buf, num_sle); 1372 return -ENOMEM; 1373 } 1374 for_each_sg(sl, slp, num_sle, i) { 1375 if (sg_dma_len(slp) == 0) { 1376 siw_dbg_mem(mem, "empty SGE\n"); 1377 return -EINVAL; 1378 } 1379 if (i == 0) { 1380 pble->addr = sg_dma_address(slp); 1381 pble->size = sg_dma_len(slp); 1382 pble->pbl_off = 0; 1383 pbl_size = pble->size; 1384 pbl->num_buf = 1; 1385 } else { 1386 /* Merge PBL entries if adjacent */ 1387 if (pble->addr + pble->size == sg_dma_address(slp)) { 1388 pble->size += sg_dma_len(slp); 1389 } else { 1390 pble++; 1391 pbl->num_buf++; 1392 pble->addr = sg_dma_address(slp); 1393 pble->size = sg_dma_len(slp); 1394 pble->pbl_off = pbl_size; 1395 } 1396 pbl_size += sg_dma_len(slp); 1397 } 1398 siw_dbg_mem(mem, 1399 "sge[%d], size %llu, addr 0x%016llx, total %llu\n", 1400 i, pble->size, pble->addr, pbl_size); 1401 } 1402 rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); 1403 if (rv > 0) { 1404 mem->len = base_mr->length; 1405 mem->va = base_mr->iova; 1406 siw_dbg_mem(mem, 1407 "%llu bytes, start 0x%016llx, %u SLE to %u entries\n", 1408 mem->len, mem->va, num_sle, pbl->num_buf); 1409 } 1410 return rv; 1411 } 1412 1413 /* 1414 * siw_get_dma_mr() 1415 * 1416 * Create a (empty) DMA memory region, where no umem is attached. 1417 */ 1418 struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights) 1419 { 1420 struct siw_device *sdev = to_siw_dev(pd->device); 1421 struct siw_mr *mr = NULL; 1422 int rv; 1423 1424 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1425 siw_dbg_pd(pd, "too many mr's\n"); 1426 rv = -ENOMEM; 1427 goto err_out; 1428 } 1429 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1430 if (!mr) { 1431 rv = -ENOMEM; 1432 goto err_out; 1433 } 1434 rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights); 1435 if (rv) 1436 goto err_out; 1437 1438 mr->mem->stag_valid = 1; 1439 1440 siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); 1441 1442 return &mr->base_mr; 1443 1444 err_out: 1445 if (rv) 1446 kfree(mr); 1447 1448 atomic_dec(&sdev->num_mr); 1449 1450 return ERR_PTR(rv); 1451 } 1452 1453 /* 1454 * siw_create_srq() 1455 * 1456 * Create Shared Receive Queue of attributes @init_attrs 1457 * within protection domain given by @pd. 1458 * 1459 * @base_srq: Base SRQ contained in siw SRQ. 1460 * @init_attrs: SRQ init attributes. 1461 * @udata: points to user context 1462 */ 1463 int siw_create_srq(struct ib_srq *base_srq, 1464 struct ib_srq_init_attr *init_attrs, struct ib_udata *udata) 1465 { 1466 struct siw_srq *srq = to_siw_srq(base_srq); 1467 struct ib_srq_attr *attrs = &init_attrs->attr; 1468 struct siw_device *sdev = to_siw_dev(base_srq->device); 1469 struct siw_ucontext *ctx = 1470 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1471 base_ucontext); 1472 int rv; 1473 1474 if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) { 1475 siw_dbg_pd(base_srq->pd, "too many SRQ's\n"); 1476 rv = -ENOMEM; 1477 goto err_out; 1478 } 1479 if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR || 1480 attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) { 1481 rv = -EINVAL; 1482 goto err_out; 1483 } 1484 srq->max_sge = attrs->max_sge; 1485 srq->num_rqe = roundup_pow_of_two(attrs->max_wr); 1486 srq->xa_srq_index = SIW_INVAL_UOBJ_KEY; 1487 srq->limit = attrs->srq_limit; 1488 if (srq->limit) 1489 srq->armed = 1; 1490 1491 srq->kernel_verbs = !udata; 1492 1493 if (udata) 1494 srq->recvq = 1495 vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe)); 1496 else 1497 srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe)); 1498 1499 if (srq->recvq == NULL) { 1500 rv = -ENOMEM; 1501 goto err_out; 1502 } 1503 if (udata) { 1504 struct siw_uresp_create_srq uresp = {}; 1505 1506 srq->xa_srq_index = siw_create_uobj( 1507 ctx, srq->recvq, srq->num_rqe * sizeof(struct siw_rqe)); 1508 1509 if (srq->xa_srq_index == SIW_INVAL_UOBJ_KEY) { 1510 rv = -ENOMEM; 1511 goto err_out; 1512 } 1513 uresp.srq_key = srq->xa_srq_index; 1514 uresp.num_rqe = srq->num_rqe; 1515 1516 if (udata->outlen < sizeof(uresp)) { 1517 rv = -EINVAL; 1518 goto err_out; 1519 } 1520 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1521 if (rv) 1522 goto err_out; 1523 } 1524 spin_lock_init(&srq->lock); 1525 1526 siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: success\n", srq); 1527 1528 return 0; 1529 1530 err_out: 1531 if (srq->recvq) { 1532 if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) 1533 kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); 1534 vfree(srq->recvq); 1535 } 1536 atomic_dec(&sdev->num_srq); 1537 1538 return rv; 1539 } 1540 1541 /* 1542 * siw_modify_srq() 1543 * 1544 * Modify SRQ. The caller may resize SRQ and/or set/reset notification 1545 * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification. 1546 * 1547 * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE 1548 * parameter. siw_modify_srq() does not check the attrs->max_sge param. 1549 */ 1550 int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs, 1551 enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) 1552 { 1553 struct siw_srq *srq = to_siw_srq(base_srq); 1554 unsigned long flags; 1555 int rv = 0; 1556 1557 spin_lock_irqsave(&srq->lock, flags); 1558 1559 if (attr_mask & IB_SRQ_MAX_WR) { 1560 /* resize request not yet supported */ 1561 rv = -EOPNOTSUPP; 1562 goto out; 1563 } 1564 if (attr_mask & IB_SRQ_LIMIT) { 1565 if (attrs->srq_limit) { 1566 if (unlikely(attrs->srq_limit > srq->num_rqe)) { 1567 rv = -EINVAL; 1568 goto out; 1569 } 1570 srq->armed = 1; 1571 } else { 1572 srq->armed = 0; 1573 } 1574 srq->limit = attrs->srq_limit; 1575 } 1576 out: 1577 spin_unlock_irqrestore(&srq->lock, flags); 1578 1579 return rv; 1580 } 1581 1582 /* 1583 * siw_query_srq() 1584 * 1585 * Query SRQ attributes. 1586 */ 1587 int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs) 1588 { 1589 struct siw_srq *srq = to_siw_srq(base_srq); 1590 unsigned long flags; 1591 1592 spin_lock_irqsave(&srq->lock, flags); 1593 1594 attrs->max_wr = srq->num_rqe; 1595 attrs->max_sge = srq->max_sge; 1596 attrs->srq_limit = srq->limit; 1597 1598 spin_unlock_irqrestore(&srq->lock, flags); 1599 1600 return 0; 1601 } 1602 1603 /* 1604 * siw_destroy_srq() 1605 * 1606 * Destroy SRQ. 1607 * It is assumed that the SRQ is not referenced by any 1608 * QP anymore - the code trusts the RDMA core environment to keep track 1609 * of QP references. 1610 */ 1611 void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata) 1612 { 1613 struct siw_srq *srq = to_siw_srq(base_srq); 1614 struct siw_device *sdev = to_siw_dev(base_srq->device); 1615 struct siw_ucontext *ctx = 1616 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1617 base_ucontext); 1618 1619 if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) 1620 kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); 1621 1622 vfree(srq->recvq); 1623 atomic_dec(&sdev->num_srq); 1624 } 1625 1626 /* 1627 * siw_post_srq_recv() 1628 * 1629 * Post a list of receive queue elements to SRQ. 1630 * NOTE: The function does not check or lock a certain SRQ state 1631 * during the post operation. The code simply trusts the 1632 * RDMA core environment. 1633 * 1634 * @base_srq: Base SRQ contained in siw SRQ 1635 * @wr: List of R-WR's 1636 * @bad_wr: Updated to failing WR if posting fails. 1637 */ 1638 int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, 1639 const struct ib_recv_wr **bad_wr) 1640 { 1641 struct siw_srq *srq = to_siw_srq(base_srq); 1642 unsigned long flags; 1643 int rv = 0; 1644 1645 if (unlikely(!srq->kernel_verbs)) { 1646 siw_dbg_pd(base_srq->pd, 1647 "[SRQ 0x%p]: no kernel post_recv for mapped srq\n", 1648 srq); 1649 rv = -EINVAL; 1650 goto out; 1651 } 1652 /* 1653 * Serialize potentially multiple producers. 1654 * Also needed to serialize potentially multiple 1655 * consumers. 1656 */ 1657 spin_lock_irqsave(&srq->lock, flags); 1658 1659 while (wr) { 1660 u32 idx = srq->rq_put % srq->num_rqe; 1661 struct siw_rqe *rqe = &srq->recvq[idx]; 1662 1663 if (rqe->flags) { 1664 siw_dbg_pd(base_srq->pd, "SRQ full\n"); 1665 rv = -ENOMEM; 1666 break; 1667 } 1668 if (unlikely(wr->num_sge > srq->max_sge)) { 1669 siw_dbg_pd(base_srq->pd, 1670 "[SRQ 0x%p]: too many sge's: %d\n", srq, 1671 wr->num_sge); 1672 rv = -EINVAL; 1673 break; 1674 } 1675 rqe->id = wr->wr_id; 1676 rqe->num_sge = wr->num_sge; 1677 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); 1678 1679 /* Make sure S-RQE is completely written before valid */ 1680 smp_wmb(); 1681 1682 rqe->flags = SIW_WQE_VALID; 1683 1684 srq->rq_put++; 1685 wr = wr->next; 1686 } 1687 spin_unlock_irqrestore(&srq->lock, flags); 1688 out: 1689 if (unlikely(rv < 0)) { 1690 siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: error %d\n", srq, rv); 1691 *bad_wr = wr; 1692 } 1693 return rv; 1694 } 1695 1696 void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype) 1697 { 1698 struct ib_event event; 1699 struct ib_qp *base_qp = qp->ib_qp; 1700 1701 /* 1702 * Do not report asynchronous errors on QP which gets 1703 * destroyed via verbs interface (siw_destroy_qp()) 1704 */ 1705 if (qp->attrs.flags & SIW_QP_IN_DESTROY) 1706 return; 1707 1708 event.event = etype; 1709 event.device = base_qp->device; 1710 event.element.qp = base_qp; 1711 1712 if (base_qp->event_handler) { 1713 siw_dbg_qp(qp, "reporting event %d\n", etype); 1714 base_qp->event_handler(&event, base_qp->qp_context); 1715 } 1716 } 1717 1718 void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype) 1719 { 1720 struct ib_event event; 1721 struct ib_cq *base_cq = &cq->base_cq; 1722 1723 event.event = etype; 1724 event.device = base_cq->device; 1725 event.element.cq = base_cq; 1726 1727 if (base_cq->event_handler) { 1728 siw_dbg_cq(cq, "reporting CQ event %d\n", etype); 1729 base_cq->event_handler(&event, base_cq->cq_context); 1730 } 1731 } 1732 1733 void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype) 1734 { 1735 struct ib_event event; 1736 struct ib_srq *base_srq = &srq->base_srq; 1737 1738 event.event = etype; 1739 event.device = base_srq->device; 1740 event.element.srq = base_srq; 1741 1742 if (base_srq->event_handler) { 1743 siw_dbg_pd(srq->base_srq.pd, 1744 "reporting SRQ event %d\n", etype); 1745 base_srq->event_handler(&event, base_srq->srq_context); 1746 } 1747 } 1748 1749 void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype) 1750 { 1751 struct ib_event event; 1752 1753 event.event = etype; 1754 event.device = &sdev->base_dev; 1755 event.element.port_num = port; 1756 1757 siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype); 1758 1759 ib_dispatch_event(&event); 1760 } 1761