1 /* 2 * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. 3 * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. 4 * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 */ 34 #include <linux/kernel.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 #include <linux/delay.h> 38 39 #include "iscsi_iser.h" 40 41 #define ISCSI_ISER_MAX_CONN 8 42 #define ISER_MAX_RX_CQ_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN) 43 #define ISER_MAX_TX_CQ_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN) 44 45 static void iser_cq_tasklet_fn(unsigned long data); 46 static void iser_cq_callback(struct ib_cq *cq, void *cq_context); 47 48 static void iser_cq_event_callback(struct ib_event *cause, void *context) 49 { 50 iser_err("got cq event %d \n", cause->event); 51 } 52 53 static void iser_qp_event_callback(struct ib_event *cause, void *context) 54 { 55 iser_err("got qp event %d\n",cause->event); 56 } 57 58 static void iser_event_handler(struct ib_event_handler *handler, 59 struct ib_event *event) 60 { 61 iser_err("async event %d on device %s port %d\n", event->event, 62 event->device->name, event->element.port_num); 63 } 64 65 /** 66 * iser_create_device_ib_res - creates Protection Domain (PD), Completion 67 * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with 68 * the adapator. 69 * 70 * returns 0 on success, -1 on failure 71 */ 72 static int iser_create_device_ib_res(struct iser_device *device) 73 { 74 struct iser_cq_desc *cq_desc; 75 struct ib_device_attr *dev_attr = &device->dev_attr; 76 int ret, i, j; 77 78 ret = ib_query_device(device->ib_device, dev_attr); 79 if (ret) { 80 pr_warn("Query device failed for %s\n", device->ib_device->name); 81 return ret; 82 } 83 84 /* Assign function handles - based on FMR support */ 85 if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr && 86 device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) { 87 iser_info("FMR supported, using FMR for registration\n"); 88 device->iser_alloc_rdma_reg_res = iser_create_fmr_pool; 89 device->iser_free_rdma_reg_res = iser_free_fmr_pool; 90 device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr; 91 device->iser_unreg_rdma_mem = iser_unreg_mem_fmr; 92 } else 93 if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { 94 iser_info("FastReg supported, using FastReg for registration\n"); 95 device->iser_alloc_rdma_reg_res = iser_create_fastreg_pool; 96 device->iser_free_rdma_reg_res = iser_free_fastreg_pool; 97 device->iser_reg_rdma_mem = iser_reg_rdma_mem_fastreg; 98 device->iser_unreg_rdma_mem = iser_unreg_mem_fastreg; 99 } else { 100 iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n"); 101 return -1; 102 } 103 104 device->cqs_used = min(ISER_MAX_CQ, device->ib_device->num_comp_vectors); 105 iser_info("using %d CQs, device %s supports %d vectors\n", 106 device->cqs_used, device->ib_device->name, 107 device->ib_device->num_comp_vectors); 108 109 device->cq_desc = kmalloc(sizeof(struct iser_cq_desc) * device->cqs_used, 110 GFP_KERNEL); 111 if (device->cq_desc == NULL) 112 goto cq_desc_err; 113 cq_desc = device->cq_desc; 114 115 device->pd = ib_alloc_pd(device->ib_device); 116 if (IS_ERR(device->pd)) 117 goto pd_err; 118 119 for (i = 0; i < device->cqs_used; i++) { 120 cq_desc[i].device = device; 121 cq_desc[i].cq_index = i; 122 123 device->rx_cq[i] = ib_create_cq(device->ib_device, 124 iser_cq_callback, 125 iser_cq_event_callback, 126 (void *)&cq_desc[i], 127 ISER_MAX_RX_CQ_LEN, i); 128 if (IS_ERR(device->rx_cq[i])) 129 goto cq_err; 130 131 device->tx_cq[i] = ib_create_cq(device->ib_device, 132 NULL, iser_cq_event_callback, 133 (void *)&cq_desc[i], 134 ISER_MAX_TX_CQ_LEN, i); 135 136 if (IS_ERR(device->tx_cq[i])) 137 goto cq_err; 138 139 if (ib_req_notify_cq(device->rx_cq[i], IB_CQ_NEXT_COMP)) 140 goto cq_err; 141 142 tasklet_init(&device->cq_tasklet[i], 143 iser_cq_tasklet_fn, 144 (unsigned long)&cq_desc[i]); 145 } 146 147 device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE | 148 IB_ACCESS_REMOTE_WRITE | 149 IB_ACCESS_REMOTE_READ); 150 if (IS_ERR(device->mr)) 151 goto dma_mr_err; 152 153 INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device, 154 iser_event_handler); 155 if (ib_register_event_handler(&device->event_handler)) 156 goto handler_err; 157 158 return 0; 159 160 handler_err: 161 ib_dereg_mr(device->mr); 162 dma_mr_err: 163 for (j = 0; j < device->cqs_used; j++) 164 tasklet_kill(&device->cq_tasklet[j]); 165 cq_err: 166 for (j = 0; j < i; j++) { 167 if (device->tx_cq[j]) 168 ib_destroy_cq(device->tx_cq[j]); 169 if (device->rx_cq[j]) 170 ib_destroy_cq(device->rx_cq[j]); 171 } 172 ib_dealloc_pd(device->pd); 173 pd_err: 174 kfree(device->cq_desc); 175 cq_desc_err: 176 iser_err("failed to allocate an IB resource\n"); 177 return -1; 178 } 179 180 /** 181 * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR, 182 * CQ and PD created with the device associated with the adapator. 183 */ 184 static void iser_free_device_ib_res(struct iser_device *device) 185 { 186 int i; 187 BUG_ON(device->mr == NULL); 188 189 for (i = 0; i < device->cqs_used; i++) { 190 tasklet_kill(&device->cq_tasklet[i]); 191 (void)ib_destroy_cq(device->tx_cq[i]); 192 (void)ib_destroy_cq(device->rx_cq[i]); 193 device->tx_cq[i] = NULL; 194 device->rx_cq[i] = NULL; 195 } 196 197 (void)ib_unregister_event_handler(&device->event_handler); 198 (void)ib_dereg_mr(device->mr); 199 (void)ib_dealloc_pd(device->pd); 200 201 kfree(device->cq_desc); 202 203 device->mr = NULL; 204 device->pd = NULL; 205 } 206 207 /** 208 * iser_create_fmr_pool - Creates FMR pool and page_vector 209 * 210 * returns 0 on success, or errno code on failure 211 */ 212 int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max) 213 { 214 struct iser_device *device = ib_conn->device; 215 struct ib_fmr_pool_param params; 216 int ret = -ENOMEM; 217 218 ib_conn->fmr.page_vec = kmalloc(sizeof(*ib_conn->fmr.page_vec) + 219 (sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)), 220 GFP_KERNEL); 221 if (!ib_conn->fmr.page_vec) 222 return ret; 223 224 ib_conn->fmr.page_vec->pages = (u64 *)(ib_conn->fmr.page_vec + 1); 225 226 params.page_shift = SHIFT_4K; 227 /* when the first/last SG element are not start/end * 228 * page aligned, the map whould be of N+1 pages */ 229 params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1; 230 /* make the pool size twice the max number of SCSI commands * 231 * the ML is expected to queue, watermark for unmap at 50% */ 232 params.pool_size = cmds_max * 2; 233 params.dirty_watermark = cmds_max; 234 params.cache = 0; 235 params.flush_function = NULL; 236 params.access = (IB_ACCESS_LOCAL_WRITE | 237 IB_ACCESS_REMOTE_WRITE | 238 IB_ACCESS_REMOTE_READ); 239 240 ib_conn->fmr.pool = ib_create_fmr_pool(device->pd, ¶ms); 241 if (!IS_ERR(ib_conn->fmr.pool)) 242 return 0; 243 244 /* no FMR => no need for page_vec */ 245 kfree(ib_conn->fmr.page_vec); 246 ib_conn->fmr.page_vec = NULL; 247 248 ret = PTR_ERR(ib_conn->fmr.pool); 249 ib_conn->fmr.pool = NULL; 250 if (ret != -ENOSYS) { 251 iser_err("FMR allocation failed, err %d\n", ret); 252 return ret; 253 } else { 254 iser_warn("FMRs are not supported, using unaligned mode\n"); 255 return 0; 256 } 257 } 258 259 /** 260 * iser_free_fmr_pool - releases the FMR pool and page vec 261 */ 262 void iser_free_fmr_pool(struct iser_conn *ib_conn) 263 { 264 iser_info("freeing conn %p fmr pool %p\n", 265 ib_conn, ib_conn->fmr.pool); 266 267 if (ib_conn->fmr.pool != NULL) 268 ib_destroy_fmr_pool(ib_conn->fmr.pool); 269 270 ib_conn->fmr.pool = NULL; 271 272 kfree(ib_conn->fmr.page_vec); 273 ib_conn->fmr.page_vec = NULL; 274 } 275 276 static int 277 iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd, 278 bool pi_enable, struct fast_reg_descriptor *desc) 279 { 280 int ret; 281 282 desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, 283 ISCSI_ISER_SG_TABLESIZE + 1); 284 if (IS_ERR(desc->data_frpl)) { 285 ret = PTR_ERR(desc->data_frpl); 286 iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n", 287 ret); 288 return PTR_ERR(desc->data_frpl); 289 } 290 291 desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1); 292 if (IS_ERR(desc->data_mr)) { 293 ret = PTR_ERR(desc->data_mr); 294 iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); 295 goto fast_reg_mr_failure; 296 } 297 desc->reg_indicators |= ISER_DATA_KEY_VALID; 298 299 if (pi_enable) { 300 struct ib_mr_init_attr mr_init_attr = {0}; 301 struct iser_pi_context *pi_ctx = NULL; 302 303 desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); 304 if (!desc->pi_ctx) { 305 iser_err("Failed to allocate pi context\n"); 306 ret = -ENOMEM; 307 goto pi_ctx_alloc_failure; 308 } 309 pi_ctx = desc->pi_ctx; 310 311 pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, 312 ISCSI_ISER_SG_TABLESIZE); 313 if (IS_ERR(pi_ctx->prot_frpl)) { 314 ret = PTR_ERR(pi_ctx->prot_frpl); 315 iser_err("Failed to allocate prot frpl ret=%d\n", 316 ret); 317 goto prot_frpl_failure; 318 } 319 320 pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, 321 ISCSI_ISER_SG_TABLESIZE + 1); 322 if (IS_ERR(pi_ctx->prot_mr)) { 323 ret = PTR_ERR(pi_ctx->prot_mr); 324 iser_err("Failed to allocate prot frmr ret=%d\n", 325 ret); 326 goto prot_mr_failure; 327 } 328 desc->reg_indicators |= ISER_PROT_KEY_VALID; 329 330 mr_init_attr.max_reg_descriptors = 2; 331 mr_init_attr.flags |= IB_MR_SIGNATURE_EN; 332 pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); 333 if (IS_ERR(pi_ctx->sig_mr)) { 334 ret = PTR_ERR(pi_ctx->sig_mr); 335 iser_err("Failed to allocate signature enabled mr err=%d\n", 336 ret); 337 goto sig_mr_failure; 338 } 339 desc->reg_indicators |= ISER_SIG_KEY_VALID; 340 } 341 desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; 342 343 iser_dbg("Create fr_desc %p page_list %p\n", 344 desc, desc->data_frpl->page_list); 345 346 return 0; 347 sig_mr_failure: 348 ib_dereg_mr(desc->pi_ctx->prot_mr); 349 prot_mr_failure: 350 ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); 351 prot_frpl_failure: 352 kfree(desc->pi_ctx); 353 pi_ctx_alloc_failure: 354 ib_dereg_mr(desc->data_mr); 355 fast_reg_mr_failure: 356 ib_free_fast_reg_page_list(desc->data_frpl); 357 358 return ret; 359 } 360 361 /** 362 * iser_create_fastreg_pool - Creates pool of fast_reg descriptors 363 * for fast registration work requests. 364 * returns 0 on success, or errno code on failure 365 */ 366 int iser_create_fastreg_pool(struct iser_conn *ib_conn, unsigned cmds_max) 367 { 368 struct iser_device *device = ib_conn->device; 369 struct fast_reg_descriptor *desc; 370 int i, ret; 371 372 INIT_LIST_HEAD(&ib_conn->fastreg.pool); 373 ib_conn->fastreg.pool_size = 0; 374 for (i = 0; i < cmds_max; i++) { 375 desc = kzalloc(sizeof(*desc), GFP_KERNEL); 376 if (!desc) { 377 iser_err("Failed to allocate a new fast_reg descriptor\n"); 378 ret = -ENOMEM; 379 goto err; 380 } 381 382 ret = iser_create_fastreg_desc(device->ib_device, device->pd, 383 ib_conn->pi_support, desc); 384 if (ret) { 385 iser_err("Failed to create fastreg descriptor err=%d\n", 386 ret); 387 kfree(desc); 388 goto err; 389 } 390 391 list_add_tail(&desc->list, &ib_conn->fastreg.pool); 392 ib_conn->fastreg.pool_size++; 393 } 394 395 return 0; 396 397 err: 398 iser_free_fastreg_pool(ib_conn); 399 return ret; 400 } 401 402 /** 403 * iser_free_fastreg_pool - releases the pool of fast_reg descriptors 404 */ 405 void iser_free_fastreg_pool(struct iser_conn *ib_conn) 406 { 407 struct fast_reg_descriptor *desc, *tmp; 408 int i = 0; 409 410 if (list_empty(&ib_conn->fastreg.pool)) 411 return; 412 413 iser_info("freeing conn %p fr pool\n", ib_conn); 414 415 list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) { 416 list_del(&desc->list); 417 ib_free_fast_reg_page_list(desc->data_frpl); 418 ib_dereg_mr(desc->data_mr); 419 if (desc->pi_ctx) { 420 ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); 421 ib_dereg_mr(desc->pi_ctx->prot_mr); 422 ib_destroy_mr(desc->pi_ctx->sig_mr); 423 kfree(desc->pi_ctx); 424 } 425 kfree(desc); 426 ++i; 427 } 428 429 if (i < ib_conn->fastreg.pool_size) 430 iser_warn("pool still has %d regions registered\n", 431 ib_conn->fastreg.pool_size - i); 432 } 433 434 /** 435 * iser_create_ib_conn_res - Queue-Pair (QP) 436 * 437 * returns 0 on success, -1 on failure 438 */ 439 static int iser_create_ib_conn_res(struct iser_conn *ib_conn) 440 { 441 struct iser_device *device; 442 struct ib_qp_init_attr init_attr; 443 int ret = -ENOMEM; 444 int index, min_index = 0; 445 446 BUG_ON(ib_conn->device == NULL); 447 448 device = ib_conn->device; 449 450 memset(&init_attr, 0, sizeof init_attr); 451 452 mutex_lock(&ig.connlist_mutex); 453 /* select the CQ with the minimal number of usages */ 454 for (index = 0; index < device->cqs_used; index++) 455 if (device->cq_active_qps[index] < 456 device->cq_active_qps[min_index]) 457 min_index = index; 458 device->cq_active_qps[min_index]++; 459 mutex_unlock(&ig.connlist_mutex); 460 iser_info("cq index %d used for ib_conn %p\n", min_index, ib_conn); 461 462 init_attr.event_handler = iser_qp_event_callback; 463 init_attr.qp_context = (void *)ib_conn; 464 init_attr.send_cq = device->tx_cq[min_index]; 465 init_attr.recv_cq = device->rx_cq[min_index]; 466 init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; 467 init_attr.cap.max_send_sge = 2; 468 init_attr.cap.max_recv_sge = 1; 469 init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 470 init_attr.qp_type = IB_QPT_RC; 471 if (ib_conn->pi_support) { 472 init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS; 473 init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; 474 } else { 475 init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS; 476 } 477 478 ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); 479 if (ret) 480 goto out_err; 481 482 ib_conn->qp = ib_conn->cma_id->qp; 483 iser_info("setting conn %p cma_id %p qp %p\n", 484 ib_conn, ib_conn->cma_id, 485 ib_conn->cma_id->qp); 486 return ret; 487 488 out_err: 489 iser_err("unable to alloc mem or create resource, err %d\n", ret); 490 return ret; 491 } 492 493 /** 494 * releases the QP object 495 */ 496 static void iser_free_ib_conn_res(struct iser_conn *ib_conn) 497 { 498 int cq_index; 499 BUG_ON(ib_conn == NULL); 500 501 iser_info("freeing conn %p cma_id %p qp %p\n", 502 ib_conn, ib_conn->cma_id, 503 ib_conn->qp); 504 505 /* qp is created only once both addr & route are resolved */ 506 507 if (ib_conn->qp != NULL) { 508 cq_index = ((struct iser_cq_desc *)ib_conn->qp->recv_cq->cq_context)->cq_index; 509 ib_conn->device->cq_active_qps[cq_index]--; 510 511 rdma_destroy_qp(ib_conn->cma_id); 512 } 513 514 ib_conn->qp = NULL; 515 } 516 517 /** 518 * based on the resolved device node GUID see if there already allocated 519 * device for this device. If there's no such, create one. 520 */ 521 static 522 struct iser_device *iser_device_find_by_ib_device(struct rdma_cm_id *cma_id) 523 { 524 struct iser_device *device; 525 526 mutex_lock(&ig.device_list_mutex); 527 528 list_for_each_entry(device, &ig.device_list, ig_list) 529 /* find if there's a match using the node GUID */ 530 if (device->ib_device->node_guid == cma_id->device->node_guid) 531 goto inc_refcnt; 532 533 device = kzalloc(sizeof *device, GFP_KERNEL); 534 if (device == NULL) 535 goto out; 536 537 /* assign this device to the device */ 538 device->ib_device = cma_id->device; 539 /* init the device and link it into ig device list */ 540 if (iser_create_device_ib_res(device)) { 541 kfree(device); 542 device = NULL; 543 goto out; 544 } 545 list_add(&device->ig_list, &ig.device_list); 546 547 inc_refcnt: 548 device->refcount++; 549 out: 550 mutex_unlock(&ig.device_list_mutex); 551 return device; 552 } 553 554 /* if there's no demand for this device, release it */ 555 static void iser_device_try_release(struct iser_device *device) 556 { 557 mutex_lock(&ig.device_list_mutex); 558 device->refcount--; 559 iser_info("device %p refcount %d\n", device, device->refcount); 560 if (!device->refcount) { 561 iser_free_device_ib_res(device); 562 list_del(&device->ig_list); 563 kfree(device); 564 } 565 mutex_unlock(&ig.device_list_mutex); 566 } 567 568 /** 569 * Called with state mutex held 570 **/ 571 static int iser_conn_state_comp_exch(struct iser_conn *ib_conn, 572 enum iser_ib_conn_state comp, 573 enum iser_ib_conn_state exch) 574 { 575 int ret; 576 577 if ((ret = (ib_conn->state == comp))) 578 ib_conn->state = exch; 579 return ret; 580 } 581 582 void iser_release_work(struct work_struct *work) 583 { 584 struct iser_conn *ib_conn; 585 int rc; 586 587 ib_conn = container_of(work, struct iser_conn, release_work); 588 589 /* wait for .conn_stop callback */ 590 rc = wait_for_completion_timeout(&ib_conn->stop_completion, 30 * HZ); 591 WARN_ON(rc == 0); 592 593 /* wait for the qp`s post send and post receive buffers to empty */ 594 rc = wait_for_completion_timeout(&ib_conn->flush_completion, 30 * HZ); 595 WARN_ON(rc == 0); 596 597 ib_conn->state = ISER_CONN_DOWN; 598 599 mutex_lock(&ib_conn->state_mutex); 600 ib_conn->state = ISER_CONN_DOWN; 601 mutex_unlock(&ib_conn->state_mutex); 602 603 iser_conn_release(ib_conn); 604 } 605 606 /** 607 * Frees all conn objects and deallocs conn descriptor 608 */ 609 void iser_conn_release(struct iser_conn *ib_conn) 610 { 611 struct iser_device *device = ib_conn->device; 612 613 mutex_lock(&ig.connlist_mutex); 614 list_del(&ib_conn->conn_list); 615 mutex_unlock(&ig.connlist_mutex); 616 617 mutex_lock(&ib_conn->state_mutex); 618 BUG_ON(ib_conn->state != ISER_CONN_DOWN); 619 620 iser_free_rx_descriptors(ib_conn); 621 iser_free_ib_conn_res(ib_conn); 622 ib_conn->device = NULL; 623 /* on EVENT_ADDR_ERROR there's no device yet for this conn */ 624 if (device != NULL) 625 iser_device_try_release(device); 626 mutex_unlock(&ib_conn->state_mutex); 627 628 /* if cma handler context, the caller actually destroy the id */ 629 if (ib_conn->cma_id != NULL) { 630 rdma_destroy_id(ib_conn->cma_id); 631 ib_conn->cma_id = NULL; 632 } 633 kfree(ib_conn); 634 } 635 636 /** 637 * triggers start of the disconnect procedures and wait for them to be done 638 */ 639 void iser_conn_terminate(struct iser_conn *ib_conn) 640 { 641 int err = 0; 642 643 /* change the ib conn state only if the conn is UP, however always call 644 * rdma_disconnect since this is the only way to cause the CMA to change 645 * the QP state to ERROR 646 */ 647 648 iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, ISER_CONN_TERMINATING); 649 err = rdma_disconnect(ib_conn->cma_id); 650 if (err) 651 iser_err("Failed to disconnect, conn: 0x%p err %d\n", 652 ib_conn,err); 653 } 654 655 /** 656 * Called with state mutex held 657 **/ 658 static void iser_connect_error(struct rdma_cm_id *cma_id) 659 { 660 struct iser_conn *ib_conn; 661 662 ib_conn = (struct iser_conn *)cma_id->context; 663 ib_conn->state = ISER_CONN_DOWN; 664 } 665 666 /** 667 * Called with state mutex held 668 **/ 669 static void iser_addr_handler(struct rdma_cm_id *cma_id) 670 { 671 struct iser_device *device; 672 struct iser_conn *ib_conn; 673 int ret; 674 675 ib_conn = (struct iser_conn *)cma_id->context; 676 if (ib_conn->state != ISER_CONN_PENDING) 677 /* bailout */ 678 return; 679 680 device = iser_device_find_by_ib_device(cma_id); 681 if (!device) { 682 iser_err("device lookup/creation failed\n"); 683 iser_connect_error(cma_id); 684 return; 685 } 686 687 ib_conn->device = device; 688 689 /* connection T10-PI support */ 690 if (iser_pi_enable) { 691 if (!(device->dev_attr.device_cap_flags & 692 IB_DEVICE_SIGNATURE_HANDOVER)) { 693 iser_warn("T10-PI requested but not supported on %s, " 694 "continue without T10-PI\n", 695 ib_conn->device->ib_device->name); 696 ib_conn->pi_support = false; 697 } else { 698 ib_conn->pi_support = true; 699 } 700 } 701 702 ret = rdma_resolve_route(cma_id, 1000); 703 if (ret) { 704 iser_err("resolve route failed: %d\n", ret); 705 iser_connect_error(cma_id); 706 return; 707 } 708 } 709 710 /** 711 * Called with state mutex held 712 **/ 713 static void iser_route_handler(struct rdma_cm_id *cma_id) 714 { 715 struct rdma_conn_param conn_param; 716 int ret; 717 struct iser_cm_hdr req_hdr; 718 struct iser_conn *ib_conn = (struct iser_conn *)cma_id->context; 719 struct iser_device *device = ib_conn->device; 720 721 if (ib_conn->state != ISER_CONN_PENDING) 722 /* bailout */ 723 return; 724 725 ret = iser_create_ib_conn_res((struct iser_conn *)cma_id->context); 726 if (ret) 727 goto failure; 728 729 memset(&conn_param, 0, sizeof conn_param); 730 conn_param.responder_resources = device->dev_attr.max_qp_rd_atom; 731 conn_param.initiator_depth = 1; 732 conn_param.retry_count = 7; 733 conn_param.rnr_retry_count = 6; 734 735 memset(&req_hdr, 0, sizeof(req_hdr)); 736 req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED | 737 ISER_SEND_W_INV_NOT_SUPPORTED); 738 conn_param.private_data = (void *)&req_hdr; 739 conn_param.private_data_len = sizeof(struct iser_cm_hdr); 740 741 ret = rdma_connect(cma_id, &conn_param); 742 if (ret) { 743 iser_err("failure connecting: %d\n", ret); 744 goto failure; 745 } 746 747 return; 748 failure: 749 iser_connect_error(cma_id); 750 } 751 752 static void iser_connected_handler(struct rdma_cm_id *cma_id) 753 { 754 struct iser_conn *ib_conn; 755 struct ib_qp_attr attr; 756 struct ib_qp_init_attr init_attr; 757 758 ib_conn = (struct iser_conn *)cma_id->context; 759 if (ib_conn->state != ISER_CONN_PENDING) 760 /* bailout */ 761 return; 762 763 (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); 764 iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num); 765 766 ib_conn->state = ISER_CONN_UP; 767 complete(&ib_conn->up_completion); 768 } 769 770 static void iser_disconnected_handler(struct rdma_cm_id *cma_id) 771 { 772 struct iser_conn *ib_conn; 773 774 ib_conn = (struct iser_conn *)cma_id->context; 775 776 /* getting here when the state is UP means that the conn is being * 777 * terminated asynchronously from the iSCSI layer's perspective. */ 778 if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, 779 ISER_CONN_TERMINATING)){ 780 if (ib_conn->iscsi_conn) 781 iscsi_conn_failure(ib_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); 782 else 783 iser_err("iscsi_iser connection isn't bound\n"); 784 } 785 786 /* Complete the termination process if no posts are pending. This code 787 * block also exists in iser_handle_comp_error(), but it is needed here 788 * for cases of no flushes at all, e.g. discovery over rdma. 789 */ 790 if (ib_conn->post_recv_buf_count == 0 && 791 (atomic_read(&ib_conn->post_send_buf_count) == 0)) { 792 complete(&ib_conn->flush_completion); 793 } 794 } 795 796 static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) 797 { 798 struct iser_conn *ib_conn; 799 800 ib_conn = (struct iser_conn *)cma_id->context; 801 iser_info("event %d status %d conn %p id %p\n", 802 event->event, event->status, cma_id->context, cma_id); 803 804 mutex_lock(&ib_conn->state_mutex); 805 switch (event->event) { 806 case RDMA_CM_EVENT_ADDR_RESOLVED: 807 iser_addr_handler(cma_id); 808 break; 809 case RDMA_CM_EVENT_ROUTE_RESOLVED: 810 iser_route_handler(cma_id); 811 break; 812 case RDMA_CM_EVENT_ESTABLISHED: 813 iser_connected_handler(cma_id); 814 break; 815 case RDMA_CM_EVENT_ADDR_ERROR: 816 case RDMA_CM_EVENT_ROUTE_ERROR: 817 case RDMA_CM_EVENT_CONNECT_ERROR: 818 case RDMA_CM_EVENT_UNREACHABLE: 819 case RDMA_CM_EVENT_REJECTED: 820 iser_connect_error(cma_id); 821 break; 822 case RDMA_CM_EVENT_DISCONNECTED: 823 case RDMA_CM_EVENT_DEVICE_REMOVAL: 824 case RDMA_CM_EVENT_ADDR_CHANGE: 825 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 826 iser_disconnected_handler(cma_id); 827 break; 828 default: 829 iser_err("Unexpected RDMA CM event (%d)\n", event->event); 830 break; 831 } 832 mutex_unlock(&ib_conn->state_mutex); 833 return 0; 834 } 835 836 void iser_conn_init(struct iser_conn *ib_conn) 837 { 838 ib_conn->state = ISER_CONN_INIT; 839 ib_conn->post_recv_buf_count = 0; 840 atomic_set(&ib_conn->post_send_buf_count, 0); 841 init_completion(&ib_conn->stop_completion); 842 init_completion(&ib_conn->flush_completion); 843 init_completion(&ib_conn->up_completion); 844 INIT_LIST_HEAD(&ib_conn->conn_list); 845 spin_lock_init(&ib_conn->lock); 846 mutex_init(&ib_conn->state_mutex); 847 } 848 849 /** 850 * starts the process of connecting to the target 851 * sleeps until the connection is established or rejected 852 */ 853 int iser_connect(struct iser_conn *ib_conn, 854 struct sockaddr *src_addr, 855 struct sockaddr *dst_addr, 856 int non_blocking) 857 { 858 int err = 0; 859 860 mutex_lock(&ib_conn->state_mutex); 861 862 sprintf(ib_conn->name, "%pISp", dst_addr); 863 864 iser_info("connecting to: %s\n", ib_conn->name); 865 866 /* the device is known only --after-- address resolution */ 867 ib_conn->device = NULL; 868 869 ib_conn->state = ISER_CONN_PENDING; 870 871 ib_conn->cma_id = rdma_create_id(iser_cma_handler, 872 (void *)ib_conn, 873 RDMA_PS_TCP, IB_QPT_RC); 874 if (IS_ERR(ib_conn->cma_id)) { 875 err = PTR_ERR(ib_conn->cma_id); 876 iser_err("rdma_create_id failed: %d\n", err); 877 goto id_failure; 878 } 879 880 err = rdma_resolve_addr(ib_conn->cma_id, src_addr, dst_addr, 1000); 881 if (err) { 882 iser_err("rdma_resolve_addr failed: %d\n", err); 883 goto addr_failure; 884 } 885 886 if (!non_blocking) { 887 wait_for_completion_interruptible(&ib_conn->up_completion); 888 889 if (ib_conn->state != ISER_CONN_UP) { 890 err = -EIO; 891 goto connect_failure; 892 } 893 } 894 mutex_unlock(&ib_conn->state_mutex); 895 896 mutex_lock(&ig.connlist_mutex); 897 list_add(&ib_conn->conn_list, &ig.connlist); 898 mutex_unlock(&ig.connlist_mutex); 899 return 0; 900 901 id_failure: 902 ib_conn->cma_id = NULL; 903 addr_failure: 904 ib_conn->state = ISER_CONN_DOWN; 905 connect_failure: 906 mutex_unlock(&ib_conn->state_mutex); 907 iser_conn_release(ib_conn); 908 return err; 909 } 910 911 /** 912 * iser_reg_page_vec - Register physical memory 913 * 914 * returns: 0 on success, errno code on failure 915 */ 916 int iser_reg_page_vec(struct iser_conn *ib_conn, 917 struct iser_page_vec *page_vec, 918 struct iser_mem_reg *mem_reg) 919 { 920 struct ib_pool_fmr *mem; 921 u64 io_addr; 922 u64 *page_list; 923 int status; 924 925 page_list = page_vec->pages; 926 io_addr = page_list[0]; 927 928 mem = ib_fmr_pool_map_phys(ib_conn->fmr.pool, 929 page_list, 930 page_vec->length, 931 io_addr); 932 933 if (IS_ERR(mem)) { 934 status = (int)PTR_ERR(mem); 935 iser_err("ib_fmr_pool_map_phys failed: %d\n", status); 936 return status; 937 } 938 939 mem_reg->lkey = mem->fmr->lkey; 940 mem_reg->rkey = mem->fmr->rkey; 941 mem_reg->len = page_vec->length * SIZE_4K; 942 mem_reg->va = io_addr; 943 mem_reg->is_mr = 1; 944 mem_reg->mem_h = (void *)mem; 945 946 mem_reg->va += page_vec->offset; 947 mem_reg->len = page_vec->data_size; 948 949 iser_dbg("PHYSICAL Mem.register, [PHYS p_array: 0x%p, sz: %d, " 950 "entry[0]: (0x%08lx,%ld)] -> " 951 "[lkey: 0x%08X mem_h: 0x%p va: 0x%08lX sz: %ld]\n", 952 page_vec, page_vec->length, 953 (unsigned long)page_vec->pages[0], 954 (unsigned long)page_vec->data_size, 955 (unsigned int)mem_reg->lkey, mem_reg->mem_h, 956 (unsigned long)mem_reg->va, (unsigned long)mem_reg->len); 957 return 0; 958 } 959 960 /** 961 * Unregister (previosuly registered using FMR) memory. 962 * If memory is non-FMR does nothing. 963 */ 964 void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, 965 enum iser_data_dir cmd_dir) 966 { 967 struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; 968 int ret; 969 970 if (!reg->is_mr) 971 return; 972 973 iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); 974 975 ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h); 976 if (ret) 977 iser_err("ib_fmr_pool_unmap failed %d\n", ret); 978 979 reg->mem_h = NULL; 980 } 981 982 void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, 983 enum iser_data_dir cmd_dir) 984 { 985 struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; 986 struct iser_conn *ib_conn = iser_task->ib_conn; 987 struct fast_reg_descriptor *desc = reg->mem_h; 988 989 if (!reg->is_mr) 990 return; 991 992 reg->mem_h = NULL; 993 reg->is_mr = 0; 994 spin_lock_bh(&ib_conn->lock); 995 list_add_tail(&desc->list, &ib_conn->fastreg.pool); 996 spin_unlock_bh(&ib_conn->lock); 997 } 998 999 int iser_post_recvl(struct iser_conn *ib_conn) 1000 { 1001 struct ib_recv_wr rx_wr, *rx_wr_failed; 1002 struct ib_sge sge; 1003 int ib_ret; 1004 1005 sge.addr = ib_conn->login_resp_dma; 1006 sge.length = ISER_RX_LOGIN_SIZE; 1007 sge.lkey = ib_conn->device->mr->lkey; 1008 1009 rx_wr.wr_id = (unsigned long)ib_conn->login_resp_buf; 1010 rx_wr.sg_list = &sge; 1011 rx_wr.num_sge = 1; 1012 rx_wr.next = NULL; 1013 1014 ib_conn->post_recv_buf_count++; 1015 ib_ret = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed); 1016 if (ib_ret) { 1017 iser_err("ib_post_recv failed ret=%d\n", ib_ret); 1018 ib_conn->post_recv_buf_count--; 1019 } 1020 return ib_ret; 1021 } 1022 1023 int iser_post_recvm(struct iser_conn *ib_conn, int count) 1024 { 1025 struct ib_recv_wr *rx_wr, *rx_wr_failed; 1026 int i, ib_ret; 1027 unsigned int my_rx_head = ib_conn->rx_desc_head; 1028 struct iser_rx_desc *rx_desc; 1029 1030 for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { 1031 rx_desc = &ib_conn->rx_descs[my_rx_head]; 1032 rx_wr->wr_id = (unsigned long)rx_desc; 1033 rx_wr->sg_list = &rx_desc->rx_sg; 1034 rx_wr->num_sge = 1; 1035 rx_wr->next = rx_wr + 1; 1036 my_rx_head = (my_rx_head + 1) & ib_conn->qp_max_recv_dtos_mask; 1037 } 1038 1039 rx_wr--; 1040 rx_wr->next = NULL; /* mark end of work requests list */ 1041 1042 ib_conn->post_recv_buf_count += count; 1043 ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed); 1044 if (ib_ret) { 1045 iser_err("ib_post_recv failed ret=%d\n", ib_ret); 1046 ib_conn->post_recv_buf_count -= count; 1047 } else 1048 ib_conn->rx_desc_head = my_rx_head; 1049 return ib_ret; 1050 } 1051 1052 1053 /** 1054 * iser_start_send - Initiate a Send DTO operation 1055 * 1056 * returns 0 on success, -1 on failure 1057 */ 1058 int iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc) 1059 { 1060 int ib_ret; 1061 struct ib_send_wr send_wr, *send_wr_failed; 1062 1063 ib_dma_sync_single_for_device(ib_conn->device->ib_device, 1064 tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); 1065 1066 send_wr.next = NULL; 1067 send_wr.wr_id = (unsigned long)tx_desc; 1068 send_wr.sg_list = tx_desc->tx_sg; 1069 send_wr.num_sge = tx_desc->num_sge; 1070 send_wr.opcode = IB_WR_SEND; 1071 send_wr.send_flags = IB_SEND_SIGNALED; 1072 1073 atomic_inc(&ib_conn->post_send_buf_count); 1074 1075 ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed); 1076 if (ib_ret) { 1077 iser_err("ib_post_send failed, ret:%d\n", ib_ret); 1078 atomic_dec(&ib_conn->post_send_buf_count); 1079 } 1080 return ib_ret; 1081 } 1082 1083 static void iser_handle_comp_error(struct iser_tx_desc *desc, 1084 struct iser_conn *ib_conn) 1085 { 1086 if (desc && desc->type == ISCSI_TX_DATAOUT) 1087 kmem_cache_free(ig.desc_cache, desc); 1088 1089 if (ib_conn->post_recv_buf_count == 0 && 1090 atomic_read(&ib_conn->post_send_buf_count) == 0) { 1091 /** 1092 * getting here when the state is UP means that the conn is 1093 * being terminated asynchronously from the iSCSI layer's 1094 * perspective. It is safe to peek at the connection state 1095 * since iscsi_conn_failure is allowed to be called twice. 1096 **/ 1097 if (ib_conn->state == ISER_CONN_UP) 1098 iscsi_conn_failure(ib_conn->iscsi_conn, 1099 ISCSI_ERR_CONN_FAILED); 1100 1101 /* no more non completed posts to the QP, complete the 1102 * termination process w.o worrying on disconnect event */ 1103 complete(&ib_conn->flush_completion); 1104 } 1105 } 1106 1107 static int iser_drain_tx_cq(struct iser_device *device, int cq_index) 1108 { 1109 struct ib_cq *cq = device->tx_cq[cq_index]; 1110 struct ib_wc wc; 1111 struct iser_tx_desc *tx_desc; 1112 struct iser_conn *ib_conn; 1113 int completed_tx = 0; 1114 1115 while (ib_poll_cq(cq, 1, &wc) == 1) { 1116 tx_desc = (struct iser_tx_desc *) (unsigned long) wc.wr_id; 1117 ib_conn = wc.qp->qp_context; 1118 if (wc.status == IB_WC_SUCCESS) { 1119 if (wc.opcode == IB_WC_SEND) 1120 iser_snd_completion(tx_desc, ib_conn); 1121 else 1122 iser_err("expected opcode %d got %d\n", 1123 IB_WC_SEND, wc.opcode); 1124 } else { 1125 iser_err("tx id %llx status %d vend_err %x\n", 1126 wc.wr_id, wc.status, wc.vendor_err); 1127 if (wc.wr_id != ISER_FASTREG_LI_WRID) { 1128 atomic_dec(&ib_conn->post_send_buf_count); 1129 iser_handle_comp_error(tx_desc, ib_conn); 1130 } 1131 } 1132 completed_tx++; 1133 } 1134 return completed_tx; 1135 } 1136 1137 1138 static void iser_cq_tasklet_fn(unsigned long data) 1139 { 1140 struct iser_cq_desc *cq_desc = (struct iser_cq_desc *)data; 1141 struct iser_device *device = cq_desc->device; 1142 int cq_index = cq_desc->cq_index; 1143 struct ib_cq *cq = device->rx_cq[cq_index]; 1144 struct ib_wc wc; 1145 struct iser_rx_desc *desc; 1146 unsigned long xfer_len; 1147 struct iser_conn *ib_conn; 1148 int completed_tx, completed_rx = 0; 1149 1150 /* First do tx drain, so in a case where we have rx flushes and a successful 1151 * tx completion we will still go through completion error handling. 1152 */ 1153 completed_tx = iser_drain_tx_cq(device, cq_index); 1154 1155 while (ib_poll_cq(cq, 1, &wc) == 1) { 1156 desc = (struct iser_rx_desc *) (unsigned long) wc.wr_id; 1157 BUG_ON(desc == NULL); 1158 ib_conn = wc.qp->qp_context; 1159 if (wc.status == IB_WC_SUCCESS) { 1160 if (wc.opcode == IB_WC_RECV) { 1161 xfer_len = (unsigned long)wc.byte_len; 1162 iser_rcv_completion(desc, xfer_len, ib_conn); 1163 } else 1164 iser_err("expected opcode %d got %d\n", 1165 IB_WC_RECV, wc.opcode); 1166 } else { 1167 if (wc.status != IB_WC_WR_FLUSH_ERR) 1168 iser_err("rx id %llx status %d vend_err %x\n", 1169 wc.wr_id, wc.status, wc.vendor_err); 1170 ib_conn->post_recv_buf_count--; 1171 iser_handle_comp_error(NULL, ib_conn); 1172 } 1173 completed_rx++; 1174 if (!(completed_rx & 63)) 1175 completed_tx += iser_drain_tx_cq(device, cq_index); 1176 } 1177 /* #warning "it is assumed here that arming CQ only once its empty" * 1178 * " would not cause interrupts to be missed" */ 1179 ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); 1180 1181 iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx); 1182 } 1183 1184 static void iser_cq_callback(struct ib_cq *cq, void *cq_context) 1185 { 1186 struct iser_cq_desc *cq_desc = (struct iser_cq_desc *)cq_context; 1187 struct iser_device *device = cq_desc->device; 1188 int cq_index = cq_desc->cq_index; 1189 1190 tasklet_schedule(&device->cq_tasklet[cq_index]); 1191 } 1192 1193 u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, 1194 enum iser_data_dir cmd_dir, sector_t *sector) 1195 { 1196 struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; 1197 struct fast_reg_descriptor *desc = reg->mem_h; 1198 unsigned long sector_size = iser_task->sc->device->sector_size; 1199 struct ib_mr_status mr_status; 1200 int ret; 1201 1202 if (desc && desc->reg_indicators & ISER_FASTREG_PROTECTED) { 1203 desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; 1204 ret = ib_check_mr_status(desc->pi_ctx->sig_mr, 1205 IB_MR_CHECK_SIG_STATUS, &mr_status); 1206 if (ret) { 1207 pr_err("ib_check_mr_status failed, ret %d\n", ret); 1208 goto err; 1209 } 1210 1211 if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { 1212 sector_t sector_off = mr_status.sig_err.sig_err_offset; 1213 1214 do_div(sector_off, sector_size + 8); 1215 *sector = scsi_get_lba(iser_task->sc) + sector_off; 1216 1217 pr_err("PI error found type %d at sector %llx " 1218 "expected %x vs actual %x\n", 1219 mr_status.sig_err.err_type, 1220 (unsigned long long)*sector, 1221 mr_status.sig_err.expected, 1222 mr_status.sig_err.actual); 1223 1224 switch (mr_status.sig_err.err_type) { 1225 case IB_SIG_BAD_GUARD: 1226 return 0x1; 1227 case IB_SIG_BAD_REFTAG: 1228 return 0x3; 1229 case IB_SIG_BAD_APPTAG: 1230 return 0x2; 1231 } 1232 } 1233 } 1234 1235 return 0; 1236 err: 1237 /* Not alot we can do here, return ambiguous guard error */ 1238 return 0x1; 1239 } 1240