1 /* 2 * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. 3 * Copyright (c) 2013 Mellanox Technologies. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 #include <linux/module.h> 34 #include <linux/kernel.h> 35 #include <linux/slab.h> 36 #include <linux/mm.h> 37 #include <linux/highmem.h> 38 #include <linux/scatterlist.h> 39 40 #include "iscsi_iser.h" 41 42 #define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */ 43 44 /** 45 * iser_start_rdma_unaligned_sg 46 */ 47 static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, 48 enum iser_data_dir cmd_dir) 49 { 50 int dma_nents; 51 struct ib_device *dev; 52 char *mem = NULL; 53 struct iser_data_buf *data = &iser_task->data[cmd_dir]; 54 unsigned long cmd_data_len = data->data_len; 55 56 if (cmd_data_len > ISER_KMALLOC_THRESHOLD) 57 mem = (void *)__get_free_pages(GFP_ATOMIC, 58 ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); 59 else 60 mem = kmalloc(cmd_data_len, GFP_ATOMIC); 61 62 if (mem == NULL) { 63 iser_err("Failed to allocate mem size %d %d for copying sglist\n", 64 data->size,(int)cmd_data_len); 65 return -ENOMEM; 66 } 67 68 if (cmd_dir == ISER_DIR_OUT) { 69 /* copy the unaligned sg the buffer which is used for RDMA */ 70 struct scatterlist *sgl = (struct scatterlist *)data->buf; 71 struct scatterlist *sg; 72 int i; 73 char *p, *from; 74 75 p = mem; 76 for_each_sg(sgl, sg, data->size, i) { 77 from = kmap_atomic(sg_page(sg)); 78 memcpy(p, 79 from + sg->offset, 80 sg->length); 81 kunmap_atomic(from); 82 p += sg->length; 83 } 84 } 85 86 sg_init_one(&iser_task->data_copy[cmd_dir].sg_single, mem, cmd_data_len); 87 iser_task->data_copy[cmd_dir].buf = 88 &iser_task->data_copy[cmd_dir].sg_single; 89 iser_task->data_copy[cmd_dir].size = 1; 90 91 iser_task->data_copy[cmd_dir].copy_buf = mem; 92 93 dev = iser_task->iser_conn->ib_conn->device->ib_device; 94 dma_nents = ib_dma_map_sg(dev, 95 &iser_task->data_copy[cmd_dir].sg_single, 96 1, 97 (cmd_dir == ISER_DIR_OUT) ? 98 DMA_TO_DEVICE : DMA_FROM_DEVICE); 99 BUG_ON(dma_nents == 0); 100 101 iser_task->data_copy[cmd_dir].dma_nents = dma_nents; 102 return 0; 103 } 104 105 /** 106 * iser_finalize_rdma_unaligned_sg 107 */ 108 void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, 109 enum iser_data_dir cmd_dir) 110 { 111 struct ib_device *dev; 112 struct iser_data_buf *mem_copy; 113 unsigned long cmd_data_len; 114 115 dev = iser_task->iser_conn->ib_conn->device->ib_device; 116 mem_copy = &iser_task->data_copy[cmd_dir]; 117 118 ib_dma_unmap_sg(dev, &mem_copy->sg_single, 1, 119 (cmd_dir == ISER_DIR_OUT) ? 120 DMA_TO_DEVICE : DMA_FROM_DEVICE); 121 122 if (cmd_dir == ISER_DIR_IN) { 123 char *mem; 124 struct scatterlist *sgl, *sg; 125 unsigned char *p, *to; 126 unsigned int sg_size; 127 int i; 128 129 /* copy back read RDMA to unaligned sg */ 130 mem = mem_copy->copy_buf; 131 132 sgl = (struct scatterlist *)iser_task->data[ISER_DIR_IN].buf; 133 sg_size = iser_task->data[ISER_DIR_IN].size; 134 135 p = mem; 136 for_each_sg(sgl, sg, sg_size, i) { 137 to = kmap_atomic(sg_page(sg)); 138 memcpy(to + sg->offset, 139 p, 140 sg->length); 141 kunmap_atomic(to); 142 p += sg->length; 143 } 144 } 145 146 cmd_data_len = iser_task->data[cmd_dir].data_len; 147 148 if (cmd_data_len > ISER_KMALLOC_THRESHOLD) 149 free_pages((unsigned long)mem_copy->copy_buf, 150 ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); 151 else 152 kfree(mem_copy->copy_buf); 153 154 mem_copy->copy_buf = NULL; 155 } 156 157 #define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) 158 159 /** 160 * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses 161 * and returns the length of resulting physical address array (may be less than 162 * the original due to possible compaction). 163 * 164 * we build a "page vec" under the assumption that the SG meets the RDMA 165 * alignment requirements. Other then the first and last SG elements, all 166 * the "internal" elements can be compacted into a list whose elements are 167 * dma addresses of physical pages. The code supports also the weird case 168 * where --few fragments of the same page-- are present in the SG as 169 * consecutive elements. Also, it handles one entry SG. 170 */ 171 172 static int iser_sg_to_page_vec(struct iser_data_buf *data, 173 struct ib_device *ibdev, u64 *pages, 174 int *offset, int *data_size) 175 { 176 struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf; 177 u64 start_addr, end_addr, page, chunk_start = 0; 178 unsigned long total_sz = 0; 179 unsigned int dma_len; 180 int i, new_chunk, cur_page, last_ent = data->dma_nents - 1; 181 182 /* compute the offset of first element */ 183 *offset = (u64) sgl[0].offset & ~MASK_4K; 184 185 new_chunk = 1; 186 cur_page = 0; 187 for_each_sg(sgl, sg, data->dma_nents, i) { 188 start_addr = ib_sg_dma_address(ibdev, sg); 189 if (new_chunk) 190 chunk_start = start_addr; 191 dma_len = ib_sg_dma_len(ibdev, sg); 192 end_addr = start_addr + dma_len; 193 total_sz += dma_len; 194 195 /* collect page fragments until aligned or end of SG list */ 196 if (!IS_4K_ALIGNED(end_addr) && i < last_ent) { 197 new_chunk = 0; 198 continue; 199 } 200 new_chunk = 1; 201 202 /* address of the first page in the contiguous chunk; 203 masking relevant for the very first SG entry, 204 which might be unaligned */ 205 page = chunk_start & MASK_4K; 206 do { 207 pages[cur_page++] = page; 208 page += SIZE_4K; 209 } while (page < end_addr); 210 } 211 212 *data_size = total_sz; 213 iser_dbg("page_vec->data_size:%d cur_page %d\n", 214 *data_size, cur_page); 215 return cur_page; 216 } 217 218 219 /** 220 * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned 221 * for RDMA sub-list of a scatter-gather list of memory buffers, and returns 222 * the number of entries which are aligned correctly. Supports the case where 223 * consecutive SG elements are actually fragments of the same physcial page. 224 */ 225 static int iser_data_buf_aligned_len(struct iser_data_buf *data, 226 struct ib_device *ibdev) 227 { 228 struct scatterlist *sgl, *sg, *next_sg = NULL; 229 u64 start_addr, end_addr; 230 int i, ret_len, start_check = 0; 231 232 if (data->dma_nents == 1) 233 return 1; 234 235 sgl = (struct scatterlist *)data->buf; 236 start_addr = ib_sg_dma_address(ibdev, sgl); 237 238 for_each_sg(sgl, sg, data->dma_nents, i) { 239 if (start_check && !IS_4K_ALIGNED(start_addr)) 240 break; 241 242 next_sg = sg_next(sg); 243 if (!next_sg) 244 break; 245 246 end_addr = start_addr + ib_sg_dma_len(ibdev, sg); 247 start_addr = ib_sg_dma_address(ibdev, next_sg); 248 249 if (end_addr == start_addr) { 250 start_check = 0; 251 continue; 252 } else 253 start_check = 1; 254 255 if (!IS_4K_ALIGNED(end_addr)) 256 break; 257 } 258 ret_len = (next_sg) ? i : i+1; 259 iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", 260 ret_len, data->dma_nents, data); 261 return ret_len; 262 } 263 264 static void iser_data_buf_dump(struct iser_data_buf *data, 265 struct ib_device *ibdev) 266 { 267 struct scatterlist *sgl = (struct scatterlist *)data->buf; 268 struct scatterlist *sg; 269 int i; 270 271 for_each_sg(sgl, sg, data->dma_nents, i) 272 iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p " 273 "off:0x%x sz:0x%x dma_len:0x%x\n", 274 i, (unsigned long)ib_sg_dma_address(ibdev, sg), 275 sg_page(sg), sg->offset, 276 sg->length, ib_sg_dma_len(ibdev, sg)); 277 } 278 279 static void iser_dump_page_vec(struct iser_page_vec *page_vec) 280 { 281 int i; 282 283 iser_err("page vec length %d data size %d\n", 284 page_vec->length, page_vec->data_size); 285 for (i = 0; i < page_vec->length; i++) 286 iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]); 287 } 288 289 static void iser_page_vec_build(struct iser_data_buf *data, 290 struct iser_page_vec *page_vec, 291 struct ib_device *ibdev) 292 { 293 int page_vec_len = 0; 294 295 page_vec->length = 0; 296 page_vec->offset = 0; 297 298 iser_dbg("Translating sg sz: %d\n", data->dma_nents); 299 page_vec_len = iser_sg_to_page_vec(data, ibdev, page_vec->pages, 300 &page_vec->offset, 301 &page_vec->data_size); 302 iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents, page_vec_len); 303 304 page_vec->length = page_vec_len; 305 306 if (page_vec_len * SIZE_4K < page_vec->data_size) { 307 iser_err("page_vec too short to hold this SG\n"); 308 iser_data_buf_dump(data, ibdev); 309 iser_dump_page_vec(page_vec); 310 BUG(); 311 } 312 } 313 314 int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, 315 struct iser_data_buf *data, 316 enum iser_data_dir iser_dir, 317 enum dma_data_direction dma_dir) 318 { 319 struct ib_device *dev; 320 321 iser_task->dir[iser_dir] = 1; 322 dev = iser_task->iser_conn->ib_conn->device->ib_device; 323 324 data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir); 325 if (data->dma_nents == 0) { 326 iser_err("dma_map_sg failed!!!\n"); 327 return -EINVAL; 328 } 329 return 0; 330 } 331 332 void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task) 333 { 334 struct ib_device *dev; 335 struct iser_data_buf *data; 336 337 dev = iser_task->iser_conn->ib_conn->device->ib_device; 338 339 if (iser_task->dir[ISER_DIR_IN]) { 340 data = &iser_task->data[ISER_DIR_IN]; 341 ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE); 342 } 343 344 if (iser_task->dir[ISER_DIR_OUT]) { 345 data = &iser_task->data[ISER_DIR_OUT]; 346 ib_dma_unmap_sg(dev, data->buf, data->size, DMA_TO_DEVICE); 347 } 348 } 349 350 static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, 351 struct ib_device *ibdev, 352 enum iser_data_dir cmd_dir, 353 int aligned_len) 354 { 355 struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn; 356 struct iser_data_buf *mem = &iser_task->data[cmd_dir]; 357 358 iscsi_conn->fmr_unalign_cnt++; 359 iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n", 360 aligned_len, mem->size); 361 362 if (iser_debug_level > 0) 363 iser_data_buf_dump(mem, ibdev); 364 365 /* unmap the command data before accessing it */ 366 iser_dma_unmap_task_data(iser_task); 367 368 /* allocate copy buf, if we are writing, copy the */ 369 /* unaligned scatterlist, dma map the copy */ 370 if (iser_start_rdma_unaligned_sg(iser_task, cmd_dir) != 0) 371 return -ENOMEM; 372 373 return 0; 374 } 375 376 /** 377 * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA, 378 * using FMR (if possible) obtaining rkey and va 379 * 380 * returns 0 on success, errno code on failure 381 */ 382 int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, 383 enum iser_data_dir cmd_dir) 384 { 385 struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn; 386 struct iser_device *device = ib_conn->device; 387 struct ib_device *ibdev = device->ib_device; 388 struct iser_data_buf *mem = &iser_task->data[cmd_dir]; 389 struct iser_regd_buf *regd_buf; 390 int aligned_len; 391 int err; 392 int i; 393 struct scatterlist *sg; 394 395 regd_buf = &iser_task->rdma_regd[cmd_dir]; 396 397 aligned_len = iser_data_buf_aligned_len(mem, ibdev); 398 if (aligned_len != mem->dma_nents) { 399 err = fall_to_bounce_buf(iser_task, ibdev, 400 cmd_dir, aligned_len); 401 if (err) { 402 iser_err("failed to allocate bounce buffer\n"); 403 return err; 404 } 405 mem = &iser_task->data_copy[cmd_dir]; 406 } 407 408 /* if there a single dma entry, FMR is not needed */ 409 if (mem->dma_nents == 1) { 410 sg = (struct scatterlist *)mem->buf; 411 412 regd_buf->reg.lkey = device->mr->lkey; 413 regd_buf->reg.rkey = device->mr->rkey; 414 regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); 415 regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); 416 regd_buf->reg.is_mr = 0; 417 418 iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " 419 "va: 0x%08lX sz: %ld]\n", 420 (unsigned int)regd_buf->reg.lkey, 421 (unsigned int)regd_buf->reg.rkey, 422 (unsigned long)regd_buf->reg.va, 423 (unsigned long)regd_buf->reg.len); 424 } else { /* use FMR for multiple dma entries */ 425 iser_page_vec_build(mem, ib_conn->fastreg.fmr.page_vec, ibdev); 426 err = iser_reg_page_vec(ib_conn, ib_conn->fastreg.fmr.page_vec, 427 ®d_buf->reg); 428 if (err && err != -EAGAIN) { 429 iser_data_buf_dump(mem, ibdev); 430 iser_err("mem->dma_nents = %d (dlength = 0x%x)\n", 431 mem->dma_nents, 432 ntoh24(iser_task->desc.iscsi_header.dlength)); 433 iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n", 434 ib_conn->fastreg.fmr.page_vec->data_size, 435 ib_conn->fastreg.fmr.page_vec->length, 436 ib_conn->fastreg.fmr.page_vec->offset); 437 for (i = 0; i < ib_conn->fastreg.fmr.page_vec->length; i++) 438 iser_err("page_vec[%d] = 0x%llx\n", i, 439 (unsigned long long) ib_conn->fastreg.fmr.page_vec->pages[i]); 440 } 441 if (err) 442 return err; 443 } 444 return 0; 445 } 446 447 static int iser_fast_reg_mr(struct fast_reg_descriptor *desc, 448 struct iser_conn *ib_conn, 449 struct iser_regd_buf *regd_buf, 450 u32 offset, unsigned int data_size, 451 unsigned int page_list_len) 452 { 453 struct ib_send_wr fastreg_wr, inv_wr; 454 struct ib_send_wr *bad_wr, *wr = NULL; 455 u8 key; 456 int ret; 457 458 if (!desc->valid) { 459 memset(&inv_wr, 0, sizeof(inv_wr)); 460 inv_wr.opcode = IB_WR_LOCAL_INV; 461 inv_wr.send_flags = IB_SEND_SIGNALED; 462 inv_wr.ex.invalidate_rkey = desc->data_mr->rkey; 463 wr = &inv_wr; 464 /* Bump the key */ 465 key = (u8)(desc->data_mr->rkey & 0x000000FF); 466 ib_update_fast_reg_key(desc->data_mr, ++key); 467 } 468 469 /* Prepare FASTREG WR */ 470 memset(&fastreg_wr, 0, sizeof(fastreg_wr)); 471 fastreg_wr.opcode = IB_WR_FAST_REG_MR; 472 fastreg_wr.send_flags = IB_SEND_SIGNALED; 473 fastreg_wr.wr.fast_reg.iova_start = desc->data_frpl->page_list[0] + offset; 474 fastreg_wr.wr.fast_reg.page_list = desc->data_frpl; 475 fastreg_wr.wr.fast_reg.page_list_len = page_list_len; 476 fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K; 477 fastreg_wr.wr.fast_reg.length = data_size; 478 fastreg_wr.wr.fast_reg.rkey = desc->data_mr->rkey; 479 fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | 480 IB_ACCESS_REMOTE_WRITE | 481 IB_ACCESS_REMOTE_READ); 482 483 if (!wr) { 484 wr = &fastreg_wr; 485 atomic_inc(&ib_conn->post_send_buf_count); 486 } else { 487 wr->next = &fastreg_wr; 488 atomic_add(2, &ib_conn->post_send_buf_count); 489 } 490 491 ret = ib_post_send(ib_conn->qp, wr, &bad_wr); 492 if (ret) { 493 if (bad_wr->next) 494 atomic_sub(2, &ib_conn->post_send_buf_count); 495 else 496 atomic_dec(&ib_conn->post_send_buf_count); 497 iser_err("fast registration failed, ret:%d\n", ret); 498 return ret; 499 } 500 desc->valid = false; 501 502 regd_buf->reg.mem_h = desc; 503 regd_buf->reg.lkey = desc->data_mr->lkey; 504 regd_buf->reg.rkey = desc->data_mr->rkey; 505 regd_buf->reg.va = desc->data_frpl->page_list[0] + offset; 506 regd_buf->reg.len = data_size; 507 regd_buf->reg.is_mr = 1; 508 509 return ret; 510 } 511 512 /** 513 * iser_reg_rdma_mem_frwr - Registers memory intended for RDMA, 514 * using Fast Registration WR (if possible) obtaining rkey and va 515 * 516 * returns 0 on success, errno code on failure 517 */ 518 int iser_reg_rdma_mem_frwr(struct iscsi_iser_task *iser_task, 519 enum iser_data_dir cmd_dir) 520 { 521 struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn; 522 struct iser_device *device = ib_conn->device; 523 struct ib_device *ibdev = device->ib_device; 524 struct iser_data_buf *mem = &iser_task->data[cmd_dir]; 525 struct iser_regd_buf *regd_buf = &iser_task->rdma_regd[cmd_dir]; 526 struct fast_reg_descriptor *desc; 527 unsigned int data_size, page_list_len; 528 int err, aligned_len; 529 unsigned long flags; 530 u32 offset; 531 532 aligned_len = iser_data_buf_aligned_len(mem, ibdev); 533 if (aligned_len != mem->dma_nents) { 534 err = fall_to_bounce_buf(iser_task, ibdev, 535 cmd_dir, aligned_len); 536 if (err) { 537 iser_err("failed to allocate bounce buffer\n"); 538 return err; 539 } 540 mem = &iser_task->data_copy[cmd_dir]; 541 } 542 543 /* if there a single dma entry, dma mr suffices */ 544 if (mem->dma_nents == 1) { 545 struct scatterlist *sg = (struct scatterlist *)mem->buf; 546 547 regd_buf->reg.lkey = device->mr->lkey; 548 regd_buf->reg.rkey = device->mr->rkey; 549 regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); 550 regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); 551 regd_buf->reg.is_mr = 0; 552 } else { 553 spin_lock_irqsave(&ib_conn->lock, flags); 554 desc = list_first_entry(&ib_conn->fastreg.frwr.pool, 555 struct fast_reg_descriptor, list); 556 list_del(&desc->list); 557 spin_unlock_irqrestore(&ib_conn->lock, flags); 558 page_list_len = iser_sg_to_page_vec(mem, device->ib_device, 559 desc->data_frpl->page_list, 560 &offset, &data_size); 561 562 if (page_list_len * SIZE_4K < data_size) { 563 iser_err("fast reg page_list too short to hold this SG\n"); 564 err = -EINVAL; 565 goto err_reg; 566 } 567 568 err = iser_fast_reg_mr(desc, ib_conn, regd_buf, 569 offset, data_size, page_list_len); 570 if (err) 571 goto err_reg; 572 } 573 574 return 0; 575 err_reg: 576 spin_lock_irqsave(&ib_conn->lock, flags); 577 list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool); 578 spin_unlock_irqrestore(&ib_conn->lock, flags); 579 return err; 580 } 581