1 /* 2 * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. 3 * Copyright (c) 2013 Mellanox Technologies. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 #include <linux/module.h> 34 #include <linux/kernel.h> 35 #include <linux/slab.h> 36 #include <linux/mm.h> 37 #include <linux/highmem.h> 38 #include <linux/scatterlist.h> 39 40 #include "iscsi_iser.h" 41 42 #define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */ 43 44 /** 45 * iser_start_rdma_unaligned_sg 46 */ 47 static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, 48 enum iser_data_dir cmd_dir) 49 { 50 int dma_nents; 51 struct ib_device *dev; 52 char *mem = NULL; 53 struct iser_data_buf *data = &iser_task->data[cmd_dir]; 54 unsigned long cmd_data_len = data->data_len; 55 56 if (cmd_data_len > ISER_KMALLOC_THRESHOLD) 57 mem = (void *)__get_free_pages(GFP_ATOMIC, 58 ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); 59 else 60 mem = kmalloc(cmd_data_len, GFP_ATOMIC); 61 62 if (mem == NULL) { 63 iser_err("Failed to allocate mem size %d %d for copying sglist\n", 64 data->size,(int)cmd_data_len); 65 return -ENOMEM; 66 } 67 68 if (cmd_dir == ISER_DIR_OUT) { 69 /* copy the unaligned sg the buffer which is used for RDMA */ 70 struct scatterlist *sgl = (struct scatterlist *)data->buf; 71 struct scatterlist *sg; 72 int i; 73 char *p, *from; 74 75 p = mem; 76 for_each_sg(sgl, sg, data->size, i) { 77 from = kmap_atomic(sg_page(sg)); 78 memcpy(p, 79 from + sg->offset, 80 sg->length); 81 kunmap_atomic(from); 82 p += sg->length; 83 } 84 } 85 86 sg_init_one(&iser_task->data_copy[cmd_dir].sg_single, mem, cmd_data_len); 87 iser_task->data_copy[cmd_dir].buf = 88 &iser_task->data_copy[cmd_dir].sg_single; 89 iser_task->data_copy[cmd_dir].size = 1; 90 91 iser_task->data_copy[cmd_dir].copy_buf = mem; 92 93 dev = iser_task->iser_conn->ib_conn->device->ib_device; 94 dma_nents = ib_dma_map_sg(dev, 95 &iser_task->data_copy[cmd_dir].sg_single, 96 1, 97 (cmd_dir == ISER_DIR_OUT) ? 98 DMA_TO_DEVICE : DMA_FROM_DEVICE); 99 BUG_ON(dma_nents == 0); 100 101 iser_task->data_copy[cmd_dir].dma_nents = dma_nents; 102 return 0; 103 } 104 105 /** 106 * iser_finalize_rdma_unaligned_sg 107 */ 108 void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, 109 enum iser_data_dir cmd_dir) 110 { 111 struct ib_device *dev; 112 struct iser_data_buf *mem_copy; 113 unsigned long cmd_data_len; 114 115 dev = iser_task->iser_conn->ib_conn->device->ib_device; 116 mem_copy = &iser_task->data_copy[cmd_dir]; 117 118 ib_dma_unmap_sg(dev, &mem_copy->sg_single, 1, 119 (cmd_dir == ISER_DIR_OUT) ? 120 DMA_TO_DEVICE : DMA_FROM_DEVICE); 121 122 if (cmd_dir == ISER_DIR_IN) { 123 char *mem; 124 struct scatterlist *sgl, *sg; 125 unsigned char *p, *to; 126 unsigned int sg_size; 127 int i; 128 129 /* copy back read RDMA to unaligned sg */ 130 mem = mem_copy->copy_buf; 131 132 sgl = (struct scatterlist *)iser_task->data[ISER_DIR_IN].buf; 133 sg_size = iser_task->data[ISER_DIR_IN].size; 134 135 p = mem; 136 for_each_sg(sgl, sg, sg_size, i) { 137 to = kmap_atomic(sg_page(sg)); 138 memcpy(to + sg->offset, 139 p, 140 sg->length); 141 kunmap_atomic(to); 142 p += sg->length; 143 } 144 } 145 146 cmd_data_len = iser_task->data[cmd_dir].data_len; 147 148 if (cmd_data_len > ISER_KMALLOC_THRESHOLD) 149 free_pages((unsigned long)mem_copy->copy_buf, 150 ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); 151 else 152 kfree(mem_copy->copy_buf); 153 154 mem_copy->copy_buf = NULL; 155 } 156 157 #define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) 158 159 /** 160 * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses 161 * and returns the length of resulting physical address array (may be less than 162 * the original due to possible compaction). 163 * 164 * we build a "page vec" under the assumption that the SG meets the RDMA 165 * alignment requirements. Other then the first and last SG elements, all 166 * the "internal" elements can be compacted into a list whose elements are 167 * dma addresses of physical pages. The code supports also the weird case 168 * where --few fragments of the same page-- are present in the SG as 169 * consecutive elements. Also, it handles one entry SG. 170 */ 171 172 static int iser_sg_to_page_vec(struct iser_data_buf *data, 173 struct iser_page_vec *page_vec, 174 struct ib_device *ibdev) 175 { 176 struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf; 177 u64 start_addr, end_addr, page, chunk_start = 0; 178 unsigned long total_sz = 0; 179 unsigned int dma_len; 180 int i, new_chunk, cur_page, last_ent = data->dma_nents - 1; 181 182 /* compute the offset of first element */ 183 page_vec->offset = (u64) sgl[0].offset & ~MASK_4K; 184 185 new_chunk = 1; 186 cur_page = 0; 187 for_each_sg(sgl, sg, data->dma_nents, i) { 188 start_addr = ib_sg_dma_address(ibdev, sg); 189 if (new_chunk) 190 chunk_start = start_addr; 191 dma_len = ib_sg_dma_len(ibdev, sg); 192 end_addr = start_addr + dma_len; 193 total_sz += dma_len; 194 195 /* collect page fragments until aligned or end of SG list */ 196 if (!IS_4K_ALIGNED(end_addr) && i < last_ent) { 197 new_chunk = 0; 198 continue; 199 } 200 new_chunk = 1; 201 202 /* address of the first page in the contiguous chunk; 203 masking relevant for the very first SG entry, 204 which might be unaligned */ 205 page = chunk_start & MASK_4K; 206 do { 207 page_vec->pages[cur_page++] = page; 208 page += SIZE_4K; 209 } while (page < end_addr); 210 } 211 212 page_vec->data_size = total_sz; 213 iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page); 214 return cur_page; 215 } 216 217 218 /** 219 * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned 220 * for RDMA sub-list of a scatter-gather list of memory buffers, and returns 221 * the number of entries which are aligned correctly. Supports the case where 222 * consecutive SG elements are actually fragments of the same physcial page. 223 */ 224 static int iser_data_buf_aligned_len(struct iser_data_buf *data, 225 struct ib_device *ibdev) 226 { 227 struct scatterlist *sgl, *sg, *next_sg = NULL; 228 u64 start_addr, end_addr; 229 int i, ret_len, start_check = 0; 230 231 if (data->dma_nents == 1) 232 return 1; 233 234 sgl = (struct scatterlist *)data->buf; 235 start_addr = ib_sg_dma_address(ibdev, sgl); 236 237 for_each_sg(sgl, sg, data->dma_nents, i) { 238 if (start_check && !IS_4K_ALIGNED(start_addr)) 239 break; 240 241 next_sg = sg_next(sg); 242 if (!next_sg) 243 break; 244 245 end_addr = start_addr + ib_sg_dma_len(ibdev, sg); 246 start_addr = ib_sg_dma_address(ibdev, next_sg); 247 248 if (end_addr == start_addr) { 249 start_check = 0; 250 continue; 251 } else 252 start_check = 1; 253 254 if (!IS_4K_ALIGNED(end_addr)) 255 break; 256 } 257 ret_len = (next_sg) ? i : i+1; 258 iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", 259 ret_len, data->dma_nents, data); 260 return ret_len; 261 } 262 263 static void iser_data_buf_dump(struct iser_data_buf *data, 264 struct ib_device *ibdev) 265 { 266 struct scatterlist *sgl = (struct scatterlist *)data->buf; 267 struct scatterlist *sg; 268 int i; 269 270 if (iser_debug_level == 0) 271 return; 272 273 for_each_sg(sgl, sg, data->dma_nents, i) 274 iser_warn("sg[%d] dma_addr:0x%lX page:0x%p " 275 "off:0x%x sz:0x%x dma_len:0x%x\n", 276 i, (unsigned long)ib_sg_dma_address(ibdev, sg), 277 sg_page(sg), sg->offset, 278 sg->length, ib_sg_dma_len(ibdev, sg)); 279 } 280 281 static void iser_dump_page_vec(struct iser_page_vec *page_vec) 282 { 283 int i; 284 285 iser_err("page vec length %d data size %d\n", 286 page_vec->length, page_vec->data_size); 287 for (i = 0; i < page_vec->length; i++) 288 iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]); 289 } 290 291 static void iser_page_vec_build(struct iser_data_buf *data, 292 struct iser_page_vec *page_vec, 293 struct ib_device *ibdev) 294 { 295 int page_vec_len = 0; 296 297 page_vec->length = 0; 298 page_vec->offset = 0; 299 300 iser_dbg("Translating sg sz: %d\n", data->dma_nents); 301 page_vec_len = iser_sg_to_page_vec(data, page_vec, ibdev); 302 iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents,page_vec_len); 303 304 page_vec->length = page_vec_len; 305 306 if (page_vec_len * SIZE_4K < page_vec->data_size) { 307 iser_err("page_vec too short to hold this SG\n"); 308 iser_data_buf_dump(data, ibdev); 309 iser_dump_page_vec(page_vec); 310 BUG(); 311 } 312 } 313 314 int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, 315 struct iser_data_buf *data, 316 enum iser_data_dir iser_dir, 317 enum dma_data_direction dma_dir) 318 { 319 struct ib_device *dev; 320 321 iser_task->dir[iser_dir] = 1; 322 dev = iser_task->iser_conn->ib_conn->device->ib_device; 323 324 data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir); 325 if (data->dma_nents == 0) { 326 iser_err("dma_map_sg failed!!!\n"); 327 return -EINVAL; 328 } 329 return 0; 330 } 331 332 void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task) 333 { 334 struct ib_device *dev; 335 struct iser_data_buf *data; 336 337 dev = iser_task->iser_conn->ib_conn->device->ib_device; 338 339 if (iser_task->dir[ISER_DIR_IN]) { 340 data = &iser_task->data[ISER_DIR_IN]; 341 ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE); 342 } 343 344 if (iser_task->dir[ISER_DIR_OUT]) { 345 data = &iser_task->data[ISER_DIR_OUT]; 346 ib_dma_unmap_sg(dev, data->buf, data->size, DMA_TO_DEVICE); 347 } 348 } 349 350 /** 351 * iser_reg_rdma_mem - Registers memory intended for RDMA, 352 * obtaining rkey and va 353 * 354 * returns 0 on success, errno code on failure 355 */ 356 int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task, 357 enum iser_data_dir cmd_dir) 358 { 359 struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn; 360 struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn; 361 struct iser_device *device = ib_conn->device; 362 struct ib_device *ibdev = device->ib_device; 363 struct iser_data_buf *mem = &iser_task->data[cmd_dir]; 364 struct iser_regd_buf *regd_buf; 365 int aligned_len; 366 int err; 367 int i; 368 struct scatterlist *sg; 369 370 regd_buf = &iser_task->rdma_regd[cmd_dir]; 371 372 aligned_len = iser_data_buf_aligned_len(mem, ibdev); 373 if (aligned_len != mem->dma_nents || 374 (!ib_conn->fmr_pool && mem->dma_nents > 1)) { 375 iscsi_conn->fmr_unalign_cnt++; 376 iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n", 377 aligned_len, mem->size); 378 iser_data_buf_dump(mem, ibdev); 379 380 /* unmap the command data before accessing it */ 381 iser_dma_unmap_task_data(iser_task); 382 383 /* allocate copy buf, if we are writing, copy the */ 384 /* unaligned scatterlist, dma map the copy */ 385 if (iser_start_rdma_unaligned_sg(iser_task, cmd_dir) != 0) 386 return -ENOMEM; 387 mem = &iser_task->data_copy[cmd_dir]; 388 } 389 390 /* if there a single dma entry, FMR is not needed */ 391 if (mem->dma_nents == 1) { 392 sg = (struct scatterlist *)mem->buf; 393 394 regd_buf->reg.lkey = device->mr->lkey; 395 regd_buf->reg.rkey = device->mr->rkey; 396 regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); 397 regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); 398 regd_buf->reg.is_fmr = 0; 399 400 iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " 401 "va: 0x%08lX sz: %ld]\n", 402 (unsigned int)regd_buf->reg.lkey, 403 (unsigned int)regd_buf->reg.rkey, 404 (unsigned long)regd_buf->reg.va, 405 (unsigned long)regd_buf->reg.len); 406 } else { /* use FMR for multiple dma entries */ 407 iser_page_vec_build(mem, ib_conn->page_vec, ibdev); 408 err = iser_reg_page_vec(ib_conn, ib_conn->page_vec, ®d_buf->reg); 409 if (err && err != -EAGAIN) { 410 iser_data_buf_dump(mem, ibdev); 411 iser_err("mem->dma_nents = %d (dlength = 0x%x)\n", 412 mem->dma_nents, 413 ntoh24(iser_task->desc.iscsi_header.dlength)); 414 iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n", 415 ib_conn->page_vec->data_size, ib_conn->page_vec->length, 416 ib_conn->page_vec->offset); 417 for (i=0 ; i<ib_conn->page_vec->length ; i++) 418 iser_err("page_vec[%d] = 0x%llx\n", i, 419 (unsigned long long) ib_conn->page_vec->pages[i]); 420 } 421 if (err) 422 return err; 423 } 424 return 0; 425 } 426