1 /* 2 * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 #include <linux/module.h> 33 #include <linux/kernel.h> 34 #include <linux/slab.h> 35 #include <linux/mm.h> 36 #include <linux/highmem.h> 37 #include <linux/scatterlist.h> 38 39 #include "iscsi_iser.h" 40 41 #define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */ 42 43 /** 44 * iser_start_rdma_unaligned_sg 45 */ 46 static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, 47 enum iser_data_dir cmd_dir) 48 { 49 int dma_nents; 50 struct ib_device *dev; 51 char *mem = NULL; 52 struct iser_data_buf *data = &iser_task->data[cmd_dir]; 53 unsigned long cmd_data_len = data->data_len; 54 55 if (cmd_data_len > ISER_KMALLOC_THRESHOLD) 56 mem = (void *)__get_free_pages(GFP_ATOMIC, 57 ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); 58 else 59 mem = kmalloc(cmd_data_len, GFP_ATOMIC); 60 61 if (mem == NULL) { 62 iser_err("Failed to allocate mem size %d %d for copying sglist\n", 63 data->size,(int)cmd_data_len); 64 return -ENOMEM; 65 } 66 67 if (cmd_dir == ISER_DIR_OUT) { 68 /* copy the unaligned sg the buffer which is used for RDMA */ 69 struct scatterlist *sgl = (struct scatterlist *)data->buf; 70 struct scatterlist *sg; 71 int i; 72 char *p, *from; 73 74 p = mem; 75 for_each_sg(sgl, sg, data->size, i) { 76 from = kmap_atomic(sg_page(sg)); 77 memcpy(p, 78 from + sg->offset, 79 sg->length); 80 kunmap_atomic(from); 81 p += sg->length; 82 } 83 } 84 85 sg_init_one(&iser_task->data_copy[cmd_dir].sg_single, mem, cmd_data_len); 86 iser_task->data_copy[cmd_dir].buf = 87 &iser_task->data_copy[cmd_dir].sg_single; 88 iser_task->data_copy[cmd_dir].size = 1; 89 90 iser_task->data_copy[cmd_dir].copy_buf = mem; 91 92 dev = iser_task->iser_conn->ib_conn->device->ib_device; 93 dma_nents = ib_dma_map_sg(dev, 94 &iser_task->data_copy[cmd_dir].sg_single, 95 1, 96 (cmd_dir == ISER_DIR_OUT) ? 97 DMA_TO_DEVICE : DMA_FROM_DEVICE); 98 BUG_ON(dma_nents == 0); 99 100 iser_task->data_copy[cmd_dir].dma_nents = dma_nents; 101 return 0; 102 } 103 104 /** 105 * iser_finalize_rdma_unaligned_sg 106 */ 107 void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, 108 enum iser_data_dir cmd_dir) 109 { 110 struct ib_device *dev; 111 struct iser_data_buf *mem_copy; 112 unsigned long cmd_data_len; 113 114 dev = iser_task->iser_conn->ib_conn->device->ib_device; 115 mem_copy = &iser_task->data_copy[cmd_dir]; 116 117 ib_dma_unmap_sg(dev, &mem_copy->sg_single, 1, 118 (cmd_dir == ISER_DIR_OUT) ? 119 DMA_TO_DEVICE : DMA_FROM_DEVICE); 120 121 if (cmd_dir == ISER_DIR_IN) { 122 char *mem; 123 struct scatterlist *sgl, *sg; 124 unsigned char *p, *to; 125 unsigned int sg_size; 126 int i; 127 128 /* copy back read RDMA to unaligned sg */ 129 mem = mem_copy->copy_buf; 130 131 sgl = (struct scatterlist *)iser_task->data[ISER_DIR_IN].buf; 132 sg_size = iser_task->data[ISER_DIR_IN].size; 133 134 p = mem; 135 for_each_sg(sgl, sg, sg_size, i) { 136 to = kmap_atomic(sg_page(sg)); 137 memcpy(to + sg->offset, 138 p, 139 sg->length); 140 kunmap_atomic(to); 141 p += sg->length; 142 } 143 } 144 145 cmd_data_len = iser_task->data[cmd_dir].data_len; 146 147 if (cmd_data_len > ISER_KMALLOC_THRESHOLD) 148 free_pages((unsigned long)mem_copy->copy_buf, 149 ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); 150 else 151 kfree(mem_copy->copy_buf); 152 153 mem_copy->copy_buf = NULL; 154 } 155 156 #define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) 157 158 /** 159 * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses 160 * and returns the length of resulting physical address array (may be less than 161 * the original due to possible compaction). 162 * 163 * we build a "page vec" under the assumption that the SG meets the RDMA 164 * alignment requirements. Other then the first and last SG elements, all 165 * the "internal" elements can be compacted into a list whose elements are 166 * dma addresses of physical pages. The code supports also the weird case 167 * where --few fragments of the same page-- are present in the SG as 168 * consecutive elements. Also, it handles one entry SG. 169 */ 170 171 static int iser_sg_to_page_vec(struct iser_data_buf *data, 172 struct iser_page_vec *page_vec, 173 struct ib_device *ibdev) 174 { 175 struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf; 176 u64 start_addr, end_addr, page, chunk_start = 0; 177 unsigned long total_sz = 0; 178 unsigned int dma_len; 179 int i, new_chunk, cur_page, last_ent = data->dma_nents - 1; 180 181 /* compute the offset of first element */ 182 page_vec->offset = (u64) sgl[0].offset & ~MASK_4K; 183 184 new_chunk = 1; 185 cur_page = 0; 186 for_each_sg(sgl, sg, data->dma_nents, i) { 187 start_addr = ib_sg_dma_address(ibdev, sg); 188 if (new_chunk) 189 chunk_start = start_addr; 190 dma_len = ib_sg_dma_len(ibdev, sg); 191 end_addr = start_addr + dma_len; 192 total_sz += dma_len; 193 194 /* collect page fragments until aligned or end of SG list */ 195 if (!IS_4K_ALIGNED(end_addr) && i < last_ent) { 196 new_chunk = 0; 197 continue; 198 } 199 new_chunk = 1; 200 201 /* address of the first page in the contiguous chunk; 202 masking relevant for the very first SG entry, 203 which might be unaligned */ 204 page = chunk_start & MASK_4K; 205 do { 206 page_vec->pages[cur_page++] = page; 207 page += SIZE_4K; 208 } while (page < end_addr); 209 } 210 211 page_vec->data_size = total_sz; 212 iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page); 213 return cur_page; 214 } 215 216 217 /** 218 * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned 219 * for RDMA sub-list of a scatter-gather list of memory buffers, and returns 220 * the number of entries which are aligned correctly. Supports the case where 221 * consecutive SG elements are actually fragments of the same physcial page. 222 */ 223 static int iser_data_buf_aligned_len(struct iser_data_buf *data, 224 struct ib_device *ibdev) 225 { 226 struct scatterlist *sgl, *sg, *next_sg = NULL; 227 u64 start_addr, end_addr; 228 int i, ret_len, start_check = 0; 229 230 if (data->dma_nents == 1) 231 return 1; 232 233 sgl = (struct scatterlist *)data->buf; 234 start_addr = ib_sg_dma_address(ibdev, sgl); 235 236 for_each_sg(sgl, sg, data->dma_nents, i) { 237 if (start_check && !IS_4K_ALIGNED(start_addr)) 238 break; 239 240 next_sg = sg_next(sg); 241 if (!next_sg) 242 break; 243 244 end_addr = start_addr + ib_sg_dma_len(ibdev, sg); 245 start_addr = ib_sg_dma_address(ibdev, next_sg); 246 247 if (end_addr == start_addr) { 248 start_check = 0; 249 continue; 250 } else 251 start_check = 1; 252 253 if (!IS_4K_ALIGNED(end_addr)) 254 break; 255 } 256 ret_len = (next_sg) ? i : i+1; 257 iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", 258 ret_len, data->dma_nents, data); 259 return ret_len; 260 } 261 262 static void iser_data_buf_dump(struct iser_data_buf *data, 263 struct ib_device *ibdev) 264 { 265 struct scatterlist *sgl = (struct scatterlist *)data->buf; 266 struct scatterlist *sg; 267 int i; 268 269 if (iser_debug_level == 0) 270 return; 271 272 for_each_sg(sgl, sg, data->dma_nents, i) 273 iser_warn("sg[%d] dma_addr:0x%lX page:0x%p " 274 "off:0x%x sz:0x%x dma_len:0x%x\n", 275 i, (unsigned long)ib_sg_dma_address(ibdev, sg), 276 sg_page(sg), sg->offset, 277 sg->length, ib_sg_dma_len(ibdev, sg)); 278 } 279 280 static void iser_dump_page_vec(struct iser_page_vec *page_vec) 281 { 282 int i; 283 284 iser_err("page vec length %d data size %d\n", 285 page_vec->length, page_vec->data_size); 286 for (i = 0; i < page_vec->length; i++) 287 iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]); 288 } 289 290 static void iser_page_vec_build(struct iser_data_buf *data, 291 struct iser_page_vec *page_vec, 292 struct ib_device *ibdev) 293 { 294 int page_vec_len = 0; 295 296 page_vec->length = 0; 297 page_vec->offset = 0; 298 299 iser_dbg("Translating sg sz: %d\n", data->dma_nents); 300 page_vec_len = iser_sg_to_page_vec(data, page_vec, ibdev); 301 iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents,page_vec_len); 302 303 page_vec->length = page_vec_len; 304 305 if (page_vec_len * SIZE_4K < page_vec->data_size) { 306 iser_err("page_vec too short to hold this SG\n"); 307 iser_data_buf_dump(data, ibdev); 308 iser_dump_page_vec(page_vec); 309 BUG(); 310 } 311 } 312 313 int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, 314 struct iser_data_buf *data, 315 enum iser_data_dir iser_dir, 316 enum dma_data_direction dma_dir) 317 { 318 struct ib_device *dev; 319 320 iser_task->dir[iser_dir] = 1; 321 dev = iser_task->iser_conn->ib_conn->device->ib_device; 322 323 data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir); 324 if (data->dma_nents == 0) { 325 iser_err("dma_map_sg failed!!!\n"); 326 return -EINVAL; 327 } 328 return 0; 329 } 330 331 void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task) 332 { 333 struct ib_device *dev; 334 struct iser_data_buf *data; 335 336 dev = iser_task->iser_conn->ib_conn->device->ib_device; 337 338 if (iser_task->dir[ISER_DIR_IN]) { 339 data = &iser_task->data[ISER_DIR_IN]; 340 ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE); 341 } 342 343 if (iser_task->dir[ISER_DIR_OUT]) { 344 data = &iser_task->data[ISER_DIR_OUT]; 345 ib_dma_unmap_sg(dev, data->buf, data->size, DMA_TO_DEVICE); 346 } 347 } 348 349 /** 350 * iser_reg_rdma_mem - Registers memory intended for RDMA, 351 * obtaining rkey and va 352 * 353 * returns 0 on success, errno code on failure 354 */ 355 int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task, 356 enum iser_data_dir cmd_dir) 357 { 358 struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn; 359 struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn; 360 struct iser_device *device = ib_conn->device; 361 struct ib_device *ibdev = device->ib_device; 362 struct iser_data_buf *mem = &iser_task->data[cmd_dir]; 363 struct iser_regd_buf *regd_buf; 364 int aligned_len; 365 int err; 366 int i; 367 struct scatterlist *sg; 368 369 regd_buf = &iser_task->rdma_regd[cmd_dir]; 370 371 aligned_len = iser_data_buf_aligned_len(mem, ibdev); 372 if (aligned_len != mem->dma_nents) { 373 iscsi_conn->fmr_unalign_cnt++; 374 iser_warn("rdma alignment violation %d/%d aligned\n", 375 aligned_len, mem->size); 376 iser_data_buf_dump(mem, ibdev); 377 378 /* unmap the command data before accessing it */ 379 iser_dma_unmap_task_data(iser_task); 380 381 /* allocate copy buf, if we are writing, copy the */ 382 /* unaligned scatterlist, dma map the copy */ 383 if (iser_start_rdma_unaligned_sg(iser_task, cmd_dir) != 0) 384 return -ENOMEM; 385 mem = &iser_task->data_copy[cmd_dir]; 386 } 387 388 /* if there a single dma entry, FMR is not needed */ 389 if (mem->dma_nents == 1) { 390 sg = (struct scatterlist *)mem->buf; 391 392 regd_buf->reg.lkey = device->mr->lkey; 393 regd_buf->reg.rkey = device->mr->rkey; 394 regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); 395 regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); 396 regd_buf->reg.is_fmr = 0; 397 398 iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " 399 "va: 0x%08lX sz: %ld]\n", 400 (unsigned int)regd_buf->reg.lkey, 401 (unsigned int)regd_buf->reg.rkey, 402 (unsigned long)regd_buf->reg.va, 403 (unsigned long)regd_buf->reg.len); 404 } else { /* use FMR for multiple dma entries */ 405 iser_page_vec_build(mem, ib_conn->page_vec, ibdev); 406 err = iser_reg_page_vec(ib_conn, ib_conn->page_vec, ®d_buf->reg); 407 if (err) { 408 iser_data_buf_dump(mem, ibdev); 409 iser_err("mem->dma_nents = %d (dlength = 0x%x)\n", 410 mem->dma_nents, 411 ntoh24(iser_task->desc.iscsi_header.dlength)); 412 iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n", 413 ib_conn->page_vec->data_size, ib_conn->page_vec->length, 414 ib_conn->page_vec->offset); 415 for (i=0 ; i<ib_conn->page_vec->length ; i++) 416 iser_err("page_vec[%d] = 0x%llx\n", i, 417 (unsigned long long) ib_conn->page_vec->pages[i]); 418 return err; 419 } 420 } 421 return 0; 422 } 423