1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* 3 * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. 4 * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. 5 */ 6 7 #include <linux/libnvdimm.h> 8 9 #include "rxe.h" 10 #include "rxe_loc.h" 11 12 /* Return a random 8 bit key value that is 13 * different than the last_key. Set last_key to -1 14 * if this is the first key for an MR or MW 15 */ 16 u8 rxe_get_next_key(u32 last_key) 17 { 18 u8 key; 19 20 do { 21 get_random_bytes(&key, 1); 22 } while (key == last_key); 23 24 return key; 25 } 26 27 int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) 28 { 29 30 31 switch (mr->ibmr.type) { 32 case IB_MR_TYPE_DMA: 33 return 0; 34 35 case IB_MR_TYPE_USER: 36 case IB_MR_TYPE_MEM_REG: 37 if (iova < mr->ibmr.iova || length > mr->ibmr.length || 38 iova > mr->ibmr.iova + mr->ibmr.length - length) 39 return -EFAULT; 40 return 0; 41 42 default: 43 rxe_dbg_mr(mr, "type (%d) not supported\n", mr->ibmr.type); 44 return -EFAULT; 45 } 46 } 47 48 #define IB_ACCESS_REMOTE (IB_ACCESS_REMOTE_READ \ 49 | IB_ACCESS_REMOTE_WRITE \ 50 | IB_ACCESS_REMOTE_ATOMIC) 51 52 static void rxe_mr_init(int access, struct rxe_mr *mr) 53 { 54 u32 lkey = mr->elem.index << 8 | rxe_get_next_key(-1); 55 u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0; 56 57 /* set ibmr->l/rkey and also copy into private l/rkey 58 * for user MRs these will always be the same 59 * for cases where caller 'owns' the key portion 60 * they may be different until REG_MR WQE is executed. 61 */ 62 mr->lkey = mr->ibmr.lkey = lkey; 63 mr->rkey = mr->ibmr.rkey = rkey; 64 65 mr->state = RXE_MR_STATE_INVALID; 66 } 67 68 static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) 69 { 70 int i; 71 int num_map; 72 struct rxe_map **map = mr->map; 73 74 num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP; 75 76 mr->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL); 77 if (!mr->map) 78 goto err1; 79 80 for (i = 0; i < num_map; i++) { 81 mr->map[i] = kmalloc(sizeof(**map), GFP_KERNEL); 82 if (!mr->map[i]) 83 goto err2; 84 } 85 86 BUILD_BUG_ON(!is_power_of_2(RXE_BUF_PER_MAP)); 87 88 mr->map_shift = ilog2(RXE_BUF_PER_MAP); 89 mr->map_mask = RXE_BUF_PER_MAP - 1; 90 91 mr->num_buf = num_buf; 92 mr->num_map = num_map; 93 mr->max_buf = num_map * RXE_BUF_PER_MAP; 94 95 return 0; 96 97 err2: 98 for (i--; i >= 0; i--) 99 kfree(mr->map[i]); 100 101 kfree(mr->map); 102 mr->map = NULL; 103 err1: 104 return -ENOMEM; 105 } 106 107 void rxe_mr_init_dma(int access, struct rxe_mr *mr) 108 { 109 rxe_mr_init(access, mr); 110 111 mr->access = access; 112 mr->state = RXE_MR_STATE_VALID; 113 mr->ibmr.type = IB_MR_TYPE_DMA; 114 } 115 116 static bool is_pmem_page(struct page *pg) 117 { 118 unsigned long paddr = page_to_phys(pg); 119 120 return REGION_INTERSECTS == 121 region_intersects(paddr, PAGE_SIZE, IORESOURCE_MEM, 122 IORES_DESC_PERSISTENT_MEMORY); 123 } 124 125 int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, 126 int access, struct rxe_mr *mr) 127 { 128 struct rxe_map **map; 129 struct rxe_phys_buf *buf = NULL; 130 struct ib_umem *umem; 131 struct sg_page_iter sg_iter; 132 int num_buf; 133 void *vaddr; 134 int err; 135 136 umem = ib_umem_get(&rxe->ib_dev, start, length, access); 137 if (IS_ERR(umem)) { 138 rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n", 139 (int)PTR_ERR(umem)); 140 err = PTR_ERR(umem); 141 goto err_out; 142 } 143 144 num_buf = ib_umem_num_pages(umem); 145 146 rxe_mr_init(access, mr); 147 148 err = rxe_mr_alloc(mr, num_buf); 149 if (err) { 150 rxe_dbg_mr(mr, "Unable to allocate memory for map\n"); 151 goto err_release_umem; 152 } 153 154 mr->page_shift = PAGE_SHIFT; 155 mr->page_mask = PAGE_SIZE - 1; 156 157 num_buf = 0; 158 map = mr->map; 159 if (length > 0) { 160 bool persistent_access = access & IB_ACCESS_FLUSH_PERSISTENT; 161 162 buf = map[0]->buf; 163 for_each_sgtable_page (&umem->sgt_append.sgt, &sg_iter, 0) { 164 struct page *pg = sg_page_iter_page(&sg_iter); 165 166 if (persistent_access && !is_pmem_page(pg)) { 167 rxe_dbg_mr(mr, "Unable to register persistent access to non-pmem device\n"); 168 err = -EINVAL; 169 goto err_release_umem; 170 } 171 172 if (num_buf >= RXE_BUF_PER_MAP) { 173 map++; 174 buf = map[0]->buf; 175 num_buf = 0; 176 } 177 178 vaddr = page_address(pg); 179 if (!vaddr) { 180 rxe_dbg_mr(mr, "Unable to get virtual address\n"); 181 err = -ENOMEM; 182 goto err_release_umem; 183 } 184 buf->addr = (uintptr_t)vaddr; 185 buf->size = PAGE_SIZE; 186 num_buf++; 187 buf++; 188 189 } 190 } 191 192 mr->umem = umem; 193 mr->access = access; 194 mr->offset = ib_umem_offset(umem); 195 mr->state = RXE_MR_STATE_VALID; 196 mr->ibmr.type = IB_MR_TYPE_USER; 197 mr->ibmr.page_size = PAGE_SIZE; 198 199 return 0; 200 201 err_release_umem: 202 ib_umem_release(umem); 203 err_out: 204 return err; 205 } 206 207 int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) 208 { 209 int err; 210 211 /* always allow remote access for FMRs */ 212 rxe_mr_init(IB_ACCESS_REMOTE, mr); 213 214 err = rxe_mr_alloc(mr, max_pages); 215 if (err) 216 goto err1; 217 218 mr->max_buf = max_pages; 219 mr->state = RXE_MR_STATE_FREE; 220 mr->ibmr.type = IB_MR_TYPE_MEM_REG; 221 222 return 0; 223 224 err1: 225 return err; 226 } 227 228 static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out, 229 size_t *offset_out) 230 { 231 size_t offset = iova - mr->ibmr.iova + mr->offset; 232 int map_index; 233 int buf_index; 234 u64 length; 235 236 if (likely(mr->page_shift)) { 237 *offset_out = offset & mr->page_mask; 238 offset >>= mr->page_shift; 239 *n_out = offset & mr->map_mask; 240 *m_out = offset >> mr->map_shift; 241 } else { 242 map_index = 0; 243 buf_index = 0; 244 245 length = mr->map[map_index]->buf[buf_index].size; 246 247 while (offset >= length) { 248 offset -= length; 249 buf_index++; 250 251 if (buf_index == RXE_BUF_PER_MAP) { 252 map_index++; 253 buf_index = 0; 254 } 255 length = mr->map[map_index]->buf[buf_index].size; 256 } 257 258 *m_out = map_index; 259 *n_out = buf_index; 260 *offset_out = offset; 261 } 262 } 263 264 void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length) 265 { 266 size_t offset; 267 int m, n; 268 void *addr; 269 270 if (mr->state != RXE_MR_STATE_VALID) { 271 rxe_dbg_mr(mr, "Not in valid state\n"); 272 addr = NULL; 273 goto out; 274 } 275 276 if (!mr->map) { 277 addr = (void *)(uintptr_t)iova; 278 goto out; 279 } 280 281 if (mr_check_range(mr, iova, length)) { 282 rxe_dbg_mr(mr, "Range violation\n"); 283 addr = NULL; 284 goto out; 285 } 286 287 lookup_iova(mr, iova, &m, &n, &offset); 288 289 if (offset + length > mr->map[m]->buf[n].size) { 290 rxe_dbg_mr(mr, "Crosses page boundary\n"); 291 addr = NULL; 292 goto out; 293 } 294 295 addr = (void *)(uintptr_t)mr->map[m]->buf[n].addr + offset; 296 297 out: 298 return addr; 299 } 300 301 int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, int length) 302 { 303 size_t offset; 304 305 if (length == 0) 306 return 0; 307 308 if (mr->ibmr.type == IB_MR_TYPE_DMA) 309 return -EFAULT; 310 311 offset = (iova - mr->ibmr.iova + mr->offset) & mr->page_mask; 312 while (length > 0) { 313 u8 *va; 314 int bytes; 315 316 bytes = mr->ibmr.page_size - offset; 317 if (bytes > length) 318 bytes = length; 319 320 va = iova_to_vaddr(mr, iova, length); 321 if (!va) 322 return -EFAULT; 323 324 arch_wb_cache_pmem(va, bytes); 325 326 length -= bytes; 327 iova += bytes; 328 offset = 0; 329 } 330 331 return 0; 332 } 333 334 /* copy data from a range (vaddr, vaddr+length-1) to or from 335 * a mr object starting at iova. 336 */ 337 int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, 338 enum rxe_mr_copy_dir dir) 339 { 340 int err; 341 int bytes; 342 u8 *va; 343 struct rxe_map **map; 344 struct rxe_phys_buf *buf; 345 int m; 346 int i; 347 size_t offset; 348 349 if (length == 0) 350 return 0; 351 352 if (mr->ibmr.type == IB_MR_TYPE_DMA) { 353 u8 *src, *dest; 354 355 src = (dir == RXE_TO_MR_OBJ) ? addr : ((void *)(uintptr_t)iova); 356 357 dest = (dir == RXE_TO_MR_OBJ) ? ((void *)(uintptr_t)iova) : addr; 358 359 memcpy(dest, src, length); 360 361 return 0; 362 } 363 364 WARN_ON_ONCE(!mr->map); 365 366 err = mr_check_range(mr, iova, length); 367 if (err) { 368 err = -EFAULT; 369 goto err1; 370 } 371 372 lookup_iova(mr, iova, &m, &i, &offset); 373 374 map = mr->map + m; 375 buf = map[0]->buf + i; 376 377 while (length > 0) { 378 u8 *src, *dest; 379 380 va = (u8 *)(uintptr_t)buf->addr + offset; 381 src = (dir == RXE_TO_MR_OBJ) ? addr : va; 382 dest = (dir == RXE_TO_MR_OBJ) ? va : addr; 383 384 bytes = buf->size - offset; 385 386 if (bytes > length) 387 bytes = length; 388 389 memcpy(dest, src, bytes); 390 391 length -= bytes; 392 addr += bytes; 393 394 offset = 0; 395 buf++; 396 i++; 397 398 if (i == RXE_BUF_PER_MAP) { 399 i = 0; 400 map++; 401 buf = map[0]->buf; 402 } 403 } 404 405 return 0; 406 407 err1: 408 return err; 409 } 410 411 /* copy data in or out of a wqe, i.e. sg list 412 * under the control of a dma descriptor 413 */ 414 int copy_data( 415 struct rxe_pd *pd, 416 int access, 417 struct rxe_dma_info *dma, 418 void *addr, 419 int length, 420 enum rxe_mr_copy_dir dir) 421 { 422 int bytes; 423 struct rxe_sge *sge = &dma->sge[dma->cur_sge]; 424 int offset = dma->sge_offset; 425 int resid = dma->resid; 426 struct rxe_mr *mr = NULL; 427 u64 iova; 428 int err; 429 430 if (length == 0) 431 return 0; 432 433 if (length > resid) { 434 err = -EINVAL; 435 goto err2; 436 } 437 438 if (sge->length && (offset < sge->length)) { 439 mr = lookup_mr(pd, access, sge->lkey, RXE_LOOKUP_LOCAL); 440 if (!mr) { 441 err = -EINVAL; 442 goto err1; 443 } 444 } 445 446 while (length > 0) { 447 bytes = length; 448 449 if (offset >= sge->length) { 450 if (mr) { 451 rxe_put(mr); 452 mr = NULL; 453 } 454 sge++; 455 dma->cur_sge++; 456 offset = 0; 457 458 if (dma->cur_sge >= dma->num_sge) { 459 err = -ENOSPC; 460 goto err2; 461 } 462 463 if (sge->length) { 464 mr = lookup_mr(pd, access, sge->lkey, 465 RXE_LOOKUP_LOCAL); 466 if (!mr) { 467 err = -EINVAL; 468 goto err1; 469 } 470 } else { 471 continue; 472 } 473 } 474 475 if (bytes > sge->length - offset) 476 bytes = sge->length - offset; 477 478 if (bytes > 0) { 479 iova = sge->addr + offset; 480 481 err = rxe_mr_copy(mr, iova, addr, bytes, dir); 482 if (err) 483 goto err2; 484 485 offset += bytes; 486 resid -= bytes; 487 length -= bytes; 488 addr += bytes; 489 } 490 } 491 492 dma->sge_offset = offset; 493 dma->resid = resid; 494 495 if (mr) 496 rxe_put(mr); 497 498 return 0; 499 500 err2: 501 if (mr) 502 rxe_put(mr); 503 err1: 504 return err; 505 } 506 507 int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) 508 { 509 struct rxe_sge *sge = &dma->sge[dma->cur_sge]; 510 int offset = dma->sge_offset; 511 int resid = dma->resid; 512 513 while (length) { 514 unsigned int bytes; 515 516 if (offset >= sge->length) { 517 sge++; 518 dma->cur_sge++; 519 offset = 0; 520 if (dma->cur_sge >= dma->num_sge) 521 return -ENOSPC; 522 } 523 524 bytes = length; 525 526 if (bytes > sge->length - offset) 527 bytes = sge->length - offset; 528 529 offset += bytes; 530 resid -= bytes; 531 length -= bytes; 532 } 533 534 dma->sge_offset = offset; 535 dma->resid = resid; 536 537 return 0; 538 } 539 540 /* (1) find the mr corresponding to lkey/rkey 541 * depending on lookup_type 542 * (2) verify that the (qp) pd matches the mr pd 543 * (3) verify that the mr can support the requested access 544 * (4) verify that mr state is valid 545 */ 546 struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, 547 enum rxe_mr_lookup_type type) 548 { 549 struct rxe_mr *mr; 550 struct rxe_dev *rxe = to_rdev(pd->ibpd.device); 551 int index = key >> 8; 552 553 mr = rxe_pool_get_index(&rxe->mr_pool, index); 554 if (!mr) 555 return NULL; 556 557 if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) || 558 (type == RXE_LOOKUP_REMOTE && mr->rkey != key) || 559 mr_pd(mr) != pd || ((access & mr->access) != access) || 560 mr->state != RXE_MR_STATE_VALID)) { 561 rxe_put(mr); 562 mr = NULL; 563 } 564 565 return mr; 566 } 567 568 int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) 569 { 570 struct rxe_dev *rxe = to_rdev(qp->ibqp.device); 571 struct rxe_mr *mr; 572 int ret; 573 574 mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); 575 if (!mr) { 576 rxe_dbg_qp(qp, "No MR for key %#x\n", key); 577 ret = -EINVAL; 578 goto err; 579 } 580 581 if (mr->rkey ? (key != mr->rkey) : (key != mr->lkey)) { 582 rxe_dbg_mr(mr, "wr key (%#x) doesn't match mr key (%#x)\n", 583 key, (mr->rkey ? mr->rkey : mr->lkey)); 584 ret = -EINVAL; 585 goto err_drop_ref; 586 } 587 588 if (atomic_read(&mr->num_mw) > 0) { 589 rxe_dbg_mr(mr, "Attempt to invalidate an MR while bound to MWs\n"); 590 ret = -EINVAL; 591 goto err_drop_ref; 592 } 593 594 if (unlikely(mr->ibmr.type != IB_MR_TYPE_MEM_REG)) { 595 rxe_dbg_mr(mr, "Type (%d) is wrong\n", mr->ibmr.type); 596 ret = -EINVAL; 597 goto err_drop_ref; 598 } 599 600 mr->state = RXE_MR_STATE_FREE; 601 ret = 0; 602 603 err_drop_ref: 604 rxe_put(mr); 605 err: 606 return ret; 607 } 608 609 /* user can (re)register fast MR by executing a REG_MR WQE. 610 * user is expected to hold a reference on the ib mr until the 611 * WQE completes. 612 * Once a fast MR is created this is the only way to change the 613 * private keys. It is the responsibility of the user to maintain 614 * the ib mr keys in sync with rxe mr keys. 615 */ 616 int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) 617 { 618 struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr); 619 u32 key = wqe->wr.wr.reg.key; 620 u32 access = wqe->wr.wr.reg.access; 621 622 /* user can only register MR in free state */ 623 if (unlikely(mr->state != RXE_MR_STATE_FREE)) { 624 rxe_dbg_mr(mr, "mr->lkey = 0x%x not free\n", mr->lkey); 625 return -EINVAL; 626 } 627 628 /* user can only register mr with qp in same protection domain */ 629 if (unlikely(qp->ibqp.pd != mr->ibmr.pd)) { 630 rxe_dbg_mr(mr, "qp->pd and mr->pd don't match\n"); 631 return -EINVAL; 632 } 633 634 /* user is only allowed to change key portion of l/rkey */ 635 if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) { 636 rxe_dbg_mr(mr, "key = 0x%x has wrong index mr->lkey = 0x%x\n", 637 key, mr->lkey); 638 return -EINVAL; 639 } 640 641 mr->access = access; 642 mr->lkey = key; 643 mr->rkey = (access & IB_ACCESS_REMOTE) ? key : 0; 644 mr->ibmr.iova = wqe->wr.wr.reg.mr->iova; 645 mr->state = RXE_MR_STATE_VALID; 646 647 return 0; 648 } 649 650 int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 651 { 652 struct rxe_mr *mr = to_rmr(ibmr); 653 654 /* See IBA 10.6.7.2.6 */ 655 if (atomic_read(&mr->num_mw) > 0) 656 return -EINVAL; 657 658 rxe_cleanup(mr); 659 660 return 0; 661 } 662 663 void rxe_mr_cleanup(struct rxe_pool_elem *elem) 664 { 665 struct rxe_mr *mr = container_of(elem, typeof(*mr), elem); 666 int i; 667 668 rxe_put(mr_pd(mr)); 669 ib_umem_release(mr->umem); 670 671 if (mr->map) { 672 for (i = 0; i < mr->num_map; i++) 673 kfree(mr->map[i]); 674 675 kfree(mr->map); 676 } 677 } 678