1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* 3 * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. 4 * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. 5 */ 6 7 #include <linux/libnvdimm.h> 8 9 #include "rxe.h" 10 #include "rxe_loc.h" 11 12 /* Return a random 8 bit key value that is 13 * different than the last_key. Set last_key to -1 14 * if this is the first key for an MR or MW 15 */ 16 u8 rxe_get_next_key(u32 last_key) 17 { 18 u8 key; 19 20 do { 21 get_random_bytes(&key, 1); 22 } while (key == last_key); 23 24 return key; 25 } 26 27 int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) 28 { 29 switch (mr->ibmr.type) { 30 case IB_MR_TYPE_DMA: 31 return 0; 32 33 case IB_MR_TYPE_USER: 34 case IB_MR_TYPE_MEM_REG: 35 if (iova < mr->ibmr.iova || 36 iova + length > mr->ibmr.iova + mr->ibmr.length) { 37 rxe_dbg_mr(mr, "iova/length out of range"); 38 return -EINVAL; 39 } 40 return 0; 41 42 default: 43 rxe_dbg_mr(mr, "mr type not supported\n"); 44 return -EINVAL; 45 } 46 } 47 48 #define IB_ACCESS_REMOTE (IB_ACCESS_REMOTE_READ \ 49 | IB_ACCESS_REMOTE_WRITE \ 50 | IB_ACCESS_REMOTE_ATOMIC) 51 52 static void rxe_mr_init(int access, struct rxe_mr *mr) 53 { 54 u32 lkey = mr->elem.index << 8 | rxe_get_next_key(-1); 55 u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0; 56 57 /* set ibmr->l/rkey and also copy into private l/rkey 58 * for user MRs these will always be the same 59 * for cases where caller 'owns' the key portion 60 * they may be different until REG_MR WQE is executed. 61 */ 62 mr->lkey = mr->ibmr.lkey = lkey; 63 mr->rkey = mr->ibmr.rkey = rkey; 64 65 mr->access = access; 66 mr->ibmr.page_size = PAGE_SIZE; 67 mr->page_mask = PAGE_MASK; 68 mr->page_shift = PAGE_SHIFT; 69 mr->state = RXE_MR_STATE_INVALID; 70 } 71 72 void rxe_mr_init_dma(int access, struct rxe_mr *mr) 73 { 74 rxe_mr_init(access, mr); 75 76 mr->state = RXE_MR_STATE_VALID; 77 mr->ibmr.type = IB_MR_TYPE_DMA; 78 } 79 80 static unsigned long rxe_mr_iova_to_index(struct rxe_mr *mr, u64 iova) 81 { 82 return (iova >> mr->page_shift) - (mr->ibmr.iova >> mr->page_shift); 83 } 84 85 static unsigned long rxe_mr_iova_to_page_offset(struct rxe_mr *mr, u64 iova) 86 { 87 return iova & (mr_page_size(mr) - 1); 88 } 89 90 static bool is_pmem_page(struct page *pg) 91 { 92 unsigned long paddr = page_to_phys(pg); 93 94 return REGION_INTERSECTS == 95 region_intersects(paddr, PAGE_SIZE, IORESOURCE_MEM, 96 IORES_DESC_PERSISTENT_MEMORY); 97 } 98 99 static int rxe_mr_fill_pages_from_sgt(struct rxe_mr *mr, struct sg_table *sgt) 100 { 101 XA_STATE(xas, &mr->page_list, 0); 102 struct sg_page_iter sg_iter; 103 struct page *page; 104 bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); 105 106 __sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0); 107 if (!__sg_page_iter_next(&sg_iter)) 108 return 0; 109 110 do { 111 xas_lock(&xas); 112 while (true) { 113 page = sg_page_iter_page(&sg_iter); 114 115 if (persistent && !is_pmem_page(page)) { 116 rxe_dbg_mr(mr, "Page can't be persistent\n"); 117 xas_set_err(&xas, -EINVAL); 118 break; 119 } 120 121 xas_store(&xas, page); 122 if (xas_error(&xas)) 123 break; 124 xas_next(&xas); 125 if (!__sg_page_iter_next(&sg_iter)) 126 break; 127 } 128 xas_unlock(&xas); 129 } while (xas_nomem(&xas, GFP_KERNEL)); 130 131 return xas_error(&xas); 132 } 133 134 int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, 135 int access, struct rxe_mr *mr) 136 { 137 struct ib_umem *umem; 138 int err; 139 140 rxe_mr_init(access, mr); 141 142 xa_init(&mr->page_list); 143 144 umem = ib_umem_get(&rxe->ib_dev, start, length, access); 145 if (IS_ERR(umem)) { 146 rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n", 147 (int)PTR_ERR(umem)); 148 return PTR_ERR(umem); 149 } 150 151 err = rxe_mr_fill_pages_from_sgt(mr, &umem->sgt_append.sgt); 152 if (err) { 153 ib_umem_release(umem); 154 return err; 155 } 156 157 mr->umem = umem; 158 mr->ibmr.type = IB_MR_TYPE_USER; 159 mr->state = RXE_MR_STATE_VALID; 160 161 return 0; 162 } 163 164 static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) 165 { 166 XA_STATE(xas, &mr->page_list, 0); 167 int i = 0; 168 int err; 169 170 xa_init(&mr->page_list); 171 172 do { 173 xas_lock(&xas); 174 while (i != num_buf) { 175 xas_store(&xas, XA_ZERO_ENTRY); 176 if (xas_error(&xas)) 177 break; 178 xas_next(&xas); 179 i++; 180 } 181 xas_unlock(&xas); 182 } while (xas_nomem(&xas, GFP_KERNEL)); 183 184 err = xas_error(&xas); 185 if (err) 186 return err; 187 188 mr->num_buf = num_buf; 189 190 return 0; 191 } 192 193 int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) 194 { 195 int err; 196 197 /* always allow remote access for FMRs */ 198 rxe_mr_init(IB_ACCESS_REMOTE, mr); 199 200 err = rxe_mr_alloc(mr, max_pages); 201 if (err) 202 goto err1; 203 204 mr->state = RXE_MR_STATE_FREE; 205 mr->ibmr.type = IB_MR_TYPE_MEM_REG; 206 207 return 0; 208 209 err1: 210 return err; 211 } 212 213 static int rxe_set_page(struct ib_mr *ibmr, u64 iova) 214 { 215 struct rxe_mr *mr = to_rmr(ibmr); 216 struct page *page = virt_to_page(iova & mr->page_mask); 217 bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); 218 int err; 219 220 if (persistent && !is_pmem_page(page)) { 221 rxe_dbg_mr(mr, "Page cannot be persistent\n"); 222 return -EINVAL; 223 } 224 225 if (unlikely(mr->nbuf == mr->num_buf)) 226 return -ENOMEM; 227 228 err = xa_err(xa_store(&mr->page_list, mr->nbuf, page, GFP_KERNEL)); 229 if (err) 230 return err; 231 232 mr->nbuf++; 233 return 0; 234 } 235 236 int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl, 237 int sg_nents, unsigned int *sg_offset) 238 { 239 struct rxe_mr *mr = to_rmr(ibmr); 240 unsigned int page_size = mr_page_size(mr); 241 242 mr->nbuf = 0; 243 mr->page_shift = ilog2(page_size); 244 mr->page_mask = ~((u64)page_size - 1); 245 mr->page_offset = mr->ibmr.iova & (page_size - 1); 246 247 return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page); 248 } 249 250 static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr, 251 unsigned int length, enum rxe_mr_copy_dir dir) 252 { 253 unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); 254 unsigned long index = rxe_mr_iova_to_index(mr, iova); 255 unsigned int bytes; 256 struct page *page; 257 void *va; 258 259 while (length) { 260 page = xa_load(&mr->page_list, index); 261 if (!page) 262 return -EFAULT; 263 264 bytes = min_t(unsigned int, length, 265 mr_page_size(mr) - page_offset); 266 va = kmap_local_page(page); 267 if (dir == RXE_FROM_MR_OBJ) 268 memcpy(addr, va + page_offset, bytes); 269 else 270 memcpy(va + page_offset, addr, bytes); 271 kunmap_local(va); 272 273 page_offset = 0; 274 addr += bytes; 275 length -= bytes; 276 index++; 277 } 278 279 return 0; 280 } 281 282 static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 iova, void *addr, 283 unsigned int length, enum rxe_mr_copy_dir dir) 284 { 285 unsigned int page_offset = iova & (PAGE_SIZE - 1); 286 unsigned int bytes; 287 struct page *page; 288 u8 *va; 289 290 while (length) { 291 page = virt_to_page(iova & mr->page_mask); 292 bytes = min_t(unsigned int, length, 293 PAGE_SIZE - page_offset); 294 va = kmap_local_page(page); 295 296 if (dir == RXE_TO_MR_OBJ) 297 memcpy(va + page_offset, addr, bytes); 298 else 299 memcpy(addr, va + page_offset, bytes); 300 301 kunmap_local(va); 302 page_offset = 0; 303 iova += bytes; 304 addr += bytes; 305 length -= bytes; 306 } 307 } 308 309 int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, 310 unsigned int length, enum rxe_mr_copy_dir dir) 311 { 312 int err; 313 314 if (length == 0) 315 return 0; 316 317 if (WARN_ON(!mr)) 318 return -EINVAL; 319 320 if (mr->ibmr.type == IB_MR_TYPE_DMA) { 321 rxe_mr_copy_dma(mr, iova, addr, length, dir); 322 return 0; 323 } 324 325 err = mr_check_range(mr, iova, length); 326 if (unlikely(err)) { 327 rxe_dbg_mr(mr, "iova out of range"); 328 return err; 329 } 330 331 return rxe_mr_copy_xarray(mr, iova, addr, length, dir); 332 } 333 334 /* copy data in or out of a wqe, i.e. sg list 335 * under the control of a dma descriptor 336 */ 337 int copy_data( 338 struct rxe_pd *pd, 339 int access, 340 struct rxe_dma_info *dma, 341 void *addr, 342 int length, 343 enum rxe_mr_copy_dir dir) 344 { 345 int bytes; 346 struct rxe_sge *sge = &dma->sge[dma->cur_sge]; 347 int offset = dma->sge_offset; 348 int resid = dma->resid; 349 struct rxe_mr *mr = NULL; 350 u64 iova; 351 int err; 352 353 if (length == 0) 354 return 0; 355 356 if (length > resid) { 357 err = -EINVAL; 358 goto err2; 359 } 360 361 if (sge->length && (offset < sge->length)) { 362 mr = lookup_mr(pd, access, sge->lkey, RXE_LOOKUP_LOCAL); 363 if (!mr) { 364 err = -EINVAL; 365 goto err1; 366 } 367 } 368 369 while (length > 0) { 370 bytes = length; 371 372 if (offset >= sge->length) { 373 if (mr) { 374 rxe_put(mr); 375 mr = NULL; 376 } 377 sge++; 378 dma->cur_sge++; 379 offset = 0; 380 381 if (dma->cur_sge >= dma->num_sge) { 382 err = -ENOSPC; 383 goto err2; 384 } 385 386 if (sge->length) { 387 mr = lookup_mr(pd, access, sge->lkey, 388 RXE_LOOKUP_LOCAL); 389 if (!mr) { 390 err = -EINVAL; 391 goto err1; 392 } 393 } else { 394 continue; 395 } 396 } 397 398 if (bytes > sge->length - offset) 399 bytes = sge->length - offset; 400 401 if (bytes > 0) { 402 iova = sge->addr + offset; 403 err = rxe_mr_copy(mr, iova, addr, bytes, dir); 404 if (err) 405 goto err2; 406 407 offset += bytes; 408 resid -= bytes; 409 length -= bytes; 410 addr += bytes; 411 } 412 } 413 414 dma->sge_offset = offset; 415 dma->resid = resid; 416 417 if (mr) 418 rxe_put(mr); 419 420 return 0; 421 422 err2: 423 if (mr) 424 rxe_put(mr); 425 err1: 426 return err; 427 } 428 429 int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) 430 { 431 unsigned int page_offset; 432 unsigned long index; 433 struct page *page; 434 unsigned int bytes; 435 int err; 436 u8 *va; 437 438 /* mr must be valid even if length is zero */ 439 if (WARN_ON(!mr)) 440 return -EINVAL; 441 442 if (length == 0) 443 return 0; 444 445 if (mr->ibmr.type == IB_MR_TYPE_DMA) 446 return -EFAULT; 447 448 err = mr_check_range(mr, iova, length); 449 if (err) 450 return err; 451 452 while (length > 0) { 453 index = rxe_mr_iova_to_index(mr, iova); 454 page = xa_load(&mr->page_list, index); 455 page_offset = rxe_mr_iova_to_page_offset(mr, iova); 456 if (!page) 457 return -EFAULT; 458 bytes = min_t(unsigned int, length, 459 mr_page_size(mr) - page_offset); 460 461 va = kmap_local_page(page); 462 arch_wb_cache_pmem(va + page_offset, bytes); 463 kunmap_local(va); 464 465 length -= bytes; 466 iova += bytes; 467 page_offset = 0; 468 } 469 470 return 0; 471 } 472 473 /* Guarantee atomicity of atomic operations at the machine level. */ 474 static DEFINE_SPINLOCK(atomic_ops_lock); 475 476 int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, 477 u64 compare, u64 swap_add, u64 *orig_val) 478 { 479 unsigned int page_offset; 480 struct page *page; 481 u64 value; 482 u64 *va; 483 484 if (unlikely(mr->state != RXE_MR_STATE_VALID)) { 485 rxe_dbg_mr(mr, "mr not in valid state"); 486 return RESPST_ERR_RKEY_VIOLATION; 487 } 488 489 if (mr->ibmr.type == IB_MR_TYPE_DMA) { 490 page_offset = iova & (PAGE_SIZE - 1); 491 page = virt_to_page(iova & PAGE_MASK); 492 } else { 493 unsigned long index; 494 int err; 495 496 err = mr_check_range(mr, iova, sizeof(value)); 497 if (err) { 498 rxe_dbg_mr(mr, "iova out of range"); 499 return RESPST_ERR_RKEY_VIOLATION; 500 } 501 page_offset = rxe_mr_iova_to_page_offset(mr, iova); 502 index = rxe_mr_iova_to_index(mr, iova); 503 page = xa_load(&mr->page_list, index); 504 if (!page) 505 return RESPST_ERR_RKEY_VIOLATION; 506 } 507 508 if (unlikely(page_offset & 0x7)) { 509 rxe_dbg_mr(mr, "iova not aligned"); 510 return RESPST_ERR_MISALIGNED_ATOMIC; 511 } 512 513 va = kmap_local_page(page); 514 515 spin_lock_bh(&atomic_ops_lock); 516 value = *orig_val = va[page_offset >> 3]; 517 518 if (opcode == IB_OPCODE_RC_COMPARE_SWAP) { 519 if (value == compare) 520 va[page_offset >> 3] = swap_add; 521 } else { 522 value += swap_add; 523 va[page_offset >> 3] = value; 524 } 525 spin_unlock_bh(&atomic_ops_lock); 526 527 kunmap_local(va); 528 529 return 0; 530 } 531 532 #if defined CONFIG_64BIT 533 /* only implemented or called for 64 bit architectures */ 534 int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) 535 { 536 unsigned int page_offset; 537 struct page *page; 538 u64 *va; 539 540 /* See IBA oA19-28 */ 541 if (unlikely(mr->state != RXE_MR_STATE_VALID)) { 542 rxe_dbg_mr(mr, "mr not in valid state"); 543 return RESPST_ERR_RKEY_VIOLATION; 544 } 545 546 if (mr->ibmr.type == IB_MR_TYPE_DMA) { 547 page_offset = iova & (PAGE_SIZE - 1); 548 page = virt_to_page(iova & PAGE_MASK); 549 } else { 550 unsigned long index; 551 int err; 552 553 /* See IBA oA19-28 */ 554 err = mr_check_range(mr, iova, sizeof(value)); 555 if (unlikely(err)) { 556 rxe_dbg_mr(mr, "iova out of range"); 557 return RESPST_ERR_RKEY_VIOLATION; 558 } 559 page_offset = rxe_mr_iova_to_page_offset(mr, iova); 560 index = rxe_mr_iova_to_index(mr, iova); 561 page = xa_load(&mr->page_list, index); 562 if (!page) 563 return RESPST_ERR_RKEY_VIOLATION; 564 } 565 566 /* See IBA A19.4.2 */ 567 if (unlikely(page_offset & 0x7)) { 568 rxe_dbg_mr(mr, "misaligned address"); 569 return RESPST_ERR_MISALIGNED_ATOMIC; 570 } 571 572 va = kmap_local_page(page); 573 574 /* Do atomic write after all prior operations have completed */ 575 smp_store_release(&va[page_offset >> 3], value); 576 577 kunmap_local(va); 578 579 return 0; 580 } 581 #else 582 int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) 583 { 584 return RESPST_ERR_UNSUPPORTED_OPCODE; 585 } 586 #endif 587 588 int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) 589 { 590 struct rxe_sge *sge = &dma->sge[dma->cur_sge]; 591 int offset = dma->sge_offset; 592 int resid = dma->resid; 593 594 while (length) { 595 unsigned int bytes; 596 597 if (offset >= sge->length) { 598 sge++; 599 dma->cur_sge++; 600 offset = 0; 601 if (dma->cur_sge >= dma->num_sge) 602 return -ENOSPC; 603 } 604 605 bytes = length; 606 607 if (bytes > sge->length - offset) 608 bytes = sge->length - offset; 609 610 offset += bytes; 611 resid -= bytes; 612 length -= bytes; 613 } 614 615 dma->sge_offset = offset; 616 dma->resid = resid; 617 618 return 0; 619 } 620 621 struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, 622 enum rxe_mr_lookup_type type) 623 { 624 struct rxe_mr *mr; 625 struct rxe_dev *rxe = to_rdev(pd->ibpd.device); 626 int index = key >> 8; 627 628 mr = rxe_pool_get_index(&rxe->mr_pool, index); 629 if (!mr) 630 return NULL; 631 632 if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) || 633 (type == RXE_LOOKUP_REMOTE && mr->rkey != key) || 634 mr_pd(mr) != pd || ((access & mr->access) != access) || 635 mr->state != RXE_MR_STATE_VALID)) { 636 rxe_put(mr); 637 mr = NULL; 638 } 639 640 return mr; 641 } 642 643 int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) 644 { 645 struct rxe_dev *rxe = to_rdev(qp->ibqp.device); 646 struct rxe_mr *mr; 647 int ret; 648 649 mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); 650 if (!mr) { 651 rxe_dbg_qp(qp, "No MR for key %#x\n", key); 652 ret = -EINVAL; 653 goto err; 654 } 655 656 if (mr->rkey ? (key != mr->rkey) : (key != mr->lkey)) { 657 rxe_dbg_mr(mr, "wr key (%#x) doesn't match mr key (%#x)\n", 658 key, (mr->rkey ? mr->rkey : mr->lkey)); 659 ret = -EINVAL; 660 goto err_drop_ref; 661 } 662 663 if (atomic_read(&mr->num_mw) > 0) { 664 rxe_dbg_mr(mr, "Attempt to invalidate an MR while bound to MWs\n"); 665 ret = -EINVAL; 666 goto err_drop_ref; 667 } 668 669 if (unlikely(mr->ibmr.type != IB_MR_TYPE_MEM_REG)) { 670 rxe_dbg_mr(mr, "Type (%d) is wrong\n", mr->ibmr.type); 671 ret = -EINVAL; 672 goto err_drop_ref; 673 } 674 675 mr->state = RXE_MR_STATE_FREE; 676 ret = 0; 677 678 err_drop_ref: 679 rxe_put(mr); 680 err: 681 return ret; 682 } 683 684 /* user can (re)register fast MR by executing a REG_MR WQE. 685 * user is expected to hold a reference on the ib mr until the 686 * WQE completes. 687 * Once a fast MR is created this is the only way to change the 688 * private keys. It is the responsibility of the user to maintain 689 * the ib mr keys in sync with rxe mr keys. 690 */ 691 int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) 692 { 693 struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr); 694 u32 key = wqe->wr.wr.reg.key; 695 u32 access = wqe->wr.wr.reg.access; 696 697 /* user can only register MR in free state */ 698 if (unlikely(mr->state != RXE_MR_STATE_FREE)) { 699 rxe_dbg_mr(mr, "mr->lkey = 0x%x not free\n", mr->lkey); 700 return -EINVAL; 701 } 702 703 /* user can only register mr with qp in same protection domain */ 704 if (unlikely(qp->ibqp.pd != mr->ibmr.pd)) { 705 rxe_dbg_mr(mr, "qp->pd and mr->pd don't match\n"); 706 return -EINVAL; 707 } 708 709 /* user is only allowed to change key portion of l/rkey */ 710 if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) { 711 rxe_dbg_mr(mr, "key = 0x%x has wrong index mr->lkey = 0x%x\n", 712 key, mr->lkey); 713 return -EINVAL; 714 } 715 716 mr->access = access; 717 mr->lkey = key; 718 mr->rkey = (access & IB_ACCESS_REMOTE) ? key : 0; 719 mr->ibmr.iova = wqe->wr.wr.reg.mr->iova; 720 mr->state = RXE_MR_STATE_VALID; 721 722 return 0; 723 } 724 725 int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 726 { 727 struct rxe_mr *mr = to_rmr(ibmr); 728 729 /* See IBA 10.6.7.2.6 */ 730 if (atomic_read(&mr->num_mw) > 0) 731 return -EINVAL; 732 733 rxe_cleanup(mr); 734 kfree_rcu(mr); 735 return 0; 736 } 737 738 void rxe_mr_cleanup(struct rxe_pool_elem *elem) 739 { 740 struct rxe_mr *mr = container_of(elem, typeof(*mr), elem); 741 742 rxe_put(mr_pd(mr)); 743 ib_umem_release(mr->umem); 744 745 if (mr->ibmr.type != IB_MR_TYPE_DMA) 746 xa_destroy(&mr->page_list); 747 } 748