1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* 3 * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. 4 * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. 5 */ 6 7 #include "rxe.h" 8 #include "rxe_loc.h" 9 10 /* Return a random 8 bit key value that is 11 * different than the last_key. Set last_key to -1 12 * if this is the first key for an MR or MW 13 */ 14 u8 rxe_get_next_key(u32 last_key) 15 { 16 u8 key; 17 18 do { 19 get_random_bytes(&key, 1); 20 } while (key == last_key); 21 22 return key; 23 } 24 25 int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) 26 { 27 struct rxe_map_set *set = mr->cur_map_set; 28 29 switch (mr->type) { 30 case IB_MR_TYPE_DMA: 31 return 0; 32 33 case IB_MR_TYPE_USER: 34 case IB_MR_TYPE_MEM_REG: 35 if (iova < set->iova || length > set->length || 36 iova > set->iova + set->length - length) 37 return -EFAULT; 38 return 0; 39 40 default: 41 pr_warn("%s: mr type (%d) not supported\n", 42 __func__, mr->type); 43 return -EFAULT; 44 } 45 } 46 47 #define IB_ACCESS_REMOTE (IB_ACCESS_REMOTE_READ \ 48 | IB_ACCESS_REMOTE_WRITE \ 49 | IB_ACCESS_REMOTE_ATOMIC) 50 51 static void rxe_mr_init(int access, struct rxe_mr *mr) 52 { 53 u32 lkey = mr->pelem.index << 8 | rxe_get_next_key(-1); 54 u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0; 55 56 /* set ibmr->l/rkey and also copy into private l/rkey 57 * for user MRs these will always be the same 58 * for cases where caller 'owns' the key portion 59 * they may be different until REG_MR WQE is executed. 60 */ 61 mr->lkey = mr->ibmr.lkey = lkey; 62 mr->rkey = mr->ibmr.rkey = rkey; 63 64 mr->state = RXE_MR_STATE_INVALID; 65 mr->map_shift = ilog2(RXE_BUF_PER_MAP); 66 } 67 68 static void rxe_mr_free_map_set(int num_map, struct rxe_map_set *set) 69 { 70 int i; 71 72 for (i = 0; i < num_map; i++) 73 kfree(set->map[i]); 74 75 kfree(set->map); 76 kfree(set); 77 } 78 79 static int rxe_mr_alloc_map_set(int num_map, struct rxe_map_set **setp) 80 { 81 int i; 82 struct rxe_map_set *set; 83 84 set = kmalloc(sizeof(*set), GFP_KERNEL); 85 if (!set) 86 goto err_out; 87 88 set->map = kmalloc_array(num_map, sizeof(struct rxe_map *), GFP_KERNEL); 89 if (!set->map) 90 goto err_free_set; 91 92 for (i = 0; i < num_map; i++) { 93 set->map[i] = kmalloc(sizeof(struct rxe_map), GFP_KERNEL); 94 if (!set->map[i]) 95 goto err_free_map; 96 } 97 98 *setp = set; 99 100 return 0; 101 102 err_free_map: 103 for (i--; i >= 0; i--) 104 kfree(set->map[i]); 105 106 kfree(set->map); 107 err_free_set: 108 kfree(set); 109 err_out: 110 return -ENOMEM; 111 } 112 113 /** 114 * rxe_mr_alloc() - Allocate memory map array(s) for MR 115 * @mr: Memory region 116 * @num_buf: Number of buffer descriptors to support 117 * @both: If non zero allocate both mr->map and mr->next_map 118 * else just allocate mr->map. Used for fast MRs 119 * 120 * Return: 0 on success else an error 121 */ 122 static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf, int both) 123 { 124 int ret; 125 int num_map; 126 127 BUILD_BUG_ON(!is_power_of_2(RXE_BUF_PER_MAP)); 128 num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP; 129 130 mr->map_shift = ilog2(RXE_BUF_PER_MAP); 131 mr->map_mask = RXE_BUF_PER_MAP - 1; 132 mr->num_buf = num_buf; 133 mr->max_buf = num_map * RXE_BUF_PER_MAP; 134 mr->num_map = num_map; 135 136 ret = rxe_mr_alloc_map_set(num_map, &mr->cur_map_set); 137 if (ret) 138 goto err_out; 139 140 if (both) { 141 ret = rxe_mr_alloc_map_set(num_map, &mr->next_map_set); 142 if (ret) { 143 rxe_mr_free_map_set(mr->num_map, mr->cur_map_set); 144 goto err_out; 145 } 146 } 147 148 return 0; 149 150 err_out: 151 return -ENOMEM; 152 } 153 154 void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr) 155 { 156 rxe_mr_init(access, mr); 157 158 mr->ibmr.pd = &pd->ibpd; 159 mr->access = access; 160 mr->state = RXE_MR_STATE_VALID; 161 mr->type = IB_MR_TYPE_DMA; 162 } 163 164 int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova, 165 int access, struct rxe_mr *mr) 166 { 167 struct rxe_map_set *set; 168 struct rxe_map **map; 169 struct rxe_phys_buf *buf = NULL; 170 struct ib_umem *umem; 171 struct sg_page_iter sg_iter; 172 int num_buf; 173 void *vaddr; 174 int err; 175 176 umem = ib_umem_get(pd->ibpd.device, start, length, access); 177 if (IS_ERR(umem)) { 178 pr_warn("%s: Unable to pin memory region err = %d\n", 179 __func__, (int)PTR_ERR(umem)); 180 err = PTR_ERR(umem); 181 goto err_out; 182 } 183 184 num_buf = ib_umem_num_pages(umem); 185 186 rxe_mr_init(access, mr); 187 188 err = rxe_mr_alloc(mr, num_buf, 0); 189 if (err) { 190 pr_warn("%s: Unable to allocate memory for map\n", 191 __func__); 192 goto err_release_umem; 193 } 194 195 set = mr->cur_map_set; 196 set->page_shift = PAGE_SHIFT; 197 set->page_mask = PAGE_SIZE - 1; 198 199 num_buf = 0; 200 map = set->map; 201 202 if (length > 0) { 203 buf = map[0]->buf; 204 205 for_each_sgtable_page (&umem->sgt_append.sgt, &sg_iter, 0) { 206 if (num_buf >= RXE_BUF_PER_MAP) { 207 map++; 208 buf = map[0]->buf; 209 num_buf = 0; 210 } 211 212 vaddr = page_address(sg_page_iter_page(&sg_iter)); 213 if (!vaddr) { 214 pr_warn("%s: Unable to get virtual address\n", 215 __func__); 216 err = -ENOMEM; 217 goto err_cleanup_map; 218 } 219 220 buf->addr = (uintptr_t)vaddr; 221 buf->size = PAGE_SIZE; 222 num_buf++; 223 buf++; 224 } 225 } 226 227 mr->ibmr.pd = &pd->ibpd; 228 mr->umem = umem; 229 mr->access = access; 230 mr->state = RXE_MR_STATE_VALID; 231 mr->type = IB_MR_TYPE_USER; 232 233 set->length = length; 234 set->iova = iova; 235 set->va = start; 236 set->offset = ib_umem_offset(umem); 237 238 return 0; 239 240 err_cleanup_map: 241 rxe_mr_free_map_set(mr->num_map, mr->cur_map_set); 242 err_release_umem: 243 ib_umem_release(umem); 244 err_out: 245 return err; 246 } 247 248 int rxe_mr_init_fast(struct rxe_pd *pd, int max_pages, struct rxe_mr *mr) 249 { 250 int err; 251 252 /* always allow remote access for FMRs */ 253 rxe_mr_init(IB_ACCESS_REMOTE, mr); 254 255 err = rxe_mr_alloc(mr, max_pages, 1); 256 if (err) 257 goto err1; 258 259 mr->ibmr.pd = &pd->ibpd; 260 mr->max_buf = max_pages; 261 mr->state = RXE_MR_STATE_FREE; 262 mr->type = IB_MR_TYPE_MEM_REG; 263 264 return 0; 265 266 err1: 267 return err; 268 } 269 270 static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out, 271 size_t *offset_out) 272 { 273 struct rxe_map_set *set = mr->cur_map_set; 274 size_t offset = iova - set->iova + set->offset; 275 int map_index; 276 int buf_index; 277 u64 length; 278 struct rxe_map *map; 279 280 if (likely(set->page_shift)) { 281 *offset_out = offset & set->page_mask; 282 offset >>= set->page_shift; 283 *n_out = offset & mr->map_mask; 284 *m_out = offset >> mr->map_shift; 285 } else { 286 map_index = 0; 287 buf_index = 0; 288 289 map = set->map[map_index]; 290 length = map->buf[buf_index].size; 291 292 while (offset >= length) { 293 offset -= length; 294 buf_index++; 295 296 if (buf_index == RXE_BUF_PER_MAP) { 297 map_index++; 298 buf_index = 0; 299 } 300 map = set->map[map_index]; 301 length = map->buf[buf_index].size; 302 } 303 304 *m_out = map_index; 305 *n_out = buf_index; 306 *offset_out = offset; 307 } 308 } 309 310 void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length) 311 { 312 size_t offset; 313 int m, n; 314 void *addr; 315 316 if (mr->state != RXE_MR_STATE_VALID) { 317 pr_warn("mr not in valid state\n"); 318 addr = NULL; 319 goto out; 320 } 321 322 if (!mr->cur_map_set) { 323 addr = (void *)(uintptr_t)iova; 324 goto out; 325 } 326 327 if (mr_check_range(mr, iova, length)) { 328 pr_warn("range violation\n"); 329 addr = NULL; 330 goto out; 331 } 332 333 lookup_iova(mr, iova, &m, &n, &offset); 334 335 if (offset + length > mr->cur_map_set->map[m]->buf[n].size) { 336 pr_warn("crosses page boundary\n"); 337 addr = NULL; 338 goto out; 339 } 340 341 addr = (void *)(uintptr_t)mr->cur_map_set->map[m]->buf[n].addr + offset; 342 343 out: 344 return addr; 345 } 346 347 /* copy data from a range (vaddr, vaddr+length-1) to or from 348 * a mr object starting at iova. 349 */ 350 int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, 351 enum rxe_mr_copy_dir dir) 352 { 353 int err; 354 int bytes; 355 u8 *va; 356 struct rxe_map **map; 357 struct rxe_phys_buf *buf; 358 int m; 359 int i; 360 size_t offset; 361 362 if (length == 0) 363 return 0; 364 365 if (mr->type == IB_MR_TYPE_DMA) { 366 u8 *src, *dest; 367 368 src = (dir == RXE_TO_MR_OBJ) ? addr : ((void *)(uintptr_t)iova); 369 370 dest = (dir == RXE_TO_MR_OBJ) ? ((void *)(uintptr_t)iova) : addr; 371 372 memcpy(dest, src, length); 373 374 return 0; 375 } 376 377 WARN_ON_ONCE(!mr->cur_map_set); 378 379 err = mr_check_range(mr, iova, length); 380 if (err) { 381 err = -EFAULT; 382 goto err1; 383 } 384 385 lookup_iova(mr, iova, &m, &i, &offset); 386 387 map = mr->cur_map_set->map + m; 388 buf = map[0]->buf + i; 389 390 while (length > 0) { 391 u8 *src, *dest; 392 393 va = (u8 *)(uintptr_t)buf->addr + offset; 394 src = (dir == RXE_TO_MR_OBJ) ? addr : va; 395 dest = (dir == RXE_TO_MR_OBJ) ? va : addr; 396 397 bytes = buf->size - offset; 398 399 if (bytes > length) 400 bytes = length; 401 402 memcpy(dest, src, bytes); 403 404 length -= bytes; 405 addr += bytes; 406 407 offset = 0; 408 buf++; 409 i++; 410 411 if (i == RXE_BUF_PER_MAP) { 412 i = 0; 413 map++; 414 buf = map[0]->buf; 415 } 416 } 417 418 return 0; 419 420 err1: 421 return err; 422 } 423 424 /* copy data in or out of a wqe, i.e. sg list 425 * under the control of a dma descriptor 426 */ 427 int copy_data( 428 struct rxe_pd *pd, 429 int access, 430 struct rxe_dma_info *dma, 431 void *addr, 432 int length, 433 enum rxe_mr_copy_dir dir) 434 { 435 int bytes; 436 struct rxe_sge *sge = &dma->sge[dma->cur_sge]; 437 int offset = dma->sge_offset; 438 int resid = dma->resid; 439 struct rxe_mr *mr = NULL; 440 u64 iova; 441 int err; 442 443 if (length == 0) 444 return 0; 445 446 if (length > resid) { 447 err = -EINVAL; 448 goto err2; 449 } 450 451 if (sge->length && (offset < sge->length)) { 452 mr = lookup_mr(pd, access, sge->lkey, RXE_LOOKUP_LOCAL); 453 if (!mr) { 454 err = -EINVAL; 455 goto err1; 456 } 457 } 458 459 while (length > 0) { 460 bytes = length; 461 462 if (offset >= sge->length) { 463 if (mr) { 464 rxe_drop_ref(mr); 465 mr = NULL; 466 } 467 sge++; 468 dma->cur_sge++; 469 offset = 0; 470 471 if (dma->cur_sge >= dma->num_sge) { 472 err = -ENOSPC; 473 goto err2; 474 } 475 476 if (sge->length) { 477 mr = lookup_mr(pd, access, sge->lkey, 478 RXE_LOOKUP_LOCAL); 479 if (!mr) { 480 err = -EINVAL; 481 goto err1; 482 } 483 } else { 484 continue; 485 } 486 } 487 488 if (bytes > sge->length - offset) 489 bytes = sge->length - offset; 490 491 if (bytes > 0) { 492 iova = sge->addr + offset; 493 494 err = rxe_mr_copy(mr, iova, addr, bytes, dir); 495 if (err) 496 goto err2; 497 498 offset += bytes; 499 resid -= bytes; 500 length -= bytes; 501 addr += bytes; 502 } 503 } 504 505 dma->sge_offset = offset; 506 dma->resid = resid; 507 508 if (mr) 509 rxe_drop_ref(mr); 510 511 return 0; 512 513 err2: 514 if (mr) 515 rxe_drop_ref(mr); 516 err1: 517 return err; 518 } 519 520 int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) 521 { 522 struct rxe_sge *sge = &dma->sge[dma->cur_sge]; 523 int offset = dma->sge_offset; 524 int resid = dma->resid; 525 526 while (length) { 527 unsigned int bytes; 528 529 if (offset >= sge->length) { 530 sge++; 531 dma->cur_sge++; 532 offset = 0; 533 if (dma->cur_sge >= dma->num_sge) 534 return -ENOSPC; 535 } 536 537 bytes = length; 538 539 if (bytes > sge->length - offset) 540 bytes = sge->length - offset; 541 542 offset += bytes; 543 resid -= bytes; 544 length -= bytes; 545 } 546 547 dma->sge_offset = offset; 548 dma->resid = resid; 549 550 return 0; 551 } 552 553 /* (1) find the mr corresponding to lkey/rkey 554 * depending on lookup_type 555 * (2) verify that the (qp) pd matches the mr pd 556 * (3) verify that the mr can support the requested access 557 * (4) verify that mr state is valid 558 */ 559 struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, 560 enum rxe_mr_lookup_type type) 561 { 562 struct rxe_mr *mr; 563 struct rxe_dev *rxe = to_rdev(pd->ibpd.device); 564 int index = key >> 8; 565 566 mr = rxe_pool_get_index(&rxe->mr_pool, index); 567 if (!mr) 568 return NULL; 569 570 if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) || 571 (type == RXE_LOOKUP_REMOTE && mr->rkey != key) || 572 mr_pd(mr) != pd || (access && !(access & mr->access)) || 573 mr->state != RXE_MR_STATE_VALID)) { 574 rxe_drop_ref(mr); 575 mr = NULL; 576 } 577 578 return mr; 579 } 580 581 int rxe_invalidate_mr(struct rxe_qp *qp, u32 rkey) 582 { 583 struct rxe_dev *rxe = to_rdev(qp->ibqp.device); 584 struct rxe_mr *mr; 585 int ret; 586 587 mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8); 588 if (!mr) { 589 pr_err("%s: No MR for rkey %#x\n", __func__, rkey); 590 ret = -EINVAL; 591 goto err; 592 } 593 594 if (rkey != mr->rkey) { 595 pr_err("%s: rkey (%#x) doesn't match mr->rkey (%#x)\n", 596 __func__, rkey, mr->rkey); 597 ret = -EINVAL; 598 goto err_drop_ref; 599 } 600 601 if (atomic_read(&mr->num_mw) > 0) { 602 pr_warn("%s: Attempt to invalidate an MR while bound to MWs\n", 603 __func__); 604 ret = -EINVAL; 605 goto err_drop_ref; 606 } 607 608 if (unlikely(mr->type != IB_MR_TYPE_MEM_REG)) { 609 pr_warn("%s: mr->type (%d) is wrong type\n", __func__, mr->type); 610 ret = -EINVAL; 611 goto err_drop_ref; 612 } 613 614 mr->state = RXE_MR_STATE_FREE; 615 ret = 0; 616 617 err_drop_ref: 618 rxe_drop_ref(mr); 619 err: 620 return ret; 621 } 622 623 /* user can (re)register fast MR by executing a REG_MR WQE. 624 * user is expected to hold a reference on the ib mr until the 625 * WQE completes. 626 * Once a fast MR is created this is the only way to change the 627 * private keys. It is the responsibility of the user to maintain 628 * the ib mr keys in sync with rxe mr keys. 629 */ 630 int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) 631 { 632 struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr); 633 u32 key = wqe->wr.wr.reg.key & 0xff; 634 u32 access = wqe->wr.wr.reg.access; 635 struct rxe_map_set *set; 636 637 /* user can only register MR in free state */ 638 if (unlikely(mr->state != RXE_MR_STATE_FREE)) { 639 pr_warn("%s: mr->lkey = 0x%x not free\n", 640 __func__, mr->lkey); 641 return -EINVAL; 642 } 643 644 /* user can only register mr with qp in same protection domain */ 645 if (unlikely(qp->ibqp.pd != mr->ibmr.pd)) { 646 pr_warn("%s: qp->pd and mr->pd don't match\n", 647 __func__); 648 return -EINVAL; 649 } 650 651 mr->access = access; 652 mr->lkey = (mr->lkey & ~0xff) | key; 653 mr->rkey = (access & IB_ACCESS_REMOTE) ? mr->lkey : 0; 654 mr->state = RXE_MR_STATE_VALID; 655 656 set = mr->cur_map_set; 657 mr->cur_map_set = mr->next_map_set; 658 mr->cur_map_set->iova = wqe->wr.wr.reg.mr->iova; 659 mr->next_map_set = set; 660 661 return 0; 662 } 663 664 int rxe_mr_set_page(struct ib_mr *ibmr, u64 addr) 665 { 666 struct rxe_mr *mr = to_rmr(ibmr); 667 struct rxe_map_set *set = mr->next_map_set; 668 struct rxe_map *map; 669 struct rxe_phys_buf *buf; 670 671 if (unlikely(set->nbuf == mr->num_buf)) 672 return -ENOMEM; 673 674 map = set->map[set->nbuf / RXE_BUF_PER_MAP]; 675 buf = &map->buf[set->nbuf % RXE_BUF_PER_MAP]; 676 677 buf->addr = addr; 678 buf->size = ibmr->page_size; 679 set->nbuf++; 680 681 return 0; 682 } 683 684 int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 685 { 686 struct rxe_mr *mr = to_rmr(ibmr); 687 688 if (atomic_read(&mr->num_mw) > 0) { 689 pr_warn("%s: Attempt to deregister an MR while bound to MWs\n", 690 __func__); 691 return -EINVAL; 692 } 693 694 mr->state = RXE_MR_STATE_INVALID; 695 rxe_drop_ref(mr_pd(mr)); 696 rxe_drop_index(mr); 697 rxe_drop_ref(mr); 698 699 return 0; 700 } 701 702 void rxe_mr_cleanup(struct rxe_pool_entry *arg) 703 { 704 struct rxe_mr *mr = container_of(arg, typeof(*mr), pelem); 705 706 ib_umem_release(mr->umem); 707 708 if (mr->cur_map_set) 709 rxe_mr_free_map_set(mr->num_map, mr->cur_map_set); 710 711 if (mr->next_map_set) 712 rxe_mr_free_map_set(mr->num_map, mr->next_map_set); 713 } 714