1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <rdma/ib_umem.h> 34 #include <rdma/ib_umem_odp.h> 35 #include <linux/kernel.h> 36 37 #include "mlx5_ib.h" 38 #include "cmd.h" 39 40 #define MAX_PREFETCH_LEN (4*1024*1024U) 41 42 /* Timeout in ms to wait for an active mmu notifier to complete when handling 43 * a pagefault. */ 44 #define MMU_NOTIFIER_TIMEOUT 1000 45 46 #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) 47 #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) 48 #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) 49 #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) 50 #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) 51 52 #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT 53 54 static u64 mlx5_imr_ksm_entries; 55 56 static int check_parent(struct ib_umem_odp *odp, 57 struct mlx5_ib_mr *parent) 58 { 59 struct mlx5_ib_mr *mr = odp->private; 60 61 return mr && mr->parent == parent && !odp->dying; 62 } 63 64 static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) 65 { 66 struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; 67 struct ib_ucontext *ctx = odp->umem->context; 68 struct rb_node *rb; 69 70 down_read(&ctx->umem_rwsem); 71 while (1) { 72 rb = rb_next(&odp->interval_tree.rb); 73 if (!rb) 74 goto not_found; 75 odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); 76 if (check_parent(odp, parent)) 77 goto end; 78 } 79 not_found: 80 odp = NULL; 81 end: 82 up_read(&ctx->umem_rwsem); 83 return odp; 84 } 85 86 static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, 87 u64 start, u64 length, 88 struct mlx5_ib_mr *parent) 89 { 90 struct ib_umem_odp *odp; 91 struct rb_node *rb; 92 93 down_read(&ctx->umem_rwsem); 94 odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length); 95 if (!odp) 96 goto end; 97 98 while (1) { 99 if (check_parent(odp, parent)) 100 goto end; 101 rb = rb_next(&odp->interval_tree.rb); 102 if (!rb) 103 goto not_found; 104 odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); 105 if (ib_umem_start(odp->umem) > start + length) 106 goto not_found; 107 } 108 not_found: 109 odp = NULL; 110 end: 111 up_read(&ctx->umem_rwsem); 112 return odp; 113 } 114 115 void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, 116 size_t nentries, struct mlx5_ib_mr *mr, int flags) 117 { 118 struct ib_pd *pd = mr->ibmr.pd; 119 struct ib_ucontext *ctx = pd->uobject->context; 120 struct mlx5_ib_dev *dev = to_mdev(pd->device); 121 struct ib_umem_odp *odp; 122 unsigned long va; 123 int i; 124 125 if (flags & MLX5_IB_UPD_XLT_ZAP) { 126 for (i = 0; i < nentries; i++, pklm++) { 127 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 128 pklm->key = cpu_to_be32(dev->null_mkey); 129 pklm->va = 0; 130 } 131 return; 132 } 133 134 odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE, 135 nentries * MLX5_IMR_MTT_SIZE, mr); 136 137 for (i = 0; i < nentries; i++, pklm++) { 138 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 139 va = (offset + i) * MLX5_IMR_MTT_SIZE; 140 if (odp && odp->umem->address == va) { 141 struct mlx5_ib_mr *mtt = odp->private; 142 143 pklm->key = cpu_to_be32(mtt->ibmr.lkey); 144 odp = odp_next(odp); 145 } else { 146 pklm->key = cpu_to_be32(dev->null_mkey); 147 } 148 mlx5_ib_dbg(dev, "[%d] va %lx key %x\n", 149 i, va, be32_to_cpu(pklm->key)); 150 } 151 } 152 153 static void mr_leaf_free_action(struct work_struct *work) 154 { 155 struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); 156 int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT; 157 struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; 158 159 mr->parent = NULL; 160 synchronize_srcu(&mr->dev->mr_srcu); 161 162 ib_umem_release(odp->umem); 163 if (imr->live) 164 mlx5_ib_update_xlt(imr, idx, 1, 0, 165 MLX5_IB_UPD_XLT_INDIRECT | 166 MLX5_IB_UPD_XLT_ATOMIC); 167 mlx5_mr_cache_free(mr->dev, mr); 168 169 if (atomic_dec_and_test(&imr->num_leaf_free)) 170 wake_up(&imr->q_leaf_free); 171 } 172 173 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, 174 unsigned long end) 175 { 176 struct mlx5_ib_mr *mr; 177 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / 178 sizeof(struct mlx5_mtt)) - 1; 179 u64 idx = 0, blk_start_idx = 0; 180 int in_block = 0; 181 u64 addr; 182 183 if (!umem || !umem->odp_data) { 184 pr_err("invalidation called on NULL umem or non-ODP umem\n"); 185 return; 186 } 187 188 mr = umem->odp_data->private; 189 190 if (!mr || !mr->ibmr.pd) 191 return; 192 193 start = max_t(u64, ib_umem_start(umem), start); 194 end = min_t(u64, ib_umem_end(umem), end); 195 196 /* 197 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that 198 * while we are doing the invalidation, no page fault will attempt to 199 * overwrite the same MTTs. Concurent invalidations might race us, 200 * but they will write 0s as well, so no difference in the end result. 201 */ 202 203 for (addr = start; addr < end; addr += BIT(umem->page_shift)) { 204 idx = (addr - ib_umem_start(umem)) >> umem->page_shift; 205 /* 206 * Strive to write the MTTs in chunks, but avoid overwriting 207 * non-existing MTTs. The huristic here can be improved to 208 * estimate the cost of another UMR vs. the cost of bigger 209 * UMR. 210 */ 211 if (umem->odp_data->dma_list[idx] & 212 (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { 213 if (!in_block) { 214 blk_start_idx = idx; 215 in_block = 1; 216 } 217 } else { 218 u64 umr_offset = idx & umr_block_mask; 219 220 if (in_block && umr_offset == 0) { 221 mlx5_ib_update_xlt(mr, blk_start_idx, 222 idx - blk_start_idx, 0, 223 MLX5_IB_UPD_XLT_ZAP | 224 MLX5_IB_UPD_XLT_ATOMIC); 225 in_block = 0; 226 } 227 } 228 } 229 if (in_block) 230 mlx5_ib_update_xlt(mr, blk_start_idx, 231 idx - blk_start_idx + 1, 0, 232 MLX5_IB_UPD_XLT_ZAP | 233 MLX5_IB_UPD_XLT_ATOMIC); 234 /* 235 * We are now sure that the device will not access the 236 * memory. We can safely unmap it, and mark it as dirty if 237 * needed. 238 */ 239 240 ib_umem_odp_unmap_dma_pages(umem, start, end); 241 242 if (unlikely(!umem->npages && mr->parent && 243 !umem->odp_data->dying)) { 244 WRITE_ONCE(umem->odp_data->dying, 1); 245 atomic_inc(&mr->parent->num_leaf_free); 246 schedule_work(&umem->odp_data->work); 247 } 248 } 249 250 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 251 { 252 struct ib_odp_caps *caps = &dev->odp_caps; 253 254 memset(caps, 0, sizeof(*caps)); 255 256 if (!MLX5_CAP_GEN(dev->mdev, pg)) 257 return; 258 259 caps->general_caps = IB_ODP_SUPPORT; 260 261 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 262 dev->odp_max_size = U64_MAX; 263 else 264 dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); 265 266 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) 267 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; 268 269 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) 270 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; 271 272 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) 273 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; 274 275 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) 276 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; 277 278 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) 279 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; 280 281 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) 282 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; 283 284 if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && 285 MLX5_CAP_GEN(dev->mdev, null_mkey) && 286 MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 287 caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; 288 289 return; 290 } 291 292 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, 293 struct mlx5_pagefault *pfault, 294 int error) 295 { 296 int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? 297 pfault->wqe.wq_num : pfault->token; 298 int ret = mlx5_core_page_fault_resume(dev->mdev, 299 pfault->token, 300 wq_num, 301 pfault->type, 302 error); 303 if (ret) 304 mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", 305 wq_num); 306 } 307 308 static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, 309 struct ib_umem *umem, 310 bool ksm, int access_flags) 311 { 312 struct mlx5_ib_dev *dev = to_mdev(pd->device); 313 struct mlx5_ib_mr *mr; 314 int err; 315 316 mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY : 317 MLX5_IMR_MTT_CACHE_ENTRY); 318 319 if (IS_ERR(mr)) 320 return mr; 321 322 mr->ibmr.pd = pd; 323 324 mr->dev = dev; 325 mr->access_flags = access_flags; 326 mr->mmkey.iova = 0; 327 mr->umem = umem; 328 329 if (ksm) { 330 err = mlx5_ib_update_xlt(mr, 0, 331 mlx5_imr_ksm_entries, 332 MLX5_KSM_PAGE_SHIFT, 333 MLX5_IB_UPD_XLT_INDIRECT | 334 MLX5_IB_UPD_XLT_ZAP | 335 MLX5_IB_UPD_XLT_ENABLE); 336 337 } else { 338 err = mlx5_ib_update_xlt(mr, 0, 339 MLX5_IMR_MTT_ENTRIES, 340 PAGE_SHIFT, 341 MLX5_IB_UPD_XLT_ZAP | 342 MLX5_IB_UPD_XLT_ENABLE | 343 MLX5_IB_UPD_XLT_ATOMIC); 344 } 345 346 if (err) 347 goto fail; 348 349 mr->ibmr.lkey = mr->mmkey.key; 350 mr->ibmr.rkey = mr->mmkey.key; 351 352 mr->live = 1; 353 354 mlx5_ib_dbg(dev, "key %x dev %p mr %p\n", 355 mr->mmkey.key, dev->mdev, mr); 356 357 return mr; 358 359 fail: 360 mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); 361 mlx5_mr_cache_free(dev, mr); 362 363 return ERR_PTR(err); 364 } 365 366 static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, 367 u64 io_virt, size_t bcnt) 368 { 369 struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; 370 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); 371 struct ib_umem_odp *odp, *result = NULL; 372 u64 addr = io_virt & MLX5_IMR_MTT_MASK; 373 int nentries = 0, start_idx = 0, ret; 374 struct mlx5_ib_mr *mtt; 375 struct ib_umem *umem; 376 377 mutex_lock(&mr->umem->odp_data->umem_mutex); 378 odp = odp_lookup(ctx, addr, 1, mr); 379 380 mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", 381 io_virt, bcnt, addr, odp); 382 383 next_mr: 384 if (likely(odp)) { 385 if (nentries) 386 nentries++; 387 } else { 388 umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); 389 if (IS_ERR(umem)) { 390 mutex_unlock(&mr->umem->odp_data->umem_mutex); 391 return ERR_CAST(umem); 392 } 393 394 mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags); 395 if (IS_ERR(mtt)) { 396 mutex_unlock(&mr->umem->odp_data->umem_mutex); 397 ib_umem_release(umem); 398 return ERR_CAST(mtt); 399 } 400 401 odp = umem->odp_data; 402 odp->private = mtt; 403 mtt->umem = umem; 404 mtt->mmkey.iova = addr; 405 mtt->parent = mr; 406 INIT_WORK(&odp->work, mr_leaf_free_action); 407 408 if (!nentries) 409 start_idx = addr >> MLX5_IMR_MTT_SHIFT; 410 nentries++; 411 } 412 413 /* Return first odp if region not covered by single one */ 414 if (likely(!result)) 415 result = odp; 416 417 addr += MLX5_IMR_MTT_SIZE; 418 if (unlikely(addr < io_virt + bcnt)) { 419 odp = odp_next(odp); 420 if (odp && odp->umem->address != addr) 421 odp = NULL; 422 goto next_mr; 423 } 424 425 if (unlikely(nentries)) { 426 ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0, 427 MLX5_IB_UPD_XLT_INDIRECT | 428 MLX5_IB_UPD_XLT_ATOMIC); 429 if (ret) { 430 mlx5_ib_err(dev, "Failed to update PAS\n"); 431 result = ERR_PTR(ret); 432 } 433 } 434 435 mutex_unlock(&mr->umem->odp_data->umem_mutex); 436 return result; 437 } 438 439 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, 440 int access_flags) 441 { 442 struct ib_ucontext *ctx = pd->ibpd.uobject->context; 443 struct mlx5_ib_mr *imr; 444 struct ib_umem *umem; 445 446 umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0); 447 if (IS_ERR(umem)) 448 return ERR_CAST(umem); 449 450 imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags); 451 if (IS_ERR(imr)) { 452 ib_umem_release(umem); 453 return ERR_CAST(imr); 454 } 455 456 imr->umem = umem; 457 init_waitqueue_head(&imr->q_leaf_free); 458 atomic_set(&imr->num_leaf_free, 0); 459 460 return imr; 461 } 462 463 static int mr_leaf_free(struct ib_umem *umem, u64 start, 464 u64 end, void *cookie) 465 { 466 struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie; 467 468 if (mr->parent != imr) 469 return 0; 470 471 ib_umem_odp_unmap_dma_pages(umem, 472 ib_umem_start(umem), 473 ib_umem_end(umem)); 474 475 if (umem->odp_data->dying) 476 return 0; 477 478 WRITE_ONCE(umem->odp_data->dying, 1); 479 atomic_inc(&imr->num_leaf_free); 480 schedule_work(&umem->odp_data->work); 481 482 return 0; 483 } 484 485 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) 486 { 487 struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context; 488 489 down_read(&ctx->umem_rwsem); 490 rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, 491 mr_leaf_free, imr); 492 up_read(&ctx->umem_rwsem); 493 494 wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); 495 } 496 497 static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 498 u64 io_virt, size_t bcnt, u32 *bytes_mapped) 499 { 500 u64 access_mask = ODP_READ_ALLOWED_BIT; 501 int npages = 0, page_shift, np; 502 u64 start_idx, page_mask; 503 struct ib_umem_odp *odp; 504 int current_seq; 505 size_t size; 506 int ret; 507 508 if (!mr->umem->odp_data->page_list) { 509 odp = implicit_mr_get_data(mr, io_virt, bcnt); 510 511 if (IS_ERR(odp)) 512 return PTR_ERR(odp); 513 mr = odp->private; 514 515 } else { 516 odp = mr->umem->odp_data; 517 } 518 519 next_mr: 520 size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt); 521 522 page_shift = mr->umem->page_shift; 523 page_mask = ~(BIT(page_shift) - 1); 524 start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; 525 526 if (mr->umem->writable) 527 access_mask |= ODP_WRITE_ALLOWED_BIT; 528 529 current_seq = READ_ONCE(odp->notifiers_seq); 530 /* 531 * Ensure the sequence number is valid for some time before we call 532 * gup. 533 */ 534 smp_rmb(); 535 536 ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size, 537 access_mask, current_seq); 538 539 if (ret < 0) 540 goto out; 541 542 np = ret; 543 544 mutex_lock(&odp->umem_mutex); 545 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { 546 /* 547 * No need to check whether the MTTs really belong to 548 * this MR, since ib_umem_odp_map_dma_pages already 549 * checks this. 550 */ 551 ret = mlx5_ib_update_xlt(mr, start_idx, np, 552 page_shift, MLX5_IB_UPD_XLT_ATOMIC); 553 } else { 554 ret = -EAGAIN; 555 } 556 mutex_unlock(&odp->umem_mutex); 557 558 if (ret < 0) { 559 if (ret != -EAGAIN) 560 mlx5_ib_err(dev, "Failed to update mkey page tables\n"); 561 goto out; 562 } 563 564 if (bytes_mapped) { 565 u32 new_mappings = (np << page_shift) - 566 (io_virt - round_down(io_virt, 1 << page_shift)); 567 *bytes_mapped += min_t(u32, new_mappings, size); 568 } 569 570 npages += np << (page_shift - PAGE_SHIFT); 571 bcnt -= size; 572 573 if (unlikely(bcnt)) { 574 struct ib_umem_odp *next; 575 576 io_virt += size; 577 next = odp_next(odp); 578 if (unlikely(!next || next->umem->address != io_virt)) { 579 mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", 580 io_virt, next); 581 return -EAGAIN; 582 } 583 odp = next; 584 mr = odp->private; 585 goto next_mr; 586 } 587 588 return npages; 589 590 out: 591 if (ret == -EAGAIN) { 592 if (mr->parent || !odp->dying) { 593 unsigned long timeout = 594 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); 595 596 if (!wait_for_completion_timeout( 597 &odp->notifier_completion, 598 timeout)) { 599 mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n", 600 current_seq, odp->notifiers_seq); 601 } 602 } else { 603 /* The MR is being killed, kill the QP as well. */ 604 ret = -EFAULT; 605 } 606 } 607 608 return ret; 609 } 610 611 struct pf_frame { 612 struct pf_frame *next; 613 u32 key; 614 u64 io_virt; 615 size_t bcnt; 616 int depth; 617 }; 618 619 /* 620 * Handle a single data segment in a page-fault WQE or RDMA region. 621 * 622 * Returns number of OS pages retrieved on success. The caller may continue to 623 * the next data segment. 624 * Can return the following error codes: 625 * -EAGAIN to designate a temporary error. The caller will abort handling the 626 * page fault and resolve it. 627 * -EFAULT when there's an error mapping the requested pages. The caller will 628 * abort the page fault handling. 629 */ 630 static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, 631 u32 key, u64 io_virt, size_t bcnt, 632 u32 *bytes_committed, 633 u32 *bytes_mapped) 634 { 635 int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0; 636 struct pf_frame *head = NULL, *frame; 637 struct mlx5_core_mkey *mmkey; 638 struct mlx5_ib_mw *mw; 639 struct mlx5_ib_mr *mr; 640 struct mlx5_klm *pklm; 641 u32 *out = NULL; 642 size_t offset; 643 644 srcu_key = srcu_read_lock(&dev->mr_srcu); 645 646 io_virt += *bytes_committed; 647 bcnt -= *bytes_committed; 648 649 next_mr: 650 mmkey = __mlx5_mr_lookup(dev->mdev, mlx5_base_mkey(key)); 651 if (!mmkey || mmkey->key != key) { 652 mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); 653 ret = -EFAULT; 654 goto srcu_unlock; 655 } 656 657 switch (mmkey->type) { 658 case MLX5_MKEY_MR: 659 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 660 if (!mr->live || !mr->ibmr.pd) { 661 mlx5_ib_dbg(dev, "got dead MR\n"); 662 ret = -EFAULT; 663 goto srcu_unlock; 664 } 665 666 ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped); 667 if (ret < 0) 668 goto srcu_unlock; 669 670 npages += ret; 671 ret = 0; 672 break; 673 674 case MLX5_MKEY_MW: 675 mw = container_of(mmkey, struct mlx5_ib_mw, mmkey); 676 677 if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) { 678 mlx5_ib_dbg(dev, "indirection level exceeded\n"); 679 ret = -EFAULT; 680 goto srcu_unlock; 681 } 682 683 outlen = MLX5_ST_SZ_BYTES(query_mkey_out) + 684 sizeof(*pklm) * (mw->ndescs - 2); 685 686 if (outlen > cur_outlen) { 687 kfree(out); 688 out = kzalloc(outlen, GFP_KERNEL); 689 if (!out) { 690 ret = -ENOMEM; 691 goto srcu_unlock; 692 } 693 cur_outlen = outlen; 694 } 695 696 pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out, 697 bsf0_klm0_pas_mtt0_1); 698 699 ret = mlx5_core_query_mkey(dev->mdev, &mw->mmkey, out, outlen); 700 if (ret) 701 goto srcu_unlock; 702 703 offset = io_virt - MLX5_GET64(query_mkey_out, out, 704 memory_key_mkey_entry.start_addr); 705 706 for (i = 0; bcnt && i < mw->ndescs; i++, pklm++) { 707 if (offset >= be32_to_cpu(pklm->bcount)) { 708 offset -= be32_to_cpu(pklm->bcount); 709 continue; 710 } 711 712 frame = kzalloc(sizeof(*frame), GFP_KERNEL); 713 if (!frame) { 714 ret = -ENOMEM; 715 goto srcu_unlock; 716 } 717 718 frame->key = be32_to_cpu(pklm->key); 719 frame->io_virt = be64_to_cpu(pklm->va) + offset; 720 frame->bcnt = min_t(size_t, bcnt, 721 be32_to_cpu(pklm->bcount) - offset); 722 frame->depth = depth + 1; 723 frame->next = head; 724 head = frame; 725 726 bcnt -= frame->bcnt; 727 } 728 break; 729 730 default: 731 mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type); 732 ret = -EFAULT; 733 goto srcu_unlock; 734 } 735 736 if (head) { 737 frame = head; 738 head = frame->next; 739 740 key = frame->key; 741 io_virt = frame->io_virt; 742 bcnt = frame->bcnt; 743 depth = frame->depth; 744 kfree(frame); 745 746 goto next_mr; 747 } 748 749 srcu_unlock: 750 while (head) { 751 frame = head; 752 head = frame->next; 753 kfree(frame); 754 } 755 kfree(out); 756 757 srcu_read_unlock(&dev->mr_srcu, srcu_key); 758 *bytes_committed = 0; 759 return ret ? ret : npages; 760 } 761 762 /** 763 * Parse a series of data segments for page fault handling. 764 * 765 * @qp the QP on which the fault occurred. 766 * @pfault contains page fault information. 767 * @wqe points at the first data segment in the WQE. 768 * @wqe_end points after the end of the WQE. 769 * @bytes_mapped receives the number of bytes that the function was able to 770 * map. This allows the caller to decide intelligently whether 771 * enough memory was mapped to resolve the page fault 772 * successfully (e.g. enough for the next MTU, or the entire 773 * WQE). 774 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus 775 * the committed bytes). 776 * 777 * Returns the number of pages loaded if positive, zero for an empty WQE, or a 778 * negative error code. 779 */ 780 static int pagefault_data_segments(struct mlx5_ib_dev *dev, 781 struct mlx5_pagefault *pfault, 782 struct mlx5_ib_qp *qp, void *wqe, 783 void *wqe_end, u32 *bytes_mapped, 784 u32 *total_wqe_bytes, int receive_queue) 785 { 786 int ret = 0, npages = 0; 787 u64 io_virt; 788 u32 key; 789 u32 byte_count; 790 size_t bcnt; 791 int inline_segment; 792 793 /* Skip SRQ next-WQE segment. */ 794 if (receive_queue && qp->ibqp.srq) 795 wqe += sizeof(struct mlx5_wqe_srq_next_seg); 796 797 if (bytes_mapped) 798 *bytes_mapped = 0; 799 if (total_wqe_bytes) 800 *total_wqe_bytes = 0; 801 802 while (wqe < wqe_end) { 803 struct mlx5_wqe_data_seg *dseg = wqe; 804 805 io_virt = be64_to_cpu(dseg->addr); 806 key = be32_to_cpu(dseg->lkey); 807 byte_count = be32_to_cpu(dseg->byte_count); 808 inline_segment = !!(byte_count & MLX5_INLINE_SEG); 809 bcnt = byte_count & ~MLX5_INLINE_SEG; 810 811 if (inline_segment) { 812 bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; 813 wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, 814 16); 815 } else { 816 wqe += sizeof(*dseg); 817 } 818 819 /* receive WQE end of sg list. */ 820 if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && 821 io_virt == 0) 822 break; 823 824 if (!inline_segment && total_wqe_bytes) { 825 *total_wqe_bytes += bcnt - min_t(size_t, bcnt, 826 pfault->bytes_committed); 827 } 828 829 /* A zero length data segment designates a length of 2GB. */ 830 if (bcnt == 0) 831 bcnt = 1U << 31; 832 833 if (inline_segment || bcnt <= pfault->bytes_committed) { 834 pfault->bytes_committed -= 835 min_t(size_t, bcnt, 836 pfault->bytes_committed); 837 continue; 838 } 839 840 ret = pagefault_single_data_segment(dev, key, io_virt, bcnt, 841 &pfault->bytes_committed, 842 bytes_mapped); 843 if (ret < 0) 844 break; 845 npages += ret; 846 } 847 848 return ret < 0 ? ret : npages; 849 } 850 851 static const u32 mlx5_ib_odp_opcode_cap[] = { 852 [MLX5_OPCODE_SEND] = IB_ODP_SUPPORT_SEND, 853 [MLX5_OPCODE_SEND_IMM] = IB_ODP_SUPPORT_SEND, 854 [MLX5_OPCODE_SEND_INVAL] = IB_ODP_SUPPORT_SEND, 855 [MLX5_OPCODE_RDMA_WRITE] = IB_ODP_SUPPORT_WRITE, 856 [MLX5_OPCODE_RDMA_WRITE_IMM] = IB_ODP_SUPPORT_WRITE, 857 [MLX5_OPCODE_RDMA_READ] = IB_ODP_SUPPORT_READ, 858 [MLX5_OPCODE_ATOMIC_CS] = IB_ODP_SUPPORT_ATOMIC, 859 [MLX5_OPCODE_ATOMIC_FA] = IB_ODP_SUPPORT_ATOMIC, 860 }; 861 862 /* 863 * Parse initiator WQE. Advances the wqe pointer to point at the 864 * scatter-gather list, and set wqe_end to the end of the WQE. 865 */ 866 static int mlx5_ib_mr_initiator_pfault_handler( 867 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 868 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) 869 { 870 struct mlx5_wqe_ctrl_seg *ctrl = *wqe; 871 u16 wqe_index = pfault->wqe.wqe_index; 872 u32 transport_caps; 873 struct mlx5_base_av *av; 874 unsigned ds, opcode; 875 #if defined(DEBUG) 876 u32 ctrl_wqe_index, ctrl_qpn; 877 #endif 878 u32 qpn = qp->trans_qp.base.mqp.qpn; 879 880 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; 881 if (ds * MLX5_WQE_DS_UNITS > wqe_length) { 882 mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", 883 ds, wqe_length); 884 return -EFAULT; 885 } 886 887 if (ds == 0) { 888 mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", 889 wqe_index, qpn); 890 return -EFAULT; 891 } 892 893 #if defined(DEBUG) 894 ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & 895 MLX5_WQE_CTRL_WQE_INDEX_MASK) >> 896 MLX5_WQE_CTRL_WQE_INDEX_SHIFT; 897 if (wqe_index != ctrl_wqe_index) { 898 mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", 899 wqe_index, qpn, 900 ctrl_wqe_index); 901 return -EFAULT; 902 } 903 904 ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> 905 MLX5_WQE_CTRL_QPN_SHIFT; 906 if (qpn != ctrl_qpn) { 907 mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", 908 wqe_index, qpn, 909 ctrl_qpn); 910 return -EFAULT; 911 } 912 #endif /* DEBUG */ 913 914 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; 915 *wqe += sizeof(*ctrl); 916 917 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & 918 MLX5_WQE_CTRL_OPCODE_MASK; 919 920 switch (qp->ibqp.qp_type) { 921 case IB_QPT_RC: 922 transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps; 923 break; 924 case IB_QPT_UD: 925 transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps; 926 break; 927 default: 928 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n", 929 qp->ibqp.qp_type); 930 return -EFAULT; 931 } 932 933 if (unlikely(opcode >= ARRAY_SIZE(mlx5_ib_odp_opcode_cap) || 934 !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) { 935 mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n", 936 opcode); 937 return -EFAULT; 938 } 939 940 if (qp->ibqp.qp_type != IB_QPT_RC) { 941 av = *wqe; 942 if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV)) 943 *wqe += sizeof(struct mlx5_av); 944 else 945 *wqe += sizeof(struct mlx5_base_av); 946 } 947 948 switch (opcode) { 949 case MLX5_OPCODE_RDMA_WRITE: 950 case MLX5_OPCODE_RDMA_WRITE_IMM: 951 case MLX5_OPCODE_RDMA_READ: 952 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 953 break; 954 case MLX5_OPCODE_ATOMIC_CS: 955 case MLX5_OPCODE_ATOMIC_FA: 956 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 957 *wqe += sizeof(struct mlx5_wqe_atomic_seg); 958 break; 959 } 960 961 return 0; 962 } 963 964 /* 965 * Parse responder WQE. Advances the wqe pointer to point at the 966 * scatter-gather list, and set wqe_end to the end of the WQE. 967 */ 968 static int mlx5_ib_mr_responder_pfault_handler( 969 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 970 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) 971 { 972 struct mlx5_ib_wq *wq = &qp->rq; 973 int wqe_size = 1 << wq->wqe_shift; 974 975 if (qp->ibqp.srq) { 976 mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); 977 return -EFAULT; 978 } 979 980 if (qp->wq_sig) { 981 mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); 982 return -EFAULT; 983 } 984 985 if (wqe_size > wqe_length) { 986 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); 987 return -EFAULT; 988 } 989 990 switch (qp->ibqp.qp_type) { 991 case IB_QPT_RC: 992 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 993 IB_ODP_SUPPORT_RECV)) 994 goto invalid_transport_or_opcode; 995 break; 996 default: 997 invalid_transport_or_opcode: 998 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", 999 qp->ibqp.qp_type); 1000 return -EFAULT; 1001 } 1002 1003 *wqe_end = *wqe + wqe_size; 1004 1005 return 0; 1006 } 1007 1008 static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev, 1009 u32 wq_num) 1010 { 1011 struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num); 1012 1013 if (!mqp) { 1014 mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num); 1015 return NULL; 1016 } 1017 1018 return to_mibqp(mqp); 1019 } 1020 1021 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, 1022 struct mlx5_pagefault *pfault) 1023 { 1024 int ret; 1025 void *wqe, *wqe_end; 1026 u32 bytes_mapped, total_wqe_bytes; 1027 char *buffer = NULL; 1028 int resume_with_error = 1; 1029 u16 wqe_index = pfault->wqe.wqe_index; 1030 int requestor = pfault->type & MLX5_PFAULT_REQUESTOR; 1031 struct mlx5_ib_qp *qp; 1032 1033 buffer = (char *)__get_free_page(GFP_KERNEL); 1034 if (!buffer) { 1035 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); 1036 goto resolve_page_fault; 1037 } 1038 1039 qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num); 1040 if (!qp) 1041 goto resolve_page_fault; 1042 1043 ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, 1044 PAGE_SIZE, &qp->trans_qp.base); 1045 if (ret < 0) { 1046 mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n", 1047 ret, wqe_index, pfault->token); 1048 goto resolve_page_fault; 1049 } 1050 1051 wqe = buffer; 1052 if (requestor) 1053 ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe, 1054 &wqe_end, ret); 1055 else 1056 ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe, 1057 &wqe_end, ret); 1058 if (ret < 0) 1059 goto resolve_page_fault; 1060 1061 if (wqe >= wqe_end) { 1062 mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); 1063 goto resolve_page_fault; 1064 } 1065 1066 ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end, 1067 &bytes_mapped, &total_wqe_bytes, 1068 !requestor); 1069 if (ret == -EAGAIN) { 1070 resume_with_error = 0; 1071 goto resolve_page_fault; 1072 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { 1073 goto resolve_page_fault; 1074 } 1075 1076 resume_with_error = 0; 1077 resolve_page_fault: 1078 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); 1079 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", 1080 pfault->wqe.wq_num, resume_with_error, 1081 pfault->type); 1082 free_page((unsigned long)buffer); 1083 } 1084 1085 static int pages_in_range(u64 address, u32 length) 1086 { 1087 return (ALIGN(address + length, PAGE_SIZE) - 1088 (address & PAGE_MASK)) >> PAGE_SHIFT; 1089 } 1090 1091 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, 1092 struct mlx5_pagefault *pfault) 1093 { 1094 u64 address; 1095 u32 length; 1096 u32 prefetch_len = pfault->bytes_committed; 1097 int prefetch_activated = 0; 1098 u32 rkey = pfault->rdma.r_key; 1099 int ret; 1100 1101 /* The RDMA responder handler handles the page fault in two parts. 1102 * First it brings the necessary pages for the current packet 1103 * (and uses the pfault context), and then (after resuming the QP) 1104 * prefetches more pages. The second operation cannot use the pfault 1105 * context and therefore uses the dummy_pfault context allocated on 1106 * the stack */ 1107 pfault->rdma.rdma_va += pfault->bytes_committed; 1108 pfault->rdma.rdma_op_len -= min(pfault->bytes_committed, 1109 pfault->rdma.rdma_op_len); 1110 pfault->bytes_committed = 0; 1111 1112 address = pfault->rdma.rdma_va; 1113 length = pfault->rdma.rdma_op_len; 1114 1115 /* For some operations, the hardware cannot tell the exact message 1116 * length, and in those cases it reports zero. Use prefetch 1117 * logic. */ 1118 if (length == 0) { 1119 prefetch_activated = 1; 1120 length = pfault->rdma.packet_size; 1121 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); 1122 } 1123 1124 ret = pagefault_single_data_segment(dev, rkey, address, length, 1125 &pfault->bytes_committed, NULL); 1126 if (ret == -EAGAIN) { 1127 /* We're racing with an invalidation, don't prefetch */ 1128 prefetch_activated = 0; 1129 } else if (ret < 0 || pages_in_range(address, length) > ret) { 1130 mlx5_ib_page_fault_resume(dev, pfault, 1); 1131 if (ret != -ENOENT) 1132 mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", 1133 ret, pfault->token, pfault->type); 1134 return; 1135 } 1136 1137 mlx5_ib_page_fault_resume(dev, pfault, 0); 1138 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", 1139 pfault->token, pfault->type, 1140 prefetch_activated); 1141 1142 /* At this point, there might be a new pagefault already arriving in 1143 * the eq, switch to the dummy pagefault for the rest of the 1144 * processing. We're still OK with the objects being alive as the 1145 * work-queue is being fenced. */ 1146 1147 if (prefetch_activated) { 1148 u32 bytes_committed = 0; 1149 1150 ret = pagefault_single_data_segment(dev, rkey, address, 1151 prefetch_len, 1152 &bytes_committed, NULL); 1153 if (ret < 0 && ret != -EAGAIN) { 1154 mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", 1155 ret, pfault->token, address, prefetch_len); 1156 } 1157 } 1158 } 1159 1160 void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, 1161 struct mlx5_pagefault *pfault) 1162 { 1163 struct mlx5_ib_dev *dev = context; 1164 u8 event_subtype = pfault->event_subtype; 1165 1166 switch (event_subtype) { 1167 case MLX5_PFAULT_SUBTYPE_WQE: 1168 mlx5_ib_mr_wqe_pfault_handler(dev, pfault); 1169 break; 1170 case MLX5_PFAULT_SUBTYPE_RDMA: 1171 mlx5_ib_mr_rdma_pfault_handler(dev, pfault); 1172 break; 1173 default: 1174 mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", 1175 event_subtype); 1176 mlx5_ib_page_fault_resume(dev, pfault, 1); 1177 } 1178 } 1179 1180 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) 1181 { 1182 if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1183 return; 1184 1185 switch (ent->order - 2) { 1186 case MLX5_IMR_MTT_CACHE_ENTRY: 1187 ent->page = PAGE_SHIFT; 1188 ent->xlt = MLX5_IMR_MTT_ENTRIES * 1189 sizeof(struct mlx5_mtt) / 1190 MLX5_IB_UMR_OCTOWORD; 1191 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; 1192 ent->limit = 0; 1193 break; 1194 1195 case MLX5_IMR_KSM_CACHE_ENTRY: 1196 ent->page = MLX5_KSM_PAGE_SHIFT; 1197 ent->xlt = mlx5_imr_ksm_entries * 1198 sizeof(struct mlx5_klm) / 1199 MLX5_IB_UMR_OCTOWORD; 1200 ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; 1201 ent->limit = 0; 1202 break; 1203 } 1204 } 1205 1206 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) 1207 { 1208 int ret; 1209 1210 if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { 1211 ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); 1212 if (ret) { 1213 mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret); 1214 return ret; 1215 } 1216 } 1217 1218 return 0; 1219 } 1220 1221 int mlx5_ib_odp_init(void) 1222 { 1223 mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - 1224 MLX5_IMR_MTT_BITS); 1225 1226 return 0; 1227 } 1228 1229