1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <rdma/ib_umem.h> 34 #include <rdma/ib_umem_odp.h> 35 36 #include "mlx5_ib.h" 37 #include "cmd.h" 38 39 #define MAX_PREFETCH_LEN (4*1024*1024U) 40 41 /* Timeout in ms to wait for an active mmu notifier to complete when handling 42 * a pagefault. */ 43 #define MMU_NOTIFIER_TIMEOUT 1000 44 45 #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) 46 #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) 47 #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) 48 #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) 49 #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) 50 51 #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT 52 53 static u64 mlx5_imr_ksm_entries; 54 55 static int check_parent(struct ib_umem_odp *odp, 56 struct mlx5_ib_mr *parent) 57 { 58 struct mlx5_ib_mr *mr = odp->private; 59 60 return mr && mr->parent == parent && !odp->dying; 61 } 62 63 static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) 64 { 65 struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; 66 struct ib_ucontext *ctx = odp->umem->context; 67 struct rb_node *rb; 68 69 down_read(&ctx->umem_rwsem); 70 while (1) { 71 rb = rb_next(&odp->interval_tree.rb); 72 if (!rb) 73 goto not_found; 74 odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); 75 if (check_parent(odp, parent)) 76 goto end; 77 } 78 not_found: 79 odp = NULL; 80 end: 81 up_read(&ctx->umem_rwsem); 82 return odp; 83 } 84 85 static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, 86 u64 start, u64 length, 87 struct mlx5_ib_mr *parent) 88 { 89 struct ib_umem_odp *odp; 90 struct rb_node *rb; 91 92 down_read(&ctx->umem_rwsem); 93 odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length); 94 if (!odp) 95 goto end; 96 97 while (1) { 98 if (check_parent(odp, parent)) 99 goto end; 100 rb = rb_next(&odp->interval_tree.rb); 101 if (!rb) 102 goto not_found; 103 odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); 104 if (ib_umem_start(odp->umem) > start + length) 105 goto not_found; 106 } 107 not_found: 108 odp = NULL; 109 end: 110 up_read(&ctx->umem_rwsem); 111 return odp; 112 } 113 114 void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, 115 size_t nentries, struct mlx5_ib_mr *mr, int flags) 116 { 117 struct ib_pd *pd = mr->ibmr.pd; 118 struct ib_ucontext *ctx = pd->uobject->context; 119 struct mlx5_ib_dev *dev = to_mdev(pd->device); 120 struct ib_umem_odp *odp; 121 unsigned long va; 122 int i; 123 124 if (flags & MLX5_IB_UPD_XLT_ZAP) { 125 for (i = 0; i < nentries; i++, pklm++) { 126 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 127 pklm->key = cpu_to_be32(dev->null_mkey); 128 pklm->va = 0; 129 } 130 return; 131 } 132 133 odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE, 134 nentries * MLX5_IMR_MTT_SIZE, mr); 135 136 for (i = 0; i < nentries; i++, pklm++) { 137 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 138 va = (offset + i) * MLX5_IMR_MTT_SIZE; 139 if (odp && odp->umem->address == va) { 140 struct mlx5_ib_mr *mtt = odp->private; 141 142 pklm->key = cpu_to_be32(mtt->ibmr.lkey); 143 odp = odp_next(odp); 144 } else { 145 pklm->key = cpu_to_be32(dev->null_mkey); 146 } 147 mlx5_ib_dbg(dev, "[%d] va %lx key %x\n", 148 i, va, be32_to_cpu(pklm->key)); 149 } 150 } 151 152 static void mr_leaf_free_action(struct work_struct *work) 153 { 154 struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); 155 int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT; 156 struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; 157 158 mr->parent = NULL; 159 synchronize_srcu(&mr->dev->mr_srcu); 160 161 ib_umem_release(odp->umem); 162 if (imr->live) 163 mlx5_ib_update_xlt(imr, idx, 1, 0, 164 MLX5_IB_UPD_XLT_INDIRECT | 165 MLX5_IB_UPD_XLT_ATOMIC); 166 mlx5_mr_cache_free(mr->dev, mr); 167 168 if (atomic_dec_and_test(&imr->num_leaf_free)) 169 wake_up(&imr->q_leaf_free); 170 } 171 172 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, 173 unsigned long end) 174 { 175 struct mlx5_ib_mr *mr; 176 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / 177 sizeof(struct mlx5_mtt)) - 1; 178 u64 idx = 0, blk_start_idx = 0; 179 int in_block = 0; 180 u64 addr; 181 182 if (!umem || !umem->odp_data) { 183 pr_err("invalidation called on NULL umem or non-ODP umem\n"); 184 return; 185 } 186 187 mr = umem->odp_data->private; 188 189 if (!mr || !mr->ibmr.pd) 190 return; 191 192 start = max_t(u64, ib_umem_start(umem), start); 193 end = min_t(u64, ib_umem_end(umem), end); 194 195 /* 196 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that 197 * while we are doing the invalidation, no page fault will attempt to 198 * overwrite the same MTTs. Concurent invalidations might race us, 199 * but they will write 0s as well, so no difference in the end result. 200 */ 201 202 for (addr = start; addr < end; addr += BIT(umem->page_shift)) { 203 idx = (addr - ib_umem_start(umem)) >> umem->page_shift; 204 /* 205 * Strive to write the MTTs in chunks, but avoid overwriting 206 * non-existing MTTs. The huristic here can be improved to 207 * estimate the cost of another UMR vs. the cost of bigger 208 * UMR. 209 */ 210 if (umem->odp_data->dma_list[idx] & 211 (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { 212 if (!in_block) { 213 blk_start_idx = idx; 214 in_block = 1; 215 } 216 } else { 217 u64 umr_offset = idx & umr_block_mask; 218 219 if (in_block && umr_offset == 0) { 220 mlx5_ib_update_xlt(mr, blk_start_idx, 221 idx - blk_start_idx, 0, 222 MLX5_IB_UPD_XLT_ZAP | 223 MLX5_IB_UPD_XLT_ATOMIC); 224 in_block = 0; 225 } 226 } 227 } 228 if (in_block) 229 mlx5_ib_update_xlt(mr, blk_start_idx, 230 idx - blk_start_idx + 1, 0, 231 MLX5_IB_UPD_XLT_ZAP | 232 MLX5_IB_UPD_XLT_ATOMIC); 233 /* 234 * We are now sure that the device will not access the 235 * memory. We can safely unmap it, and mark it as dirty if 236 * needed. 237 */ 238 239 ib_umem_odp_unmap_dma_pages(umem, start, end); 240 241 if (unlikely(!umem->npages && mr->parent && 242 !umem->odp_data->dying)) { 243 WRITE_ONCE(umem->odp_data->dying, 1); 244 atomic_inc(&mr->parent->num_leaf_free); 245 schedule_work(&umem->odp_data->work); 246 } 247 } 248 249 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 250 { 251 struct ib_odp_caps *caps = &dev->odp_caps; 252 253 memset(caps, 0, sizeof(*caps)); 254 255 if (!MLX5_CAP_GEN(dev->mdev, pg)) 256 return; 257 258 caps->general_caps = IB_ODP_SUPPORT; 259 260 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 261 dev->odp_max_size = U64_MAX; 262 else 263 dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); 264 265 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) 266 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; 267 268 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) 269 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; 270 271 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) 272 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; 273 274 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) 275 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; 276 277 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) 278 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; 279 280 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) 281 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; 282 283 if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && 284 MLX5_CAP_GEN(dev->mdev, null_mkey) && 285 MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 286 caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; 287 288 return; 289 } 290 291 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, 292 struct mlx5_pagefault *pfault, 293 int error) 294 { 295 int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? 296 pfault->wqe.wq_num : pfault->token; 297 int ret = mlx5_core_page_fault_resume(dev->mdev, 298 pfault->token, 299 wq_num, 300 pfault->type, 301 error); 302 if (ret) 303 mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", 304 wq_num); 305 } 306 307 static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, 308 struct ib_umem *umem, 309 bool ksm, int access_flags) 310 { 311 struct mlx5_ib_dev *dev = to_mdev(pd->device); 312 struct mlx5_ib_mr *mr; 313 int err; 314 315 mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY : 316 MLX5_IMR_MTT_CACHE_ENTRY); 317 318 if (IS_ERR(mr)) 319 return mr; 320 321 mr->ibmr.pd = pd; 322 323 mr->dev = dev; 324 mr->access_flags = access_flags; 325 mr->mmkey.iova = 0; 326 mr->umem = umem; 327 328 if (ksm) { 329 err = mlx5_ib_update_xlt(mr, 0, 330 mlx5_imr_ksm_entries, 331 MLX5_KSM_PAGE_SHIFT, 332 MLX5_IB_UPD_XLT_INDIRECT | 333 MLX5_IB_UPD_XLT_ZAP | 334 MLX5_IB_UPD_XLT_ENABLE); 335 336 } else { 337 err = mlx5_ib_update_xlt(mr, 0, 338 MLX5_IMR_MTT_ENTRIES, 339 PAGE_SHIFT, 340 MLX5_IB_UPD_XLT_ZAP | 341 MLX5_IB_UPD_XLT_ENABLE | 342 MLX5_IB_UPD_XLT_ATOMIC); 343 } 344 345 if (err) 346 goto fail; 347 348 mr->ibmr.lkey = mr->mmkey.key; 349 mr->ibmr.rkey = mr->mmkey.key; 350 351 mr->live = 1; 352 353 mlx5_ib_dbg(dev, "key %x dev %p mr %p\n", 354 mr->mmkey.key, dev->mdev, mr); 355 356 return mr; 357 358 fail: 359 mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); 360 mlx5_mr_cache_free(dev, mr); 361 362 return ERR_PTR(err); 363 } 364 365 static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, 366 u64 io_virt, size_t bcnt) 367 { 368 struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; 369 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); 370 struct ib_umem_odp *odp, *result = NULL; 371 u64 addr = io_virt & MLX5_IMR_MTT_MASK; 372 int nentries = 0, start_idx = 0, ret; 373 struct mlx5_ib_mr *mtt; 374 struct ib_umem *umem; 375 376 mutex_lock(&mr->umem->odp_data->umem_mutex); 377 odp = odp_lookup(ctx, addr, 1, mr); 378 379 mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", 380 io_virt, bcnt, addr, odp); 381 382 next_mr: 383 if (likely(odp)) { 384 if (nentries) 385 nentries++; 386 } else { 387 umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); 388 if (IS_ERR(umem)) { 389 mutex_unlock(&mr->umem->odp_data->umem_mutex); 390 return ERR_CAST(umem); 391 } 392 393 mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags); 394 if (IS_ERR(mtt)) { 395 mutex_unlock(&mr->umem->odp_data->umem_mutex); 396 ib_umem_release(umem); 397 return ERR_CAST(mtt); 398 } 399 400 odp = umem->odp_data; 401 odp->private = mtt; 402 mtt->umem = umem; 403 mtt->mmkey.iova = addr; 404 mtt->parent = mr; 405 INIT_WORK(&odp->work, mr_leaf_free_action); 406 407 if (!nentries) 408 start_idx = addr >> MLX5_IMR_MTT_SHIFT; 409 nentries++; 410 } 411 412 /* Return first odp if region not covered by single one */ 413 if (likely(!result)) 414 result = odp; 415 416 addr += MLX5_IMR_MTT_SIZE; 417 if (unlikely(addr < io_virt + bcnt)) { 418 odp = odp_next(odp); 419 if (odp && odp->umem->address != addr) 420 odp = NULL; 421 goto next_mr; 422 } 423 424 if (unlikely(nentries)) { 425 ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0, 426 MLX5_IB_UPD_XLT_INDIRECT | 427 MLX5_IB_UPD_XLT_ATOMIC); 428 if (ret) { 429 mlx5_ib_err(dev, "Failed to update PAS\n"); 430 result = ERR_PTR(ret); 431 } 432 } 433 434 mutex_unlock(&mr->umem->odp_data->umem_mutex); 435 return result; 436 } 437 438 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, 439 int access_flags) 440 { 441 struct ib_ucontext *ctx = pd->ibpd.uobject->context; 442 struct mlx5_ib_mr *imr; 443 struct ib_umem *umem; 444 445 umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0); 446 if (IS_ERR(umem)) 447 return ERR_CAST(umem); 448 449 imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags); 450 if (IS_ERR(imr)) { 451 ib_umem_release(umem); 452 return ERR_CAST(imr); 453 } 454 455 imr->umem = umem; 456 init_waitqueue_head(&imr->q_leaf_free); 457 atomic_set(&imr->num_leaf_free, 0); 458 459 return imr; 460 } 461 462 static int mr_leaf_free(struct ib_umem *umem, u64 start, 463 u64 end, void *cookie) 464 { 465 struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie; 466 467 if (mr->parent != imr) 468 return 0; 469 470 ib_umem_odp_unmap_dma_pages(umem, 471 ib_umem_start(umem), 472 ib_umem_end(umem)); 473 474 if (umem->odp_data->dying) 475 return 0; 476 477 WRITE_ONCE(umem->odp_data->dying, 1); 478 atomic_inc(&imr->num_leaf_free); 479 schedule_work(&umem->odp_data->work); 480 481 return 0; 482 } 483 484 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) 485 { 486 struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context; 487 488 down_read(&ctx->umem_rwsem); 489 rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, 490 mr_leaf_free, imr); 491 up_read(&ctx->umem_rwsem); 492 493 wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); 494 } 495 496 static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 497 u64 io_virt, size_t bcnt, u32 *bytes_mapped) 498 { 499 u64 access_mask = ODP_READ_ALLOWED_BIT; 500 int npages = 0, page_shift, np; 501 u64 start_idx, page_mask; 502 struct ib_umem_odp *odp; 503 int current_seq; 504 size_t size; 505 int ret; 506 507 if (!mr->umem->odp_data->page_list) { 508 odp = implicit_mr_get_data(mr, io_virt, bcnt); 509 510 if (IS_ERR(odp)) 511 return PTR_ERR(odp); 512 mr = odp->private; 513 514 } else { 515 odp = mr->umem->odp_data; 516 } 517 518 next_mr: 519 size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt); 520 521 page_shift = mr->umem->page_shift; 522 page_mask = ~(BIT(page_shift) - 1); 523 start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; 524 525 if (mr->umem->writable) 526 access_mask |= ODP_WRITE_ALLOWED_BIT; 527 528 current_seq = READ_ONCE(odp->notifiers_seq); 529 /* 530 * Ensure the sequence number is valid for some time before we call 531 * gup. 532 */ 533 smp_rmb(); 534 535 ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size, 536 access_mask, current_seq); 537 538 if (ret < 0) 539 goto out; 540 541 np = ret; 542 543 mutex_lock(&odp->umem_mutex); 544 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { 545 /* 546 * No need to check whether the MTTs really belong to 547 * this MR, since ib_umem_odp_map_dma_pages already 548 * checks this. 549 */ 550 ret = mlx5_ib_update_xlt(mr, start_idx, np, 551 page_shift, MLX5_IB_UPD_XLT_ATOMIC); 552 } else { 553 ret = -EAGAIN; 554 } 555 mutex_unlock(&odp->umem_mutex); 556 557 if (ret < 0) { 558 if (ret != -EAGAIN) 559 mlx5_ib_err(dev, "Failed to update mkey page tables\n"); 560 goto out; 561 } 562 563 if (bytes_mapped) { 564 u32 new_mappings = (np << page_shift) - 565 (io_virt - round_down(io_virt, 1 << page_shift)); 566 *bytes_mapped += min_t(u32, new_mappings, size); 567 } 568 569 npages += np << (page_shift - PAGE_SHIFT); 570 bcnt -= size; 571 572 if (unlikely(bcnt)) { 573 struct ib_umem_odp *next; 574 575 io_virt += size; 576 next = odp_next(odp); 577 if (unlikely(!next || next->umem->address != io_virt)) { 578 mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", 579 io_virt, next); 580 return -EAGAIN; 581 } 582 odp = next; 583 mr = odp->private; 584 goto next_mr; 585 } 586 587 return npages; 588 589 out: 590 if (ret == -EAGAIN) { 591 if (mr->parent || !odp->dying) { 592 unsigned long timeout = 593 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); 594 595 if (!wait_for_completion_timeout( 596 &odp->notifier_completion, 597 timeout)) { 598 mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n", 599 current_seq, odp->notifiers_seq); 600 } 601 } else { 602 /* The MR is being killed, kill the QP as well. */ 603 ret = -EFAULT; 604 } 605 } 606 607 return ret; 608 } 609 610 struct pf_frame { 611 struct pf_frame *next; 612 u32 key; 613 u64 io_virt; 614 size_t bcnt; 615 int depth; 616 }; 617 618 /* 619 * Handle a single data segment in a page-fault WQE or RDMA region. 620 * 621 * Returns number of OS pages retrieved on success. The caller may continue to 622 * the next data segment. 623 * Can return the following error codes: 624 * -EAGAIN to designate a temporary error. The caller will abort handling the 625 * page fault and resolve it. 626 * -EFAULT when there's an error mapping the requested pages. The caller will 627 * abort the page fault handling. 628 */ 629 static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, 630 u32 key, u64 io_virt, size_t bcnt, 631 u32 *bytes_committed, 632 u32 *bytes_mapped) 633 { 634 int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0; 635 struct pf_frame *head = NULL, *frame; 636 struct mlx5_core_mkey *mmkey; 637 struct mlx5_ib_mw *mw; 638 struct mlx5_ib_mr *mr; 639 struct mlx5_klm *pklm; 640 u32 *out = NULL; 641 size_t offset; 642 643 srcu_key = srcu_read_lock(&dev->mr_srcu); 644 645 io_virt += *bytes_committed; 646 bcnt -= *bytes_committed; 647 648 next_mr: 649 mmkey = __mlx5_mr_lookup(dev->mdev, mlx5_base_mkey(key)); 650 if (!mmkey || mmkey->key != key) { 651 mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); 652 ret = -EFAULT; 653 goto srcu_unlock; 654 } 655 656 switch (mmkey->type) { 657 case MLX5_MKEY_MR: 658 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 659 if (!mr->live || !mr->ibmr.pd) { 660 mlx5_ib_dbg(dev, "got dead MR\n"); 661 ret = -EFAULT; 662 goto srcu_unlock; 663 } 664 665 ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped); 666 if (ret < 0) 667 goto srcu_unlock; 668 669 npages += ret; 670 ret = 0; 671 break; 672 673 case MLX5_MKEY_MW: 674 mw = container_of(mmkey, struct mlx5_ib_mw, mmkey); 675 676 if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) { 677 mlx5_ib_dbg(dev, "indirection level exceeded\n"); 678 ret = -EFAULT; 679 goto srcu_unlock; 680 } 681 682 outlen = MLX5_ST_SZ_BYTES(query_mkey_out) + 683 sizeof(*pklm) * (mw->ndescs - 2); 684 685 if (outlen > cur_outlen) { 686 kfree(out); 687 out = kzalloc(outlen, GFP_KERNEL); 688 if (!out) { 689 ret = -ENOMEM; 690 goto srcu_unlock; 691 } 692 cur_outlen = outlen; 693 } 694 695 pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out, 696 bsf0_klm0_pas_mtt0_1); 697 698 ret = mlx5_core_query_mkey(dev->mdev, &mw->mmkey, out, outlen); 699 if (ret) 700 goto srcu_unlock; 701 702 offset = io_virt - MLX5_GET64(query_mkey_out, out, 703 memory_key_mkey_entry.start_addr); 704 705 for (i = 0; bcnt && i < mw->ndescs; i++, pklm++) { 706 if (offset >= be32_to_cpu(pklm->bcount)) { 707 offset -= be32_to_cpu(pklm->bcount); 708 continue; 709 } 710 711 frame = kzalloc(sizeof(*frame), GFP_KERNEL); 712 if (!frame) { 713 ret = -ENOMEM; 714 goto srcu_unlock; 715 } 716 717 frame->key = be32_to_cpu(pklm->key); 718 frame->io_virt = be64_to_cpu(pklm->va) + offset; 719 frame->bcnt = min_t(size_t, bcnt, 720 be32_to_cpu(pklm->bcount) - offset); 721 frame->depth = depth + 1; 722 frame->next = head; 723 head = frame; 724 725 bcnt -= frame->bcnt; 726 } 727 break; 728 729 default: 730 mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type); 731 ret = -EFAULT; 732 goto srcu_unlock; 733 } 734 735 if (head) { 736 frame = head; 737 head = frame->next; 738 739 key = frame->key; 740 io_virt = frame->io_virt; 741 bcnt = frame->bcnt; 742 depth = frame->depth; 743 kfree(frame); 744 745 goto next_mr; 746 } 747 748 srcu_unlock: 749 while (head) { 750 frame = head; 751 head = frame->next; 752 kfree(frame); 753 } 754 kfree(out); 755 756 srcu_read_unlock(&dev->mr_srcu, srcu_key); 757 *bytes_committed = 0; 758 return ret ? ret : npages; 759 } 760 761 /** 762 * Parse a series of data segments for page fault handling. 763 * 764 * @qp the QP on which the fault occurred. 765 * @pfault contains page fault information. 766 * @wqe points at the first data segment in the WQE. 767 * @wqe_end points after the end of the WQE. 768 * @bytes_mapped receives the number of bytes that the function was able to 769 * map. This allows the caller to decide intelligently whether 770 * enough memory was mapped to resolve the page fault 771 * successfully (e.g. enough for the next MTU, or the entire 772 * WQE). 773 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus 774 * the committed bytes). 775 * 776 * Returns the number of pages loaded if positive, zero for an empty WQE, or a 777 * negative error code. 778 */ 779 static int pagefault_data_segments(struct mlx5_ib_dev *dev, 780 struct mlx5_pagefault *pfault, 781 struct mlx5_ib_qp *qp, void *wqe, 782 void *wqe_end, u32 *bytes_mapped, 783 u32 *total_wqe_bytes, int receive_queue) 784 { 785 int ret = 0, npages = 0; 786 u64 io_virt; 787 u32 key; 788 u32 byte_count; 789 size_t bcnt; 790 int inline_segment; 791 792 /* Skip SRQ next-WQE segment. */ 793 if (receive_queue && qp->ibqp.srq) 794 wqe += sizeof(struct mlx5_wqe_srq_next_seg); 795 796 if (bytes_mapped) 797 *bytes_mapped = 0; 798 if (total_wqe_bytes) 799 *total_wqe_bytes = 0; 800 801 while (wqe < wqe_end) { 802 struct mlx5_wqe_data_seg *dseg = wqe; 803 804 io_virt = be64_to_cpu(dseg->addr); 805 key = be32_to_cpu(dseg->lkey); 806 byte_count = be32_to_cpu(dseg->byte_count); 807 inline_segment = !!(byte_count & MLX5_INLINE_SEG); 808 bcnt = byte_count & ~MLX5_INLINE_SEG; 809 810 if (inline_segment) { 811 bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; 812 wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, 813 16); 814 } else { 815 wqe += sizeof(*dseg); 816 } 817 818 /* receive WQE end of sg list. */ 819 if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && 820 io_virt == 0) 821 break; 822 823 if (!inline_segment && total_wqe_bytes) { 824 *total_wqe_bytes += bcnt - min_t(size_t, bcnt, 825 pfault->bytes_committed); 826 } 827 828 /* A zero length data segment designates a length of 2GB. */ 829 if (bcnt == 0) 830 bcnt = 1U << 31; 831 832 if (inline_segment || bcnt <= pfault->bytes_committed) { 833 pfault->bytes_committed -= 834 min_t(size_t, bcnt, 835 pfault->bytes_committed); 836 continue; 837 } 838 839 ret = pagefault_single_data_segment(dev, key, io_virt, bcnt, 840 &pfault->bytes_committed, 841 bytes_mapped); 842 if (ret < 0) 843 break; 844 npages += ret; 845 } 846 847 return ret < 0 ? ret : npages; 848 } 849 850 static const u32 mlx5_ib_odp_opcode_cap[] = { 851 [MLX5_OPCODE_SEND] = IB_ODP_SUPPORT_SEND, 852 [MLX5_OPCODE_SEND_IMM] = IB_ODP_SUPPORT_SEND, 853 [MLX5_OPCODE_SEND_INVAL] = IB_ODP_SUPPORT_SEND, 854 [MLX5_OPCODE_RDMA_WRITE] = IB_ODP_SUPPORT_WRITE, 855 [MLX5_OPCODE_RDMA_WRITE_IMM] = IB_ODP_SUPPORT_WRITE, 856 [MLX5_OPCODE_RDMA_READ] = IB_ODP_SUPPORT_READ, 857 [MLX5_OPCODE_ATOMIC_CS] = IB_ODP_SUPPORT_ATOMIC, 858 [MLX5_OPCODE_ATOMIC_FA] = IB_ODP_SUPPORT_ATOMIC, 859 }; 860 861 /* 862 * Parse initiator WQE. Advances the wqe pointer to point at the 863 * scatter-gather list, and set wqe_end to the end of the WQE. 864 */ 865 static int mlx5_ib_mr_initiator_pfault_handler( 866 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 867 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) 868 { 869 struct mlx5_wqe_ctrl_seg *ctrl = *wqe; 870 u16 wqe_index = pfault->wqe.wqe_index; 871 u32 transport_caps; 872 struct mlx5_base_av *av; 873 unsigned ds, opcode; 874 #if defined(DEBUG) 875 u32 ctrl_wqe_index, ctrl_qpn; 876 #endif 877 u32 qpn = qp->trans_qp.base.mqp.qpn; 878 879 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; 880 if (ds * MLX5_WQE_DS_UNITS > wqe_length) { 881 mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", 882 ds, wqe_length); 883 return -EFAULT; 884 } 885 886 if (ds == 0) { 887 mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", 888 wqe_index, qpn); 889 return -EFAULT; 890 } 891 892 #if defined(DEBUG) 893 ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & 894 MLX5_WQE_CTRL_WQE_INDEX_MASK) >> 895 MLX5_WQE_CTRL_WQE_INDEX_SHIFT; 896 if (wqe_index != ctrl_wqe_index) { 897 mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", 898 wqe_index, qpn, 899 ctrl_wqe_index); 900 return -EFAULT; 901 } 902 903 ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> 904 MLX5_WQE_CTRL_QPN_SHIFT; 905 if (qpn != ctrl_qpn) { 906 mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", 907 wqe_index, qpn, 908 ctrl_qpn); 909 return -EFAULT; 910 } 911 #endif /* DEBUG */ 912 913 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; 914 *wqe += sizeof(*ctrl); 915 916 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & 917 MLX5_WQE_CTRL_OPCODE_MASK; 918 919 switch (qp->ibqp.qp_type) { 920 case IB_QPT_RC: 921 transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps; 922 break; 923 case IB_QPT_UD: 924 transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps; 925 break; 926 default: 927 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n", 928 qp->ibqp.qp_type); 929 return -EFAULT; 930 } 931 932 if (unlikely(opcode >= sizeof(mlx5_ib_odp_opcode_cap) / 933 sizeof(mlx5_ib_odp_opcode_cap[0]) || 934 !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) { 935 mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n", 936 opcode); 937 return -EFAULT; 938 } 939 940 if (qp->ibqp.qp_type != IB_QPT_RC) { 941 av = *wqe; 942 if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV)) 943 *wqe += sizeof(struct mlx5_av); 944 else 945 *wqe += sizeof(struct mlx5_base_av); 946 } 947 948 switch (opcode) { 949 case MLX5_OPCODE_RDMA_WRITE: 950 case MLX5_OPCODE_RDMA_WRITE_IMM: 951 case MLX5_OPCODE_RDMA_READ: 952 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 953 break; 954 case MLX5_OPCODE_ATOMIC_CS: 955 case MLX5_OPCODE_ATOMIC_FA: 956 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 957 *wqe += sizeof(struct mlx5_wqe_atomic_seg); 958 break; 959 } 960 961 return 0; 962 } 963 964 /* 965 * Parse responder WQE. Advances the wqe pointer to point at the 966 * scatter-gather list, and set wqe_end to the end of the WQE. 967 */ 968 static int mlx5_ib_mr_responder_pfault_handler( 969 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 970 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) 971 { 972 struct mlx5_ib_wq *wq = &qp->rq; 973 int wqe_size = 1 << wq->wqe_shift; 974 975 if (qp->ibqp.srq) { 976 mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); 977 return -EFAULT; 978 } 979 980 if (qp->wq_sig) { 981 mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); 982 return -EFAULT; 983 } 984 985 if (wqe_size > wqe_length) { 986 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); 987 return -EFAULT; 988 } 989 990 switch (qp->ibqp.qp_type) { 991 case IB_QPT_RC: 992 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 993 IB_ODP_SUPPORT_RECV)) 994 goto invalid_transport_or_opcode; 995 break; 996 default: 997 invalid_transport_or_opcode: 998 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", 999 qp->ibqp.qp_type); 1000 return -EFAULT; 1001 } 1002 1003 *wqe_end = *wqe + wqe_size; 1004 1005 return 0; 1006 } 1007 1008 static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev, 1009 u32 wq_num) 1010 { 1011 struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num); 1012 1013 if (!mqp) { 1014 mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num); 1015 return NULL; 1016 } 1017 1018 return to_mibqp(mqp); 1019 } 1020 1021 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, 1022 struct mlx5_pagefault *pfault) 1023 { 1024 int ret; 1025 void *wqe, *wqe_end; 1026 u32 bytes_mapped, total_wqe_bytes; 1027 char *buffer = NULL; 1028 int resume_with_error = 1; 1029 u16 wqe_index = pfault->wqe.wqe_index; 1030 int requestor = pfault->type & MLX5_PFAULT_REQUESTOR; 1031 struct mlx5_ib_qp *qp; 1032 1033 buffer = (char *)__get_free_page(GFP_KERNEL); 1034 if (!buffer) { 1035 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); 1036 goto resolve_page_fault; 1037 } 1038 1039 qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num); 1040 if (!qp) 1041 goto resolve_page_fault; 1042 1043 ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, 1044 PAGE_SIZE, &qp->trans_qp.base); 1045 if (ret < 0) { 1046 mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n", 1047 ret, wqe_index, pfault->token); 1048 goto resolve_page_fault; 1049 } 1050 1051 wqe = buffer; 1052 if (requestor) 1053 ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe, 1054 &wqe_end, ret); 1055 else 1056 ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe, 1057 &wqe_end, ret); 1058 if (ret < 0) 1059 goto resolve_page_fault; 1060 1061 if (wqe >= wqe_end) { 1062 mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); 1063 goto resolve_page_fault; 1064 } 1065 1066 ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end, 1067 &bytes_mapped, &total_wqe_bytes, 1068 !requestor); 1069 if (ret == -EAGAIN) { 1070 resume_with_error = 0; 1071 goto resolve_page_fault; 1072 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { 1073 goto resolve_page_fault; 1074 } 1075 1076 resume_with_error = 0; 1077 resolve_page_fault: 1078 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); 1079 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", 1080 pfault->wqe.wq_num, resume_with_error, 1081 pfault->type); 1082 free_page((unsigned long)buffer); 1083 } 1084 1085 static int pages_in_range(u64 address, u32 length) 1086 { 1087 return (ALIGN(address + length, PAGE_SIZE) - 1088 (address & PAGE_MASK)) >> PAGE_SHIFT; 1089 } 1090 1091 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, 1092 struct mlx5_pagefault *pfault) 1093 { 1094 u64 address; 1095 u32 length; 1096 u32 prefetch_len = pfault->bytes_committed; 1097 int prefetch_activated = 0; 1098 u32 rkey = pfault->rdma.r_key; 1099 int ret; 1100 1101 /* The RDMA responder handler handles the page fault in two parts. 1102 * First it brings the necessary pages for the current packet 1103 * (and uses the pfault context), and then (after resuming the QP) 1104 * prefetches more pages. The second operation cannot use the pfault 1105 * context and therefore uses the dummy_pfault context allocated on 1106 * the stack */ 1107 pfault->rdma.rdma_va += pfault->bytes_committed; 1108 pfault->rdma.rdma_op_len -= min(pfault->bytes_committed, 1109 pfault->rdma.rdma_op_len); 1110 pfault->bytes_committed = 0; 1111 1112 address = pfault->rdma.rdma_va; 1113 length = pfault->rdma.rdma_op_len; 1114 1115 /* For some operations, the hardware cannot tell the exact message 1116 * length, and in those cases it reports zero. Use prefetch 1117 * logic. */ 1118 if (length == 0) { 1119 prefetch_activated = 1; 1120 length = pfault->rdma.packet_size; 1121 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); 1122 } 1123 1124 ret = pagefault_single_data_segment(dev, rkey, address, length, 1125 &pfault->bytes_committed, NULL); 1126 if (ret == -EAGAIN) { 1127 /* We're racing with an invalidation, don't prefetch */ 1128 prefetch_activated = 0; 1129 } else if (ret < 0 || pages_in_range(address, length) > ret) { 1130 mlx5_ib_page_fault_resume(dev, pfault, 1); 1131 if (ret != -ENOENT) 1132 mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", 1133 ret, pfault->token, pfault->type); 1134 return; 1135 } 1136 1137 mlx5_ib_page_fault_resume(dev, pfault, 0); 1138 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", 1139 pfault->token, pfault->type, 1140 prefetch_activated); 1141 1142 /* At this point, there might be a new pagefault already arriving in 1143 * the eq, switch to the dummy pagefault for the rest of the 1144 * processing. We're still OK with the objects being alive as the 1145 * work-queue is being fenced. */ 1146 1147 if (prefetch_activated) { 1148 u32 bytes_committed = 0; 1149 1150 ret = pagefault_single_data_segment(dev, rkey, address, 1151 prefetch_len, 1152 &bytes_committed, NULL); 1153 if (ret < 0 && ret != -EAGAIN) { 1154 mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", 1155 ret, pfault->token, address, prefetch_len); 1156 } 1157 } 1158 } 1159 1160 void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, 1161 struct mlx5_pagefault *pfault) 1162 { 1163 struct mlx5_ib_dev *dev = context; 1164 u8 event_subtype = pfault->event_subtype; 1165 1166 switch (event_subtype) { 1167 case MLX5_PFAULT_SUBTYPE_WQE: 1168 mlx5_ib_mr_wqe_pfault_handler(dev, pfault); 1169 break; 1170 case MLX5_PFAULT_SUBTYPE_RDMA: 1171 mlx5_ib_mr_rdma_pfault_handler(dev, pfault); 1172 break; 1173 default: 1174 mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", 1175 event_subtype); 1176 mlx5_ib_page_fault_resume(dev, pfault, 1); 1177 } 1178 } 1179 1180 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) 1181 { 1182 if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1183 return; 1184 1185 switch (ent->order - 2) { 1186 case MLX5_IMR_MTT_CACHE_ENTRY: 1187 ent->page = PAGE_SHIFT; 1188 ent->xlt = MLX5_IMR_MTT_ENTRIES * 1189 sizeof(struct mlx5_mtt) / 1190 MLX5_IB_UMR_OCTOWORD; 1191 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; 1192 ent->limit = 0; 1193 break; 1194 1195 case MLX5_IMR_KSM_CACHE_ENTRY: 1196 ent->page = MLX5_KSM_PAGE_SHIFT; 1197 ent->xlt = mlx5_imr_ksm_entries * 1198 sizeof(struct mlx5_klm) / 1199 MLX5_IB_UMR_OCTOWORD; 1200 ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; 1201 ent->limit = 0; 1202 break; 1203 } 1204 } 1205 1206 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) 1207 { 1208 int ret; 1209 1210 ret = init_srcu_struct(&dev->mr_srcu); 1211 if (ret) 1212 return ret; 1213 1214 if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { 1215 ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); 1216 if (ret) { 1217 mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret); 1218 return ret; 1219 } 1220 } 1221 1222 return 0; 1223 } 1224 1225 void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *dev) 1226 { 1227 cleanup_srcu_struct(&dev->mr_srcu); 1228 } 1229 1230 int mlx5_ib_odp_init(void) 1231 { 1232 mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - 1233 MLX5_IMR_MTT_BITS); 1234 1235 return 0; 1236 } 1237 1238