1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <rdma/ib_umem.h> 34 #include <rdma/ib_umem_odp.h> 35 36 #include "mlx5_ib.h" 37 #include "cmd.h" 38 39 #define MAX_PREFETCH_LEN (4*1024*1024U) 40 41 /* Timeout in ms to wait for an active mmu notifier to complete when handling 42 * a pagefault. */ 43 #define MMU_NOTIFIER_TIMEOUT 1000 44 45 #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) 46 #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) 47 #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) 48 #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) 49 #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) 50 51 #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT 52 53 static u64 mlx5_imr_ksm_entries; 54 55 static int check_parent(struct ib_umem_odp *odp, 56 struct mlx5_ib_mr *parent) 57 { 58 struct mlx5_ib_mr *mr = odp->private; 59 60 return mr && mr->parent == parent; 61 } 62 63 static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) 64 { 65 struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; 66 struct ib_ucontext *ctx = odp->umem->context; 67 struct rb_node *rb; 68 69 down_read(&ctx->umem_rwsem); 70 while (1) { 71 rb = rb_next(&odp->interval_tree.rb); 72 if (!rb) 73 goto not_found; 74 odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); 75 if (check_parent(odp, parent)) 76 goto end; 77 } 78 not_found: 79 odp = NULL; 80 end: 81 up_read(&ctx->umem_rwsem); 82 return odp; 83 } 84 85 static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, 86 u64 start, u64 length, 87 struct mlx5_ib_mr *parent) 88 { 89 struct ib_umem_odp *odp; 90 struct rb_node *rb; 91 92 down_read(&ctx->umem_rwsem); 93 odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length); 94 if (!odp) 95 goto end; 96 97 while (1) { 98 if (check_parent(odp, parent)) 99 goto end; 100 rb = rb_next(&odp->interval_tree.rb); 101 if (!rb) 102 goto not_found; 103 odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); 104 if (ib_umem_start(odp->umem) > start + length) 105 goto not_found; 106 } 107 not_found: 108 odp = NULL; 109 end: 110 up_read(&ctx->umem_rwsem); 111 return odp; 112 } 113 114 void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, 115 size_t nentries, struct mlx5_ib_mr *mr, int flags) 116 { 117 struct ib_pd *pd = mr->ibmr.pd; 118 struct ib_ucontext *ctx = pd->uobject->context; 119 struct mlx5_ib_dev *dev = to_mdev(pd->device); 120 struct ib_umem_odp *odp; 121 unsigned long va; 122 int i; 123 124 if (flags & MLX5_IB_UPD_XLT_ZAP) { 125 for (i = 0; i < nentries; i++, pklm++) { 126 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 127 pklm->key = cpu_to_be32(dev->null_mkey); 128 pklm->va = 0; 129 } 130 return; 131 } 132 133 odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE, 134 nentries * MLX5_IMR_MTT_SIZE, mr); 135 136 for (i = 0; i < nentries; i++, pklm++) { 137 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 138 va = (offset + i) * MLX5_IMR_MTT_SIZE; 139 if (odp && odp->umem->address == va) { 140 struct mlx5_ib_mr *mtt = odp->private; 141 142 pklm->key = cpu_to_be32(mtt->ibmr.lkey); 143 odp = odp_next(odp); 144 } else { 145 pklm->key = cpu_to_be32(dev->null_mkey); 146 } 147 mlx5_ib_dbg(dev, "[%d] va %lx key %x\n", 148 i, va, be32_to_cpu(pklm->key)); 149 } 150 } 151 152 static void mr_leaf_free_action(struct work_struct *work) 153 { 154 struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); 155 int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT; 156 struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; 157 158 mr->parent = NULL; 159 synchronize_srcu(&mr->dev->mr_srcu); 160 161 if (!READ_ONCE(odp->dying)) { 162 mr->parent = imr; 163 if (atomic_dec_and_test(&imr->num_leaf_free)) 164 wake_up(&imr->q_leaf_free); 165 return; 166 } 167 168 ib_umem_release(odp->umem); 169 if (imr->live) 170 mlx5_ib_update_xlt(imr, idx, 1, 0, 171 MLX5_IB_UPD_XLT_INDIRECT | 172 MLX5_IB_UPD_XLT_ATOMIC); 173 mlx5_mr_cache_free(mr->dev, mr); 174 175 if (atomic_dec_and_test(&imr->num_leaf_free)) 176 wake_up(&imr->q_leaf_free); 177 } 178 179 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, 180 unsigned long end) 181 { 182 struct mlx5_ib_mr *mr; 183 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / 184 sizeof(struct mlx5_mtt)) - 1; 185 u64 idx = 0, blk_start_idx = 0; 186 int in_block = 0; 187 u64 addr; 188 189 if (!umem || !umem->odp_data) { 190 pr_err("invalidation called on NULL umem or non-ODP umem\n"); 191 return; 192 } 193 194 mr = umem->odp_data->private; 195 196 if (!mr || !mr->ibmr.pd) 197 return; 198 199 start = max_t(u64, ib_umem_start(umem), start); 200 end = min_t(u64, ib_umem_end(umem), end); 201 202 /* 203 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that 204 * while we are doing the invalidation, no page fault will attempt to 205 * overwrite the same MTTs. Concurent invalidations might race us, 206 * but they will write 0s as well, so no difference in the end result. 207 */ 208 209 for (addr = start; addr < end; addr += (u64)umem->page_size) { 210 idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; 211 /* 212 * Strive to write the MTTs in chunks, but avoid overwriting 213 * non-existing MTTs. The huristic here can be improved to 214 * estimate the cost of another UMR vs. the cost of bigger 215 * UMR. 216 */ 217 if (umem->odp_data->dma_list[idx] & 218 (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { 219 if (!in_block) { 220 blk_start_idx = idx; 221 in_block = 1; 222 } 223 } else { 224 u64 umr_offset = idx & umr_block_mask; 225 226 if (in_block && umr_offset == 0) { 227 mlx5_ib_update_xlt(mr, blk_start_idx, 228 idx - blk_start_idx, 229 PAGE_SHIFT, 230 MLX5_IB_UPD_XLT_ZAP | 231 MLX5_IB_UPD_XLT_ATOMIC); 232 in_block = 0; 233 } 234 } 235 } 236 if (in_block) 237 mlx5_ib_update_xlt(mr, blk_start_idx, 238 idx - blk_start_idx + 1, 239 PAGE_SHIFT, 240 MLX5_IB_UPD_XLT_ZAP | 241 MLX5_IB_UPD_XLT_ATOMIC); 242 /* 243 * We are now sure that the device will not access the 244 * memory. We can safely unmap it, and mark it as dirty if 245 * needed. 246 */ 247 248 ib_umem_odp_unmap_dma_pages(umem, start, end); 249 250 if (unlikely(!umem->npages && mr->parent && 251 !umem->odp_data->dying)) { 252 WRITE_ONCE(umem->odp_data->dying, 1); 253 atomic_inc(&mr->parent->num_leaf_free); 254 schedule_work(&umem->odp_data->work); 255 } 256 } 257 258 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 259 { 260 struct ib_odp_caps *caps = &dev->odp_caps; 261 262 memset(caps, 0, sizeof(*caps)); 263 264 if (!MLX5_CAP_GEN(dev->mdev, pg)) 265 return; 266 267 caps->general_caps = IB_ODP_SUPPORT; 268 269 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 270 dev->odp_max_size = U64_MAX; 271 else 272 dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); 273 274 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) 275 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; 276 277 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) 278 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; 279 280 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) 281 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; 282 283 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) 284 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; 285 286 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) 287 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; 288 289 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) 290 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; 291 292 if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && 293 MLX5_CAP_GEN(dev->mdev, null_mkey) && 294 MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 295 caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; 296 297 return; 298 } 299 300 static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, 301 u32 key) 302 { 303 u32 base_key = mlx5_base_mkey(key); 304 struct mlx5_core_mkey *mmkey = __mlx5_mr_lookup(dev->mdev, base_key); 305 struct mlx5_ib_mr *mr; 306 307 if (!mmkey || mmkey->key != key || mmkey->type != MLX5_MKEY_MR) 308 return NULL; 309 310 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 311 312 if (!mr->live) 313 return NULL; 314 315 return container_of(mmkey, struct mlx5_ib_mr, mmkey); 316 } 317 318 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, 319 struct mlx5_pagefault *pfault, 320 int error) 321 { 322 int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? 323 pfault->wqe.wq_num : pfault->token; 324 int ret = mlx5_core_page_fault_resume(dev->mdev, 325 pfault->token, 326 wq_num, 327 pfault->type, 328 error); 329 if (ret) 330 mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", 331 wq_num); 332 } 333 334 static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, 335 struct ib_umem *umem, 336 bool ksm, int access_flags) 337 { 338 struct mlx5_ib_dev *dev = to_mdev(pd->device); 339 struct mlx5_ib_mr *mr; 340 int err; 341 342 mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY : 343 MLX5_IMR_MTT_CACHE_ENTRY); 344 345 if (IS_ERR(mr)) 346 return mr; 347 348 mr->ibmr.pd = pd; 349 350 mr->dev = dev; 351 mr->access_flags = access_flags; 352 mr->mmkey.iova = 0; 353 mr->umem = umem; 354 355 if (ksm) { 356 err = mlx5_ib_update_xlt(mr, 0, 357 mlx5_imr_ksm_entries, 358 MLX5_KSM_PAGE_SHIFT, 359 MLX5_IB_UPD_XLT_INDIRECT | 360 MLX5_IB_UPD_XLT_ZAP | 361 MLX5_IB_UPD_XLT_ENABLE); 362 363 } else { 364 err = mlx5_ib_update_xlt(mr, 0, 365 MLX5_IMR_MTT_ENTRIES, 366 PAGE_SHIFT, 367 MLX5_IB_UPD_XLT_ZAP | 368 MLX5_IB_UPD_XLT_ENABLE | 369 MLX5_IB_UPD_XLT_ATOMIC); 370 } 371 372 if (err) 373 goto fail; 374 375 mr->ibmr.lkey = mr->mmkey.key; 376 mr->ibmr.rkey = mr->mmkey.key; 377 378 mr->live = 1; 379 380 mlx5_ib_dbg(dev, "key %x dev %p mr %p\n", 381 mr->mmkey.key, dev->mdev, mr); 382 383 return mr; 384 385 fail: 386 mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); 387 mlx5_mr_cache_free(dev, mr); 388 389 return ERR_PTR(err); 390 } 391 392 static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, 393 u64 io_virt, size_t bcnt) 394 { 395 struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; 396 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); 397 struct ib_umem_odp *odp, *result = NULL; 398 u64 addr = io_virt & MLX5_IMR_MTT_MASK; 399 int nentries = 0, start_idx = 0, ret; 400 struct mlx5_ib_mr *mtt; 401 struct ib_umem *umem; 402 403 mutex_lock(&mr->umem->odp_data->umem_mutex); 404 odp = odp_lookup(ctx, addr, 1, mr); 405 406 mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", 407 io_virt, bcnt, addr, odp); 408 409 next_mr: 410 if (likely(odp)) { 411 if (nentries) 412 nentries++; 413 } else { 414 umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); 415 if (IS_ERR(umem)) { 416 mutex_unlock(&mr->umem->odp_data->umem_mutex); 417 return ERR_CAST(umem); 418 } 419 420 mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags); 421 if (IS_ERR(mtt)) { 422 mutex_unlock(&mr->umem->odp_data->umem_mutex); 423 ib_umem_release(umem); 424 return ERR_CAST(mtt); 425 } 426 427 odp = umem->odp_data; 428 odp->private = mtt; 429 mtt->umem = umem; 430 mtt->mmkey.iova = addr; 431 mtt->parent = mr; 432 INIT_WORK(&odp->work, mr_leaf_free_action); 433 434 if (!nentries) 435 start_idx = addr >> MLX5_IMR_MTT_SHIFT; 436 nentries++; 437 } 438 439 odp->dying = 0; 440 441 /* Return first odp if region not covered by single one */ 442 if (likely(!result)) 443 result = odp; 444 445 addr += MLX5_IMR_MTT_SIZE; 446 if (unlikely(addr < io_virt + bcnt)) { 447 odp = odp_next(odp); 448 if (odp && odp->umem->address != addr) 449 odp = NULL; 450 goto next_mr; 451 } 452 453 if (unlikely(nentries)) { 454 ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0, 455 MLX5_IB_UPD_XLT_INDIRECT | 456 MLX5_IB_UPD_XLT_ATOMIC); 457 if (ret) { 458 mlx5_ib_err(dev, "Failed to update PAS\n"); 459 result = ERR_PTR(ret); 460 } 461 } 462 463 mutex_unlock(&mr->umem->odp_data->umem_mutex); 464 return result; 465 } 466 467 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, 468 int access_flags) 469 { 470 struct ib_ucontext *ctx = pd->ibpd.uobject->context; 471 struct mlx5_ib_mr *imr; 472 struct ib_umem *umem; 473 474 umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0); 475 if (IS_ERR(umem)) 476 return ERR_CAST(umem); 477 478 imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags); 479 if (IS_ERR(imr)) { 480 ib_umem_release(umem); 481 return ERR_CAST(imr); 482 } 483 484 imr->umem = umem; 485 init_waitqueue_head(&imr->q_leaf_free); 486 atomic_set(&imr->num_leaf_free, 0); 487 488 return imr; 489 } 490 491 static int mr_leaf_free(struct ib_umem *umem, u64 start, 492 u64 end, void *cookie) 493 { 494 struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie; 495 496 if (mr->parent != imr) 497 return 0; 498 499 ib_umem_odp_unmap_dma_pages(umem, 500 ib_umem_start(umem), 501 ib_umem_end(umem)); 502 503 if (umem->odp_data->dying) 504 return 0; 505 506 WRITE_ONCE(umem->odp_data->dying, 1); 507 atomic_inc(&imr->num_leaf_free); 508 schedule_work(&umem->odp_data->work); 509 510 return 0; 511 } 512 513 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) 514 { 515 struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context; 516 517 down_read(&ctx->umem_rwsem); 518 rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, 519 mr_leaf_free, imr); 520 up_read(&ctx->umem_rwsem); 521 522 wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); 523 } 524 525 /* 526 * Handle a single data segment in a page-fault WQE or RDMA region. 527 * 528 * Returns number of pages retrieved on success. The caller may continue to 529 * the next data segment. 530 * Can return the following error codes: 531 * -EAGAIN to designate a temporary error. The caller will abort handling the 532 * page fault and resolve it. 533 * -EFAULT when there's an error mapping the requested pages. The caller will 534 * abort the page fault handling. 535 */ 536 static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, 537 u32 key, u64 io_virt, size_t bcnt, 538 u32 *bytes_committed, 539 u32 *bytes_mapped) 540 { 541 int srcu_key; 542 unsigned int current_seq = 0; 543 u64 start_idx; 544 int npages = 0, ret = 0; 545 struct mlx5_ib_mr *mr; 546 u64 access_mask = ODP_READ_ALLOWED_BIT; 547 struct ib_umem_odp *odp; 548 int implicit = 0; 549 size_t size; 550 551 srcu_key = srcu_read_lock(&dev->mr_srcu); 552 mr = mlx5_ib_odp_find_mr_lkey(dev, key); 553 /* 554 * If we didn't find the MR, it means the MR was closed while we were 555 * handling the ODP event. In this case we return -EFAULT so that the 556 * QP will be closed. 557 */ 558 if (!mr || !mr->ibmr.pd) { 559 mlx5_ib_dbg(dev, "Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", 560 key); 561 ret = -EFAULT; 562 goto srcu_unlock; 563 } 564 if (!mr->umem->odp_data) { 565 mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", 566 key); 567 if (bytes_mapped) 568 *bytes_mapped += 569 (bcnt - *bytes_committed); 570 goto srcu_unlock; 571 } 572 573 /* 574 * Avoid branches - this code will perform correctly 575 * in all iterations (in iteration 2 and above, 576 * bytes_committed == 0). 577 */ 578 io_virt += *bytes_committed; 579 bcnt -= *bytes_committed; 580 581 if (!mr->umem->odp_data->page_list) { 582 odp = implicit_mr_get_data(mr, io_virt, bcnt); 583 584 if (IS_ERR(odp)) { 585 ret = PTR_ERR(odp); 586 goto srcu_unlock; 587 } 588 mr = odp->private; 589 implicit = 1; 590 591 } else { 592 odp = mr->umem->odp_data; 593 } 594 595 next_mr: 596 current_seq = READ_ONCE(odp->notifiers_seq); 597 /* 598 * Ensure the sequence number is valid for some time before we call 599 * gup. 600 */ 601 smp_rmb(); 602 603 size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt); 604 start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT; 605 606 if (mr->umem->writable) 607 access_mask |= ODP_WRITE_ALLOWED_BIT; 608 609 ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size, 610 access_mask, current_seq); 611 612 if (ret < 0) 613 goto srcu_unlock; 614 615 if (ret > 0) { 616 int np = ret; 617 618 mutex_lock(&odp->umem_mutex); 619 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { 620 /* 621 * No need to check whether the MTTs really belong to 622 * this MR, since ib_umem_odp_map_dma_pages already 623 * checks this. 624 */ 625 ret = mlx5_ib_update_xlt(mr, start_idx, np, 626 PAGE_SHIFT, 627 MLX5_IB_UPD_XLT_ATOMIC); 628 } else { 629 ret = -EAGAIN; 630 } 631 mutex_unlock(&odp->umem_mutex); 632 if (ret < 0) { 633 if (ret != -EAGAIN) 634 mlx5_ib_err(dev, "Failed to update mkey page tables\n"); 635 goto srcu_unlock; 636 } 637 638 if (bytes_mapped) { 639 u32 new_mappings = np * PAGE_SIZE - 640 (io_virt - round_down(io_virt, PAGE_SIZE)); 641 *bytes_mapped += min_t(u32, new_mappings, size); 642 } 643 644 npages += np; 645 } 646 647 bcnt -= size; 648 if (unlikely(bcnt)) { 649 struct ib_umem_odp *next; 650 651 io_virt += size; 652 next = odp_next(odp); 653 if (unlikely(!next || next->umem->address != io_virt)) { 654 mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", 655 io_virt, next); 656 ret = -EAGAIN; 657 goto srcu_unlock_no_wait; 658 } 659 odp = next; 660 mr = odp->private; 661 goto next_mr; 662 } 663 664 srcu_unlock: 665 if (ret == -EAGAIN) { 666 if (implicit || !odp->dying) { 667 unsigned long timeout = 668 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); 669 670 if (!wait_for_completion_timeout( 671 &odp->notifier_completion, 672 timeout)) { 673 mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n", 674 current_seq, odp->notifiers_seq); 675 } 676 } else { 677 /* The MR is being killed, kill the QP as well. */ 678 ret = -EFAULT; 679 } 680 } 681 682 srcu_unlock_no_wait: 683 srcu_read_unlock(&dev->mr_srcu, srcu_key); 684 *bytes_committed = 0; 685 return ret ? ret : npages; 686 } 687 688 /** 689 * Parse a series of data segments for page fault handling. 690 * 691 * @qp the QP on which the fault occurred. 692 * @pfault contains page fault information. 693 * @wqe points at the first data segment in the WQE. 694 * @wqe_end points after the end of the WQE. 695 * @bytes_mapped receives the number of bytes that the function was able to 696 * map. This allows the caller to decide intelligently whether 697 * enough memory was mapped to resolve the page fault 698 * successfully (e.g. enough for the next MTU, or the entire 699 * WQE). 700 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus 701 * the committed bytes). 702 * 703 * Returns the number of pages loaded if positive, zero for an empty WQE, or a 704 * negative error code. 705 */ 706 static int pagefault_data_segments(struct mlx5_ib_dev *dev, 707 struct mlx5_pagefault *pfault, 708 struct mlx5_ib_qp *qp, void *wqe, 709 void *wqe_end, u32 *bytes_mapped, 710 u32 *total_wqe_bytes, int receive_queue) 711 { 712 int ret = 0, npages = 0; 713 u64 io_virt; 714 u32 key; 715 u32 byte_count; 716 size_t bcnt; 717 int inline_segment; 718 719 /* Skip SRQ next-WQE segment. */ 720 if (receive_queue && qp->ibqp.srq) 721 wqe += sizeof(struct mlx5_wqe_srq_next_seg); 722 723 if (bytes_mapped) 724 *bytes_mapped = 0; 725 if (total_wqe_bytes) 726 *total_wqe_bytes = 0; 727 728 while (wqe < wqe_end) { 729 struct mlx5_wqe_data_seg *dseg = wqe; 730 731 io_virt = be64_to_cpu(dseg->addr); 732 key = be32_to_cpu(dseg->lkey); 733 byte_count = be32_to_cpu(dseg->byte_count); 734 inline_segment = !!(byte_count & MLX5_INLINE_SEG); 735 bcnt = byte_count & ~MLX5_INLINE_SEG; 736 737 if (inline_segment) { 738 bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; 739 wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, 740 16); 741 } else { 742 wqe += sizeof(*dseg); 743 } 744 745 /* receive WQE end of sg list. */ 746 if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && 747 io_virt == 0) 748 break; 749 750 if (!inline_segment && total_wqe_bytes) { 751 *total_wqe_bytes += bcnt - min_t(size_t, bcnt, 752 pfault->bytes_committed); 753 } 754 755 /* A zero length data segment designates a length of 2GB. */ 756 if (bcnt == 0) 757 bcnt = 1U << 31; 758 759 if (inline_segment || bcnt <= pfault->bytes_committed) { 760 pfault->bytes_committed -= 761 min_t(size_t, bcnt, 762 pfault->bytes_committed); 763 continue; 764 } 765 766 ret = pagefault_single_data_segment(dev, key, io_virt, bcnt, 767 &pfault->bytes_committed, 768 bytes_mapped); 769 if (ret < 0) 770 break; 771 npages += ret; 772 } 773 774 return ret < 0 ? ret : npages; 775 } 776 777 static const u32 mlx5_ib_odp_opcode_cap[] = { 778 [MLX5_OPCODE_SEND] = IB_ODP_SUPPORT_SEND, 779 [MLX5_OPCODE_SEND_IMM] = IB_ODP_SUPPORT_SEND, 780 [MLX5_OPCODE_SEND_INVAL] = IB_ODP_SUPPORT_SEND, 781 [MLX5_OPCODE_RDMA_WRITE] = IB_ODP_SUPPORT_WRITE, 782 [MLX5_OPCODE_RDMA_WRITE_IMM] = IB_ODP_SUPPORT_WRITE, 783 [MLX5_OPCODE_RDMA_READ] = IB_ODP_SUPPORT_READ, 784 [MLX5_OPCODE_ATOMIC_CS] = IB_ODP_SUPPORT_ATOMIC, 785 [MLX5_OPCODE_ATOMIC_FA] = IB_ODP_SUPPORT_ATOMIC, 786 }; 787 788 /* 789 * Parse initiator WQE. Advances the wqe pointer to point at the 790 * scatter-gather list, and set wqe_end to the end of the WQE. 791 */ 792 static int mlx5_ib_mr_initiator_pfault_handler( 793 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 794 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) 795 { 796 struct mlx5_wqe_ctrl_seg *ctrl = *wqe; 797 u16 wqe_index = pfault->wqe.wqe_index; 798 u32 transport_caps; 799 struct mlx5_base_av *av; 800 unsigned ds, opcode; 801 #if defined(DEBUG) 802 u32 ctrl_wqe_index, ctrl_qpn; 803 #endif 804 u32 qpn = qp->trans_qp.base.mqp.qpn; 805 806 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; 807 if (ds * MLX5_WQE_DS_UNITS > wqe_length) { 808 mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", 809 ds, wqe_length); 810 return -EFAULT; 811 } 812 813 if (ds == 0) { 814 mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", 815 wqe_index, qpn); 816 return -EFAULT; 817 } 818 819 #if defined(DEBUG) 820 ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & 821 MLX5_WQE_CTRL_WQE_INDEX_MASK) >> 822 MLX5_WQE_CTRL_WQE_INDEX_SHIFT; 823 if (wqe_index != ctrl_wqe_index) { 824 mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", 825 wqe_index, qpn, 826 ctrl_wqe_index); 827 return -EFAULT; 828 } 829 830 ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> 831 MLX5_WQE_CTRL_QPN_SHIFT; 832 if (qpn != ctrl_qpn) { 833 mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", 834 wqe_index, qpn, 835 ctrl_qpn); 836 return -EFAULT; 837 } 838 #endif /* DEBUG */ 839 840 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; 841 *wqe += sizeof(*ctrl); 842 843 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & 844 MLX5_WQE_CTRL_OPCODE_MASK; 845 846 switch (qp->ibqp.qp_type) { 847 case IB_QPT_RC: 848 transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps; 849 break; 850 case IB_QPT_UD: 851 transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps; 852 break; 853 default: 854 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n", 855 qp->ibqp.qp_type); 856 return -EFAULT; 857 } 858 859 if (unlikely(opcode >= sizeof(mlx5_ib_odp_opcode_cap) / 860 sizeof(mlx5_ib_odp_opcode_cap[0]) || 861 !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) { 862 mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n", 863 opcode); 864 return -EFAULT; 865 } 866 867 if (qp->ibqp.qp_type != IB_QPT_RC) { 868 av = *wqe; 869 if (av->dqp_dct & be32_to_cpu(MLX5_WQE_AV_EXT)) 870 *wqe += sizeof(struct mlx5_av); 871 else 872 *wqe += sizeof(struct mlx5_base_av); 873 } 874 875 switch (opcode) { 876 case MLX5_OPCODE_RDMA_WRITE: 877 case MLX5_OPCODE_RDMA_WRITE_IMM: 878 case MLX5_OPCODE_RDMA_READ: 879 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 880 break; 881 case MLX5_OPCODE_ATOMIC_CS: 882 case MLX5_OPCODE_ATOMIC_FA: 883 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 884 *wqe += sizeof(struct mlx5_wqe_atomic_seg); 885 break; 886 } 887 888 return 0; 889 } 890 891 /* 892 * Parse responder WQE. Advances the wqe pointer to point at the 893 * scatter-gather list, and set wqe_end to the end of the WQE. 894 */ 895 static int mlx5_ib_mr_responder_pfault_handler( 896 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 897 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) 898 { 899 struct mlx5_ib_wq *wq = &qp->rq; 900 int wqe_size = 1 << wq->wqe_shift; 901 902 if (qp->ibqp.srq) { 903 mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); 904 return -EFAULT; 905 } 906 907 if (qp->wq_sig) { 908 mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); 909 return -EFAULT; 910 } 911 912 if (wqe_size > wqe_length) { 913 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); 914 return -EFAULT; 915 } 916 917 switch (qp->ibqp.qp_type) { 918 case IB_QPT_RC: 919 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 920 IB_ODP_SUPPORT_RECV)) 921 goto invalid_transport_or_opcode; 922 break; 923 default: 924 invalid_transport_or_opcode: 925 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", 926 qp->ibqp.qp_type); 927 return -EFAULT; 928 } 929 930 *wqe_end = *wqe + wqe_size; 931 932 return 0; 933 } 934 935 static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev, 936 u32 wq_num) 937 { 938 struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num); 939 940 if (!mqp) { 941 mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num); 942 return NULL; 943 } 944 945 return to_mibqp(mqp); 946 } 947 948 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, 949 struct mlx5_pagefault *pfault) 950 { 951 int ret; 952 void *wqe, *wqe_end; 953 u32 bytes_mapped, total_wqe_bytes; 954 char *buffer = NULL; 955 int resume_with_error = 1; 956 u16 wqe_index = pfault->wqe.wqe_index; 957 int requestor = pfault->type & MLX5_PFAULT_REQUESTOR; 958 struct mlx5_ib_qp *qp; 959 960 buffer = (char *)__get_free_page(GFP_KERNEL); 961 if (!buffer) { 962 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); 963 goto resolve_page_fault; 964 } 965 966 qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num); 967 if (!qp) 968 goto resolve_page_fault; 969 970 ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, 971 PAGE_SIZE, &qp->trans_qp.base); 972 if (ret < 0) { 973 mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n", 974 ret, wqe_index, pfault->token); 975 goto resolve_page_fault; 976 } 977 978 wqe = buffer; 979 if (requestor) 980 ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe, 981 &wqe_end, ret); 982 else 983 ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe, 984 &wqe_end, ret); 985 if (ret < 0) 986 goto resolve_page_fault; 987 988 if (wqe >= wqe_end) { 989 mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); 990 goto resolve_page_fault; 991 } 992 993 ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end, 994 &bytes_mapped, &total_wqe_bytes, 995 !requestor); 996 if (ret == -EAGAIN) { 997 resume_with_error = 0; 998 goto resolve_page_fault; 999 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { 1000 if (ret != -ENOENT) 1001 mlx5_ib_err(dev, "PAGE FAULT error: %d. QP 0x%x. type: 0x%x\n", 1002 ret, pfault->wqe.wq_num, pfault->type); 1003 goto resolve_page_fault; 1004 } 1005 1006 resume_with_error = 0; 1007 resolve_page_fault: 1008 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); 1009 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", 1010 pfault->wqe.wq_num, resume_with_error, 1011 pfault->type); 1012 free_page((unsigned long)buffer); 1013 } 1014 1015 static int pages_in_range(u64 address, u32 length) 1016 { 1017 return (ALIGN(address + length, PAGE_SIZE) - 1018 (address & PAGE_MASK)) >> PAGE_SHIFT; 1019 } 1020 1021 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, 1022 struct mlx5_pagefault *pfault) 1023 { 1024 u64 address; 1025 u32 length; 1026 u32 prefetch_len = pfault->bytes_committed; 1027 int prefetch_activated = 0; 1028 u32 rkey = pfault->rdma.r_key; 1029 int ret; 1030 1031 /* The RDMA responder handler handles the page fault in two parts. 1032 * First it brings the necessary pages for the current packet 1033 * (and uses the pfault context), and then (after resuming the QP) 1034 * prefetches more pages. The second operation cannot use the pfault 1035 * context and therefore uses the dummy_pfault context allocated on 1036 * the stack */ 1037 pfault->rdma.rdma_va += pfault->bytes_committed; 1038 pfault->rdma.rdma_op_len -= min(pfault->bytes_committed, 1039 pfault->rdma.rdma_op_len); 1040 pfault->bytes_committed = 0; 1041 1042 address = pfault->rdma.rdma_va; 1043 length = pfault->rdma.rdma_op_len; 1044 1045 /* For some operations, the hardware cannot tell the exact message 1046 * length, and in those cases it reports zero. Use prefetch 1047 * logic. */ 1048 if (length == 0) { 1049 prefetch_activated = 1; 1050 length = pfault->rdma.packet_size; 1051 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); 1052 } 1053 1054 ret = pagefault_single_data_segment(dev, rkey, address, length, 1055 &pfault->bytes_committed, NULL); 1056 if (ret == -EAGAIN) { 1057 /* We're racing with an invalidation, don't prefetch */ 1058 prefetch_activated = 0; 1059 } else if (ret < 0 || pages_in_range(address, length) > ret) { 1060 mlx5_ib_page_fault_resume(dev, pfault, 1); 1061 if (ret != -ENOENT) 1062 mlx5_ib_warn(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", 1063 ret, pfault->token, pfault->type); 1064 return; 1065 } 1066 1067 mlx5_ib_page_fault_resume(dev, pfault, 0); 1068 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", 1069 pfault->token, pfault->type, 1070 prefetch_activated); 1071 1072 /* At this point, there might be a new pagefault already arriving in 1073 * the eq, switch to the dummy pagefault for the rest of the 1074 * processing. We're still OK with the objects being alive as the 1075 * work-queue is being fenced. */ 1076 1077 if (prefetch_activated) { 1078 u32 bytes_committed = 0; 1079 1080 ret = pagefault_single_data_segment(dev, rkey, address, 1081 prefetch_len, 1082 &bytes_committed, NULL); 1083 if (ret < 0 && ret != -EAGAIN) { 1084 mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", 1085 ret, pfault->token, address, prefetch_len); 1086 } 1087 } 1088 } 1089 1090 void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, 1091 struct mlx5_pagefault *pfault) 1092 { 1093 struct mlx5_ib_dev *dev = context; 1094 u8 event_subtype = pfault->event_subtype; 1095 1096 switch (event_subtype) { 1097 case MLX5_PFAULT_SUBTYPE_WQE: 1098 mlx5_ib_mr_wqe_pfault_handler(dev, pfault); 1099 break; 1100 case MLX5_PFAULT_SUBTYPE_RDMA: 1101 mlx5_ib_mr_rdma_pfault_handler(dev, pfault); 1102 break; 1103 default: 1104 mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", 1105 event_subtype); 1106 mlx5_ib_page_fault_resume(dev, pfault, 1); 1107 } 1108 } 1109 1110 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) 1111 { 1112 if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1113 return; 1114 1115 switch (ent->order - 2) { 1116 case MLX5_IMR_MTT_CACHE_ENTRY: 1117 ent->page = PAGE_SHIFT; 1118 ent->xlt = MLX5_IMR_MTT_ENTRIES * 1119 sizeof(struct mlx5_mtt) / 1120 MLX5_IB_UMR_OCTOWORD; 1121 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; 1122 ent->limit = 0; 1123 break; 1124 1125 case MLX5_IMR_KSM_CACHE_ENTRY: 1126 ent->page = MLX5_KSM_PAGE_SHIFT; 1127 ent->xlt = mlx5_imr_ksm_entries * 1128 sizeof(struct mlx5_klm) / 1129 MLX5_IB_UMR_OCTOWORD; 1130 ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; 1131 ent->limit = 0; 1132 break; 1133 } 1134 } 1135 1136 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) 1137 { 1138 int ret; 1139 1140 ret = init_srcu_struct(&dev->mr_srcu); 1141 if (ret) 1142 return ret; 1143 1144 if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { 1145 ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); 1146 if (ret) { 1147 mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret); 1148 return ret; 1149 } 1150 } 1151 1152 return 0; 1153 } 1154 1155 void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *dev) 1156 { 1157 cleanup_srcu_struct(&dev->mr_srcu); 1158 } 1159 1160 int mlx5_ib_odp_init(void) 1161 { 1162 mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - 1163 MLX5_IMR_MTT_BITS); 1164 1165 return 0; 1166 } 1167 1168