odp.c (0cce284537fb42d9c28b9b31038ffc9b464555f5) | odp.c (81713d3788d2e6bc005f15ee1c59d0eb06050a6b) |
---|---|
1/* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: --- 20 unchanged lines hidden (view full) --- 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33#include <rdma/ib_umem.h> 34#include <rdma/ib_umem_odp.h> 35 36#include "mlx5_ib.h" | 1/* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: --- 20 unchanged lines hidden (view full) --- 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33#include <rdma/ib_umem.h> 34#include <rdma/ib_umem_odp.h> 35 36#include "mlx5_ib.h" |
37#include "cmd.h" |
|
37 38#define MAX_PREFETCH_LEN (4*1024*1024U) 39 40/* Timeout in ms to wait for an active mmu notifier to complete when handling 41 * a pagefault. */ 42#define MMU_NOTIFIER_TIMEOUT 1000 43 | 38 39#define MAX_PREFETCH_LEN (4*1024*1024U) 40 41/* Timeout in ms to wait for an active mmu notifier to complete when handling 42 * a pagefault. */ 43#define MMU_NOTIFIER_TIMEOUT 1000 44 |
44struct workqueue_struct *mlx5_ib_page_fault_wq; | 45#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) 46#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) 47#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) 48#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) 49#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) |
45 | 50 |
51#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT 52 53static u64 mlx5_imr_ksm_entries; 54 55static int check_parent(struct ib_umem_odp *odp, 56 struct mlx5_ib_mr *parent) 57{ 58 struct mlx5_ib_mr *mr = odp->private; 59 60 return mr && mr->parent == parent; 61} 62 63static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) 64{ 65 struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; 66 struct ib_ucontext *ctx = odp->umem->context; 67 struct rb_node *rb; 68 69 down_read(&ctx->umem_rwsem); 70 while (1) { 71 rb = rb_next(&odp->interval_tree.rb); 72 if (!rb) 73 goto not_found; 74 odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); 75 if (check_parent(odp, parent)) 76 goto end; 77 } 78not_found: 79 odp = NULL; 80end: 81 up_read(&ctx->umem_rwsem); 82 return odp; 83} 84 85static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, 86 u64 start, u64 length, 87 struct mlx5_ib_mr *parent) 88{ 89 struct ib_umem_odp *odp; 90 struct rb_node *rb; 91 92 down_read(&ctx->umem_rwsem); 93 odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length); 94 if (!odp) 95 goto end; 96 97 while (1) { 98 if (check_parent(odp, parent)) 99 goto end; 100 rb = rb_next(&odp->interval_tree.rb); 101 if (!rb) 102 goto not_found; 103 odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); 104 if (ib_umem_start(odp->umem) > start + length) 105 goto not_found; 106 } 107not_found: 108 odp = NULL; 109end: 110 up_read(&ctx->umem_rwsem); 111 return odp; 112} 113 114void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, 115 size_t nentries, struct mlx5_ib_mr *mr, int flags) 116{ 117 struct ib_pd *pd = mr->ibmr.pd; 118 struct ib_ucontext *ctx = pd->uobject->context; 119 struct mlx5_ib_dev *dev = to_mdev(pd->device); 120 struct ib_umem_odp *odp; 121 unsigned long va; 122 int i; 123 124 if (flags & MLX5_IB_UPD_XLT_ZAP) { 125 for (i = 0; i < nentries; i++, pklm++) { 126 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 127 pklm->key = cpu_to_be32(dev->null_mkey); 128 pklm->va = 0; 129 } 130 return; 131 } 132 133 odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE, 134 nentries * MLX5_IMR_MTT_SIZE, mr); 135 136 for (i = 0; i < nentries; i++, pklm++) { 137 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 138 va = (offset + i) * MLX5_IMR_MTT_SIZE; 139 if (odp && odp->umem->address == va) { 140 struct mlx5_ib_mr *mtt = odp->private; 141 142 pklm->key = cpu_to_be32(mtt->ibmr.lkey); 143 odp = odp_next(odp); 144 } else { 145 pklm->key = cpu_to_be32(dev->null_mkey); 146 } 147 mlx5_ib_dbg(dev, "[%d] va %lx key %x\n", 148 i, va, be32_to_cpu(pklm->key)); 149 } 150} 151 152static void mr_leaf_free_action(struct work_struct *work) 153{ 154 struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); 155 int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT; 156 struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; 157 158 mr->parent = NULL; 159 synchronize_srcu(&mr->dev->mr_srcu); 160 161 if (!READ_ONCE(odp->dying)) { 162 mr->parent = imr; 163 if (atomic_dec_and_test(&imr->num_leaf_free)) 164 wake_up(&imr->q_leaf_free); 165 return; 166 } 167 168 ib_umem_release(odp->umem); 169 if (imr->live) 170 mlx5_ib_update_xlt(imr, idx, 1, 0, 171 MLX5_IB_UPD_XLT_INDIRECT | 172 MLX5_IB_UPD_XLT_ATOMIC); 173 mlx5_mr_cache_free(mr->dev, mr); 174 175 if (atomic_dec_and_test(&imr->num_leaf_free)) 176 wake_up(&imr->q_leaf_free); 177} 178 |
|
46void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, 47 unsigned long end) 48{ 49 struct mlx5_ib_mr *mr; | 179void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, 180 unsigned long end) 181{ 182 struct mlx5_ib_mr *mr; |
50 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1; | 183 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / 184 sizeof(struct mlx5_mtt)) - 1; |
51 u64 idx = 0, blk_start_idx = 0; 52 int in_block = 0; 53 u64 addr; 54 55 if (!umem || !umem->odp_data) { 56 pr_err("invalidation called on NULL umem or non-ODP umem\n"); 57 return; 58 } --- 26 unchanged lines hidden (view full) --- 85 if (!in_block) { 86 blk_start_idx = idx; 87 in_block = 1; 88 } 89 } else { 90 u64 umr_offset = idx & umr_block_mask; 91 92 if (in_block && umr_offset == 0) { | 185 u64 idx = 0, blk_start_idx = 0; 186 int in_block = 0; 187 u64 addr; 188 189 if (!umem || !umem->odp_data) { 190 pr_err("invalidation called on NULL umem or non-ODP umem\n"); 191 return; 192 } --- 26 unchanged lines hidden (view full) --- 219 if (!in_block) { 220 blk_start_idx = idx; 221 in_block = 1; 222 } 223 } else { 224 u64 umr_offset = idx & umr_block_mask; 225 226 if (in_block && umr_offset == 0) { |
93 mlx5_ib_update_mtt(mr, blk_start_idx, 94 idx - blk_start_idx, 1); | 227 mlx5_ib_update_xlt(mr, blk_start_idx, 228 idx - blk_start_idx, 229 PAGE_SHIFT, 230 MLX5_IB_UPD_XLT_ZAP | 231 MLX5_IB_UPD_XLT_ATOMIC); |
95 in_block = 0; 96 } 97 } 98 } 99 if (in_block) | 232 in_block = 0; 233 } 234 } 235 } 236 if (in_block) |
100 mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1, 101 1); 102 | 237 mlx5_ib_update_xlt(mr, blk_start_idx, 238 idx - blk_start_idx + 1, 239 PAGE_SHIFT, 240 MLX5_IB_UPD_XLT_ZAP | 241 MLX5_IB_UPD_XLT_ATOMIC); |
103 /* 104 * We are now sure that the device will not access the 105 * memory. We can safely unmap it, and mark it as dirty if 106 * needed. 107 */ 108 109 ib_umem_odp_unmap_dma_pages(umem, start, end); | 242 /* 243 * We are now sure that the device will not access the 244 * memory. We can safely unmap it, and mark it as dirty if 245 * needed. 246 */ 247 248 ib_umem_odp_unmap_dma_pages(umem, start, end); |
249 250 if (unlikely(!umem->npages && mr->parent && 251 !umem->odp_data->dying)) { 252 WRITE_ONCE(umem->odp_data->dying, 1); 253 atomic_inc(&mr->parent->num_leaf_free); 254 schedule_work(&umem->odp_data->work); 255 } |
|
110} 111 112void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 113{ 114 struct ib_odp_caps *caps = &dev->odp_caps; 115 116 memset(caps, 0, sizeof(*caps)); 117 118 if (!MLX5_CAP_GEN(dev->mdev, pg)) 119 return; 120 121 caps->general_caps = IB_ODP_SUPPORT; 122 | 256} 257 258void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 259{ 260 struct ib_odp_caps *caps = &dev->odp_caps; 261 262 memset(caps, 0, sizeof(*caps)); 263 264 if (!MLX5_CAP_GEN(dev->mdev, pg)) 265 return; 266 267 caps->general_caps = IB_ODP_SUPPORT; 268 |
269 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 270 dev->odp_max_size = U64_MAX; 271 else 272 dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); 273 |
|
123 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) 124 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; 125 126 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) 127 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; 128 129 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) 130 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; 131 132 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) 133 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; 134 135 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) 136 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; 137 | 274 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) 275 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; 276 277 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) 278 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; 279 280 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) 281 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; 282 283 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) 284 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; 285 286 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) 287 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; 288 |
289 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) 290 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; 291 292 if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && 293 MLX5_CAP_GEN(dev->mdev, null_mkey) && 294 MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 295 caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; 296 |
|
138 return; 139} 140 141static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, 142 u32 key) 143{ 144 u32 base_key = mlx5_base_mkey(key); 145 struct mlx5_core_mkey *mmkey = __mlx5_mr_lookup(dev->mdev, base_key); | 297 return; 298} 299 300static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, 301 u32 key) 302{ 303 u32 base_key = mlx5_base_mkey(key); 304 struct mlx5_core_mkey *mmkey = __mlx5_mr_lookup(dev->mdev, base_key); |
146 struct mlx5_ib_mr *mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); | 305 struct mlx5_ib_mr *mr; |
147 | 306 |
148 if (!mmkey || mmkey->key != key || !mr->live) | 307 if (!mmkey || mmkey->key != key || mmkey->type != MLX5_MKEY_MR) |
149 return NULL; 150 | 308 return NULL; 309 |
310 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 311 312 if (!mr->live) 313 return NULL; 314 |
|
151 return container_of(mmkey, struct mlx5_ib_mr, mmkey); 152} 153 | 315 return container_of(mmkey, struct mlx5_ib_mr, mmkey); 316} 317 |
154static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, 155 struct mlx5_ib_pfault *pfault, | 318static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, 319 struct mlx5_pagefault *pfault, |
156 int error) 157{ | 320 int error) 321{ |
158 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); 159 u32 qpn = qp->trans_qp.base.mqp.qpn; | 322 int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? 323 pfault->wqe.wq_num : pfault->token; |
160 int ret = mlx5_core_page_fault_resume(dev->mdev, | 324 int ret = mlx5_core_page_fault_resume(dev->mdev, |
161 qpn, 162 pfault->mpfault.flags, | 325 pfault->token, 326 wq_num, 327 pfault->type, |
163 error); 164 if (ret) | 328 error); 329 if (ret) |
165 pr_err("Failed to resolve the page fault on QP 0x%x\n", qpn); | 330 mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", 331 wq_num); |
166} 167 | 332} 333 |
334static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, 335 struct ib_umem *umem, 336 bool ksm, int access_flags) 337{ 338 struct mlx5_ib_dev *dev = to_mdev(pd->device); 339 struct mlx5_ib_mr *mr; 340 int err; 341 342 mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY : 343 MLX5_IMR_MTT_CACHE_ENTRY); 344 345 if (IS_ERR(mr)) 346 return mr; 347 348 mr->ibmr.pd = pd; 349 350 mr->dev = dev; 351 mr->access_flags = access_flags; 352 mr->mmkey.iova = 0; 353 mr->umem = umem; 354 355 if (ksm) { 356 err = mlx5_ib_update_xlt(mr, 0, 357 mlx5_imr_ksm_entries, 358 MLX5_KSM_PAGE_SHIFT, 359 MLX5_IB_UPD_XLT_INDIRECT | 360 MLX5_IB_UPD_XLT_ZAP | 361 MLX5_IB_UPD_XLT_ENABLE); 362 363 } else { 364 err = mlx5_ib_update_xlt(mr, 0, 365 MLX5_IMR_MTT_ENTRIES, 366 PAGE_SHIFT, 367 MLX5_IB_UPD_XLT_ZAP | 368 MLX5_IB_UPD_XLT_ENABLE | 369 MLX5_IB_UPD_XLT_ATOMIC); 370 } 371 372 if (err) 373 goto fail; 374 375 mr->ibmr.lkey = mr->mmkey.key; 376 mr->ibmr.rkey = mr->mmkey.key; 377 378 mr->live = 1; 379 380 mlx5_ib_dbg(dev, "key %x dev %p mr %p\n", 381 mr->mmkey.key, dev->mdev, mr); 382 383 return mr; 384 385fail: 386 mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); 387 mlx5_mr_cache_free(dev, mr); 388 389 return ERR_PTR(err); 390} 391 392static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, 393 u64 io_virt, size_t bcnt) 394{ 395 struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; 396 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); 397 struct ib_umem_odp *odp, *result = NULL; 398 u64 addr = io_virt & MLX5_IMR_MTT_MASK; 399 int nentries = 0, start_idx = 0, ret; 400 struct mlx5_ib_mr *mtt; 401 struct ib_umem *umem; 402 403 mutex_lock(&mr->umem->odp_data->umem_mutex); 404 odp = odp_lookup(ctx, addr, 1, mr); 405 406 mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", 407 io_virt, bcnt, addr, odp); 408 409next_mr: 410 if (likely(odp)) { 411 if (nentries) 412 nentries++; 413 } else { 414 umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); 415 if (IS_ERR(umem)) { 416 mutex_unlock(&mr->umem->odp_data->umem_mutex); 417 return ERR_CAST(umem); 418 } 419 420 mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags); 421 if (IS_ERR(mtt)) { 422 mutex_unlock(&mr->umem->odp_data->umem_mutex); 423 ib_umem_release(umem); 424 return ERR_CAST(mtt); 425 } 426 427 odp = umem->odp_data; 428 odp->private = mtt; 429 mtt->umem = umem; 430 mtt->mmkey.iova = addr; 431 mtt->parent = mr; 432 INIT_WORK(&odp->work, mr_leaf_free_action); 433 434 if (!nentries) 435 start_idx = addr >> MLX5_IMR_MTT_SHIFT; 436 nentries++; 437 } 438 439 odp->dying = 0; 440 441 /* Return first odp if region not covered by single one */ 442 if (likely(!result)) 443 result = odp; 444 445 addr += MLX5_IMR_MTT_SIZE; 446 if (unlikely(addr < io_virt + bcnt)) { 447 odp = odp_next(odp); 448 if (odp && odp->umem->address != addr) 449 odp = NULL; 450 goto next_mr; 451 } 452 453 if (unlikely(nentries)) { 454 ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0, 455 MLX5_IB_UPD_XLT_INDIRECT | 456 MLX5_IB_UPD_XLT_ATOMIC); 457 if (ret) { 458 mlx5_ib_err(dev, "Failed to update PAS\n"); 459 result = ERR_PTR(ret); 460 } 461 } 462 463 mutex_unlock(&mr->umem->odp_data->umem_mutex); 464 return result; 465} 466 467struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, 468 int access_flags) 469{ 470 struct ib_ucontext *ctx = pd->ibpd.uobject->context; 471 struct mlx5_ib_mr *imr; 472 struct ib_umem *umem; 473 474 umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0); 475 if (IS_ERR(umem)) 476 return ERR_CAST(umem); 477 478 imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags); 479 if (IS_ERR(imr)) { 480 ib_umem_release(umem); 481 return ERR_CAST(imr); 482 } 483 484 imr->umem = umem; 485 init_waitqueue_head(&imr->q_leaf_free); 486 atomic_set(&imr->num_leaf_free, 0); 487 488 return imr; 489} 490 491static int mr_leaf_free(struct ib_umem *umem, u64 start, 492 u64 end, void *cookie) 493{ 494 struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie; 495 496 if (mr->parent != imr) 497 return 0; 498 499 ib_umem_odp_unmap_dma_pages(umem, 500 ib_umem_start(umem), 501 ib_umem_end(umem)); 502 503 if (umem->odp_data->dying) 504 return 0; 505 506 WRITE_ONCE(umem->odp_data->dying, 1); 507 atomic_inc(&imr->num_leaf_free); 508 schedule_work(&umem->odp_data->work); 509 510 return 0; 511} 512 513void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) 514{ 515 struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context; 516 517 down_read(&ctx->umem_rwsem); 518 rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, 519 mr_leaf_free, imr); 520 up_read(&ctx->umem_rwsem); 521 522 wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); 523} 524 |
|
168/* | 525/* |
169 * Handle a single data segment in a page-fault WQE. | 526 * Handle a single data segment in a page-fault WQE or RDMA region. |
170 * | 527 * |
171 * Returns number of pages retrieved on success. The caller will continue to | 528 * Returns number of pages retrieved on success. The caller may continue to |
172 * the next data segment. 173 * Can return the following error codes: 174 * -EAGAIN to designate a temporary error. The caller will abort handling the 175 * page fault and resolve it. 176 * -EFAULT when there's an error mapping the requested pages. The caller will | 529 * the next data segment. 530 * Can return the following error codes: 531 * -EAGAIN to designate a temporary error. The caller will abort handling the 532 * page fault and resolve it. 533 * -EFAULT when there's an error mapping the requested pages. The caller will |
177 * abort the page fault handling and possibly move the QP to an error state. 178 * On other errors the QP should also be closed with an error. | 534 * abort the page fault handling. |
179 */ | 535 */ |
180static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, 181 struct mlx5_ib_pfault *pfault, | 536static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, |
182 u32 key, u64 io_virt, size_t bcnt, | 537 u32 key, u64 io_virt, size_t bcnt, |
538 u32 *bytes_committed, |
|
183 u32 *bytes_mapped) 184{ | 539 u32 *bytes_mapped) 540{ |
185 struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device); | |
186 int srcu_key; | 541 int srcu_key; |
187 unsigned int current_seq; | 542 unsigned int current_seq = 0; |
188 u64 start_idx; 189 int npages = 0, ret = 0; 190 struct mlx5_ib_mr *mr; 191 u64 access_mask = ODP_READ_ALLOWED_BIT; | 543 u64 start_idx; 544 int npages = 0, ret = 0; 545 struct mlx5_ib_mr *mr; 546 u64 access_mask = ODP_READ_ALLOWED_BIT; |
547 struct ib_umem_odp *odp; 548 int implicit = 0; 549 size_t size; |
|
192 | 550 |
193 srcu_key = srcu_read_lock(&mib_dev->mr_srcu); 194 mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); | 551 srcu_key = srcu_read_lock(&dev->mr_srcu); 552 mr = mlx5_ib_odp_find_mr_lkey(dev, key); |
195 /* 196 * If we didn't find the MR, it means the MR was closed while we were 197 * handling the ODP event. In this case we return -EFAULT so that the 198 * QP will be closed. 199 */ 200 if (!mr || !mr->ibmr.pd) { | 553 /* 554 * If we didn't find the MR, it means the MR was closed while we were 555 * handling the ODP event. In this case we return -EFAULT so that the 556 * QP will be closed. 557 */ 558 if (!mr || !mr->ibmr.pd) { |
201 pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", 202 key); | 559 mlx5_ib_dbg(dev, "Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", 560 key); |
203 ret = -EFAULT; 204 goto srcu_unlock; 205 } 206 if (!mr->umem->odp_data) { | 561 ret = -EFAULT; 562 goto srcu_unlock; 563 } 564 if (!mr->umem->odp_data) { |
207 pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", 208 key); | 565 mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", 566 key); |
209 if (bytes_mapped) 210 *bytes_mapped += | 567 if (bytes_mapped) 568 *bytes_mapped += |
211 (bcnt - pfault->mpfault.bytes_committed); | 569 (bcnt - *bytes_committed); |
212 goto srcu_unlock; 213 } | 570 goto srcu_unlock; 571 } |
214 if (mr->ibmr.pd != qp->ibqp.pd) { 215 pr_err("Page-fault with different PDs for QP and MR.\n"); 216 ret = -EFAULT; 217 goto srcu_unlock; | 572 573 /* 574 * Avoid branches - this code will perform correctly 575 * in all iterations (in iteration 2 and above, 576 * bytes_committed == 0). 577 */ 578 io_virt += *bytes_committed; 579 bcnt -= *bytes_committed; 580 581 if (!mr->umem->odp_data->page_list) { 582 odp = implicit_mr_get_data(mr, io_virt, bcnt); 583 584 if (IS_ERR(odp)) { 585 ret = PTR_ERR(odp); 586 goto srcu_unlock; 587 } 588 mr = odp->private; 589 implicit = 1; 590 591 } else { 592 odp = mr->umem->odp_data; |
218 } 219 | 593 } 594 |
220 current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); | 595next_mr: 596 current_seq = READ_ONCE(odp->notifiers_seq); |
221 /* 222 * Ensure the sequence number is valid for some time before we call 223 * gup. 224 */ 225 smp_rmb(); 226 | 597 /* 598 * Ensure the sequence number is valid for some time before we call 599 * gup. 600 */ 601 smp_rmb(); 602 |
227 /* 228 * Avoid branches - this code will perform correctly 229 * in all iterations (in iteration 2 and above, 230 * bytes_committed == 0). 231 */ 232 io_virt += pfault->mpfault.bytes_committed; 233 bcnt -= pfault->mpfault.bytes_committed; 234 | 603 size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt); |
235 start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT; 236 237 if (mr->umem->writable) 238 access_mask |= ODP_WRITE_ALLOWED_BIT; | 604 start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT; 605 606 if (mr->umem->writable) 607 access_mask |= ODP_WRITE_ALLOWED_BIT; |
239 npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, 240 access_mask, current_seq); 241 if (npages < 0) { 242 ret = npages; | 608 609 ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size, 610 access_mask, current_seq); 611 612 if (ret < 0) |
243 goto srcu_unlock; | 613 goto srcu_unlock; |
244 } | |
245 | 614 |
246 if (npages > 0) { 247 mutex_lock(&mr->umem->odp_data->umem_mutex); | 615 if (ret > 0) { 616 int np = ret; 617 618 mutex_lock(&odp->umem_mutex); |
248 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { 249 /* 250 * No need to check whether the MTTs really belong to 251 * this MR, since ib_umem_odp_map_dma_pages already 252 * checks this. 253 */ | 619 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { 620 /* 621 * No need to check whether the MTTs really belong to 622 * this MR, since ib_umem_odp_map_dma_pages already 623 * checks this. 624 */ |
254 ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); | 625 ret = mlx5_ib_update_xlt(mr, start_idx, np, 626 PAGE_SHIFT, 627 MLX5_IB_UPD_XLT_ATOMIC); |
255 } else { 256 ret = -EAGAIN; 257 } | 628 } else { 629 ret = -EAGAIN; 630 } |
258 mutex_unlock(&mr->umem->odp_data->umem_mutex); | 631 mutex_unlock(&odp->umem_mutex); |
259 if (ret < 0) { 260 if (ret != -EAGAIN) | 632 if (ret < 0) { 633 if (ret != -EAGAIN) |
261 pr_err("Failed to update mkey page tables\n"); | 634 mlx5_ib_err(dev, "Failed to update mkey page tables\n"); |
262 goto srcu_unlock; 263 } 264 265 if (bytes_mapped) { | 635 goto srcu_unlock; 636 } 637 638 if (bytes_mapped) { |
266 u32 new_mappings = npages * PAGE_SIZE - | 639 u32 new_mappings = np * PAGE_SIZE - |
267 (io_virt - round_down(io_virt, PAGE_SIZE)); | 640 (io_virt - round_down(io_virt, PAGE_SIZE)); |
268 *bytes_mapped += min_t(u32, new_mappings, bcnt); | 641 *bytes_mapped += min_t(u32, new_mappings, size); |
269 } | 642 } |
643 644 npages += np; |
|
270 } 271 | 645 } 646 |
647 bcnt -= size; 648 if (unlikely(bcnt)) { 649 struct ib_umem_odp *next; 650 651 io_virt += size; 652 next = odp_next(odp); 653 if (unlikely(!next || next->umem->address != io_virt)) { 654 mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", 655 io_virt, next); 656 ret = -EAGAIN; 657 goto srcu_unlock_no_wait; 658 } 659 odp = next; 660 mr = odp->private; 661 goto next_mr; 662 } 663 |
|
272srcu_unlock: 273 if (ret == -EAGAIN) { | 664srcu_unlock: 665 if (ret == -EAGAIN) { |
274 if (!mr->umem->odp_data->dying) { 275 struct ib_umem_odp *odp_data = mr->umem->odp_data; | 666 if (implicit || !odp->dying) { |
276 unsigned long timeout = 277 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); 278 279 if (!wait_for_completion_timeout( | 667 unsigned long timeout = 668 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); 669 670 if (!wait_for_completion_timeout( |
280 &odp_data->notifier_completion, | 671 &odp->notifier_completion, |
281 timeout)) { | 672 timeout)) { |
282 pr_warn("timeout waiting for mmu notifier completion\n"); | 673 mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n", 674 current_seq, odp->notifiers_seq); |
283 } 284 } else { 285 /* The MR is being killed, kill the QP as well. */ 286 ret = -EFAULT; 287 } 288 } | 675 } 676 } else { 677 /* The MR is being killed, kill the QP as well. */ 678 ret = -EFAULT; 679 } 680 } |
289 srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); 290 pfault->mpfault.bytes_committed = 0; | 681 682srcu_unlock_no_wait: 683 srcu_read_unlock(&dev->mr_srcu, srcu_key); 684 *bytes_committed = 0; |
291 return ret ? ret : npages; 292} 293 294/** 295 * Parse a series of data segments for page fault handling. 296 * 297 * @qp the QP on which the fault occurred. 298 * @pfault contains page fault information. --- 5 unchanged lines hidden (view full) --- 304 * successfully (e.g. enough for the next MTU, or the entire 305 * WQE). 306 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus 307 * the committed bytes). 308 * 309 * Returns the number of pages loaded if positive, zero for an empty WQE, or a 310 * negative error code. 311 */ | 685 return ret ? ret : npages; 686} 687 688/** 689 * Parse a series of data segments for page fault handling. 690 * 691 * @qp the QP on which the fault occurred. 692 * @pfault contains page fault information. --- 5 unchanged lines hidden (view full) --- 698 * successfully (e.g. enough for the next MTU, or the entire 699 * WQE). 700 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus 701 * the committed bytes). 702 * 703 * Returns the number of pages loaded if positive, zero for an empty WQE, or a 704 * negative error code. 705 */ |
312static int pagefault_data_segments(struct mlx5_ib_qp *qp, 313 struct mlx5_ib_pfault *pfault, void *wqe, | 706static int pagefault_data_segments(struct mlx5_ib_dev *dev, 707 struct mlx5_pagefault *pfault, 708 struct mlx5_ib_qp *qp, void *wqe, |
314 void *wqe_end, u32 *bytes_mapped, 315 u32 *total_wqe_bytes, int receive_queue) 316{ 317 int ret = 0, npages = 0; 318 u64 io_virt; 319 u32 key; 320 u32 byte_count; 321 size_t bcnt; --- 27 unchanged lines hidden (view full) --- 349 350 /* receive WQE end of sg list. */ 351 if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && 352 io_virt == 0) 353 break; 354 355 if (!inline_segment && total_wqe_bytes) { 356 *total_wqe_bytes += bcnt - min_t(size_t, bcnt, | 709 void *wqe_end, u32 *bytes_mapped, 710 u32 *total_wqe_bytes, int receive_queue) 711{ 712 int ret = 0, npages = 0; 713 u64 io_virt; 714 u32 key; 715 u32 byte_count; 716 size_t bcnt; --- 27 unchanged lines hidden (view full) --- 744 745 /* receive WQE end of sg list. */ 746 if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && 747 io_virt == 0) 748 break; 749 750 if (!inline_segment && total_wqe_bytes) { 751 *total_wqe_bytes += bcnt - min_t(size_t, bcnt, |
357 pfault->mpfault.bytes_committed); | 752 pfault->bytes_committed); |
358 } 359 360 /* A zero length data segment designates a length of 2GB. */ 361 if (bcnt == 0) 362 bcnt = 1U << 31; 363 | 753 } 754 755 /* A zero length data segment designates a length of 2GB. */ 756 if (bcnt == 0) 757 bcnt = 1U << 31; 758 |
364 if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) { 365 pfault->mpfault.bytes_committed -= | 759 if (inline_segment || bcnt <= pfault->bytes_committed) { 760 pfault->bytes_committed -= |
366 min_t(size_t, bcnt, | 761 min_t(size_t, bcnt, |
367 pfault->mpfault.bytes_committed); | 762 pfault->bytes_committed); |
368 continue; 369 } 370 | 763 continue; 764 } 765 |
371 ret = pagefault_single_data_segment(qp, pfault, key, io_virt, 372 bcnt, bytes_mapped); | 766 ret = pagefault_single_data_segment(dev, key, io_virt, bcnt, 767 &pfault->bytes_committed, 768 bytes_mapped); |
373 if (ret < 0) 374 break; 375 npages += ret; 376 } 377 378 return ret < 0 ? ret : npages; 379} 380 | 769 if (ret < 0) 770 break; 771 npages += ret; 772 } 773 774 return ret < 0 ? ret : npages; 775} 776 |
777static const u32 mlx5_ib_odp_opcode_cap[] = { 778 [MLX5_OPCODE_SEND] = IB_ODP_SUPPORT_SEND, 779 [MLX5_OPCODE_SEND_IMM] = IB_ODP_SUPPORT_SEND, 780 [MLX5_OPCODE_SEND_INVAL] = IB_ODP_SUPPORT_SEND, 781 [MLX5_OPCODE_RDMA_WRITE] = IB_ODP_SUPPORT_WRITE, 782 [MLX5_OPCODE_RDMA_WRITE_IMM] = IB_ODP_SUPPORT_WRITE, 783 [MLX5_OPCODE_RDMA_READ] = IB_ODP_SUPPORT_READ, 784 [MLX5_OPCODE_ATOMIC_CS] = IB_ODP_SUPPORT_ATOMIC, 785 [MLX5_OPCODE_ATOMIC_FA] = IB_ODP_SUPPORT_ATOMIC, 786}; 787 |
|
381/* 382 * Parse initiator WQE. Advances the wqe pointer to point at the 383 * scatter-gather list, and set wqe_end to the end of the WQE. 384 */ 385static int mlx5_ib_mr_initiator_pfault_handler( | 788/* 789 * Parse initiator WQE. Advances the wqe pointer to point at the 790 * scatter-gather list, and set wqe_end to the end of the WQE. 791 */ 792static int mlx5_ib_mr_initiator_pfault_handler( |
386 struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, 387 void **wqe, void **wqe_end, int wqe_length) | 793 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 794 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) |
388{ | 795{ |
389 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); | |
390 struct mlx5_wqe_ctrl_seg *ctrl = *wqe; | 796 struct mlx5_wqe_ctrl_seg *ctrl = *wqe; |
391 u16 wqe_index = pfault->mpfault.wqe.wqe_index; | 797 u16 wqe_index = pfault->wqe.wqe_index; 798 u32 transport_caps; 799 struct mlx5_base_av *av; |
392 unsigned ds, opcode; 393#if defined(DEBUG) 394 u32 ctrl_wqe_index, ctrl_qpn; 395#endif 396 u32 qpn = qp->trans_qp.base.mqp.qpn; 397 398 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; 399 if (ds * MLX5_WQE_DS_UNITS > wqe_length) { --- 29 unchanged lines hidden (view full) --- 429 } 430#endif /* DEBUG */ 431 432 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; 433 *wqe += sizeof(*ctrl); 434 435 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & 436 MLX5_WQE_CTRL_OPCODE_MASK; | 800 unsigned ds, opcode; 801#if defined(DEBUG) 802 u32 ctrl_wqe_index, ctrl_qpn; 803#endif 804 u32 qpn = qp->trans_qp.base.mqp.qpn; 805 806 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; 807 if (ds * MLX5_WQE_DS_UNITS > wqe_length) { --- 29 unchanged lines hidden (view full) --- 837 } 838#endif /* DEBUG */ 839 840 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; 841 *wqe += sizeof(*ctrl); 842 843 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & 844 MLX5_WQE_CTRL_OPCODE_MASK; |
845 |
|
437 switch (qp->ibqp.qp_type) { 438 case IB_QPT_RC: | 846 switch (qp->ibqp.qp_type) { 847 case IB_QPT_RC: |
439 switch (opcode) { 440 case MLX5_OPCODE_SEND: 441 case MLX5_OPCODE_SEND_IMM: 442 case MLX5_OPCODE_SEND_INVAL: 443 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 444 IB_ODP_SUPPORT_SEND)) 445 goto invalid_transport_or_opcode; 446 break; 447 case MLX5_OPCODE_RDMA_WRITE: 448 case MLX5_OPCODE_RDMA_WRITE_IMM: 449 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 450 IB_ODP_SUPPORT_WRITE)) 451 goto invalid_transport_or_opcode; 452 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 453 break; 454 case MLX5_OPCODE_RDMA_READ: 455 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 456 IB_ODP_SUPPORT_READ)) 457 goto invalid_transport_or_opcode; 458 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 459 break; 460 default: 461 goto invalid_transport_or_opcode; 462 } | 848 transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps; |
463 break; 464 case IB_QPT_UD: | 849 break; 850 case IB_QPT_UD: |
465 switch (opcode) { 466 case MLX5_OPCODE_SEND: 467 case MLX5_OPCODE_SEND_IMM: 468 if (!(dev->odp_caps.per_transport_caps.ud_odp_caps & 469 IB_ODP_SUPPORT_SEND)) 470 goto invalid_transport_or_opcode; 471 *wqe += sizeof(struct mlx5_wqe_datagram_seg); 472 break; 473 default: 474 goto invalid_transport_or_opcode; 475 } | 851 transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps; |
476 break; 477 default: | 852 break; 853 default: |
478invalid_transport_or_opcode: 479 mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n", 480 qp->ibqp.qp_type, opcode); | 854 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n", 855 qp->ibqp.qp_type); |
481 return -EFAULT; 482 } 483 | 856 return -EFAULT; 857 } 858 |
859 if (unlikely(opcode >= sizeof(mlx5_ib_odp_opcode_cap) / 860 sizeof(mlx5_ib_odp_opcode_cap[0]) || 861 !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) { 862 mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n", 863 opcode); 864 return -EFAULT; 865 } 866 867 if (qp->ibqp.qp_type != IB_QPT_RC) { 868 av = *wqe; 869 if (av->dqp_dct & be32_to_cpu(MLX5_WQE_AV_EXT)) 870 *wqe += sizeof(struct mlx5_av); 871 else 872 *wqe += sizeof(struct mlx5_base_av); 873 } 874 875 switch (opcode) { 876 case MLX5_OPCODE_RDMA_WRITE: 877 case MLX5_OPCODE_RDMA_WRITE_IMM: 878 case MLX5_OPCODE_RDMA_READ: 879 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 880 break; 881 case MLX5_OPCODE_ATOMIC_CS: 882 case MLX5_OPCODE_ATOMIC_FA: 883 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 884 *wqe += sizeof(struct mlx5_wqe_atomic_seg); 885 break; 886 } 887 |
|
484 return 0; 485} 486 487/* 488 * Parse responder WQE. Advances the wqe pointer to point at the 489 * scatter-gather list, and set wqe_end to the end of the WQE. 490 */ 491static int mlx5_ib_mr_responder_pfault_handler( | 888 return 0; 889} 890 891/* 892 * Parse responder WQE. Advances the wqe pointer to point at the 893 * scatter-gather list, and set wqe_end to the end of the WQE. 894 */ 895static int mlx5_ib_mr_responder_pfault_handler( |
492 struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, 493 void **wqe, void **wqe_end, int wqe_length) | 896 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 897 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) |
494{ | 898{ |
495 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); | |
496 struct mlx5_ib_wq *wq = &qp->rq; 497 int wqe_size = 1 << wq->wqe_shift; 498 499 if (qp->ibqp.srq) { 500 mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); 501 return -EFAULT; 502 } 503 --- 20 unchanged lines hidden (view full) --- 524 return -EFAULT; 525 } 526 527 *wqe_end = *wqe + wqe_size; 528 529 return 0; 530} 531 | 899 struct mlx5_ib_wq *wq = &qp->rq; 900 int wqe_size = 1 << wq->wqe_shift; 901 902 if (qp->ibqp.srq) { 903 mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); 904 return -EFAULT; 905 } 906 --- 20 unchanged lines hidden (view full) --- 927 return -EFAULT; 928 } 929 930 *wqe_end = *wqe + wqe_size; 931 932 return 0; 933} 934 |
532static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, 533 struct mlx5_ib_pfault *pfault) | 935static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev, 936 u32 wq_num) |
534{ | 937{ |
535 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); | 938 struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num); 939 940 if (!mqp) { 941 mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num); 942 return NULL; 943 } 944 945 return to_mibqp(mqp); 946} 947 948static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, 949 struct mlx5_pagefault *pfault) 950{ |
536 int ret; 537 void *wqe, *wqe_end; 538 u32 bytes_mapped, total_wqe_bytes; 539 char *buffer = NULL; | 951 int ret; 952 void *wqe, *wqe_end; 953 u32 bytes_mapped, total_wqe_bytes; 954 char *buffer = NULL; |
540 int resume_with_error = 0; 541 u16 wqe_index = pfault->mpfault.wqe.wqe_index; 542 int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR; 543 u32 qpn = qp->trans_qp.base.mqp.qpn; | 955 int resume_with_error = 1; 956 u16 wqe_index = pfault->wqe.wqe_index; 957 int requestor = pfault->type & MLX5_PFAULT_REQUESTOR; 958 struct mlx5_ib_qp *qp; |
544 545 buffer = (char *)__get_free_page(GFP_KERNEL); 546 if (!buffer) { 547 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); | 959 960 buffer = (char *)__get_free_page(GFP_KERNEL); 961 if (!buffer) { 962 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); |
548 resume_with_error = 1; | |
549 goto resolve_page_fault; 550 } 551 | 963 goto resolve_page_fault; 964 } 965 |
966 qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num); 967 if (!qp) 968 goto resolve_page_fault; 969 |
|
552 ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, 553 PAGE_SIZE, &qp->trans_qp.base); 554 if (ret < 0) { | 970 ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, 971 PAGE_SIZE, &qp->trans_qp.base); 972 if (ret < 0) { |
555 mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n", 556 -ret, wqe_index, qpn); 557 resume_with_error = 1; | 973 mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n", 974 ret, wqe_index, pfault->token); |
558 goto resolve_page_fault; 559 } 560 561 wqe = buffer; 562 if (requestor) | 975 goto resolve_page_fault; 976 } 977 978 wqe = buffer; 979 if (requestor) |
563 ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe, | 980 ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe, |
564 &wqe_end, ret); 565 else | 981 &wqe_end, ret); 982 else |
566 ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe, | 983 ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe, |
567 &wqe_end, ret); | 984 &wqe_end, ret); |
568 if (ret < 0) { 569 resume_with_error = 1; | 985 if (ret < 0) |
570 goto resolve_page_fault; | 986 goto resolve_page_fault; |
571 } | |
572 573 if (wqe >= wqe_end) { 574 mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); | 987 988 if (wqe >= wqe_end) { 989 mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); |
575 resume_with_error = 1; | |
576 goto resolve_page_fault; 577 } 578 | 990 goto resolve_page_fault; 991 } 992 |
579 ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped, 580 &total_wqe_bytes, !requestor); | 993 ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end, 994 &bytes_mapped, &total_wqe_bytes, 995 !requestor); |
581 if (ret == -EAGAIN) { | 996 if (ret == -EAGAIN) { |
997 resume_with_error = 0; |
|
582 goto resolve_page_fault; 583 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { | 998 goto resolve_page_fault; 999 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { |
584 mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n", 585 -ret); 586 resume_with_error = 1; | 1000 if (ret != -ENOENT) 1001 mlx5_ib_err(dev, "PAGE FAULT error: %d. QP 0x%x. type: 0x%x\n", 1002 ret, pfault->wqe.wq_num, pfault->type); |
587 goto resolve_page_fault; 588 } 589 | 1003 goto resolve_page_fault; 1004 } 1005 |
1006 resume_with_error = 0; |
|
590resolve_page_fault: | 1007resolve_page_fault: |
591 mlx5_ib_page_fault_resume(qp, pfault, resume_with_error); 592 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n", 593 qpn, resume_with_error, 594 pfault->mpfault.flags); 595 | 1008 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); 1009 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", 1010 pfault->wqe.wq_num, resume_with_error, 1011 pfault->type); |
596 free_page((unsigned long)buffer); 597} 598 599static int pages_in_range(u64 address, u32 length) 600{ 601 return (ALIGN(address + length, PAGE_SIZE) - 602 (address & PAGE_MASK)) >> PAGE_SHIFT; 603} 604 | 1012 free_page((unsigned long)buffer); 1013} 1014 1015static int pages_in_range(u64 address, u32 length) 1016{ 1017 return (ALIGN(address + length, PAGE_SIZE) - 1018 (address & PAGE_MASK)) >> PAGE_SHIFT; 1019} 1020 |
605static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, 606 struct mlx5_ib_pfault *pfault) | 1021static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, 1022 struct mlx5_pagefault *pfault) |
607{ | 1023{ |
608 struct mlx5_pagefault *mpfault = &pfault->mpfault; | |
609 u64 address; 610 u32 length; | 1024 u64 address; 1025 u32 length; |
611 u32 prefetch_len = mpfault->bytes_committed; | 1026 u32 prefetch_len = pfault->bytes_committed; |
612 int prefetch_activated = 0; | 1027 int prefetch_activated = 0; |
613 u32 rkey = mpfault->rdma.r_key; | 1028 u32 rkey = pfault->rdma.r_key; |
614 int ret; 615 616 /* The RDMA responder handler handles the page fault in two parts. 617 * First it brings the necessary pages for the current packet 618 * (and uses the pfault context), and then (after resuming the QP) 619 * prefetches more pages. The second operation cannot use the pfault 620 * context and therefore uses the dummy_pfault context allocated on 621 * the stack */ | 1029 int ret; 1030 1031 /* The RDMA responder handler handles the page fault in two parts. 1032 * First it brings the necessary pages for the current packet 1033 * (and uses the pfault context), and then (after resuming the QP) 1034 * prefetches more pages. The second operation cannot use the pfault 1035 * context and therefore uses the dummy_pfault context allocated on 1036 * the stack */ |
622 struct mlx5_ib_pfault dummy_pfault = {}; | 1037 pfault->rdma.rdma_va += pfault->bytes_committed; 1038 pfault->rdma.rdma_op_len -= min(pfault->bytes_committed, 1039 pfault->rdma.rdma_op_len); 1040 pfault->bytes_committed = 0; |
623 | 1041 |
624 dummy_pfault.mpfault.bytes_committed = 0; | 1042 address = pfault->rdma.rdma_va; 1043 length = pfault->rdma.rdma_op_len; |
625 | 1044 |
626 mpfault->rdma.rdma_va += mpfault->bytes_committed; 627 mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed, 628 mpfault->rdma.rdma_op_len); 629 mpfault->bytes_committed = 0; 630 631 address = mpfault->rdma.rdma_va; 632 length = mpfault->rdma.rdma_op_len; 633 | |
634 /* For some operations, the hardware cannot tell the exact message 635 * length, and in those cases it reports zero. Use prefetch 636 * logic. */ 637 if (length == 0) { 638 prefetch_activated = 1; | 1045 /* For some operations, the hardware cannot tell the exact message 1046 * length, and in those cases it reports zero. Use prefetch 1047 * logic. */ 1048 if (length == 0) { 1049 prefetch_activated = 1; |
639 length = mpfault->rdma.packet_size; | 1050 length = pfault->rdma.packet_size; |
640 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); 641 } 642 | 1051 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); 1052 } 1053 |
643 ret = pagefault_single_data_segment(qp, pfault, rkey, address, length, 644 NULL); | 1054 ret = pagefault_single_data_segment(dev, rkey, address, length, 1055 &pfault->bytes_committed, NULL); |
645 if (ret == -EAGAIN) { 646 /* We're racing with an invalidation, don't prefetch */ 647 prefetch_activated = 0; 648 } else if (ret < 0 || pages_in_range(address, length) > ret) { | 1056 if (ret == -EAGAIN) { 1057 /* We're racing with an invalidation, don't prefetch */ 1058 prefetch_activated = 0; 1059 } else if (ret < 0 || pages_in_range(address, length) > ret) { |
649 mlx5_ib_page_fault_resume(qp, pfault, 1); | 1060 mlx5_ib_page_fault_resume(dev, pfault, 1); 1061 if (ret != -ENOENT) 1062 mlx5_ib_warn(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", 1063 ret, pfault->token, pfault->type); |
650 return; 651 } 652 | 1064 return; 1065 } 1066 |
653 mlx5_ib_page_fault_resume(qp, pfault, 0); | 1067 mlx5_ib_page_fault_resume(dev, pfault, 0); 1068 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", 1069 pfault->token, pfault->type, 1070 prefetch_activated); |
654 655 /* At this point, there might be a new pagefault already arriving in 656 * the eq, switch to the dummy pagefault for the rest of the 657 * processing. We're still OK with the objects being alive as the 658 * work-queue is being fenced. */ 659 660 if (prefetch_activated) { | 1071 1072 /* At this point, there might be a new pagefault already arriving in 1073 * the eq, switch to the dummy pagefault for the rest of the 1074 * processing. We're still OK with the objects being alive as the 1075 * work-queue is being fenced. */ 1076 1077 if (prefetch_activated) { |
661 ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey, 662 address, | 1078 u32 bytes_committed = 0; 1079 1080 ret = pagefault_single_data_segment(dev, rkey, address, |
663 prefetch_len, | 1081 prefetch_len, |
664 NULL); 665 if (ret < 0) { 666 pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n", 667 ret, prefetch_activated, 668 qp->ibqp.qp_num, address, prefetch_len); | 1082 &bytes_committed, NULL); 1083 if (ret < 0 && ret != -EAGAIN) { 1084 mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", 1085 ret, pfault->token, address, prefetch_len); |
669 } 670 } 671} 672 | 1086 } 1087 } 1088} 1089 |
673void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, 674 struct mlx5_ib_pfault *pfault) | 1090void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, 1091 struct mlx5_pagefault *pfault) |
675{ | 1092{ |
676 u8 event_subtype = pfault->mpfault.event_subtype; | 1093 struct mlx5_ib_dev *dev = context; 1094 u8 event_subtype = pfault->event_subtype; |
677 678 switch (event_subtype) { 679 case MLX5_PFAULT_SUBTYPE_WQE: | 1095 1096 switch (event_subtype) { 1097 case MLX5_PFAULT_SUBTYPE_WQE: |
680 mlx5_ib_mr_wqe_pfault_handler(qp, pfault); | 1098 mlx5_ib_mr_wqe_pfault_handler(dev, pfault); |
681 break; 682 case MLX5_PFAULT_SUBTYPE_RDMA: | 1099 break; 1100 case MLX5_PFAULT_SUBTYPE_RDMA: |
683 mlx5_ib_mr_rdma_pfault_handler(qp, pfault); | 1101 mlx5_ib_mr_rdma_pfault_handler(dev, pfault); |
684 break; 685 default: | 1102 break; 1103 default: |
686 pr_warn("Invalid page fault event subtype: 0x%x\n", 687 event_subtype); 688 mlx5_ib_page_fault_resume(qp, pfault, 1); 689 break; | 1104 mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", 1105 event_subtype); 1106 mlx5_ib_page_fault_resume(dev, pfault, 1); |
690 } 691} 692 | 1107 } 1108} 1109 |
693static void mlx5_ib_qp_pfault_action(struct work_struct *work) | 1110void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) |
694{ | 1111{ |
695 struct mlx5_ib_pfault *pfault = container_of(work, 696 struct mlx5_ib_pfault, 697 work); 698 enum mlx5_ib_pagefault_context context = 699 mlx5_ib_get_pagefault_context(&pfault->mpfault); 700 struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp, 701 pagefaults[context]); 702 mlx5_ib_mr_pfault_handler(qp, pfault); 703} | 1112 if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1113 return; |
704 | 1114 |
705void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) 706{ 707 unsigned long flags; | 1115 switch (ent->order - 2) { 1116 case MLX5_IMR_MTT_CACHE_ENTRY: 1117 ent->page = PAGE_SHIFT; 1118 ent->xlt = MLX5_IMR_MTT_ENTRIES * 1119 sizeof(struct mlx5_mtt) / 1120 MLX5_IB_UMR_OCTOWORD; 1121 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; 1122 ent->limit = 0; 1123 break; |
708 | 1124 |
709 spin_lock_irqsave(&qp->disable_page_faults_lock, flags); 710 qp->disable_page_faults = 1; 711 spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); 712 713 /* 714 * Note that at this point, we are guarenteed that no more 715 * work queue elements will be posted to the work queue with 716 * the QP we are closing. 717 */ 718 flush_workqueue(mlx5_ib_page_fault_wq); | 1125 case MLX5_IMR_KSM_CACHE_ENTRY: 1126 ent->page = MLX5_KSM_PAGE_SHIFT; 1127 ent->xlt = mlx5_imr_ksm_entries * 1128 sizeof(struct mlx5_klm) / 1129 MLX5_IB_UMR_OCTOWORD; 1130 ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; 1131 ent->limit = 0; 1132 break; 1133 } |
719} 720 | 1134} 1135 |
721void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) | 1136int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) |
722{ | 1137{ |
723 unsigned long flags; 724 725 spin_lock_irqsave(&qp->disable_page_faults_lock, flags); 726 qp->disable_page_faults = 0; 727 spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); 728} 729 730static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp, 731 struct mlx5_pagefault *pfault) 732{ 733 /* 734 * Note that we will only get one fault event per QP per context 735 * (responder/initiator, read/write), until we resolve the page fault 736 * with the mlx5_ib_page_fault_resume command. Since this function is 737 * called from within the work element, there is no risk of missing 738 * events. 739 */ 740 struct mlx5_ib_qp *mibqp = to_mibqp(qp); 741 enum mlx5_ib_pagefault_context context = 742 mlx5_ib_get_pagefault_context(pfault); 743 struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context]; 744 745 qp_pfault->mpfault = *pfault; 746 747 /* No need to stop interrupts here since we are in an interrupt */ 748 spin_lock(&mibqp->disable_page_faults_lock); 749 if (!mibqp->disable_page_faults) 750 queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work); 751 spin_unlock(&mibqp->disable_page_faults_lock); 752} 753 754void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) 755{ 756 int i; 757 758 qp->disable_page_faults = 1; 759 spin_lock_init(&qp->disable_page_faults_lock); 760 761 qp->trans_qp.base.mqp.pfault_handler = mlx5_ib_pfault_handler; 762 763 for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) 764 INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); 765} 766 767int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) 768{ | |
769 int ret; 770 | 1138 int ret; 1139 |
771 ret = init_srcu_struct(&ibdev->mr_srcu); | 1140 ret = init_srcu_struct(&dev->mr_srcu); |
772 if (ret) 773 return ret; 774 | 1141 if (ret) 1142 return ret; 1143 |
1144 if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { 1145 ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); 1146 if (ret) { 1147 mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret); 1148 return ret; 1149 } 1150 } 1151 |
|
775 return 0; 776} 777 | 1152 return 0; 1153} 1154 |
778void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) | 1155void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *dev) |
779{ | 1156{ |
780 cleanup_srcu_struct(&ibdev->mr_srcu); | 1157 cleanup_srcu_struct(&dev->mr_srcu); |
781} 782 | 1158} 1159 |
783int __init mlx5_ib_odp_init(void) | 1160int mlx5_ib_odp_init(void) |
784{ | 1161{ |
785 mlx5_ib_page_fault_wq = alloc_ordered_workqueue("mlx5_ib_page_faults", 786 WQ_MEM_RECLAIM); 787 if (!mlx5_ib_page_fault_wq) 788 return -ENOMEM; | 1162 mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - 1163 MLX5_IMR_MTT_BITS); |
789 790 return 0; 791} 792 | 1164 1165 return 0; 1166} 1167 |
793void mlx5_ib_odp_cleanup(void) 794{ 795 destroy_workqueue(mlx5_ib_page_fault_wq); 796} | |