1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <rdma/ib_umem.h> 34 #include <rdma/ib_umem_odp.h> 35 #include <linux/kernel.h> 36 37 #include "mlx5_ib.h" 38 #include "cmd.h" 39 #include "qp.h" 40 41 #include <linux/mlx5/eq.h> 42 43 /* Contains the details of a pagefault. */ 44 struct mlx5_pagefault { 45 u32 bytes_committed; 46 u32 token; 47 u8 event_subtype; 48 u8 type; 49 union { 50 /* Initiator or send message responder pagefault details. */ 51 struct { 52 /* Received packet size, only valid for responders. */ 53 u32 packet_size; 54 /* 55 * Number of resource holding WQE, depends on type. 56 */ 57 u32 wq_num; 58 /* 59 * WQE index. Refers to either the send queue or 60 * receive queue, according to event_subtype. 61 */ 62 u16 wqe_index; 63 } wqe; 64 /* RDMA responder pagefault details */ 65 struct { 66 u32 r_key; 67 /* 68 * Received packet size, minimal size page fault 69 * resolution required for forward progress. 70 */ 71 u32 packet_size; 72 u32 rdma_op_len; 73 u64 rdma_va; 74 } rdma; 75 }; 76 77 struct mlx5_ib_pf_eq *eq; 78 struct work_struct work; 79 }; 80 81 #define MAX_PREFETCH_LEN (4*1024*1024U) 82 83 /* Timeout in ms to wait for an active mmu notifier to complete when handling 84 * a pagefault. */ 85 #define MMU_NOTIFIER_TIMEOUT 1000 86 87 #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) 88 #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) 89 #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) 90 #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) 91 #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) 92 93 #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT 94 95 static u64 mlx5_imr_ksm_entries; 96 97 static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, 98 struct mlx5_ib_mr *imr, int flags) 99 { 100 struct mlx5_klm *end = pklm + nentries; 101 102 if (flags & MLX5_IB_UPD_XLT_ZAP) { 103 for (; pklm != end; pklm++, idx++) { 104 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 105 pklm->key = cpu_to_be32(mr_to_mdev(imr)->null_mkey); 106 pklm->va = 0; 107 } 108 return; 109 } 110 111 /* 112 * The locking here is pretty subtle. Ideally the implicit_children 113 * xarray would be protected by the umem_mutex, however that is not 114 * possible. Instead this uses a weaker update-then-lock pattern: 115 * 116 * srcu_read_lock() 117 * xa_store() 118 * mutex_lock(umem_mutex) 119 * mlx5_ib_update_xlt() 120 * mutex_unlock(umem_mutex) 121 * destroy lkey 122 * 123 * ie any change the xarray must be followed by the locked update_xlt 124 * before destroying. 125 * 126 * The umem_mutex provides the acquire/release semantic needed to make 127 * the xa_store() visible to a racing thread. While SRCU is not 128 * technically required, using it gives consistent use of the SRCU 129 * locking around the xarray. 130 */ 131 lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); 132 lockdep_assert_held(&mr_to_mdev(imr)->odp_srcu); 133 134 for (; pklm != end; pklm++, idx++) { 135 struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); 136 137 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 138 if (mtt) { 139 pklm->key = cpu_to_be32(mtt->ibmr.lkey); 140 pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); 141 } else { 142 pklm->key = cpu_to_be32(mr_to_mdev(imr)->null_mkey); 143 pklm->va = 0; 144 } 145 } 146 } 147 148 static u64 umem_dma_to_mtt(dma_addr_t umem_dma) 149 { 150 u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; 151 152 if (umem_dma & ODP_READ_ALLOWED_BIT) 153 mtt_entry |= MLX5_IB_MTT_READ; 154 if (umem_dma & ODP_WRITE_ALLOWED_BIT) 155 mtt_entry |= MLX5_IB_MTT_WRITE; 156 157 return mtt_entry; 158 } 159 160 static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, 161 struct mlx5_ib_mr *mr, int flags) 162 { 163 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 164 dma_addr_t pa; 165 size_t i; 166 167 if (flags & MLX5_IB_UPD_XLT_ZAP) 168 return; 169 170 for (i = 0; i < nentries; i++) { 171 pa = odp->dma_list[idx + i]; 172 pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); 173 } 174 } 175 176 void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, 177 struct mlx5_ib_mr *mr, int flags) 178 { 179 if (flags & MLX5_IB_UPD_XLT_INDIRECT) { 180 populate_klm(xlt, idx, nentries, mr, flags); 181 } else { 182 populate_mtt(xlt, idx, nentries, mr, flags); 183 } 184 } 185 186 static void dma_fence_odp_mr(struct mlx5_ib_mr *mr) 187 { 188 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 189 190 /* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */ 191 mutex_lock(&odp->umem_mutex); 192 if (odp->npages) { 193 mlx5_mr_cache_invalidate(mr); 194 ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp), 195 ib_umem_end(odp)); 196 WARN_ON(odp->npages); 197 } 198 odp->private = NULL; 199 mutex_unlock(&odp->umem_mutex); 200 201 if (!mr->cache_ent) { 202 mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, &mr->mmkey); 203 WARN_ON(mr->descs); 204 } 205 } 206 207 /* 208 * This must be called after the mr has been removed from implicit_children 209 * and the SRCU synchronized. NOTE: The MR does not necessarily have to be 210 * empty here, parallel page faults could have raced with the free process and 211 * added pages to it. 212 */ 213 static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt) 214 { 215 struct mlx5_ib_mr *imr = mr->parent; 216 struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); 217 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 218 unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; 219 int srcu_key; 220 221 /* implicit_child_mr's are not allowed to have deferred work */ 222 WARN_ON(atomic_read(&mr->num_deferred_work)); 223 224 if (need_imr_xlt) { 225 srcu_key = srcu_read_lock(&mr_to_mdev(mr)->odp_srcu); 226 mutex_lock(&odp_imr->umem_mutex); 227 mlx5_ib_update_xlt(mr->parent, idx, 1, 0, 228 MLX5_IB_UPD_XLT_INDIRECT | 229 MLX5_IB_UPD_XLT_ATOMIC); 230 mutex_unlock(&odp_imr->umem_mutex); 231 srcu_read_unlock(&mr_to_mdev(mr)->odp_srcu, srcu_key); 232 } 233 234 dma_fence_odp_mr(mr); 235 236 mr->parent = NULL; 237 mlx5_mr_cache_free(mr_to_mdev(mr), mr); 238 ib_umem_odp_release(odp); 239 if (atomic_dec_and_test(&imr->num_deferred_work)) 240 wake_up(&imr->q_deferred_work); 241 } 242 243 static void free_implicit_child_mr_work(struct work_struct *work) 244 { 245 struct mlx5_ib_mr *mr = 246 container_of(work, struct mlx5_ib_mr, odp_destroy.work); 247 248 free_implicit_child_mr(mr, true); 249 } 250 251 static void free_implicit_child_mr_rcu(struct rcu_head *head) 252 { 253 struct mlx5_ib_mr *mr = 254 container_of(head, struct mlx5_ib_mr, odp_destroy.rcu); 255 256 /* Freeing a MR is a sleeping operation, so bounce to a work queue */ 257 INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); 258 queue_work(system_unbound_wq, &mr->odp_destroy.work); 259 } 260 261 static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) 262 { 263 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 264 unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; 265 struct mlx5_ib_mr *imr = mr->parent; 266 267 xa_lock(&imr->implicit_children); 268 /* 269 * This can race with mlx5_ib_free_implicit_mr(), the first one to 270 * reach the xa lock wins the race and destroys the MR. 271 */ 272 if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) != 273 mr) 274 goto out_unlock; 275 276 atomic_inc(&imr->num_deferred_work); 277 call_srcu(&mr_to_mdev(mr)->odp_srcu, &mr->odp_destroy.rcu, 278 free_implicit_child_mr_rcu); 279 280 out_unlock: 281 xa_unlock(&imr->implicit_children); 282 } 283 284 static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, 285 const struct mmu_notifier_range *range, 286 unsigned long cur_seq) 287 { 288 struct ib_umem_odp *umem_odp = 289 container_of(mni, struct ib_umem_odp, notifier); 290 struct mlx5_ib_mr *mr; 291 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / 292 sizeof(struct mlx5_mtt)) - 1; 293 u64 idx = 0, blk_start_idx = 0; 294 u64 invalidations = 0; 295 unsigned long start; 296 unsigned long end; 297 int in_block = 0; 298 u64 addr; 299 300 if (!mmu_notifier_range_blockable(range)) 301 return false; 302 303 mutex_lock(&umem_odp->umem_mutex); 304 mmu_interval_set_seq(mni, cur_seq); 305 /* 306 * If npages is zero then umem_odp->private may not be setup yet. This 307 * does not complete until after the first page is mapped for DMA. 308 */ 309 if (!umem_odp->npages) 310 goto out; 311 mr = umem_odp->private; 312 313 start = max_t(u64, ib_umem_start(umem_odp), range->start); 314 end = min_t(u64, ib_umem_end(umem_odp), range->end); 315 316 /* 317 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that 318 * while we are doing the invalidation, no page fault will attempt to 319 * overwrite the same MTTs. Concurent invalidations might race us, 320 * but they will write 0s as well, so no difference in the end result. 321 */ 322 for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) { 323 idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; 324 /* 325 * Strive to write the MTTs in chunks, but avoid overwriting 326 * non-existing MTTs. The huristic here can be improved to 327 * estimate the cost of another UMR vs. the cost of bigger 328 * UMR. 329 */ 330 if (umem_odp->dma_list[idx] & 331 (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { 332 if (!in_block) { 333 blk_start_idx = idx; 334 in_block = 1; 335 } 336 337 /* Count page invalidations */ 338 invalidations += idx - blk_start_idx + 1; 339 } else { 340 u64 umr_offset = idx & umr_block_mask; 341 342 if (in_block && umr_offset == 0) { 343 mlx5_ib_update_xlt(mr, blk_start_idx, 344 idx - blk_start_idx, 0, 345 MLX5_IB_UPD_XLT_ZAP | 346 MLX5_IB_UPD_XLT_ATOMIC); 347 in_block = 0; 348 } 349 } 350 } 351 if (in_block) 352 mlx5_ib_update_xlt(mr, blk_start_idx, 353 idx - blk_start_idx + 1, 0, 354 MLX5_IB_UPD_XLT_ZAP | 355 MLX5_IB_UPD_XLT_ATOMIC); 356 357 mlx5_update_odp_stats(mr, invalidations, invalidations); 358 359 /* 360 * We are now sure that the device will not access the 361 * memory. We can safely unmap it, and mark it as dirty if 362 * needed. 363 */ 364 365 ib_umem_odp_unmap_dma_pages(umem_odp, start, end); 366 367 if (unlikely(!umem_odp->npages && mr->parent)) 368 destroy_unused_implicit_child_mr(mr); 369 out: 370 mutex_unlock(&umem_odp->umem_mutex); 371 return true; 372 } 373 374 const struct mmu_interval_notifier_ops mlx5_mn_ops = { 375 .invalidate = mlx5_ib_invalidate_range, 376 }; 377 378 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 379 { 380 struct ib_odp_caps *caps = &dev->odp_caps; 381 382 memset(caps, 0, sizeof(*caps)); 383 384 if (!MLX5_CAP_GEN(dev->mdev, pg) || 385 !mlx5_ib_can_load_pas_with_umr(dev, 0)) 386 return; 387 388 caps->general_caps = IB_ODP_SUPPORT; 389 390 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 391 dev->odp_max_size = U64_MAX; 392 else 393 dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); 394 395 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) 396 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; 397 398 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive)) 399 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; 400 401 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) 402 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; 403 404 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) 405 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; 406 407 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) 408 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; 409 410 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) 411 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; 412 413 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) 414 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; 415 416 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive)) 417 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; 418 419 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send)) 420 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND; 421 422 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive)) 423 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV; 424 425 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write)) 426 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE; 427 428 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read)) 429 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ; 430 431 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic)) 432 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; 433 434 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive)) 435 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; 436 437 if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && 438 MLX5_CAP_GEN(dev->mdev, null_mkey) && 439 MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && 440 !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled)) 441 caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; 442 } 443 444 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, 445 struct mlx5_pagefault *pfault, 446 int error) 447 { 448 int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? 449 pfault->wqe.wq_num : pfault->token; 450 u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {}; 451 int err; 452 453 MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); 454 MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type); 455 MLX5_SET(page_fault_resume_in, in, token, pfault->token); 456 MLX5_SET(page_fault_resume_in, in, wq_number, wq_num); 457 MLX5_SET(page_fault_resume_in, in, error, !!error); 458 459 err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in); 460 if (err) 461 mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n", 462 wq_num, err); 463 } 464 465 static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, 466 unsigned long idx) 467 { 468 struct ib_umem_odp *odp; 469 struct mlx5_ib_mr *mr; 470 struct mlx5_ib_mr *ret; 471 int err; 472 473 odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), 474 idx * MLX5_IMR_MTT_SIZE, 475 MLX5_IMR_MTT_SIZE, &mlx5_mn_ops); 476 if (IS_ERR(odp)) 477 return ERR_CAST(odp); 478 479 ret = mr = mlx5_mr_cache_alloc( 480 mr_to_mdev(imr), MLX5_IMR_MTT_CACHE_ENTRY, imr->access_flags); 481 if (IS_ERR(mr)) 482 goto out_umem; 483 484 mr->ibmr.pd = imr->ibmr.pd; 485 mr->ibmr.device = &mr_to_mdev(imr)->ib_dev; 486 mr->umem = &odp->umem; 487 mr->ibmr.lkey = mr->mmkey.key; 488 mr->ibmr.rkey = mr->mmkey.key; 489 mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE; 490 mr->parent = imr; 491 odp->private = mr; 492 493 err = mlx5_ib_update_xlt(mr, 0, 494 MLX5_IMR_MTT_ENTRIES, 495 PAGE_SHIFT, 496 MLX5_IB_UPD_XLT_ZAP | 497 MLX5_IB_UPD_XLT_ENABLE); 498 if (err) { 499 ret = ERR_PTR(err); 500 goto out_mr; 501 } 502 503 /* 504 * Once the store to either xarray completes any error unwind has to 505 * use synchronize_srcu(). Avoid this with xa_reserve() 506 */ 507 ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr, 508 GFP_KERNEL); 509 if (unlikely(ret)) { 510 if (xa_is_err(ret)) { 511 ret = ERR_PTR(xa_err(ret)); 512 goto out_mr; 513 } 514 /* 515 * Another thread beat us to creating the child mr, use 516 * theirs. 517 */ 518 goto out_mr; 519 } 520 521 mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr); 522 return mr; 523 524 out_mr: 525 mlx5_mr_cache_free(mr_to_mdev(imr), mr); 526 out_umem: 527 ib_umem_odp_release(odp); 528 return ret; 529 } 530 531 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, 532 struct ib_udata *udata, 533 int access_flags) 534 { 535 struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device); 536 struct ib_umem_odp *umem_odp; 537 struct mlx5_ib_mr *imr; 538 int err; 539 540 if (!mlx5_ib_can_load_pas_with_umr(dev, 541 MLX5_IMR_MTT_ENTRIES * PAGE_SIZE)) 542 return ERR_PTR(-EOPNOTSUPP); 543 544 umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); 545 if (IS_ERR(umem_odp)) 546 return ERR_CAST(umem_odp); 547 548 imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY, access_flags); 549 if (IS_ERR(imr)) { 550 err = PTR_ERR(imr); 551 goto out_umem; 552 } 553 554 imr->ibmr.pd = &pd->ibpd; 555 imr->mmkey.iova = 0; 556 imr->umem = &umem_odp->umem; 557 imr->ibmr.lkey = imr->mmkey.key; 558 imr->ibmr.rkey = imr->mmkey.key; 559 imr->ibmr.device = &dev->ib_dev; 560 imr->umem = &umem_odp->umem; 561 imr->is_odp_implicit = true; 562 atomic_set(&imr->num_deferred_work, 0); 563 init_waitqueue_head(&imr->q_deferred_work); 564 xa_init(&imr->implicit_children); 565 566 err = mlx5_ib_update_xlt(imr, 0, 567 mlx5_imr_ksm_entries, 568 MLX5_KSM_PAGE_SHIFT, 569 MLX5_IB_UPD_XLT_INDIRECT | 570 MLX5_IB_UPD_XLT_ZAP | 571 MLX5_IB_UPD_XLT_ENABLE); 572 if (err) 573 goto out_mr; 574 575 err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key), 576 &imr->mmkey, GFP_KERNEL)); 577 if (err) 578 goto out_mr; 579 580 mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr); 581 return imr; 582 out_mr: 583 mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); 584 mlx5_mr_cache_free(dev, imr); 585 out_umem: 586 ib_umem_odp_release(umem_odp); 587 return ERR_PTR(err); 588 } 589 590 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) 591 { 592 struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); 593 struct mlx5_ib_dev *dev = mr_to_mdev(imr); 594 struct list_head destroy_list; 595 struct mlx5_ib_mr *mtt; 596 struct mlx5_ib_mr *tmp; 597 unsigned long idx; 598 599 INIT_LIST_HEAD(&destroy_list); 600 601 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key)); 602 /* 603 * This stops the SRCU protected page fault path from touching either 604 * the imr or any children. The page fault path can only reach the 605 * children xarray via the imr. 606 */ 607 synchronize_srcu(&dev->odp_srcu); 608 609 /* 610 * All work on the prefetch list must be completed, xa_erase() prevented 611 * new work from being created. 612 */ 613 wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work)); 614 615 /* 616 * At this point it is forbidden for any other thread to enter 617 * pagefault_mr() on this imr. It is already forbidden to call 618 * pagefault_mr() on an implicit child. Due to this additions to 619 * implicit_children are prevented. 620 */ 621 622 /* 623 * Block destroy_unused_implicit_child_mr() from incrementing 624 * num_deferred_work. 625 */ 626 xa_lock(&imr->implicit_children); 627 xa_for_each (&imr->implicit_children, idx, mtt) { 628 __xa_erase(&imr->implicit_children, idx); 629 list_add(&mtt->odp_destroy.elm, &destroy_list); 630 } 631 xa_unlock(&imr->implicit_children); 632 633 /* 634 * Wait for any concurrent destroy_unused_implicit_child_mr() to 635 * complete. 636 */ 637 wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work)); 638 639 /* 640 * Fence the imr before we destroy the children. This allows us to 641 * skip updating the XLT of the imr during destroy of the child mkey 642 * the imr points to. 643 */ 644 mlx5_mr_cache_invalidate(imr); 645 646 list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm) 647 free_implicit_child_mr(mtt, false); 648 649 mlx5_mr_cache_free(dev, imr); 650 ib_umem_odp_release(odp_imr); 651 } 652 653 /** 654 * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR 655 * @mr: to fence 656 * 657 * On return no parallel threads will be touching this MR and no DMA will be 658 * active. 659 */ 660 void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr) 661 { 662 /* Prevent new page faults and prefetch requests from succeeding */ 663 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); 664 665 /* Wait for all running page-fault handlers to finish. */ 666 synchronize_srcu(&mr_to_mdev(mr)->odp_srcu); 667 668 wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work)); 669 670 dma_fence_odp_mr(mr); 671 } 672 673 #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) 674 #define MLX5_PF_FLAGS_SNAPSHOT BIT(2) 675 #define MLX5_PF_FLAGS_ENABLE BIT(3) 676 static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, 677 u64 user_va, size_t bcnt, u32 *bytes_mapped, 678 u32 flags) 679 { 680 int page_shift, ret, np; 681 bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; 682 u64 access_mask; 683 u64 start_idx; 684 bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT); 685 u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC; 686 687 if (flags & MLX5_PF_FLAGS_ENABLE) 688 xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; 689 690 page_shift = odp->page_shift; 691 start_idx = (user_va - ib_umem_start(odp)) >> page_shift; 692 access_mask = ODP_READ_ALLOWED_BIT; 693 694 if (odp->umem.writable && !downgrade) 695 access_mask |= ODP_WRITE_ALLOWED_BIT; 696 697 np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault); 698 if (np < 0) 699 return np; 700 701 /* 702 * No need to check whether the MTTs really belong to this MR, since 703 * ib_umem_odp_map_dma_and_lock already checks this. 704 */ 705 ret = mlx5_ib_update_xlt(mr, start_idx, np, page_shift, xlt_flags); 706 mutex_unlock(&odp->umem_mutex); 707 708 if (ret < 0) { 709 if (ret != -EAGAIN) 710 mlx5_ib_err(mr_to_mdev(mr), 711 "Failed to update mkey page tables\n"); 712 goto out; 713 } 714 715 if (bytes_mapped) { 716 u32 new_mappings = (np << page_shift) - 717 (user_va - round_down(user_va, 1 << page_shift)); 718 719 *bytes_mapped += min_t(u32, new_mappings, bcnt); 720 } 721 722 return np << (page_shift - PAGE_SHIFT); 723 724 out: 725 return ret; 726 } 727 728 static int pagefault_implicit_mr(struct mlx5_ib_mr *imr, 729 struct ib_umem_odp *odp_imr, u64 user_va, 730 size_t bcnt, u32 *bytes_mapped, u32 flags) 731 { 732 unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT; 733 unsigned long upd_start_idx = end_idx + 1; 734 unsigned long upd_len = 0; 735 unsigned long npages = 0; 736 int err; 737 int ret; 738 739 if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE || 740 mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt)) 741 return -EFAULT; 742 743 /* Fault each child mr that intersects with our interval. */ 744 while (bcnt) { 745 unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT; 746 struct ib_umem_odp *umem_odp; 747 struct mlx5_ib_mr *mtt; 748 u64 len; 749 750 mtt = xa_load(&imr->implicit_children, idx); 751 if (unlikely(!mtt)) { 752 mtt = implicit_get_child_mr(imr, idx); 753 if (IS_ERR(mtt)) { 754 ret = PTR_ERR(mtt); 755 goto out; 756 } 757 upd_start_idx = min(upd_start_idx, idx); 758 upd_len = idx - upd_start_idx + 1; 759 } 760 761 umem_odp = to_ib_umem_odp(mtt->umem); 762 len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) - 763 user_va; 764 765 ret = pagefault_real_mr(mtt, umem_odp, user_va, len, 766 bytes_mapped, flags); 767 if (ret < 0) 768 goto out; 769 user_va += len; 770 bcnt -= len; 771 npages += ret; 772 } 773 774 ret = npages; 775 776 /* 777 * Any time the implicit_children are changed we must perform an 778 * update of the xlt before exiting to ensure the HW and the 779 * implicit_children remains synchronized. 780 */ 781 out: 782 if (likely(!upd_len)) 783 return ret; 784 785 /* 786 * Notice this is not strictly ordered right, the KSM is updated after 787 * the implicit_children is updated, so a parallel page fault could 788 * see a MR that is not yet visible in the KSM. This is similar to a 789 * parallel page fault seeing a MR that is being concurrently removed 790 * from the KSM. Both of these improbable situations are resolved 791 * safely by resuming the HW and then taking another page fault. The 792 * next pagefault handler will see the new information. 793 */ 794 mutex_lock(&odp_imr->umem_mutex); 795 err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0, 796 MLX5_IB_UPD_XLT_INDIRECT | 797 MLX5_IB_UPD_XLT_ATOMIC); 798 mutex_unlock(&odp_imr->umem_mutex); 799 if (err) { 800 mlx5_ib_err(mr_to_mdev(imr), "Failed to update PAS\n"); 801 return err; 802 } 803 return ret; 804 } 805 806 /* 807 * Returns: 808 * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are 809 * not accessible, or the MR is no longer valid. 810 * -EAGAIN/-ENOMEM: The operation should be retried 811 * 812 * -EINVAL/others: General internal malfunction 813 * >0: Number of pages mapped 814 */ 815 static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, 816 u32 *bytes_mapped, u32 flags) 817 { 818 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 819 820 lockdep_assert_held(&mr_to_mdev(mr)->odp_srcu); 821 if (unlikely(io_virt < mr->mmkey.iova)) 822 return -EFAULT; 823 824 if (!odp->is_implicit_odp) { 825 u64 user_va; 826 827 if (check_add_overflow(io_virt - mr->mmkey.iova, 828 (u64)odp->umem.address, &user_va)) 829 return -EFAULT; 830 if (unlikely(user_va >= ib_umem_end(odp) || 831 ib_umem_end(odp) - user_va < bcnt)) 832 return -EFAULT; 833 return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped, 834 flags); 835 } 836 return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped, 837 flags); 838 } 839 840 int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr) 841 { 842 int ret; 843 844 ret = pagefault_real_mr(mr, to_ib_umem_odp(mr->umem), mr->umem->address, 845 mr->umem->length, NULL, 846 MLX5_PF_FLAGS_SNAPSHOT | MLX5_PF_FLAGS_ENABLE); 847 return ret >= 0 ? 0 : ret; 848 } 849 850 struct pf_frame { 851 struct pf_frame *next; 852 u32 key; 853 u64 io_virt; 854 size_t bcnt; 855 int depth; 856 }; 857 858 static bool mkey_is_eq(struct mlx5_core_mkey *mmkey, u32 key) 859 { 860 if (!mmkey) 861 return false; 862 if (mmkey->type == MLX5_MKEY_MW) 863 return mlx5_base_mkey(mmkey->key) == mlx5_base_mkey(key); 864 return mmkey->key == key; 865 } 866 867 static int get_indirect_num_descs(struct mlx5_core_mkey *mmkey) 868 { 869 struct mlx5_ib_mw *mw; 870 struct mlx5_ib_devx_mr *devx_mr; 871 872 if (mmkey->type == MLX5_MKEY_MW) { 873 mw = container_of(mmkey, struct mlx5_ib_mw, mmkey); 874 return mw->ndescs; 875 } 876 877 devx_mr = container_of(mmkey, struct mlx5_ib_devx_mr, 878 mmkey); 879 return devx_mr->ndescs; 880 } 881 882 /* 883 * Handle a single data segment in a page-fault WQE or RDMA region. 884 * 885 * Returns number of OS pages retrieved on success. The caller may continue to 886 * the next data segment. 887 * Can return the following error codes: 888 * -EAGAIN to designate a temporary error. The caller will abort handling the 889 * page fault and resolve it. 890 * -EFAULT when there's an error mapping the requested pages. The caller will 891 * abort the page fault handling. 892 */ 893 static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, 894 struct ib_pd *pd, u32 key, 895 u64 io_virt, size_t bcnt, 896 u32 *bytes_committed, 897 u32 *bytes_mapped) 898 { 899 int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0; 900 struct pf_frame *head = NULL, *frame; 901 struct mlx5_core_mkey *mmkey; 902 struct mlx5_ib_mr *mr; 903 struct mlx5_klm *pklm; 904 u32 *out = NULL; 905 size_t offset; 906 int ndescs; 907 908 srcu_key = srcu_read_lock(&dev->odp_srcu); 909 910 io_virt += *bytes_committed; 911 bcnt -= *bytes_committed; 912 913 next_mr: 914 mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); 915 if (!mmkey) { 916 mlx5_ib_dbg( 917 dev, 918 "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", 919 key); 920 if (bytes_mapped) 921 *bytes_mapped += bcnt; 922 /* 923 * The user could specify a SGL with multiple lkeys and only 924 * some of them are ODP. Treat the non-ODP ones as fully 925 * faulted. 926 */ 927 ret = 0; 928 goto srcu_unlock; 929 } 930 if (!mkey_is_eq(mmkey, key)) { 931 mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); 932 ret = -EFAULT; 933 goto srcu_unlock; 934 } 935 936 switch (mmkey->type) { 937 case MLX5_MKEY_MR: 938 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 939 940 ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0); 941 if (ret < 0) 942 goto srcu_unlock; 943 944 mlx5_update_odp_stats(mr, faults, ret); 945 946 npages += ret; 947 ret = 0; 948 break; 949 950 case MLX5_MKEY_MW: 951 case MLX5_MKEY_INDIRECT_DEVX: 952 ndescs = get_indirect_num_descs(mmkey); 953 954 if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) { 955 mlx5_ib_dbg(dev, "indirection level exceeded\n"); 956 ret = -EFAULT; 957 goto srcu_unlock; 958 } 959 960 outlen = MLX5_ST_SZ_BYTES(query_mkey_out) + 961 sizeof(*pklm) * (ndescs - 2); 962 963 if (outlen > cur_outlen) { 964 kfree(out); 965 out = kzalloc(outlen, GFP_KERNEL); 966 if (!out) { 967 ret = -ENOMEM; 968 goto srcu_unlock; 969 } 970 cur_outlen = outlen; 971 } 972 973 pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out, 974 bsf0_klm0_pas_mtt0_1); 975 976 ret = mlx5_core_query_mkey(dev->mdev, mmkey, out, outlen); 977 if (ret) 978 goto srcu_unlock; 979 980 offset = io_virt - MLX5_GET64(query_mkey_out, out, 981 memory_key_mkey_entry.start_addr); 982 983 for (i = 0; bcnt && i < ndescs; i++, pklm++) { 984 if (offset >= be32_to_cpu(pklm->bcount)) { 985 offset -= be32_to_cpu(pklm->bcount); 986 continue; 987 } 988 989 frame = kzalloc(sizeof(*frame), GFP_KERNEL); 990 if (!frame) { 991 ret = -ENOMEM; 992 goto srcu_unlock; 993 } 994 995 frame->key = be32_to_cpu(pklm->key); 996 frame->io_virt = be64_to_cpu(pklm->va) + offset; 997 frame->bcnt = min_t(size_t, bcnt, 998 be32_to_cpu(pklm->bcount) - offset); 999 frame->depth = depth + 1; 1000 frame->next = head; 1001 head = frame; 1002 1003 bcnt -= frame->bcnt; 1004 offset = 0; 1005 } 1006 break; 1007 1008 default: 1009 mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type); 1010 ret = -EFAULT; 1011 goto srcu_unlock; 1012 } 1013 1014 if (head) { 1015 frame = head; 1016 head = frame->next; 1017 1018 key = frame->key; 1019 io_virt = frame->io_virt; 1020 bcnt = frame->bcnt; 1021 depth = frame->depth; 1022 kfree(frame); 1023 1024 goto next_mr; 1025 } 1026 1027 srcu_unlock: 1028 while (head) { 1029 frame = head; 1030 head = frame->next; 1031 kfree(frame); 1032 } 1033 kfree(out); 1034 1035 srcu_read_unlock(&dev->odp_srcu, srcu_key); 1036 *bytes_committed = 0; 1037 return ret ? ret : npages; 1038 } 1039 1040 /** 1041 * Parse a series of data segments for page fault handling. 1042 * 1043 * @pfault contains page fault information. 1044 * @wqe points at the first data segment in the WQE. 1045 * @wqe_end points after the end of the WQE. 1046 * @bytes_mapped receives the number of bytes that the function was able to 1047 * map. This allows the caller to decide intelligently whether 1048 * enough memory was mapped to resolve the page fault 1049 * successfully (e.g. enough for the next MTU, or the entire 1050 * WQE). 1051 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus 1052 * the committed bytes). 1053 * 1054 * Returns the number of pages loaded if positive, zero for an empty WQE, or a 1055 * negative error code. 1056 */ 1057 static int pagefault_data_segments(struct mlx5_ib_dev *dev, 1058 struct mlx5_pagefault *pfault, 1059 void *wqe, 1060 void *wqe_end, u32 *bytes_mapped, 1061 u32 *total_wqe_bytes, bool receive_queue) 1062 { 1063 int ret = 0, npages = 0; 1064 u64 io_virt; 1065 u32 key; 1066 u32 byte_count; 1067 size_t bcnt; 1068 int inline_segment; 1069 1070 if (bytes_mapped) 1071 *bytes_mapped = 0; 1072 if (total_wqe_bytes) 1073 *total_wqe_bytes = 0; 1074 1075 while (wqe < wqe_end) { 1076 struct mlx5_wqe_data_seg *dseg = wqe; 1077 1078 io_virt = be64_to_cpu(dseg->addr); 1079 key = be32_to_cpu(dseg->lkey); 1080 byte_count = be32_to_cpu(dseg->byte_count); 1081 inline_segment = !!(byte_count & MLX5_INLINE_SEG); 1082 bcnt = byte_count & ~MLX5_INLINE_SEG; 1083 1084 if (inline_segment) { 1085 bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; 1086 wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, 1087 16); 1088 } else { 1089 wqe += sizeof(*dseg); 1090 } 1091 1092 /* receive WQE end of sg list. */ 1093 if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && 1094 io_virt == 0) 1095 break; 1096 1097 if (!inline_segment && total_wqe_bytes) { 1098 *total_wqe_bytes += bcnt - min_t(size_t, bcnt, 1099 pfault->bytes_committed); 1100 } 1101 1102 /* A zero length data segment designates a length of 2GB. */ 1103 if (bcnt == 0) 1104 bcnt = 1U << 31; 1105 1106 if (inline_segment || bcnt <= pfault->bytes_committed) { 1107 pfault->bytes_committed -= 1108 min_t(size_t, bcnt, 1109 pfault->bytes_committed); 1110 continue; 1111 } 1112 1113 ret = pagefault_single_data_segment(dev, NULL, key, 1114 io_virt, bcnt, 1115 &pfault->bytes_committed, 1116 bytes_mapped); 1117 if (ret < 0) 1118 break; 1119 npages += ret; 1120 } 1121 1122 return ret < 0 ? ret : npages; 1123 } 1124 1125 /* 1126 * Parse initiator WQE. Advances the wqe pointer to point at the 1127 * scatter-gather list, and set wqe_end to the end of the WQE. 1128 */ 1129 static int mlx5_ib_mr_initiator_pfault_handler( 1130 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 1131 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) 1132 { 1133 struct mlx5_wqe_ctrl_seg *ctrl = *wqe; 1134 u16 wqe_index = pfault->wqe.wqe_index; 1135 struct mlx5_base_av *av; 1136 unsigned ds, opcode; 1137 u32 qpn = qp->trans_qp.base.mqp.qpn; 1138 1139 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; 1140 if (ds * MLX5_WQE_DS_UNITS > wqe_length) { 1141 mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", 1142 ds, wqe_length); 1143 return -EFAULT; 1144 } 1145 1146 if (ds == 0) { 1147 mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", 1148 wqe_index, qpn); 1149 return -EFAULT; 1150 } 1151 1152 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; 1153 *wqe += sizeof(*ctrl); 1154 1155 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & 1156 MLX5_WQE_CTRL_OPCODE_MASK; 1157 1158 if (qp->ibqp.qp_type == IB_QPT_XRC_INI) 1159 *wqe += sizeof(struct mlx5_wqe_xrc_seg); 1160 1161 if (qp->type == IB_QPT_UD || qp->type == MLX5_IB_QPT_DCI) { 1162 av = *wqe; 1163 if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV)) 1164 *wqe += sizeof(struct mlx5_av); 1165 else 1166 *wqe += sizeof(struct mlx5_base_av); 1167 } 1168 1169 switch (opcode) { 1170 case MLX5_OPCODE_RDMA_WRITE: 1171 case MLX5_OPCODE_RDMA_WRITE_IMM: 1172 case MLX5_OPCODE_RDMA_READ: 1173 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 1174 break; 1175 case MLX5_OPCODE_ATOMIC_CS: 1176 case MLX5_OPCODE_ATOMIC_FA: 1177 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 1178 *wqe += sizeof(struct mlx5_wqe_atomic_seg); 1179 break; 1180 } 1181 1182 return 0; 1183 } 1184 1185 /* 1186 * Parse responder WQE and set wqe_end to the end of the WQE. 1187 */ 1188 static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev *dev, 1189 struct mlx5_ib_srq *srq, 1190 void **wqe, void **wqe_end, 1191 int wqe_length) 1192 { 1193 int wqe_size = 1 << srq->msrq.wqe_shift; 1194 1195 if (wqe_size > wqe_length) { 1196 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); 1197 return -EFAULT; 1198 } 1199 1200 *wqe_end = *wqe + wqe_size; 1201 *wqe += sizeof(struct mlx5_wqe_srq_next_seg); 1202 1203 return 0; 1204 } 1205 1206 static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev, 1207 struct mlx5_ib_qp *qp, 1208 void *wqe, void **wqe_end, 1209 int wqe_length) 1210 { 1211 struct mlx5_ib_wq *wq = &qp->rq; 1212 int wqe_size = 1 << wq->wqe_shift; 1213 1214 if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) { 1215 mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); 1216 return -EFAULT; 1217 } 1218 1219 if (wqe_size > wqe_length) { 1220 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); 1221 return -EFAULT; 1222 } 1223 1224 *wqe_end = wqe + wqe_size; 1225 1226 return 0; 1227 } 1228 1229 static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev, 1230 u32 wq_num, int pf_type) 1231 { 1232 struct mlx5_core_rsc_common *common = NULL; 1233 struct mlx5_core_srq *srq; 1234 1235 switch (pf_type) { 1236 case MLX5_WQE_PF_TYPE_RMP: 1237 srq = mlx5_cmd_get_srq(dev, wq_num); 1238 if (srq) 1239 common = &srq->common; 1240 break; 1241 case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE: 1242 case MLX5_WQE_PF_TYPE_RESP: 1243 case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC: 1244 common = mlx5_core_res_hold(dev, wq_num, MLX5_RES_QP); 1245 break; 1246 default: 1247 break; 1248 } 1249 1250 return common; 1251 } 1252 1253 static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res) 1254 { 1255 struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res; 1256 1257 return to_mibqp(mqp); 1258 } 1259 1260 static inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res) 1261 { 1262 struct mlx5_core_srq *msrq = 1263 container_of(res, struct mlx5_core_srq, common); 1264 1265 return to_mibsrq(msrq); 1266 } 1267 1268 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, 1269 struct mlx5_pagefault *pfault) 1270 { 1271 bool sq = pfault->type & MLX5_PFAULT_REQUESTOR; 1272 u16 wqe_index = pfault->wqe.wqe_index; 1273 void *wqe, *wqe_start = NULL, *wqe_end = NULL; 1274 u32 bytes_mapped, total_wqe_bytes; 1275 struct mlx5_core_rsc_common *res; 1276 int resume_with_error = 1; 1277 struct mlx5_ib_qp *qp; 1278 size_t bytes_copied; 1279 int ret = 0; 1280 1281 res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type); 1282 if (!res) { 1283 mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num); 1284 return; 1285 } 1286 1287 if (res->res != MLX5_RES_QP && res->res != MLX5_RES_SRQ && 1288 res->res != MLX5_RES_XSRQ) { 1289 mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n", 1290 pfault->type); 1291 goto resolve_page_fault; 1292 } 1293 1294 wqe_start = (void *)__get_free_page(GFP_KERNEL); 1295 if (!wqe_start) { 1296 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); 1297 goto resolve_page_fault; 1298 } 1299 1300 wqe = wqe_start; 1301 qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL; 1302 if (qp && sq) { 1303 ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE, 1304 &bytes_copied); 1305 if (ret) 1306 goto read_user; 1307 ret = mlx5_ib_mr_initiator_pfault_handler( 1308 dev, pfault, qp, &wqe, &wqe_end, bytes_copied); 1309 } else if (qp && !sq) { 1310 ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE, 1311 &bytes_copied); 1312 if (ret) 1313 goto read_user; 1314 ret = mlx5_ib_mr_responder_pfault_handler_rq( 1315 dev, qp, wqe, &wqe_end, bytes_copied); 1316 } else if (!qp) { 1317 struct mlx5_ib_srq *srq = res_to_srq(res); 1318 1319 ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE, 1320 &bytes_copied); 1321 if (ret) 1322 goto read_user; 1323 ret = mlx5_ib_mr_responder_pfault_handler_srq( 1324 dev, srq, &wqe, &wqe_end, bytes_copied); 1325 } 1326 1327 if (ret < 0 || wqe >= wqe_end) 1328 goto resolve_page_fault; 1329 1330 ret = pagefault_data_segments(dev, pfault, wqe, wqe_end, &bytes_mapped, 1331 &total_wqe_bytes, !sq); 1332 if (ret == -EAGAIN) 1333 goto out; 1334 1335 if (ret < 0 || total_wqe_bytes > bytes_mapped) 1336 goto resolve_page_fault; 1337 1338 out: 1339 ret = 0; 1340 resume_with_error = 0; 1341 1342 read_user: 1343 if (ret) 1344 mlx5_ib_err( 1345 dev, 1346 "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n", 1347 ret, wqe_index, pfault->token); 1348 1349 resolve_page_fault: 1350 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); 1351 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", 1352 pfault->wqe.wq_num, resume_with_error, 1353 pfault->type); 1354 mlx5_core_res_put(res); 1355 free_page((unsigned long)wqe_start); 1356 } 1357 1358 static int pages_in_range(u64 address, u32 length) 1359 { 1360 return (ALIGN(address + length, PAGE_SIZE) - 1361 (address & PAGE_MASK)) >> PAGE_SHIFT; 1362 } 1363 1364 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, 1365 struct mlx5_pagefault *pfault) 1366 { 1367 u64 address; 1368 u32 length; 1369 u32 prefetch_len = pfault->bytes_committed; 1370 int prefetch_activated = 0; 1371 u32 rkey = pfault->rdma.r_key; 1372 int ret; 1373 1374 /* The RDMA responder handler handles the page fault in two parts. 1375 * First it brings the necessary pages for the current packet 1376 * (and uses the pfault context), and then (after resuming the QP) 1377 * prefetches more pages. The second operation cannot use the pfault 1378 * context and therefore uses the dummy_pfault context allocated on 1379 * the stack */ 1380 pfault->rdma.rdma_va += pfault->bytes_committed; 1381 pfault->rdma.rdma_op_len -= min(pfault->bytes_committed, 1382 pfault->rdma.rdma_op_len); 1383 pfault->bytes_committed = 0; 1384 1385 address = pfault->rdma.rdma_va; 1386 length = pfault->rdma.rdma_op_len; 1387 1388 /* For some operations, the hardware cannot tell the exact message 1389 * length, and in those cases it reports zero. Use prefetch 1390 * logic. */ 1391 if (length == 0) { 1392 prefetch_activated = 1; 1393 length = pfault->rdma.packet_size; 1394 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); 1395 } 1396 1397 ret = pagefault_single_data_segment(dev, NULL, rkey, address, length, 1398 &pfault->bytes_committed, NULL); 1399 if (ret == -EAGAIN) { 1400 /* We're racing with an invalidation, don't prefetch */ 1401 prefetch_activated = 0; 1402 } else if (ret < 0 || pages_in_range(address, length) > ret) { 1403 mlx5_ib_page_fault_resume(dev, pfault, 1); 1404 if (ret != -ENOENT) 1405 mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", 1406 ret, pfault->token, pfault->type); 1407 return; 1408 } 1409 1410 mlx5_ib_page_fault_resume(dev, pfault, 0); 1411 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", 1412 pfault->token, pfault->type, 1413 prefetch_activated); 1414 1415 /* At this point, there might be a new pagefault already arriving in 1416 * the eq, switch to the dummy pagefault for the rest of the 1417 * processing. We're still OK with the objects being alive as the 1418 * work-queue is being fenced. */ 1419 1420 if (prefetch_activated) { 1421 u32 bytes_committed = 0; 1422 1423 ret = pagefault_single_data_segment(dev, NULL, rkey, address, 1424 prefetch_len, 1425 &bytes_committed, NULL); 1426 if (ret < 0 && ret != -EAGAIN) { 1427 mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", 1428 ret, pfault->token, address, prefetch_len); 1429 } 1430 } 1431 } 1432 1433 static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) 1434 { 1435 u8 event_subtype = pfault->event_subtype; 1436 1437 switch (event_subtype) { 1438 case MLX5_PFAULT_SUBTYPE_WQE: 1439 mlx5_ib_mr_wqe_pfault_handler(dev, pfault); 1440 break; 1441 case MLX5_PFAULT_SUBTYPE_RDMA: 1442 mlx5_ib_mr_rdma_pfault_handler(dev, pfault); 1443 break; 1444 default: 1445 mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", 1446 event_subtype); 1447 mlx5_ib_page_fault_resume(dev, pfault, 1); 1448 } 1449 } 1450 1451 static void mlx5_ib_eqe_pf_action(struct work_struct *work) 1452 { 1453 struct mlx5_pagefault *pfault = container_of(work, 1454 struct mlx5_pagefault, 1455 work); 1456 struct mlx5_ib_pf_eq *eq = pfault->eq; 1457 1458 mlx5_ib_pfault(eq->dev, pfault); 1459 mempool_free(pfault, eq->pool); 1460 } 1461 1462 static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) 1463 { 1464 struct mlx5_eqe_page_fault *pf_eqe; 1465 struct mlx5_pagefault *pfault; 1466 struct mlx5_eqe *eqe; 1467 int cc = 0; 1468 1469 while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) { 1470 pfault = mempool_alloc(eq->pool, GFP_ATOMIC); 1471 if (!pfault) { 1472 schedule_work(&eq->work); 1473 break; 1474 } 1475 1476 pf_eqe = &eqe->data.page_fault; 1477 pfault->event_subtype = eqe->sub_type; 1478 pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed); 1479 1480 mlx5_ib_dbg(eq->dev, 1481 "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n", 1482 eqe->sub_type, pfault->bytes_committed); 1483 1484 switch (eqe->sub_type) { 1485 case MLX5_PFAULT_SUBTYPE_RDMA: 1486 /* RDMA based event */ 1487 pfault->type = 1488 be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24; 1489 pfault->token = 1490 be32_to_cpu(pf_eqe->rdma.pftype_token) & 1491 MLX5_24BIT_MASK; 1492 pfault->rdma.r_key = 1493 be32_to_cpu(pf_eqe->rdma.r_key); 1494 pfault->rdma.packet_size = 1495 be16_to_cpu(pf_eqe->rdma.packet_length); 1496 pfault->rdma.rdma_op_len = 1497 be32_to_cpu(pf_eqe->rdma.rdma_op_len); 1498 pfault->rdma.rdma_va = 1499 be64_to_cpu(pf_eqe->rdma.rdma_va); 1500 mlx5_ib_dbg(eq->dev, 1501 "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n", 1502 pfault->type, pfault->token, 1503 pfault->rdma.r_key); 1504 mlx5_ib_dbg(eq->dev, 1505 "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n", 1506 pfault->rdma.rdma_op_len, 1507 pfault->rdma.rdma_va); 1508 break; 1509 1510 case MLX5_PFAULT_SUBTYPE_WQE: 1511 /* WQE based event */ 1512 pfault->type = 1513 (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7; 1514 pfault->token = 1515 be32_to_cpu(pf_eqe->wqe.token); 1516 pfault->wqe.wq_num = 1517 be32_to_cpu(pf_eqe->wqe.pftype_wq) & 1518 MLX5_24BIT_MASK; 1519 pfault->wqe.wqe_index = 1520 be16_to_cpu(pf_eqe->wqe.wqe_index); 1521 pfault->wqe.packet_size = 1522 be16_to_cpu(pf_eqe->wqe.packet_length); 1523 mlx5_ib_dbg(eq->dev, 1524 "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n", 1525 pfault->type, pfault->token, 1526 pfault->wqe.wq_num, 1527 pfault->wqe.wqe_index); 1528 break; 1529 1530 default: 1531 mlx5_ib_warn(eq->dev, 1532 "Unsupported page fault event sub-type: 0x%02hhx\n", 1533 eqe->sub_type); 1534 /* Unsupported page faults should still be 1535 * resolved by the page fault handler 1536 */ 1537 } 1538 1539 pfault->eq = eq; 1540 INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action); 1541 queue_work(eq->wq, &pfault->work); 1542 1543 cc = mlx5_eq_update_cc(eq->core, ++cc); 1544 } 1545 1546 mlx5_eq_update_ci(eq->core, cc, 1); 1547 } 1548 1549 static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type, 1550 void *data) 1551 { 1552 struct mlx5_ib_pf_eq *eq = 1553 container_of(nb, struct mlx5_ib_pf_eq, irq_nb); 1554 unsigned long flags; 1555 1556 if (spin_trylock_irqsave(&eq->lock, flags)) { 1557 mlx5_ib_eq_pf_process(eq); 1558 spin_unlock_irqrestore(&eq->lock, flags); 1559 } else { 1560 schedule_work(&eq->work); 1561 } 1562 1563 return IRQ_HANDLED; 1564 } 1565 1566 /* mempool_refill() was proposed but unfortunately wasn't accepted 1567 * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html 1568 * Cheap workaround. 1569 */ 1570 static void mempool_refill(mempool_t *pool) 1571 { 1572 while (pool->curr_nr < pool->min_nr) 1573 mempool_free(mempool_alloc(pool, GFP_KERNEL), pool); 1574 } 1575 1576 static void mlx5_ib_eq_pf_action(struct work_struct *work) 1577 { 1578 struct mlx5_ib_pf_eq *eq = 1579 container_of(work, struct mlx5_ib_pf_eq, work); 1580 1581 mempool_refill(eq->pool); 1582 1583 spin_lock_irq(&eq->lock); 1584 mlx5_ib_eq_pf_process(eq); 1585 spin_unlock_irq(&eq->lock); 1586 } 1587 1588 enum { 1589 MLX5_IB_NUM_PF_EQE = 0x1000, 1590 MLX5_IB_NUM_PF_DRAIN = 64, 1591 }; 1592 1593 static int 1594 mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) 1595 { 1596 struct mlx5_eq_param param = {}; 1597 int err; 1598 1599 INIT_WORK(&eq->work, mlx5_ib_eq_pf_action); 1600 spin_lock_init(&eq->lock); 1601 eq->dev = dev; 1602 1603 eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN, 1604 sizeof(struct mlx5_pagefault)); 1605 if (!eq->pool) 1606 return -ENOMEM; 1607 1608 eq->wq = alloc_workqueue("mlx5_ib_page_fault", 1609 WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM, 1610 MLX5_NUM_CMD_EQE); 1611 if (!eq->wq) { 1612 err = -ENOMEM; 1613 goto err_mempool; 1614 } 1615 1616 eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int; 1617 param = (struct mlx5_eq_param) { 1618 .irq_index = 0, 1619 .nent = MLX5_IB_NUM_PF_EQE, 1620 }; 1621 param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT; 1622 eq->core = mlx5_eq_create_generic(dev->mdev, ¶m); 1623 if (IS_ERR(eq->core)) { 1624 err = PTR_ERR(eq->core); 1625 goto err_wq; 1626 } 1627 err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb); 1628 if (err) { 1629 mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err); 1630 goto err_eq; 1631 } 1632 1633 return 0; 1634 err_eq: 1635 mlx5_eq_destroy_generic(dev->mdev, eq->core); 1636 err_wq: 1637 destroy_workqueue(eq->wq); 1638 err_mempool: 1639 mempool_destroy(eq->pool); 1640 return err; 1641 } 1642 1643 static int 1644 mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) 1645 { 1646 int err; 1647 1648 mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb); 1649 err = mlx5_eq_destroy_generic(dev->mdev, eq->core); 1650 cancel_work_sync(&eq->work); 1651 destroy_workqueue(eq->wq); 1652 mempool_destroy(eq->pool); 1653 1654 return err; 1655 } 1656 1657 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) 1658 { 1659 if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1660 return; 1661 1662 switch (ent->order - 2) { 1663 case MLX5_IMR_MTT_CACHE_ENTRY: 1664 ent->page = PAGE_SHIFT; 1665 ent->xlt = MLX5_IMR_MTT_ENTRIES * 1666 sizeof(struct mlx5_mtt) / 1667 MLX5_IB_UMR_OCTOWORD; 1668 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; 1669 ent->limit = 0; 1670 break; 1671 1672 case MLX5_IMR_KSM_CACHE_ENTRY: 1673 ent->page = MLX5_KSM_PAGE_SHIFT; 1674 ent->xlt = mlx5_imr_ksm_entries * 1675 sizeof(struct mlx5_klm) / 1676 MLX5_IB_UMR_OCTOWORD; 1677 ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; 1678 ent->limit = 0; 1679 break; 1680 } 1681 } 1682 1683 static const struct ib_device_ops mlx5_ib_dev_odp_ops = { 1684 .advise_mr = mlx5_ib_advise_mr, 1685 }; 1686 1687 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) 1688 { 1689 int ret = 0; 1690 1691 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT)) 1692 return ret; 1693 1694 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops); 1695 1696 if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { 1697 ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); 1698 if (ret) { 1699 mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret); 1700 return ret; 1701 } 1702 } 1703 1704 ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq); 1705 1706 return ret; 1707 } 1708 1709 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev) 1710 { 1711 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT)) 1712 return; 1713 1714 mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq); 1715 } 1716 1717 int mlx5_ib_odp_init(void) 1718 { 1719 mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - 1720 MLX5_IMR_MTT_BITS); 1721 1722 return 0; 1723 } 1724 1725 struct prefetch_mr_work { 1726 struct work_struct work; 1727 u32 pf_flags; 1728 u32 num_sge; 1729 struct { 1730 u64 io_virt; 1731 struct mlx5_ib_mr *mr; 1732 size_t length; 1733 } frags[]; 1734 }; 1735 1736 static void destroy_prefetch_work(struct prefetch_mr_work *work) 1737 { 1738 u32 i; 1739 1740 for (i = 0; i < work->num_sge; ++i) 1741 if (atomic_dec_and_test(&work->frags[i].mr->num_deferred_work)) 1742 wake_up(&work->frags[i].mr->q_deferred_work); 1743 kvfree(work); 1744 } 1745 1746 static struct mlx5_ib_mr * 1747 get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, 1748 u32 lkey) 1749 { 1750 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1751 struct mlx5_core_mkey *mmkey; 1752 struct ib_umem_odp *odp; 1753 struct mlx5_ib_mr *mr; 1754 1755 lockdep_assert_held(&dev->odp_srcu); 1756 1757 mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey)); 1758 if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR) 1759 return NULL; 1760 1761 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 1762 1763 if (mr->ibmr.pd != pd) 1764 return NULL; 1765 1766 odp = to_ib_umem_odp(mr->umem); 1767 1768 /* prefetch with write-access must be supported by the MR */ 1769 if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1770 !odp->umem.writable) 1771 return NULL; 1772 1773 return mr; 1774 } 1775 1776 static void mlx5_ib_prefetch_mr_work(struct work_struct *w) 1777 { 1778 struct prefetch_mr_work *work = 1779 container_of(w, struct prefetch_mr_work, work); 1780 struct mlx5_ib_dev *dev; 1781 u32 bytes_mapped = 0; 1782 int srcu_key; 1783 int ret; 1784 u32 i; 1785 1786 /* We rely on IB/core that work is executed if we have num_sge != 0 only. */ 1787 WARN_ON(!work->num_sge); 1788 dev = mr_to_mdev(work->frags[0].mr); 1789 /* SRCU should be held when calling to mlx5_odp_populate_xlt() */ 1790 srcu_key = srcu_read_lock(&dev->odp_srcu); 1791 for (i = 0; i < work->num_sge; ++i) { 1792 ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt, 1793 work->frags[i].length, &bytes_mapped, 1794 work->pf_flags); 1795 if (ret <= 0) 1796 continue; 1797 mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret); 1798 } 1799 srcu_read_unlock(&dev->odp_srcu, srcu_key); 1800 1801 destroy_prefetch_work(work); 1802 } 1803 1804 static bool init_prefetch_work(struct ib_pd *pd, 1805 enum ib_uverbs_advise_mr_advice advice, 1806 u32 pf_flags, struct prefetch_mr_work *work, 1807 struct ib_sge *sg_list, u32 num_sge) 1808 { 1809 u32 i; 1810 1811 INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); 1812 work->pf_flags = pf_flags; 1813 1814 for (i = 0; i < num_sge; ++i) { 1815 work->frags[i].io_virt = sg_list[i].addr; 1816 work->frags[i].length = sg_list[i].length; 1817 work->frags[i].mr = 1818 get_prefetchable_mr(pd, advice, sg_list[i].lkey); 1819 if (!work->frags[i].mr) { 1820 work->num_sge = i; 1821 return false; 1822 } 1823 1824 /* Keep the MR pointer will valid outside the SRCU */ 1825 atomic_inc(&work->frags[i].mr->num_deferred_work); 1826 } 1827 work->num_sge = num_sge; 1828 return true; 1829 } 1830 1831 static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, 1832 enum ib_uverbs_advise_mr_advice advice, 1833 u32 pf_flags, struct ib_sge *sg_list, 1834 u32 num_sge) 1835 { 1836 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1837 u32 bytes_mapped = 0; 1838 int srcu_key; 1839 int ret = 0; 1840 u32 i; 1841 1842 srcu_key = srcu_read_lock(&dev->odp_srcu); 1843 for (i = 0; i < num_sge; ++i) { 1844 struct mlx5_ib_mr *mr; 1845 1846 mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey); 1847 if (!mr) { 1848 ret = -ENOENT; 1849 goto out; 1850 } 1851 ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length, 1852 &bytes_mapped, pf_flags); 1853 if (ret < 0) 1854 goto out; 1855 mlx5_update_odp_stats(mr, prefetch, ret); 1856 } 1857 ret = 0; 1858 1859 out: 1860 srcu_read_unlock(&dev->odp_srcu, srcu_key); 1861 return ret; 1862 } 1863 1864 int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, 1865 enum ib_uverbs_advise_mr_advice advice, 1866 u32 flags, struct ib_sge *sg_list, u32 num_sge) 1867 { 1868 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1869 u32 pf_flags = 0; 1870 struct prefetch_mr_work *work; 1871 int srcu_key; 1872 1873 if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) 1874 pf_flags |= MLX5_PF_FLAGS_DOWNGRADE; 1875 1876 if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1877 pf_flags |= MLX5_PF_FLAGS_SNAPSHOT; 1878 1879 if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) 1880 return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list, 1881 num_sge); 1882 1883 work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL); 1884 if (!work) 1885 return -ENOMEM; 1886 1887 srcu_key = srcu_read_lock(&dev->odp_srcu); 1888 if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) { 1889 srcu_read_unlock(&dev->odp_srcu, srcu_key); 1890 destroy_prefetch_work(work); 1891 return -EINVAL; 1892 } 1893 queue_work(system_unbound_wq, &work->work); 1894 srcu_read_unlock(&dev->odp_srcu, srcu_key); 1895 return 0; 1896 } 1897