1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <rdma/ib_umem.h> 34 #include <rdma/ib_umem_odp.h> 35 #include <linux/kernel.h> 36 37 #include "mlx5_ib.h" 38 #include "cmd.h" 39 40 #include <linux/mlx5/eq.h> 41 42 /* Contains the details of a pagefault. */ 43 struct mlx5_pagefault { 44 u32 bytes_committed; 45 u32 token; 46 u8 event_subtype; 47 u8 type; 48 union { 49 /* Initiator or send message responder pagefault details. */ 50 struct { 51 /* Received packet size, only valid for responders. */ 52 u32 packet_size; 53 /* 54 * Number of resource holding WQE, depends on type. 55 */ 56 u32 wq_num; 57 /* 58 * WQE index. Refers to either the send queue or 59 * receive queue, according to event_subtype. 60 */ 61 u16 wqe_index; 62 } wqe; 63 /* RDMA responder pagefault details */ 64 struct { 65 u32 r_key; 66 /* 67 * Received packet size, minimal size page fault 68 * resolution required for forward progress. 69 */ 70 u32 packet_size; 71 u32 rdma_op_len; 72 u64 rdma_va; 73 } rdma; 74 }; 75 76 struct mlx5_ib_pf_eq *eq; 77 struct work_struct work; 78 }; 79 80 #define MAX_PREFETCH_LEN (4*1024*1024U) 81 82 /* Timeout in ms to wait for an active mmu notifier to complete when handling 83 * a pagefault. */ 84 #define MMU_NOTIFIER_TIMEOUT 1000 85 86 #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) 87 #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) 88 #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) 89 #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) 90 #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) 91 92 #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT 93 94 static u64 mlx5_imr_ksm_entries; 95 96 static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, 97 struct mlx5_ib_mr *imr, int flags) 98 { 99 struct mlx5_klm *end = pklm + nentries; 100 101 if (flags & MLX5_IB_UPD_XLT_ZAP) { 102 for (; pklm != end; pklm++, idx++) { 103 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 104 pklm->key = cpu_to_be32(imr->dev->null_mkey); 105 pklm->va = 0; 106 } 107 return; 108 } 109 110 /* 111 * The locking here is pretty subtle. Ideally the implicit_children 112 * xarray would be protected by the umem_mutex, however that is not 113 * possible. Instead this uses a weaker update-then-lock pattern: 114 * 115 * srcu_read_lock() 116 * xa_store() 117 * mutex_lock(umem_mutex) 118 * mlx5_ib_update_xlt() 119 * mutex_unlock(umem_mutex) 120 * destroy lkey 121 * 122 * ie any change the xarray must be followed by the locked update_xlt 123 * before destroying. 124 * 125 * The umem_mutex provides the acquire/release semantic needed to make 126 * the xa_store() visible to a racing thread. While SRCU is not 127 * technically required, using it gives consistent use of the SRCU 128 * locking around the xarray. 129 */ 130 lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); 131 lockdep_assert_held(&imr->dev->odp_srcu); 132 133 for (; pklm != end; pklm++, idx++) { 134 struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); 135 136 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 137 if (mtt) { 138 pklm->key = cpu_to_be32(mtt->ibmr.lkey); 139 pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); 140 } else { 141 pklm->key = cpu_to_be32(imr->dev->null_mkey); 142 pklm->va = 0; 143 } 144 } 145 } 146 147 static u64 umem_dma_to_mtt(dma_addr_t umem_dma) 148 { 149 u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; 150 151 if (umem_dma & ODP_READ_ALLOWED_BIT) 152 mtt_entry |= MLX5_IB_MTT_READ; 153 if (umem_dma & ODP_WRITE_ALLOWED_BIT) 154 mtt_entry |= MLX5_IB_MTT_WRITE; 155 156 return mtt_entry; 157 } 158 159 static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, 160 struct mlx5_ib_mr *mr, int flags) 161 { 162 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 163 dma_addr_t pa; 164 size_t i; 165 166 if (flags & MLX5_IB_UPD_XLT_ZAP) 167 return; 168 169 for (i = 0; i < nentries; i++) { 170 pa = odp->dma_list[idx + i]; 171 pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); 172 } 173 } 174 175 void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, 176 struct mlx5_ib_mr *mr, int flags) 177 { 178 if (flags & MLX5_IB_UPD_XLT_INDIRECT) { 179 populate_klm(xlt, idx, nentries, mr, flags); 180 } else { 181 populate_mtt(xlt, idx, nentries, mr, flags); 182 } 183 } 184 185 static void dma_fence_odp_mr(struct mlx5_ib_mr *mr) 186 { 187 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 188 189 /* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */ 190 mutex_lock(&odp->umem_mutex); 191 if (odp->npages) { 192 mlx5_mr_cache_invalidate(mr); 193 ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp), 194 ib_umem_end(odp)); 195 WARN_ON(odp->npages); 196 } 197 odp->private = NULL; 198 mutex_unlock(&odp->umem_mutex); 199 200 if (!mr->cache_ent) { 201 mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey); 202 WARN_ON(mr->descs); 203 } 204 } 205 206 /* 207 * This must be called after the mr has been removed from implicit_children 208 * and the SRCU synchronized. NOTE: The MR does not necessarily have to be 209 * empty here, parallel page faults could have raced with the free process and 210 * added pages to it. 211 */ 212 static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt) 213 { 214 struct mlx5_ib_mr *imr = mr->parent; 215 struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); 216 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 217 unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; 218 int srcu_key; 219 220 /* implicit_child_mr's are not allowed to have deferred work */ 221 WARN_ON(atomic_read(&mr->num_deferred_work)); 222 223 if (need_imr_xlt) { 224 srcu_key = srcu_read_lock(&mr->dev->odp_srcu); 225 mutex_lock(&odp_imr->umem_mutex); 226 mlx5_ib_update_xlt(mr->parent, idx, 1, 0, 227 MLX5_IB_UPD_XLT_INDIRECT | 228 MLX5_IB_UPD_XLT_ATOMIC); 229 mutex_unlock(&odp_imr->umem_mutex); 230 srcu_read_unlock(&mr->dev->odp_srcu, srcu_key); 231 } 232 233 dma_fence_odp_mr(mr); 234 235 mr->parent = NULL; 236 mlx5_mr_cache_free(mr->dev, mr); 237 ib_umem_odp_release(odp); 238 if (atomic_dec_and_test(&imr->num_deferred_work)) 239 wake_up(&imr->q_deferred_work); 240 } 241 242 static void free_implicit_child_mr_work(struct work_struct *work) 243 { 244 struct mlx5_ib_mr *mr = 245 container_of(work, struct mlx5_ib_mr, odp_destroy.work); 246 247 free_implicit_child_mr(mr, true); 248 } 249 250 static void free_implicit_child_mr_rcu(struct rcu_head *head) 251 { 252 struct mlx5_ib_mr *mr = 253 container_of(head, struct mlx5_ib_mr, odp_destroy.rcu); 254 255 /* Freeing a MR is a sleeping operation, so bounce to a work queue */ 256 INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); 257 queue_work(system_unbound_wq, &mr->odp_destroy.work); 258 } 259 260 static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) 261 { 262 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 263 unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; 264 struct mlx5_ib_mr *imr = mr->parent; 265 266 xa_lock(&imr->implicit_children); 267 /* 268 * This can race with mlx5_ib_free_implicit_mr(), the first one to 269 * reach the xa lock wins the race and destroys the MR. 270 */ 271 if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) != 272 mr) 273 goto out_unlock; 274 275 atomic_inc(&imr->num_deferred_work); 276 call_srcu(&mr->dev->odp_srcu, &mr->odp_destroy.rcu, 277 free_implicit_child_mr_rcu); 278 279 out_unlock: 280 xa_unlock(&imr->implicit_children); 281 } 282 283 static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, 284 const struct mmu_notifier_range *range, 285 unsigned long cur_seq) 286 { 287 struct ib_umem_odp *umem_odp = 288 container_of(mni, struct ib_umem_odp, notifier); 289 struct mlx5_ib_mr *mr; 290 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / 291 sizeof(struct mlx5_mtt)) - 1; 292 u64 idx = 0, blk_start_idx = 0; 293 u64 invalidations = 0; 294 unsigned long start; 295 unsigned long end; 296 int in_block = 0; 297 u64 addr; 298 299 if (!mmu_notifier_range_blockable(range)) 300 return false; 301 302 mutex_lock(&umem_odp->umem_mutex); 303 mmu_interval_set_seq(mni, cur_seq); 304 /* 305 * If npages is zero then umem_odp->private may not be setup yet. This 306 * does not complete until after the first page is mapped for DMA. 307 */ 308 if (!umem_odp->npages) 309 goto out; 310 mr = umem_odp->private; 311 312 start = max_t(u64, ib_umem_start(umem_odp), range->start); 313 end = min_t(u64, ib_umem_end(umem_odp), range->end); 314 315 /* 316 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that 317 * while we are doing the invalidation, no page fault will attempt to 318 * overwrite the same MTTs. Concurent invalidations might race us, 319 * but they will write 0s as well, so no difference in the end result. 320 */ 321 for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) { 322 idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; 323 /* 324 * Strive to write the MTTs in chunks, but avoid overwriting 325 * non-existing MTTs. The huristic here can be improved to 326 * estimate the cost of another UMR vs. the cost of bigger 327 * UMR. 328 */ 329 if (umem_odp->dma_list[idx] & 330 (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { 331 if (!in_block) { 332 blk_start_idx = idx; 333 in_block = 1; 334 } 335 336 /* Count page invalidations */ 337 invalidations += idx - blk_start_idx + 1; 338 } else { 339 u64 umr_offset = idx & umr_block_mask; 340 341 if (in_block && umr_offset == 0) { 342 mlx5_ib_update_xlt(mr, blk_start_idx, 343 idx - blk_start_idx, 0, 344 MLX5_IB_UPD_XLT_ZAP | 345 MLX5_IB_UPD_XLT_ATOMIC); 346 in_block = 0; 347 } 348 } 349 } 350 if (in_block) 351 mlx5_ib_update_xlt(mr, blk_start_idx, 352 idx - blk_start_idx + 1, 0, 353 MLX5_IB_UPD_XLT_ZAP | 354 MLX5_IB_UPD_XLT_ATOMIC); 355 356 mlx5_update_odp_stats(mr, invalidations, invalidations); 357 358 /* 359 * We are now sure that the device will not access the 360 * memory. We can safely unmap it, and mark it as dirty if 361 * needed. 362 */ 363 364 ib_umem_odp_unmap_dma_pages(umem_odp, start, end); 365 366 if (unlikely(!umem_odp->npages && mr->parent)) 367 destroy_unused_implicit_child_mr(mr); 368 out: 369 mutex_unlock(&umem_odp->umem_mutex); 370 return true; 371 } 372 373 const struct mmu_interval_notifier_ops mlx5_mn_ops = { 374 .invalidate = mlx5_ib_invalidate_range, 375 }; 376 377 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 378 { 379 struct ib_odp_caps *caps = &dev->odp_caps; 380 381 memset(caps, 0, sizeof(*caps)); 382 383 if (!MLX5_CAP_GEN(dev->mdev, pg) || 384 !mlx5_ib_can_use_umr(dev, true, 0)) 385 return; 386 387 caps->general_caps = IB_ODP_SUPPORT; 388 389 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 390 dev->odp_max_size = U64_MAX; 391 else 392 dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); 393 394 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) 395 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; 396 397 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive)) 398 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; 399 400 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) 401 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; 402 403 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) 404 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; 405 406 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) 407 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; 408 409 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) 410 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; 411 412 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) 413 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; 414 415 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive)) 416 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; 417 418 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send)) 419 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND; 420 421 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive)) 422 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV; 423 424 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write)) 425 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE; 426 427 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read)) 428 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ; 429 430 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic)) 431 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; 432 433 if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive)) 434 caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; 435 436 if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && 437 MLX5_CAP_GEN(dev->mdev, null_mkey) && 438 MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && 439 !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled)) 440 caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; 441 } 442 443 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, 444 struct mlx5_pagefault *pfault, 445 int error) 446 { 447 int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? 448 pfault->wqe.wq_num : pfault->token; 449 u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = { }; 450 u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = { }; 451 int err; 452 453 MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); 454 MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type); 455 MLX5_SET(page_fault_resume_in, in, token, pfault->token); 456 MLX5_SET(page_fault_resume_in, in, wq_number, wq_num); 457 MLX5_SET(page_fault_resume_in, in, error, !!error); 458 459 err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); 460 if (err) 461 mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n", 462 wq_num, err); 463 } 464 465 static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, 466 unsigned long idx) 467 { 468 struct ib_umem_odp *odp; 469 struct mlx5_ib_mr *mr; 470 struct mlx5_ib_mr *ret; 471 int err; 472 473 odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), 474 idx * MLX5_IMR_MTT_SIZE, 475 MLX5_IMR_MTT_SIZE, &mlx5_mn_ops); 476 if (IS_ERR(odp)) 477 return ERR_CAST(odp); 478 479 ret = mr = mlx5_mr_cache_alloc(imr->dev, MLX5_IMR_MTT_CACHE_ENTRY); 480 if (IS_ERR(mr)) 481 goto out_umem; 482 483 mr->ibmr.pd = imr->ibmr.pd; 484 mr->access_flags = imr->access_flags; 485 mr->umem = &odp->umem; 486 mr->ibmr.lkey = mr->mmkey.key; 487 mr->ibmr.rkey = mr->mmkey.key; 488 mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE; 489 mr->parent = imr; 490 odp->private = mr; 491 492 err = mlx5_ib_update_xlt(mr, 0, 493 MLX5_IMR_MTT_ENTRIES, 494 PAGE_SHIFT, 495 MLX5_IB_UPD_XLT_ZAP | 496 MLX5_IB_UPD_XLT_ENABLE); 497 if (err) { 498 ret = ERR_PTR(err); 499 goto out_mr; 500 } 501 502 /* 503 * Once the store to either xarray completes any error unwind has to 504 * use synchronize_srcu(). Avoid this with xa_reserve() 505 */ 506 ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr, 507 GFP_KERNEL); 508 if (unlikely(ret)) { 509 if (xa_is_err(ret)) { 510 ret = ERR_PTR(xa_err(ret)); 511 goto out_mr; 512 } 513 /* 514 * Another thread beat us to creating the child mr, use 515 * theirs. 516 */ 517 goto out_mr; 518 } 519 520 mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr); 521 return mr; 522 523 out_mr: 524 mlx5_mr_cache_free(imr->dev, mr); 525 out_umem: 526 ib_umem_odp_release(odp); 527 return ret; 528 } 529 530 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, 531 struct ib_udata *udata, 532 int access_flags) 533 { 534 struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device); 535 struct ib_umem_odp *umem_odp; 536 struct mlx5_ib_mr *imr; 537 int err; 538 539 umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); 540 if (IS_ERR(umem_odp)) 541 return ERR_CAST(umem_odp); 542 543 imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY); 544 if (IS_ERR(imr)) { 545 err = PTR_ERR(imr); 546 goto out_umem; 547 } 548 549 imr->ibmr.pd = &pd->ibpd; 550 imr->access_flags = access_flags; 551 imr->mmkey.iova = 0; 552 imr->umem = &umem_odp->umem; 553 imr->ibmr.lkey = imr->mmkey.key; 554 imr->ibmr.rkey = imr->mmkey.key; 555 imr->umem = &umem_odp->umem; 556 imr->is_odp_implicit = true; 557 atomic_set(&imr->num_deferred_work, 0); 558 init_waitqueue_head(&imr->q_deferred_work); 559 xa_init(&imr->implicit_children); 560 561 err = mlx5_ib_update_xlt(imr, 0, 562 mlx5_imr_ksm_entries, 563 MLX5_KSM_PAGE_SHIFT, 564 MLX5_IB_UPD_XLT_INDIRECT | 565 MLX5_IB_UPD_XLT_ZAP | 566 MLX5_IB_UPD_XLT_ENABLE); 567 if (err) 568 goto out_mr; 569 570 err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key), 571 &imr->mmkey, GFP_KERNEL)); 572 if (err) 573 goto out_mr; 574 575 mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr); 576 return imr; 577 out_mr: 578 mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); 579 mlx5_mr_cache_free(dev, imr); 580 out_umem: 581 ib_umem_odp_release(umem_odp); 582 return ERR_PTR(err); 583 } 584 585 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) 586 { 587 struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); 588 struct mlx5_ib_dev *dev = imr->dev; 589 struct list_head destroy_list; 590 struct mlx5_ib_mr *mtt; 591 struct mlx5_ib_mr *tmp; 592 unsigned long idx; 593 594 INIT_LIST_HEAD(&destroy_list); 595 596 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key)); 597 /* 598 * This stops the SRCU protected page fault path from touching either 599 * the imr or any children. The page fault path can only reach the 600 * children xarray via the imr. 601 */ 602 synchronize_srcu(&dev->odp_srcu); 603 604 xa_lock(&imr->implicit_children); 605 xa_for_each (&imr->implicit_children, idx, mtt) { 606 __xa_erase(&imr->implicit_children, idx); 607 list_add(&mtt->odp_destroy.elm, &destroy_list); 608 } 609 xa_unlock(&imr->implicit_children); 610 611 /* 612 * num_deferred_work can only be incremented inside the odp_srcu, or 613 * under xa_lock while the child is in the xarray. Thus at this point 614 * it is only decreasing, and all work holding it is now on the wq. 615 */ 616 wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work)); 617 618 /* 619 * Fence the imr before we destroy the children. This allows us to 620 * skip updating the XLT of the imr during destroy of the child mkey 621 * the imr points to. 622 */ 623 mlx5_mr_cache_invalidate(imr); 624 625 list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm) 626 free_implicit_child_mr(mtt, false); 627 628 mlx5_mr_cache_free(dev, imr); 629 ib_umem_odp_release(odp_imr); 630 } 631 632 /** 633 * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR 634 * @mr: to fence 635 * 636 * On return no parallel threads will be touching this MR and no DMA will be 637 * active. 638 */ 639 void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr) 640 { 641 /* Prevent new page faults and prefetch requests from succeeding */ 642 xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); 643 644 /* Wait for all running page-fault handlers to finish. */ 645 synchronize_srcu(&mr->dev->odp_srcu); 646 647 wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work)); 648 649 dma_fence_odp_mr(mr); 650 } 651 652 #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) 653 static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, 654 u64 user_va, size_t bcnt, u32 *bytes_mapped, 655 u32 flags) 656 { 657 int page_shift, ret, np; 658 bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; 659 unsigned long current_seq; 660 u64 access_mask; 661 u64 start_idx; 662 663 page_shift = odp->page_shift; 664 start_idx = (user_va - ib_umem_start(odp)) >> page_shift; 665 access_mask = ODP_READ_ALLOWED_BIT; 666 667 if (odp->umem.writable && !downgrade) 668 access_mask |= ODP_WRITE_ALLOWED_BIT; 669 670 current_seq = mmu_interval_read_begin(&odp->notifier); 671 672 np = ib_umem_odp_map_dma_pages(odp, user_va, bcnt, access_mask, 673 current_seq); 674 if (np < 0) 675 return np; 676 677 mutex_lock(&odp->umem_mutex); 678 if (!mmu_interval_read_retry(&odp->notifier, current_seq)) { 679 /* 680 * No need to check whether the MTTs really belong to 681 * this MR, since ib_umem_odp_map_dma_pages already 682 * checks this. 683 */ 684 ret = mlx5_ib_update_xlt(mr, start_idx, np, 685 page_shift, MLX5_IB_UPD_XLT_ATOMIC); 686 } else { 687 ret = -EAGAIN; 688 } 689 mutex_unlock(&odp->umem_mutex); 690 691 if (ret < 0) { 692 if (ret != -EAGAIN) 693 mlx5_ib_err(mr->dev, 694 "Failed to update mkey page tables\n"); 695 goto out; 696 } 697 698 if (bytes_mapped) { 699 u32 new_mappings = (np << page_shift) - 700 (user_va - round_down(user_va, 1 << page_shift)); 701 702 *bytes_mapped += min_t(u32, new_mappings, bcnt); 703 } 704 705 return np << (page_shift - PAGE_SHIFT); 706 707 out: 708 return ret; 709 } 710 711 static int pagefault_implicit_mr(struct mlx5_ib_mr *imr, 712 struct ib_umem_odp *odp_imr, u64 user_va, 713 size_t bcnt, u32 *bytes_mapped, u32 flags) 714 { 715 unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT; 716 unsigned long upd_start_idx = end_idx + 1; 717 unsigned long upd_len = 0; 718 unsigned long npages = 0; 719 int err; 720 int ret; 721 722 if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE || 723 mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt)) 724 return -EFAULT; 725 726 /* Fault each child mr that intersects with our interval. */ 727 while (bcnt) { 728 unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT; 729 struct ib_umem_odp *umem_odp; 730 struct mlx5_ib_mr *mtt; 731 u64 len; 732 733 mtt = xa_load(&imr->implicit_children, idx); 734 if (unlikely(!mtt)) { 735 mtt = implicit_get_child_mr(imr, idx); 736 if (IS_ERR(mtt)) { 737 ret = PTR_ERR(mtt); 738 goto out; 739 } 740 upd_start_idx = min(upd_start_idx, idx); 741 upd_len = idx - upd_start_idx + 1; 742 } 743 744 umem_odp = to_ib_umem_odp(mtt->umem); 745 len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) - 746 user_va; 747 748 ret = pagefault_real_mr(mtt, umem_odp, user_va, len, 749 bytes_mapped, flags); 750 if (ret < 0) 751 goto out; 752 user_va += len; 753 bcnt -= len; 754 npages += ret; 755 } 756 757 ret = npages; 758 759 /* 760 * Any time the implicit_children are changed we must perform an 761 * update of the xlt before exiting to ensure the HW and the 762 * implicit_children remains synchronized. 763 */ 764 out: 765 if (likely(!upd_len)) 766 return ret; 767 768 /* 769 * Notice this is not strictly ordered right, the KSM is updated after 770 * the implicit_children is updated, so a parallel page fault could 771 * see a MR that is not yet visible in the KSM. This is similar to a 772 * parallel page fault seeing a MR that is being concurrently removed 773 * from the KSM. Both of these improbable situations are resolved 774 * safely by resuming the HW and then taking another page fault. The 775 * next pagefault handler will see the new information. 776 */ 777 mutex_lock(&odp_imr->umem_mutex); 778 err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0, 779 MLX5_IB_UPD_XLT_INDIRECT | 780 MLX5_IB_UPD_XLT_ATOMIC); 781 mutex_unlock(&odp_imr->umem_mutex); 782 if (err) { 783 mlx5_ib_err(imr->dev, "Failed to update PAS\n"); 784 return err; 785 } 786 return ret; 787 } 788 789 /* 790 * Returns: 791 * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are 792 * not accessible, or the MR is no longer valid. 793 * -EAGAIN/-ENOMEM: The operation should be retried 794 * 795 * -EINVAL/others: General internal malfunction 796 * >0: Number of pages mapped 797 */ 798 static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, 799 u32 *bytes_mapped, u32 flags) 800 { 801 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 802 803 if (unlikely(io_virt < mr->mmkey.iova)) 804 return -EFAULT; 805 806 if (!odp->is_implicit_odp) { 807 u64 user_va; 808 809 if (check_add_overflow(io_virt - mr->mmkey.iova, 810 (u64)odp->umem.address, &user_va)) 811 return -EFAULT; 812 if (unlikely(user_va >= ib_umem_end(odp) || 813 ib_umem_end(odp) - user_va < bcnt)) 814 return -EFAULT; 815 return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped, 816 flags); 817 } 818 return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped, 819 flags); 820 } 821 822 struct pf_frame { 823 struct pf_frame *next; 824 u32 key; 825 u64 io_virt; 826 size_t bcnt; 827 int depth; 828 }; 829 830 static bool mkey_is_eq(struct mlx5_core_mkey *mmkey, u32 key) 831 { 832 if (!mmkey) 833 return false; 834 if (mmkey->type == MLX5_MKEY_MW) 835 return mlx5_base_mkey(mmkey->key) == mlx5_base_mkey(key); 836 return mmkey->key == key; 837 } 838 839 static int get_indirect_num_descs(struct mlx5_core_mkey *mmkey) 840 { 841 struct mlx5_ib_mw *mw; 842 struct mlx5_ib_devx_mr *devx_mr; 843 844 if (mmkey->type == MLX5_MKEY_MW) { 845 mw = container_of(mmkey, struct mlx5_ib_mw, mmkey); 846 return mw->ndescs; 847 } 848 849 devx_mr = container_of(mmkey, struct mlx5_ib_devx_mr, 850 mmkey); 851 return devx_mr->ndescs; 852 } 853 854 /* 855 * Handle a single data segment in a page-fault WQE or RDMA region. 856 * 857 * Returns number of OS pages retrieved on success. The caller may continue to 858 * the next data segment. 859 * Can return the following error codes: 860 * -EAGAIN to designate a temporary error. The caller will abort handling the 861 * page fault and resolve it. 862 * -EFAULT when there's an error mapping the requested pages. The caller will 863 * abort the page fault handling. 864 */ 865 static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, 866 struct ib_pd *pd, u32 key, 867 u64 io_virt, size_t bcnt, 868 u32 *bytes_committed, 869 u32 *bytes_mapped) 870 { 871 int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0; 872 struct pf_frame *head = NULL, *frame; 873 struct mlx5_core_mkey *mmkey; 874 struct mlx5_ib_mr *mr; 875 struct mlx5_klm *pklm; 876 u32 *out = NULL; 877 size_t offset; 878 int ndescs; 879 880 srcu_key = srcu_read_lock(&dev->odp_srcu); 881 882 io_virt += *bytes_committed; 883 bcnt -= *bytes_committed; 884 885 next_mr: 886 mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); 887 if (!mmkey) { 888 mlx5_ib_dbg( 889 dev, 890 "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", 891 key); 892 if (bytes_mapped) 893 *bytes_mapped += bcnt; 894 /* 895 * The user could specify a SGL with multiple lkeys and only 896 * some of them are ODP. Treat the non-ODP ones as fully 897 * faulted. 898 */ 899 ret = 0; 900 goto srcu_unlock; 901 } 902 if (!mkey_is_eq(mmkey, key)) { 903 mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); 904 ret = -EFAULT; 905 goto srcu_unlock; 906 } 907 908 switch (mmkey->type) { 909 case MLX5_MKEY_MR: 910 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 911 912 ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0); 913 if (ret < 0) 914 goto srcu_unlock; 915 916 /* 917 * When prefetching a page, page fault is generated 918 * in order to bring the page to the main memory. 919 * In the current flow, page faults are being counted. 920 */ 921 mlx5_update_odp_stats(mr, faults, ret); 922 923 npages += ret; 924 ret = 0; 925 break; 926 927 case MLX5_MKEY_MW: 928 case MLX5_MKEY_INDIRECT_DEVX: 929 ndescs = get_indirect_num_descs(mmkey); 930 931 if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) { 932 mlx5_ib_dbg(dev, "indirection level exceeded\n"); 933 ret = -EFAULT; 934 goto srcu_unlock; 935 } 936 937 outlen = MLX5_ST_SZ_BYTES(query_mkey_out) + 938 sizeof(*pklm) * (ndescs - 2); 939 940 if (outlen > cur_outlen) { 941 kfree(out); 942 out = kzalloc(outlen, GFP_KERNEL); 943 if (!out) { 944 ret = -ENOMEM; 945 goto srcu_unlock; 946 } 947 cur_outlen = outlen; 948 } 949 950 pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out, 951 bsf0_klm0_pas_mtt0_1); 952 953 ret = mlx5_core_query_mkey(dev->mdev, mmkey, out, outlen); 954 if (ret) 955 goto srcu_unlock; 956 957 offset = io_virt - MLX5_GET64(query_mkey_out, out, 958 memory_key_mkey_entry.start_addr); 959 960 for (i = 0; bcnt && i < ndescs; i++, pklm++) { 961 if (offset >= be32_to_cpu(pklm->bcount)) { 962 offset -= be32_to_cpu(pklm->bcount); 963 continue; 964 } 965 966 frame = kzalloc(sizeof(*frame), GFP_KERNEL); 967 if (!frame) { 968 ret = -ENOMEM; 969 goto srcu_unlock; 970 } 971 972 frame->key = be32_to_cpu(pklm->key); 973 frame->io_virt = be64_to_cpu(pklm->va) + offset; 974 frame->bcnt = min_t(size_t, bcnt, 975 be32_to_cpu(pklm->bcount) - offset); 976 frame->depth = depth + 1; 977 frame->next = head; 978 head = frame; 979 980 bcnt -= frame->bcnt; 981 offset = 0; 982 } 983 break; 984 985 default: 986 mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type); 987 ret = -EFAULT; 988 goto srcu_unlock; 989 } 990 991 if (head) { 992 frame = head; 993 head = frame->next; 994 995 key = frame->key; 996 io_virt = frame->io_virt; 997 bcnt = frame->bcnt; 998 depth = frame->depth; 999 kfree(frame); 1000 1001 goto next_mr; 1002 } 1003 1004 srcu_unlock: 1005 while (head) { 1006 frame = head; 1007 head = frame->next; 1008 kfree(frame); 1009 } 1010 kfree(out); 1011 1012 srcu_read_unlock(&dev->odp_srcu, srcu_key); 1013 *bytes_committed = 0; 1014 return ret ? ret : npages; 1015 } 1016 1017 /** 1018 * Parse a series of data segments for page fault handling. 1019 * 1020 * @pfault contains page fault information. 1021 * @wqe points at the first data segment in the WQE. 1022 * @wqe_end points after the end of the WQE. 1023 * @bytes_mapped receives the number of bytes that the function was able to 1024 * map. This allows the caller to decide intelligently whether 1025 * enough memory was mapped to resolve the page fault 1026 * successfully (e.g. enough for the next MTU, or the entire 1027 * WQE). 1028 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus 1029 * the committed bytes). 1030 * 1031 * Returns the number of pages loaded if positive, zero for an empty WQE, or a 1032 * negative error code. 1033 */ 1034 static int pagefault_data_segments(struct mlx5_ib_dev *dev, 1035 struct mlx5_pagefault *pfault, 1036 void *wqe, 1037 void *wqe_end, u32 *bytes_mapped, 1038 u32 *total_wqe_bytes, bool receive_queue) 1039 { 1040 int ret = 0, npages = 0; 1041 u64 io_virt; 1042 u32 key; 1043 u32 byte_count; 1044 size_t bcnt; 1045 int inline_segment; 1046 1047 if (bytes_mapped) 1048 *bytes_mapped = 0; 1049 if (total_wqe_bytes) 1050 *total_wqe_bytes = 0; 1051 1052 while (wqe < wqe_end) { 1053 struct mlx5_wqe_data_seg *dseg = wqe; 1054 1055 io_virt = be64_to_cpu(dseg->addr); 1056 key = be32_to_cpu(dseg->lkey); 1057 byte_count = be32_to_cpu(dseg->byte_count); 1058 inline_segment = !!(byte_count & MLX5_INLINE_SEG); 1059 bcnt = byte_count & ~MLX5_INLINE_SEG; 1060 1061 if (inline_segment) { 1062 bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; 1063 wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, 1064 16); 1065 } else { 1066 wqe += sizeof(*dseg); 1067 } 1068 1069 /* receive WQE end of sg list. */ 1070 if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && 1071 io_virt == 0) 1072 break; 1073 1074 if (!inline_segment && total_wqe_bytes) { 1075 *total_wqe_bytes += bcnt - min_t(size_t, bcnt, 1076 pfault->bytes_committed); 1077 } 1078 1079 /* A zero length data segment designates a length of 2GB. */ 1080 if (bcnt == 0) 1081 bcnt = 1U << 31; 1082 1083 if (inline_segment || bcnt <= pfault->bytes_committed) { 1084 pfault->bytes_committed -= 1085 min_t(size_t, bcnt, 1086 pfault->bytes_committed); 1087 continue; 1088 } 1089 1090 ret = pagefault_single_data_segment(dev, NULL, key, 1091 io_virt, bcnt, 1092 &pfault->bytes_committed, 1093 bytes_mapped); 1094 if (ret < 0) 1095 break; 1096 npages += ret; 1097 } 1098 1099 return ret < 0 ? ret : npages; 1100 } 1101 1102 /* 1103 * Parse initiator WQE. Advances the wqe pointer to point at the 1104 * scatter-gather list, and set wqe_end to the end of the WQE. 1105 */ 1106 static int mlx5_ib_mr_initiator_pfault_handler( 1107 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 1108 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) 1109 { 1110 struct mlx5_wqe_ctrl_seg *ctrl = *wqe; 1111 u16 wqe_index = pfault->wqe.wqe_index; 1112 struct mlx5_base_av *av; 1113 unsigned ds, opcode; 1114 u32 qpn = qp->trans_qp.base.mqp.qpn; 1115 1116 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; 1117 if (ds * MLX5_WQE_DS_UNITS > wqe_length) { 1118 mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", 1119 ds, wqe_length); 1120 return -EFAULT; 1121 } 1122 1123 if (ds == 0) { 1124 mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", 1125 wqe_index, qpn); 1126 return -EFAULT; 1127 } 1128 1129 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; 1130 *wqe += sizeof(*ctrl); 1131 1132 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & 1133 MLX5_WQE_CTRL_OPCODE_MASK; 1134 1135 if (qp->ibqp.qp_type == IB_QPT_XRC_INI) 1136 *wqe += sizeof(struct mlx5_wqe_xrc_seg); 1137 1138 if (qp->ibqp.qp_type == IB_QPT_UD || 1139 qp->qp_sub_type == MLX5_IB_QPT_DCI) { 1140 av = *wqe; 1141 if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV)) 1142 *wqe += sizeof(struct mlx5_av); 1143 else 1144 *wqe += sizeof(struct mlx5_base_av); 1145 } 1146 1147 switch (opcode) { 1148 case MLX5_OPCODE_RDMA_WRITE: 1149 case MLX5_OPCODE_RDMA_WRITE_IMM: 1150 case MLX5_OPCODE_RDMA_READ: 1151 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 1152 break; 1153 case MLX5_OPCODE_ATOMIC_CS: 1154 case MLX5_OPCODE_ATOMIC_FA: 1155 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 1156 *wqe += sizeof(struct mlx5_wqe_atomic_seg); 1157 break; 1158 } 1159 1160 return 0; 1161 } 1162 1163 /* 1164 * Parse responder WQE and set wqe_end to the end of the WQE. 1165 */ 1166 static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev *dev, 1167 struct mlx5_ib_srq *srq, 1168 void **wqe, void **wqe_end, 1169 int wqe_length) 1170 { 1171 int wqe_size = 1 << srq->msrq.wqe_shift; 1172 1173 if (wqe_size > wqe_length) { 1174 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); 1175 return -EFAULT; 1176 } 1177 1178 *wqe_end = *wqe + wqe_size; 1179 *wqe += sizeof(struct mlx5_wqe_srq_next_seg); 1180 1181 return 0; 1182 } 1183 1184 static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev, 1185 struct mlx5_ib_qp *qp, 1186 void *wqe, void **wqe_end, 1187 int wqe_length) 1188 { 1189 struct mlx5_ib_wq *wq = &qp->rq; 1190 int wqe_size = 1 << wq->wqe_shift; 1191 1192 if (qp->wq_sig) { 1193 mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); 1194 return -EFAULT; 1195 } 1196 1197 if (wqe_size > wqe_length) { 1198 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); 1199 return -EFAULT; 1200 } 1201 1202 *wqe_end = wqe + wqe_size; 1203 1204 return 0; 1205 } 1206 1207 static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev, 1208 u32 wq_num, int pf_type) 1209 { 1210 struct mlx5_core_rsc_common *common = NULL; 1211 struct mlx5_core_srq *srq; 1212 1213 switch (pf_type) { 1214 case MLX5_WQE_PF_TYPE_RMP: 1215 srq = mlx5_cmd_get_srq(dev, wq_num); 1216 if (srq) 1217 common = &srq->common; 1218 break; 1219 case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE: 1220 case MLX5_WQE_PF_TYPE_RESP: 1221 case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC: 1222 common = mlx5_core_res_hold(dev->mdev, wq_num, MLX5_RES_QP); 1223 break; 1224 default: 1225 break; 1226 } 1227 1228 return common; 1229 } 1230 1231 static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res) 1232 { 1233 struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res; 1234 1235 return to_mibqp(mqp); 1236 } 1237 1238 static inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res) 1239 { 1240 struct mlx5_core_srq *msrq = 1241 container_of(res, struct mlx5_core_srq, common); 1242 1243 return to_mibsrq(msrq); 1244 } 1245 1246 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, 1247 struct mlx5_pagefault *pfault) 1248 { 1249 bool sq = pfault->type & MLX5_PFAULT_REQUESTOR; 1250 u16 wqe_index = pfault->wqe.wqe_index; 1251 void *wqe, *wqe_start = NULL, *wqe_end = NULL; 1252 u32 bytes_mapped, total_wqe_bytes; 1253 struct mlx5_core_rsc_common *res; 1254 int resume_with_error = 1; 1255 struct mlx5_ib_qp *qp; 1256 size_t bytes_copied; 1257 int ret = 0; 1258 1259 res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type); 1260 if (!res) { 1261 mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num); 1262 return; 1263 } 1264 1265 if (res->res != MLX5_RES_QP && res->res != MLX5_RES_SRQ && 1266 res->res != MLX5_RES_XSRQ) { 1267 mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n", 1268 pfault->type); 1269 goto resolve_page_fault; 1270 } 1271 1272 wqe_start = (void *)__get_free_page(GFP_KERNEL); 1273 if (!wqe_start) { 1274 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); 1275 goto resolve_page_fault; 1276 } 1277 1278 wqe = wqe_start; 1279 qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL; 1280 if (qp && sq) { 1281 ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE, 1282 &bytes_copied); 1283 if (ret) 1284 goto read_user; 1285 ret = mlx5_ib_mr_initiator_pfault_handler( 1286 dev, pfault, qp, &wqe, &wqe_end, bytes_copied); 1287 } else if (qp && !sq) { 1288 ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE, 1289 &bytes_copied); 1290 if (ret) 1291 goto read_user; 1292 ret = mlx5_ib_mr_responder_pfault_handler_rq( 1293 dev, qp, wqe, &wqe_end, bytes_copied); 1294 } else if (!qp) { 1295 struct mlx5_ib_srq *srq = res_to_srq(res); 1296 1297 ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE, 1298 &bytes_copied); 1299 if (ret) 1300 goto read_user; 1301 ret = mlx5_ib_mr_responder_pfault_handler_srq( 1302 dev, srq, &wqe, &wqe_end, bytes_copied); 1303 } 1304 1305 if (ret < 0 || wqe >= wqe_end) 1306 goto resolve_page_fault; 1307 1308 ret = pagefault_data_segments(dev, pfault, wqe, wqe_end, &bytes_mapped, 1309 &total_wqe_bytes, !sq); 1310 if (ret == -EAGAIN) 1311 goto out; 1312 1313 if (ret < 0 || total_wqe_bytes > bytes_mapped) 1314 goto resolve_page_fault; 1315 1316 out: 1317 ret = 0; 1318 resume_with_error = 0; 1319 1320 read_user: 1321 if (ret) 1322 mlx5_ib_err( 1323 dev, 1324 "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n", 1325 ret, wqe_index, pfault->token); 1326 1327 resolve_page_fault: 1328 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); 1329 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", 1330 pfault->wqe.wq_num, resume_with_error, 1331 pfault->type); 1332 mlx5_core_res_put(res); 1333 free_page((unsigned long)wqe_start); 1334 } 1335 1336 static int pages_in_range(u64 address, u32 length) 1337 { 1338 return (ALIGN(address + length, PAGE_SIZE) - 1339 (address & PAGE_MASK)) >> PAGE_SHIFT; 1340 } 1341 1342 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, 1343 struct mlx5_pagefault *pfault) 1344 { 1345 u64 address; 1346 u32 length; 1347 u32 prefetch_len = pfault->bytes_committed; 1348 int prefetch_activated = 0; 1349 u32 rkey = pfault->rdma.r_key; 1350 int ret; 1351 1352 /* The RDMA responder handler handles the page fault in two parts. 1353 * First it brings the necessary pages for the current packet 1354 * (and uses the pfault context), and then (after resuming the QP) 1355 * prefetches more pages. The second operation cannot use the pfault 1356 * context and therefore uses the dummy_pfault context allocated on 1357 * the stack */ 1358 pfault->rdma.rdma_va += pfault->bytes_committed; 1359 pfault->rdma.rdma_op_len -= min(pfault->bytes_committed, 1360 pfault->rdma.rdma_op_len); 1361 pfault->bytes_committed = 0; 1362 1363 address = pfault->rdma.rdma_va; 1364 length = pfault->rdma.rdma_op_len; 1365 1366 /* For some operations, the hardware cannot tell the exact message 1367 * length, and in those cases it reports zero. Use prefetch 1368 * logic. */ 1369 if (length == 0) { 1370 prefetch_activated = 1; 1371 length = pfault->rdma.packet_size; 1372 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); 1373 } 1374 1375 ret = pagefault_single_data_segment(dev, NULL, rkey, address, length, 1376 &pfault->bytes_committed, NULL); 1377 if (ret == -EAGAIN) { 1378 /* We're racing with an invalidation, don't prefetch */ 1379 prefetch_activated = 0; 1380 } else if (ret < 0 || pages_in_range(address, length) > ret) { 1381 mlx5_ib_page_fault_resume(dev, pfault, 1); 1382 if (ret != -ENOENT) 1383 mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", 1384 ret, pfault->token, pfault->type); 1385 return; 1386 } 1387 1388 mlx5_ib_page_fault_resume(dev, pfault, 0); 1389 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", 1390 pfault->token, pfault->type, 1391 prefetch_activated); 1392 1393 /* At this point, there might be a new pagefault already arriving in 1394 * the eq, switch to the dummy pagefault for the rest of the 1395 * processing. We're still OK with the objects being alive as the 1396 * work-queue is being fenced. */ 1397 1398 if (prefetch_activated) { 1399 u32 bytes_committed = 0; 1400 1401 ret = pagefault_single_data_segment(dev, NULL, rkey, address, 1402 prefetch_len, 1403 &bytes_committed, NULL); 1404 if (ret < 0 && ret != -EAGAIN) { 1405 mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", 1406 ret, pfault->token, address, prefetch_len); 1407 } 1408 } 1409 } 1410 1411 static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) 1412 { 1413 u8 event_subtype = pfault->event_subtype; 1414 1415 switch (event_subtype) { 1416 case MLX5_PFAULT_SUBTYPE_WQE: 1417 mlx5_ib_mr_wqe_pfault_handler(dev, pfault); 1418 break; 1419 case MLX5_PFAULT_SUBTYPE_RDMA: 1420 mlx5_ib_mr_rdma_pfault_handler(dev, pfault); 1421 break; 1422 default: 1423 mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", 1424 event_subtype); 1425 mlx5_ib_page_fault_resume(dev, pfault, 1); 1426 } 1427 } 1428 1429 static void mlx5_ib_eqe_pf_action(struct work_struct *work) 1430 { 1431 struct mlx5_pagefault *pfault = container_of(work, 1432 struct mlx5_pagefault, 1433 work); 1434 struct mlx5_ib_pf_eq *eq = pfault->eq; 1435 1436 mlx5_ib_pfault(eq->dev, pfault); 1437 mempool_free(pfault, eq->pool); 1438 } 1439 1440 static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) 1441 { 1442 struct mlx5_eqe_page_fault *pf_eqe; 1443 struct mlx5_pagefault *pfault; 1444 struct mlx5_eqe *eqe; 1445 int cc = 0; 1446 1447 while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) { 1448 pfault = mempool_alloc(eq->pool, GFP_ATOMIC); 1449 if (!pfault) { 1450 schedule_work(&eq->work); 1451 break; 1452 } 1453 1454 pf_eqe = &eqe->data.page_fault; 1455 pfault->event_subtype = eqe->sub_type; 1456 pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed); 1457 1458 mlx5_ib_dbg(eq->dev, 1459 "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n", 1460 eqe->sub_type, pfault->bytes_committed); 1461 1462 switch (eqe->sub_type) { 1463 case MLX5_PFAULT_SUBTYPE_RDMA: 1464 /* RDMA based event */ 1465 pfault->type = 1466 be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24; 1467 pfault->token = 1468 be32_to_cpu(pf_eqe->rdma.pftype_token) & 1469 MLX5_24BIT_MASK; 1470 pfault->rdma.r_key = 1471 be32_to_cpu(pf_eqe->rdma.r_key); 1472 pfault->rdma.packet_size = 1473 be16_to_cpu(pf_eqe->rdma.packet_length); 1474 pfault->rdma.rdma_op_len = 1475 be32_to_cpu(pf_eqe->rdma.rdma_op_len); 1476 pfault->rdma.rdma_va = 1477 be64_to_cpu(pf_eqe->rdma.rdma_va); 1478 mlx5_ib_dbg(eq->dev, 1479 "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n", 1480 pfault->type, pfault->token, 1481 pfault->rdma.r_key); 1482 mlx5_ib_dbg(eq->dev, 1483 "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n", 1484 pfault->rdma.rdma_op_len, 1485 pfault->rdma.rdma_va); 1486 break; 1487 1488 case MLX5_PFAULT_SUBTYPE_WQE: 1489 /* WQE based event */ 1490 pfault->type = 1491 (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7; 1492 pfault->token = 1493 be32_to_cpu(pf_eqe->wqe.token); 1494 pfault->wqe.wq_num = 1495 be32_to_cpu(pf_eqe->wqe.pftype_wq) & 1496 MLX5_24BIT_MASK; 1497 pfault->wqe.wqe_index = 1498 be16_to_cpu(pf_eqe->wqe.wqe_index); 1499 pfault->wqe.packet_size = 1500 be16_to_cpu(pf_eqe->wqe.packet_length); 1501 mlx5_ib_dbg(eq->dev, 1502 "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n", 1503 pfault->type, pfault->token, 1504 pfault->wqe.wq_num, 1505 pfault->wqe.wqe_index); 1506 break; 1507 1508 default: 1509 mlx5_ib_warn(eq->dev, 1510 "Unsupported page fault event sub-type: 0x%02hhx\n", 1511 eqe->sub_type); 1512 /* Unsupported page faults should still be 1513 * resolved by the page fault handler 1514 */ 1515 } 1516 1517 pfault->eq = eq; 1518 INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action); 1519 queue_work(eq->wq, &pfault->work); 1520 1521 cc = mlx5_eq_update_cc(eq->core, ++cc); 1522 } 1523 1524 mlx5_eq_update_ci(eq->core, cc, 1); 1525 } 1526 1527 static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type, 1528 void *data) 1529 { 1530 struct mlx5_ib_pf_eq *eq = 1531 container_of(nb, struct mlx5_ib_pf_eq, irq_nb); 1532 unsigned long flags; 1533 1534 if (spin_trylock_irqsave(&eq->lock, flags)) { 1535 mlx5_ib_eq_pf_process(eq); 1536 spin_unlock_irqrestore(&eq->lock, flags); 1537 } else { 1538 schedule_work(&eq->work); 1539 } 1540 1541 return IRQ_HANDLED; 1542 } 1543 1544 /* mempool_refill() was proposed but unfortunately wasn't accepted 1545 * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html 1546 * Cheap workaround. 1547 */ 1548 static void mempool_refill(mempool_t *pool) 1549 { 1550 while (pool->curr_nr < pool->min_nr) 1551 mempool_free(mempool_alloc(pool, GFP_KERNEL), pool); 1552 } 1553 1554 static void mlx5_ib_eq_pf_action(struct work_struct *work) 1555 { 1556 struct mlx5_ib_pf_eq *eq = 1557 container_of(work, struct mlx5_ib_pf_eq, work); 1558 1559 mempool_refill(eq->pool); 1560 1561 spin_lock_irq(&eq->lock); 1562 mlx5_ib_eq_pf_process(eq); 1563 spin_unlock_irq(&eq->lock); 1564 } 1565 1566 enum { 1567 MLX5_IB_NUM_PF_EQE = 0x1000, 1568 MLX5_IB_NUM_PF_DRAIN = 64, 1569 }; 1570 1571 static int 1572 mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) 1573 { 1574 struct mlx5_eq_param param = {}; 1575 int err; 1576 1577 INIT_WORK(&eq->work, mlx5_ib_eq_pf_action); 1578 spin_lock_init(&eq->lock); 1579 eq->dev = dev; 1580 1581 eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN, 1582 sizeof(struct mlx5_pagefault)); 1583 if (!eq->pool) 1584 return -ENOMEM; 1585 1586 eq->wq = alloc_workqueue("mlx5_ib_page_fault", 1587 WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM, 1588 MLX5_NUM_CMD_EQE); 1589 if (!eq->wq) { 1590 err = -ENOMEM; 1591 goto err_mempool; 1592 } 1593 1594 eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int; 1595 param = (struct mlx5_eq_param) { 1596 .irq_index = 0, 1597 .nent = MLX5_IB_NUM_PF_EQE, 1598 }; 1599 param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT; 1600 eq->core = mlx5_eq_create_generic(dev->mdev, ¶m); 1601 if (IS_ERR(eq->core)) { 1602 err = PTR_ERR(eq->core); 1603 goto err_wq; 1604 } 1605 err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb); 1606 if (err) { 1607 mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err); 1608 goto err_eq; 1609 } 1610 1611 return 0; 1612 err_eq: 1613 mlx5_eq_destroy_generic(dev->mdev, eq->core); 1614 err_wq: 1615 destroy_workqueue(eq->wq); 1616 err_mempool: 1617 mempool_destroy(eq->pool); 1618 return err; 1619 } 1620 1621 static int 1622 mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) 1623 { 1624 int err; 1625 1626 mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb); 1627 err = mlx5_eq_destroy_generic(dev->mdev, eq->core); 1628 cancel_work_sync(&eq->work); 1629 destroy_workqueue(eq->wq); 1630 mempool_destroy(eq->pool); 1631 1632 return err; 1633 } 1634 1635 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) 1636 { 1637 if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1638 return; 1639 1640 switch (ent->order - 2) { 1641 case MLX5_IMR_MTT_CACHE_ENTRY: 1642 ent->page = PAGE_SHIFT; 1643 ent->xlt = MLX5_IMR_MTT_ENTRIES * 1644 sizeof(struct mlx5_mtt) / 1645 MLX5_IB_UMR_OCTOWORD; 1646 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; 1647 ent->limit = 0; 1648 break; 1649 1650 case MLX5_IMR_KSM_CACHE_ENTRY: 1651 ent->page = MLX5_KSM_PAGE_SHIFT; 1652 ent->xlt = mlx5_imr_ksm_entries * 1653 sizeof(struct mlx5_klm) / 1654 MLX5_IB_UMR_OCTOWORD; 1655 ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; 1656 ent->limit = 0; 1657 break; 1658 } 1659 } 1660 1661 static const struct ib_device_ops mlx5_ib_dev_odp_ops = { 1662 .advise_mr = mlx5_ib_advise_mr, 1663 }; 1664 1665 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) 1666 { 1667 int ret = 0; 1668 1669 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT)) 1670 return ret; 1671 1672 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops); 1673 1674 if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { 1675 ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); 1676 if (ret) { 1677 mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret); 1678 return ret; 1679 } 1680 } 1681 1682 ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq); 1683 1684 return ret; 1685 } 1686 1687 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev) 1688 { 1689 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT)) 1690 return; 1691 1692 mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq); 1693 } 1694 1695 int mlx5_ib_odp_init(void) 1696 { 1697 mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - 1698 MLX5_IMR_MTT_BITS); 1699 1700 return 0; 1701 } 1702 1703 struct prefetch_mr_work { 1704 struct work_struct work; 1705 u32 pf_flags; 1706 u32 num_sge; 1707 struct { 1708 u64 io_virt; 1709 struct mlx5_ib_mr *mr; 1710 size_t length; 1711 } frags[]; 1712 }; 1713 1714 static void destroy_prefetch_work(struct prefetch_mr_work *work) 1715 { 1716 u32 i; 1717 1718 for (i = 0; i < work->num_sge; ++i) 1719 if (atomic_dec_and_test(&work->frags[i].mr->num_deferred_work)) 1720 wake_up(&work->frags[i].mr->q_deferred_work); 1721 kvfree(work); 1722 } 1723 1724 static struct mlx5_ib_mr * 1725 get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, 1726 u32 lkey) 1727 { 1728 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1729 struct mlx5_core_mkey *mmkey; 1730 struct ib_umem_odp *odp; 1731 struct mlx5_ib_mr *mr; 1732 1733 lockdep_assert_held(&dev->odp_srcu); 1734 1735 mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey)); 1736 if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR) 1737 return NULL; 1738 1739 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 1740 1741 if (mr->ibmr.pd != pd) 1742 return NULL; 1743 1744 odp = to_ib_umem_odp(mr->umem); 1745 1746 /* prefetch with write-access must be supported by the MR */ 1747 if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1748 !odp->umem.writable) 1749 return NULL; 1750 1751 return mr; 1752 } 1753 1754 static void mlx5_ib_prefetch_mr_work(struct work_struct *w) 1755 { 1756 struct prefetch_mr_work *work = 1757 container_of(w, struct prefetch_mr_work, work); 1758 u32 bytes_mapped = 0; 1759 u32 i; 1760 1761 for (i = 0; i < work->num_sge; ++i) 1762 pagefault_mr(work->frags[i].mr, work->frags[i].io_virt, 1763 work->frags[i].length, &bytes_mapped, 1764 work->pf_flags); 1765 1766 destroy_prefetch_work(work); 1767 } 1768 1769 static bool init_prefetch_work(struct ib_pd *pd, 1770 enum ib_uverbs_advise_mr_advice advice, 1771 u32 pf_flags, struct prefetch_mr_work *work, 1772 struct ib_sge *sg_list, u32 num_sge) 1773 { 1774 u32 i; 1775 1776 INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); 1777 work->pf_flags = pf_flags; 1778 1779 for (i = 0; i < num_sge; ++i) { 1780 work->frags[i].io_virt = sg_list[i].addr; 1781 work->frags[i].length = sg_list[i].length; 1782 work->frags[i].mr = 1783 get_prefetchable_mr(pd, advice, sg_list[i].lkey); 1784 if (!work->frags[i].mr) { 1785 work->num_sge = i - 1; 1786 if (i) 1787 destroy_prefetch_work(work); 1788 return false; 1789 } 1790 1791 /* Keep the MR pointer will valid outside the SRCU */ 1792 atomic_inc(&work->frags[i].mr->num_deferred_work); 1793 } 1794 work->num_sge = num_sge; 1795 return true; 1796 } 1797 1798 static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, 1799 enum ib_uverbs_advise_mr_advice advice, 1800 u32 pf_flags, struct ib_sge *sg_list, 1801 u32 num_sge) 1802 { 1803 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1804 u32 bytes_mapped = 0; 1805 int srcu_key; 1806 int ret = 0; 1807 u32 i; 1808 1809 srcu_key = srcu_read_lock(&dev->odp_srcu); 1810 for (i = 0; i < num_sge; ++i) { 1811 struct mlx5_ib_mr *mr; 1812 1813 mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey); 1814 if (!mr) { 1815 ret = -ENOENT; 1816 goto out; 1817 } 1818 ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length, 1819 &bytes_mapped, pf_flags); 1820 if (ret < 0) 1821 goto out; 1822 } 1823 ret = 0; 1824 1825 out: 1826 srcu_read_unlock(&dev->odp_srcu, srcu_key); 1827 return ret; 1828 } 1829 1830 int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, 1831 enum ib_uverbs_advise_mr_advice advice, 1832 u32 flags, struct ib_sge *sg_list, u32 num_sge) 1833 { 1834 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1835 u32 pf_flags = 0; 1836 struct prefetch_mr_work *work; 1837 int srcu_key; 1838 1839 if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) 1840 pf_flags |= MLX5_PF_FLAGS_DOWNGRADE; 1841 1842 if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) 1843 return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list, 1844 num_sge); 1845 1846 work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL); 1847 if (!work) 1848 return -ENOMEM; 1849 1850 srcu_key = srcu_read_lock(&dev->odp_srcu); 1851 if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) { 1852 srcu_read_unlock(&dev->odp_srcu, srcu_key); 1853 return -EINVAL; 1854 } 1855 queue_work(system_unbound_wq, &work->work); 1856 srcu_read_unlock(&dev->odp_srcu, srcu_key); 1857 return 0; 1858 } 1859