1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <rdma/ib_umem.h> 34 #include <rdma/ib_umem_odp.h> 35 36 #include "mlx5_ib.h" 37 38 #define MAX_PREFETCH_LEN (4*1024*1024U) 39 40 /* Timeout in ms to wait for an active mmu notifier to complete when handling 41 * a pagefault. */ 42 #define MMU_NOTIFIER_TIMEOUT 1000 43 44 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, 45 unsigned long end) 46 { 47 struct mlx5_ib_mr *mr; 48 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / 49 sizeof(struct mlx5_mtt)) - 1; 50 u64 idx = 0, blk_start_idx = 0; 51 int in_block = 0; 52 u64 addr; 53 54 if (!umem || !umem->odp_data) { 55 pr_err("invalidation called on NULL umem or non-ODP umem\n"); 56 return; 57 } 58 59 mr = umem->odp_data->private; 60 61 if (!mr || !mr->ibmr.pd) 62 return; 63 64 start = max_t(u64, ib_umem_start(umem), start); 65 end = min_t(u64, ib_umem_end(umem), end); 66 67 /* 68 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that 69 * while we are doing the invalidation, no page fault will attempt to 70 * overwrite the same MTTs. Concurent invalidations might race us, 71 * but they will write 0s as well, so no difference in the end result. 72 */ 73 74 for (addr = start; addr < end; addr += (u64)umem->page_size) { 75 idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; 76 /* 77 * Strive to write the MTTs in chunks, but avoid overwriting 78 * non-existing MTTs. The huristic here can be improved to 79 * estimate the cost of another UMR vs. the cost of bigger 80 * UMR. 81 */ 82 if (umem->odp_data->dma_list[idx] & 83 (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { 84 if (!in_block) { 85 blk_start_idx = idx; 86 in_block = 1; 87 } 88 } else { 89 u64 umr_offset = idx & umr_block_mask; 90 91 if (in_block && umr_offset == 0) { 92 mlx5_ib_update_xlt(mr, blk_start_idx, 93 idx - blk_start_idx, 94 PAGE_SHIFT, 95 MLX5_IB_UPD_XLT_ZAP | 96 MLX5_IB_UPD_XLT_ATOMIC); 97 in_block = 0; 98 } 99 } 100 } 101 if (in_block) 102 mlx5_ib_update_xlt(mr, blk_start_idx, 103 idx - blk_start_idx + 1, 104 PAGE_SHIFT, 105 MLX5_IB_UPD_XLT_ZAP | 106 MLX5_IB_UPD_XLT_ATOMIC); 107 /* 108 * We are now sure that the device will not access the 109 * memory. We can safely unmap it, and mark it as dirty if 110 * needed. 111 */ 112 113 ib_umem_odp_unmap_dma_pages(umem, start, end); 114 } 115 116 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 117 { 118 struct ib_odp_caps *caps = &dev->odp_caps; 119 120 memset(caps, 0, sizeof(*caps)); 121 122 if (!MLX5_CAP_GEN(dev->mdev, pg)) 123 return; 124 125 caps->general_caps = IB_ODP_SUPPORT; 126 127 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 128 dev->odp_max_size = U64_MAX; 129 else 130 dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); 131 132 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) 133 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; 134 135 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) 136 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; 137 138 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) 139 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; 140 141 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) 142 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; 143 144 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) 145 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; 146 147 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) 148 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; 149 150 return; 151 } 152 153 static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, 154 u32 key) 155 { 156 u32 base_key = mlx5_base_mkey(key); 157 struct mlx5_core_mkey *mmkey = __mlx5_mr_lookup(dev->mdev, base_key); 158 struct mlx5_ib_mr *mr; 159 160 if (!mmkey || mmkey->key != key || mmkey->type != MLX5_MKEY_MR) 161 return NULL; 162 163 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); 164 165 if (!mr->live) 166 return NULL; 167 168 return container_of(mmkey, struct mlx5_ib_mr, mmkey); 169 } 170 171 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, 172 struct mlx5_pagefault *pfault, 173 int error) 174 { 175 int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? 176 pfault->wqe.wq_num : pfault->token; 177 int ret = mlx5_core_page_fault_resume(dev->mdev, 178 pfault->token, 179 wq_num, 180 pfault->type, 181 error); 182 if (ret) 183 mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", 184 wq_num); 185 } 186 187 /* 188 * Handle a single data segment in a page-fault WQE or RDMA region. 189 * 190 * Returns number of pages retrieved on success. The caller may continue to 191 * the next data segment. 192 * Can return the following error codes: 193 * -EAGAIN to designate a temporary error. The caller will abort handling the 194 * page fault and resolve it. 195 * -EFAULT when there's an error mapping the requested pages. The caller will 196 * abort the page fault handling. 197 */ 198 static int pagefault_single_data_segment(struct mlx5_ib_dev *mib_dev, 199 u32 key, u64 io_virt, size_t bcnt, 200 u32 *bytes_committed, 201 u32 *bytes_mapped) 202 { 203 int srcu_key; 204 unsigned int current_seq; 205 u64 start_idx; 206 int npages = 0, ret = 0; 207 struct mlx5_ib_mr *mr; 208 u64 access_mask = ODP_READ_ALLOWED_BIT; 209 210 srcu_key = srcu_read_lock(&mib_dev->mr_srcu); 211 mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); 212 /* 213 * If we didn't find the MR, it means the MR was closed while we were 214 * handling the ODP event. In this case we return -EFAULT so that the 215 * QP will be closed. 216 */ 217 if (!mr || !mr->ibmr.pd) { 218 pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", 219 key); 220 ret = -EFAULT; 221 goto srcu_unlock; 222 } 223 if (!mr->umem->odp_data) { 224 pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", 225 key); 226 if (bytes_mapped) 227 *bytes_mapped += 228 (bcnt - *bytes_committed); 229 goto srcu_unlock; 230 } 231 232 current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); 233 /* 234 * Ensure the sequence number is valid for some time before we call 235 * gup. 236 */ 237 smp_rmb(); 238 239 /* 240 * Avoid branches - this code will perform correctly 241 * in all iterations (in iteration 2 and above, 242 * bytes_committed == 0). 243 */ 244 io_virt += *bytes_committed; 245 bcnt -= *bytes_committed; 246 247 start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT; 248 249 if (mr->umem->writable) 250 access_mask |= ODP_WRITE_ALLOWED_BIT; 251 npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, 252 access_mask, current_seq); 253 if (npages < 0) { 254 ret = npages; 255 goto srcu_unlock; 256 } 257 258 if (npages > 0) { 259 mutex_lock(&mr->umem->odp_data->umem_mutex); 260 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { 261 /* 262 * No need to check whether the MTTs really belong to 263 * this MR, since ib_umem_odp_map_dma_pages already 264 * checks this. 265 */ 266 ret = mlx5_ib_update_xlt(mr, start_idx, npages, 267 PAGE_SHIFT, 268 MLX5_IB_UPD_XLT_ATOMIC); 269 } else { 270 ret = -EAGAIN; 271 } 272 mutex_unlock(&mr->umem->odp_data->umem_mutex); 273 if (ret < 0) { 274 if (ret != -EAGAIN) 275 pr_err("Failed to update mkey page tables\n"); 276 goto srcu_unlock; 277 } 278 279 if (bytes_mapped) { 280 u32 new_mappings = npages * PAGE_SIZE - 281 (io_virt - round_down(io_virt, PAGE_SIZE)); 282 *bytes_mapped += min_t(u32, new_mappings, bcnt); 283 } 284 } 285 286 srcu_unlock: 287 if (ret == -EAGAIN) { 288 if (!mr->umem->odp_data->dying) { 289 struct ib_umem_odp *odp_data = mr->umem->odp_data; 290 unsigned long timeout = 291 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); 292 293 if (!wait_for_completion_timeout( 294 &odp_data->notifier_completion, 295 timeout)) { 296 pr_warn("timeout waiting for mmu notifier completion\n"); 297 } 298 } else { 299 /* The MR is being killed, kill the QP as well. */ 300 ret = -EFAULT; 301 } 302 } 303 srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); 304 *bytes_committed = 0; 305 return ret ? ret : npages; 306 } 307 308 /** 309 * Parse a series of data segments for page fault handling. 310 * 311 * @qp the QP on which the fault occurred. 312 * @pfault contains page fault information. 313 * @wqe points at the first data segment in the WQE. 314 * @wqe_end points after the end of the WQE. 315 * @bytes_mapped receives the number of bytes that the function was able to 316 * map. This allows the caller to decide intelligently whether 317 * enough memory was mapped to resolve the page fault 318 * successfully (e.g. enough for the next MTU, or the entire 319 * WQE). 320 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus 321 * the committed bytes). 322 * 323 * Returns the number of pages loaded if positive, zero for an empty WQE, or a 324 * negative error code. 325 */ 326 static int pagefault_data_segments(struct mlx5_ib_dev *dev, 327 struct mlx5_pagefault *pfault, 328 struct mlx5_ib_qp *qp, void *wqe, 329 void *wqe_end, u32 *bytes_mapped, 330 u32 *total_wqe_bytes, int receive_queue) 331 { 332 int ret = 0, npages = 0; 333 u64 io_virt; 334 u32 key; 335 u32 byte_count; 336 size_t bcnt; 337 int inline_segment; 338 339 /* Skip SRQ next-WQE segment. */ 340 if (receive_queue && qp->ibqp.srq) 341 wqe += sizeof(struct mlx5_wqe_srq_next_seg); 342 343 if (bytes_mapped) 344 *bytes_mapped = 0; 345 if (total_wqe_bytes) 346 *total_wqe_bytes = 0; 347 348 while (wqe < wqe_end) { 349 struct mlx5_wqe_data_seg *dseg = wqe; 350 351 io_virt = be64_to_cpu(dseg->addr); 352 key = be32_to_cpu(dseg->lkey); 353 byte_count = be32_to_cpu(dseg->byte_count); 354 inline_segment = !!(byte_count & MLX5_INLINE_SEG); 355 bcnt = byte_count & ~MLX5_INLINE_SEG; 356 357 if (inline_segment) { 358 bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; 359 wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, 360 16); 361 } else { 362 wqe += sizeof(*dseg); 363 } 364 365 /* receive WQE end of sg list. */ 366 if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && 367 io_virt == 0) 368 break; 369 370 if (!inline_segment && total_wqe_bytes) { 371 *total_wqe_bytes += bcnt - min_t(size_t, bcnt, 372 pfault->bytes_committed); 373 } 374 375 /* A zero length data segment designates a length of 2GB. */ 376 if (bcnt == 0) 377 bcnt = 1U << 31; 378 379 if (inline_segment || bcnt <= pfault->bytes_committed) { 380 pfault->bytes_committed -= 381 min_t(size_t, bcnt, 382 pfault->bytes_committed); 383 continue; 384 } 385 386 ret = pagefault_single_data_segment(dev, key, io_virt, bcnt, 387 &pfault->bytes_committed, 388 bytes_mapped); 389 if (ret < 0) 390 break; 391 npages += ret; 392 } 393 394 return ret < 0 ? ret : npages; 395 } 396 397 static const u32 mlx5_ib_odp_opcode_cap[] = { 398 [MLX5_OPCODE_SEND] = IB_ODP_SUPPORT_SEND, 399 [MLX5_OPCODE_SEND_IMM] = IB_ODP_SUPPORT_SEND, 400 [MLX5_OPCODE_SEND_INVAL] = IB_ODP_SUPPORT_SEND, 401 [MLX5_OPCODE_RDMA_WRITE] = IB_ODP_SUPPORT_WRITE, 402 [MLX5_OPCODE_RDMA_WRITE_IMM] = IB_ODP_SUPPORT_WRITE, 403 [MLX5_OPCODE_RDMA_READ] = IB_ODP_SUPPORT_READ, 404 [MLX5_OPCODE_ATOMIC_CS] = IB_ODP_SUPPORT_ATOMIC, 405 [MLX5_OPCODE_ATOMIC_FA] = IB_ODP_SUPPORT_ATOMIC, 406 }; 407 408 /* 409 * Parse initiator WQE. Advances the wqe pointer to point at the 410 * scatter-gather list, and set wqe_end to the end of the WQE. 411 */ 412 static int mlx5_ib_mr_initiator_pfault_handler( 413 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 414 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) 415 { 416 struct mlx5_wqe_ctrl_seg *ctrl = *wqe; 417 u16 wqe_index = pfault->wqe.wqe_index; 418 u32 transport_caps; 419 struct mlx5_base_av *av; 420 unsigned ds, opcode; 421 #if defined(DEBUG) 422 u32 ctrl_wqe_index, ctrl_qpn; 423 #endif 424 u32 qpn = qp->trans_qp.base.mqp.qpn; 425 426 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; 427 if (ds * MLX5_WQE_DS_UNITS > wqe_length) { 428 mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", 429 ds, wqe_length); 430 return -EFAULT; 431 } 432 433 if (ds == 0) { 434 mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", 435 wqe_index, qpn); 436 return -EFAULT; 437 } 438 439 #if defined(DEBUG) 440 ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & 441 MLX5_WQE_CTRL_WQE_INDEX_MASK) >> 442 MLX5_WQE_CTRL_WQE_INDEX_SHIFT; 443 if (wqe_index != ctrl_wqe_index) { 444 mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", 445 wqe_index, qpn, 446 ctrl_wqe_index); 447 return -EFAULT; 448 } 449 450 ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> 451 MLX5_WQE_CTRL_QPN_SHIFT; 452 if (qpn != ctrl_qpn) { 453 mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", 454 wqe_index, qpn, 455 ctrl_qpn); 456 return -EFAULT; 457 } 458 #endif /* DEBUG */ 459 460 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; 461 *wqe += sizeof(*ctrl); 462 463 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & 464 MLX5_WQE_CTRL_OPCODE_MASK; 465 466 switch (qp->ibqp.qp_type) { 467 case IB_QPT_RC: 468 transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps; 469 break; 470 case IB_QPT_UD: 471 transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps; 472 break; 473 default: 474 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n", 475 qp->ibqp.qp_type); 476 return -EFAULT; 477 } 478 479 if (unlikely(opcode >= sizeof(mlx5_ib_odp_opcode_cap) / 480 sizeof(mlx5_ib_odp_opcode_cap[0]) || 481 !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) { 482 mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n", 483 opcode); 484 return -EFAULT; 485 } 486 487 if (qp->ibqp.qp_type != IB_QPT_RC) { 488 av = *wqe; 489 if (av->dqp_dct & be32_to_cpu(MLX5_WQE_AV_EXT)) 490 *wqe += sizeof(struct mlx5_av); 491 else 492 *wqe += sizeof(struct mlx5_base_av); 493 } 494 495 switch (opcode) { 496 case MLX5_OPCODE_RDMA_WRITE: 497 case MLX5_OPCODE_RDMA_WRITE_IMM: 498 case MLX5_OPCODE_RDMA_READ: 499 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 500 break; 501 case MLX5_OPCODE_ATOMIC_CS: 502 case MLX5_OPCODE_ATOMIC_FA: 503 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 504 *wqe += sizeof(struct mlx5_wqe_atomic_seg); 505 break; 506 } 507 508 return 0; 509 } 510 511 /* 512 * Parse responder WQE. Advances the wqe pointer to point at the 513 * scatter-gather list, and set wqe_end to the end of the WQE. 514 */ 515 static int mlx5_ib_mr_responder_pfault_handler( 516 struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, 517 struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) 518 { 519 struct mlx5_ib_wq *wq = &qp->rq; 520 int wqe_size = 1 << wq->wqe_shift; 521 522 if (qp->ibqp.srq) { 523 mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); 524 return -EFAULT; 525 } 526 527 if (qp->wq_sig) { 528 mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); 529 return -EFAULT; 530 } 531 532 if (wqe_size > wqe_length) { 533 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); 534 return -EFAULT; 535 } 536 537 switch (qp->ibqp.qp_type) { 538 case IB_QPT_RC: 539 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 540 IB_ODP_SUPPORT_RECV)) 541 goto invalid_transport_or_opcode; 542 break; 543 default: 544 invalid_transport_or_opcode: 545 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", 546 qp->ibqp.qp_type); 547 return -EFAULT; 548 } 549 550 *wqe_end = *wqe + wqe_size; 551 552 return 0; 553 } 554 555 static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev, 556 u32 wq_num) 557 { 558 struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num); 559 560 if (!mqp) { 561 mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num); 562 return NULL; 563 } 564 565 return to_mibqp(mqp); 566 } 567 568 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, 569 struct mlx5_pagefault *pfault) 570 { 571 int ret; 572 void *wqe, *wqe_end; 573 u32 bytes_mapped, total_wqe_bytes; 574 char *buffer = NULL; 575 int resume_with_error = 1; 576 u16 wqe_index = pfault->wqe.wqe_index; 577 int requestor = pfault->type & MLX5_PFAULT_REQUESTOR; 578 struct mlx5_ib_qp *qp; 579 580 buffer = (char *)__get_free_page(GFP_KERNEL); 581 if (!buffer) { 582 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); 583 goto resolve_page_fault; 584 } 585 586 qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num); 587 if (!qp) 588 goto resolve_page_fault; 589 590 ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, 591 PAGE_SIZE, &qp->trans_qp.base); 592 if (ret < 0) { 593 mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n", 594 ret, wqe_index, pfault->token); 595 goto resolve_page_fault; 596 } 597 598 wqe = buffer; 599 if (requestor) 600 ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe, 601 &wqe_end, ret); 602 else 603 ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe, 604 &wqe_end, ret); 605 if (ret < 0) 606 goto resolve_page_fault; 607 608 if (wqe >= wqe_end) { 609 mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); 610 goto resolve_page_fault; 611 } 612 613 ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end, 614 &bytes_mapped, &total_wqe_bytes, 615 !requestor); 616 if (ret == -EAGAIN) { 617 resume_with_error = 0; 618 goto resolve_page_fault; 619 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { 620 if (ret != -ENOENT) 621 mlx5_ib_err(dev, "Error getting user pages for page fault. Error: %d\n", 622 ret); 623 goto resolve_page_fault; 624 } 625 626 resume_with_error = 0; 627 resolve_page_fault: 628 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); 629 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", 630 pfault->token, resume_with_error, 631 pfault->type); 632 free_page((unsigned long)buffer); 633 } 634 635 static int pages_in_range(u64 address, u32 length) 636 { 637 return (ALIGN(address + length, PAGE_SIZE) - 638 (address & PAGE_MASK)) >> PAGE_SHIFT; 639 } 640 641 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, 642 struct mlx5_pagefault *pfault) 643 { 644 u64 address; 645 u32 length; 646 u32 prefetch_len = pfault->bytes_committed; 647 int prefetch_activated = 0; 648 u32 rkey = pfault->rdma.r_key; 649 int ret; 650 651 /* The RDMA responder handler handles the page fault in two parts. 652 * First it brings the necessary pages for the current packet 653 * (and uses the pfault context), and then (after resuming the QP) 654 * prefetches more pages. The second operation cannot use the pfault 655 * context and therefore uses the dummy_pfault context allocated on 656 * the stack */ 657 pfault->rdma.rdma_va += pfault->bytes_committed; 658 pfault->rdma.rdma_op_len -= min(pfault->bytes_committed, 659 pfault->rdma.rdma_op_len); 660 pfault->bytes_committed = 0; 661 662 address = pfault->rdma.rdma_va; 663 length = pfault->rdma.rdma_op_len; 664 665 /* For some operations, the hardware cannot tell the exact message 666 * length, and in those cases it reports zero. Use prefetch 667 * logic. */ 668 if (length == 0) { 669 prefetch_activated = 1; 670 length = pfault->rdma.packet_size; 671 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); 672 } 673 674 ret = pagefault_single_data_segment(dev, rkey, address, length, 675 &pfault->bytes_committed, NULL); 676 if (ret == -EAGAIN) { 677 /* We're racing with an invalidation, don't prefetch */ 678 prefetch_activated = 0; 679 } else if (ret < 0 || pages_in_range(address, length) > ret) { 680 mlx5_ib_page_fault_resume(dev, pfault, 1); 681 if (ret != -ENOENT) 682 mlx5_ib_warn(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", 683 ret, pfault->token, pfault->type); 684 return; 685 } 686 687 mlx5_ib_page_fault_resume(dev, pfault, 0); 688 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", 689 pfault->token, pfault->type, 690 prefetch_activated); 691 692 /* At this point, there might be a new pagefault already arriving in 693 * the eq, switch to the dummy pagefault for the rest of the 694 * processing. We're still OK with the objects being alive as the 695 * work-queue is being fenced. */ 696 697 if (prefetch_activated) { 698 u32 bytes_committed = 0; 699 700 ret = pagefault_single_data_segment(dev, rkey, address, 701 prefetch_len, 702 &bytes_committed, NULL); 703 if (ret < 0) { 704 mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", 705 ret, pfault->token, address, 706 prefetch_len); 707 } 708 } 709 } 710 711 void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, 712 struct mlx5_pagefault *pfault) 713 { 714 struct mlx5_ib_dev *dev = context; 715 u8 event_subtype = pfault->event_subtype; 716 717 switch (event_subtype) { 718 case MLX5_PFAULT_SUBTYPE_WQE: 719 mlx5_ib_mr_wqe_pfault_handler(dev, pfault); 720 break; 721 case MLX5_PFAULT_SUBTYPE_RDMA: 722 mlx5_ib_mr_rdma_pfault_handler(dev, pfault); 723 break; 724 default: 725 mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", 726 event_subtype); 727 mlx5_ib_page_fault_resume(dev, pfault, 1); 728 } 729 } 730 731 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) 732 { 733 int ret; 734 735 ret = init_srcu_struct(&ibdev->mr_srcu); 736 if (ret) 737 return ret; 738 739 return 0; 740 } 741 742 void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) 743 { 744 cleanup_srcu_struct(&ibdev->mr_srcu); 745 } 746 747