1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <rdma/ib_umem.h> 34 #include <rdma/ib_umem_odp.h> 35 36 #include "mlx5_ib.h" 37 38 #define MAX_PREFETCH_LEN (4*1024*1024U) 39 40 /* Timeout in ms to wait for an active mmu notifier to complete when handling 41 * a pagefault. */ 42 #define MMU_NOTIFIER_TIMEOUT 1000 43 44 struct workqueue_struct *mlx5_ib_page_fault_wq; 45 46 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, 47 unsigned long end) 48 { 49 struct mlx5_ib_mr *mr; 50 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1; 51 u64 idx = 0, blk_start_idx = 0; 52 int in_block = 0; 53 u64 addr; 54 55 if (!umem || !umem->odp_data) { 56 pr_err("invalidation called on NULL umem or non-ODP umem\n"); 57 return; 58 } 59 60 mr = umem->odp_data->private; 61 62 if (!mr || !mr->ibmr.pd) 63 return; 64 65 start = max_t(u64, ib_umem_start(umem), start); 66 end = min_t(u64, ib_umem_end(umem), end); 67 68 /* 69 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that 70 * while we are doing the invalidation, no page fault will attempt to 71 * overwrite the same MTTs. Concurent invalidations might race us, 72 * but they will write 0s as well, so no difference in the end result. 73 */ 74 75 for (addr = start; addr < end; addr += (u64)umem->page_size) { 76 idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; 77 /* 78 * Strive to write the MTTs in chunks, but avoid overwriting 79 * non-existing MTTs. The huristic here can be improved to 80 * estimate the cost of another UMR vs. the cost of bigger 81 * UMR. 82 */ 83 if (umem->odp_data->dma_list[idx] & 84 (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { 85 if (!in_block) { 86 blk_start_idx = idx; 87 in_block = 1; 88 } 89 } else { 90 u64 umr_offset = idx & umr_block_mask; 91 92 if (in_block && umr_offset == 0) { 93 mlx5_ib_update_mtt(mr, blk_start_idx, 94 idx - blk_start_idx, 1); 95 in_block = 0; 96 } 97 } 98 } 99 if (in_block) 100 mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1, 101 1); 102 103 /* 104 * We are now sure that the device will not access the 105 * memory. We can safely unmap it, and mark it as dirty if 106 * needed. 107 */ 108 109 ib_umem_odp_unmap_dma_pages(umem, start, end); 110 } 111 112 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 113 { 114 struct ib_odp_caps *caps = &dev->odp_caps; 115 116 memset(caps, 0, sizeof(*caps)); 117 118 if (!MLX5_CAP_GEN(dev->mdev, pg)) 119 return; 120 121 caps->general_caps = IB_ODP_SUPPORT; 122 123 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) 124 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; 125 126 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) 127 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; 128 129 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) 130 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; 131 132 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) 133 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; 134 135 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) 136 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; 137 138 return; 139 } 140 141 static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, 142 u32 key) 143 { 144 u32 base_key = mlx5_base_mkey(key); 145 struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key); 146 struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr); 147 148 if (!mmr || mmr->key != key || !mr->live) 149 return NULL; 150 151 return container_of(mmr, struct mlx5_ib_mr, mmr); 152 } 153 154 static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, 155 struct mlx5_ib_pfault *pfault, 156 int error) { 157 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); 158 int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn, 159 pfault->mpfault.flags, 160 error); 161 if (ret) 162 pr_err("Failed to resolve the page fault on QP 0x%x\n", 163 qp->mqp.qpn); 164 } 165 166 /* 167 * Handle a single data segment in a page-fault WQE. 168 * 169 * Returns number of pages retrieved on success. The caller will continue to 170 * the next data segment. 171 * Can return the following error codes: 172 * -EAGAIN to designate a temporary error. The caller will abort handling the 173 * page fault and resolve it. 174 * -EFAULT when there's an error mapping the requested pages. The caller will 175 * abort the page fault handling and possibly move the QP to an error state. 176 * On other errors the QP should also be closed with an error. 177 */ 178 static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, 179 struct mlx5_ib_pfault *pfault, 180 u32 key, u64 io_virt, size_t bcnt, 181 u32 *bytes_mapped) 182 { 183 struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device); 184 int srcu_key; 185 unsigned int current_seq; 186 u64 start_idx; 187 int npages = 0, ret = 0; 188 struct mlx5_ib_mr *mr; 189 u64 access_mask = ODP_READ_ALLOWED_BIT; 190 191 srcu_key = srcu_read_lock(&mib_dev->mr_srcu); 192 mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); 193 /* 194 * If we didn't find the MR, it means the MR was closed while we were 195 * handling the ODP event. In this case we return -EFAULT so that the 196 * QP will be closed. 197 */ 198 if (!mr || !mr->ibmr.pd) { 199 pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", 200 key); 201 ret = -EFAULT; 202 goto srcu_unlock; 203 } 204 if (!mr->umem->odp_data) { 205 pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", 206 key); 207 if (bytes_mapped) 208 *bytes_mapped += 209 (bcnt - pfault->mpfault.bytes_committed); 210 goto srcu_unlock; 211 } 212 if (mr->ibmr.pd != qp->ibqp.pd) { 213 pr_err("Page-fault with different PDs for QP and MR.\n"); 214 ret = -EFAULT; 215 goto srcu_unlock; 216 } 217 218 current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); 219 /* 220 * Ensure the sequence number is valid for some time before we call 221 * gup. 222 */ 223 smp_rmb(); 224 225 /* 226 * Avoid branches - this code will perform correctly 227 * in all iterations (in iteration 2 and above, 228 * bytes_committed == 0). 229 */ 230 io_virt += pfault->mpfault.bytes_committed; 231 bcnt -= pfault->mpfault.bytes_committed; 232 233 start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT; 234 235 if (mr->umem->writable) 236 access_mask |= ODP_WRITE_ALLOWED_BIT; 237 npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, 238 access_mask, current_seq); 239 if (npages < 0) { 240 ret = npages; 241 goto srcu_unlock; 242 } 243 244 if (npages > 0) { 245 mutex_lock(&mr->umem->odp_data->umem_mutex); 246 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { 247 /* 248 * No need to check whether the MTTs really belong to 249 * this MR, since ib_umem_odp_map_dma_pages already 250 * checks this. 251 */ 252 ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); 253 } else { 254 ret = -EAGAIN; 255 } 256 mutex_unlock(&mr->umem->odp_data->umem_mutex); 257 if (ret < 0) { 258 if (ret != -EAGAIN) 259 pr_err("Failed to update mkey page tables\n"); 260 goto srcu_unlock; 261 } 262 263 if (bytes_mapped) { 264 u32 new_mappings = npages * PAGE_SIZE - 265 (io_virt - round_down(io_virt, PAGE_SIZE)); 266 *bytes_mapped += min_t(u32, new_mappings, bcnt); 267 } 268 } 269 270 srcu_unlock: 271 if (ret == -EAGAIN) { 272 if (!mr->umem->odp_data->dying) { 273 struct ib_umem_odp *odp_data = mr->umem->odp_data; 274 unsigned long timeout = 275 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); 276 277 if (!wait_for_completion_timeout( 278 &odp_data->notifier_completion, 279 timeout)) { 280 pr_warn("timeout waiting for mmu notifier completion\n"); 281 } 282 } else { 283 /* The MR is being killed, kill the QP as well. */ 284 ret = -EFAULT; 285 } 286 } 287 srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); 288 pfault->mpfault.bytes_committed = 0; 289 return ret ? ret : npages; 290 } 291 292 /** 293 * Parse a series of data segments for page fault handling. 294 * 295 * @qp the QP on which the fault occurred. 296 * @pfault contains page fault information. 297 * @wqe points at the first data segment in the WQE. 298 * @wqe_end points after the end of the WQE. 299 * @bytes_mapped receives the number of bytes that the function was able to 300 * map. This allows the caller to decide intelligently whether 301 * enough memory was mapped to resolve the page fault 302 * successfully (e.g. enough for the next MTU, or the entire 303 * WQE). 304 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus 305 * the committed bytes). 306 * 307 * Returns the number of pages loaded if positive, zero for an empty WQE, or a 308 * negative error code. 309 */ 310 static int pagefault_data_segments(struct mlx5_ib_qp *qp, 311 struct mlx5_ib_pfault *pfault, void *wqe, 312 void *wqe_end, u32 *bytes_mapped, 313 u32 *total_wqe_bytes, int receive_queue) 314 { 315 int ret = 0, npages = 0; 316 u64 io_virt; 317 u32 key; 318 u32 byte_count; 319 size_t bcnt; 320 int inline_segment; 321 322 /* Skip SRQ next-WQE segment. */ 323 if (receive_queue && qp->ibqp.srq) 324 wqe += sizeof(struct mlx5_wqe_srq_next_seg); 325 326 if (bytes_mapped) 327 *bytes_mapped = 0; 328 if (total_wqe_bytes) 329 *total_wqe_bytes = 0; 330 331 while (wqe < wqe_end) { 332 struct mlx5_wqe_data_seg *dseg = wqe; 333 334 io_virt = be64_to_cpu(dseg->addr); 335 key = be32_to_cpu(dseg->lkey); 336 byte_count = be32_to_cpu(dseg->byte_count); 337 inline_segment = !!(byte_count & MLX5_INLINE_SEG); 338 bcnt = byte_count & ~MLX5_INLINE_SEG; 339 340 if (inline_segment) { 341 bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; 342 wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, 343 16); 344 } else { 345 wqe += sizeof(*dseg); 346 } 347 348 /* receive WQE end of sg list. */ 349 if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && 350 io_virt == 0) 351 break; 352 353 if (!inline_segment && total_wqe_bytes) { 354 *total_wqe_bytes += bcnt - min_t(size_t, bcnt, 355 pfault->mpfault.bytes_committed); 356 } 357 358 /* A zero length data segment designates a length of 2GB. */ 359 if (bcnt == 0) 360 bcnt = 1U << 31; 361 362 if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) { 363 pfault->mpfault.bytes_committed -= 364 min_t(size_t, bcnt, 365 pfault->mpfault.bytes_committed); 366 continue; 367 } 368 369 ret = pagefault_single_data_segment(qp, pfault, key, io_virt, 370 bcnt, bytes_mapped); 371 if (ret < 0) 372 break; 373 npages += ret; 374 } 375 376 return ret < 0 ? ret : npages; 377 } 378 379 /* 380 * Parse initiator WQE. Advances the wqe pointer to point at the 381 * scatter-gather list, and set wqe_end to the end of the WQE. 382 */ 383 static int mlx5_ib_mr_initiator_pfault_handler( 384 struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, 385 void **wqe, void **wqe_end, int wqe_length) 386 { 387 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); 388 struct mlx5_wqe_ctrl_seg *ctrl = *wqe; 389 u16 wqe_index = pfault->mpfault.wqe.wqe_index; 390 unsigned ds, opcode; 391 #if defined(DEBUG) 392 u32 ctrl_wqe_index, ctrl_qpn; 393 #endif 394 395 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; 396 if (ds * MLX5_WQE_DS_UNITS > wqe_length) { 397 mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", 398 ds, wqe_length); 399 return -EFAULT; 400 } 401 402 if (ds == 0) { 403 mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", 404 wqe_index, qp->mqp.qpn); 405 return -EFAULT; 406 } 407 408 #if defined(DEBUG) 409 ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & 410 MLX5_WQE_CTRL_WQE_INDEX_MASK) >> 411 MLX5_WQE_CTRL_WQE_INDEX_SHIFT; 412 if (wqe_index != ctrl_wqe_index) { 413 mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", 414 wqe_index, qp->mqp.qpn, 415 ctrl_wqe_index); 416 return -EFAULT; 417 } 418 419 ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> 420 MLX5_WQE_CTRL_QPN_SHIFT; 421 if (qp->mqp.qpn != ctrl_qpn) { 422 mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", 423 wqe_index, qp->mqp.qpn, 424 ctrl_qpn); 425 return -EFAULT; 426 } 427 #endif /* DEBUG */ 428 429 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; 430 *wqe += sizeof(*ctrl); 431 432 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & 433 MLX5_WQE_CTRL_OPCODE_MASK; 434 switch (qp->ibqp.qp_type) { 435 case IB_QPT_RC: 436 switch (opcode) { 437 case MLX5_OPCODE_SEND: 438 case MLX5_OPCODE_SEND_IMM: 439 case MLX5_OPCODE_SEND_INVAL: 440 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 441 IB_ODP_SUPPORT_SEND)) 442 goto invalid_transport_or_opcode; 443 break; 444 case MLX5_OPCODE_RDMA_WRITE: 445 case MLX5_OPCODE_RDMA_WRITE_IMM: 446 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 447 IB_ODP_SUPPORT_WRITE)) 448 goto invalid_transport_or_opcode; 449 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 450 break; 451 case MLX5_OPCODE_RDMA_READ: 452 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 453 IB_ODP_SUPPORT_READ)) 454 goto invalid_transport_or_opcode; 455 *wqe += sizeof(struct mlx5_wqe_raddr_seg); 456 break; 457 default: 458 goto invalid_transport_or_opcode; 459 } 460 break; 461 case IB_QPT_UD: 462 switch (opcode) { 463 case MLX5_OPCODE_SEND: 464 case MLX5_OPCODE_SEND_IMM: 465 if (!(dev->odp_caps.per_transport_caps.ud_odp_caps & 466 IB_ODP_SUPPORT_SEND)) 467 goto invalid_transport_or_opcode; 468 *wqe += sizeof(struct mlx5_wqe_datagram_seg); 469 break; 470 default: 471 goto invalid_transport_or_opcode; 472 } 473 break; 474 default: 475 invalid_transport_or_opcode: 476 mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n", 477 qp->ibqp.qp_type, opcode); 478 return -EFAULT; 479 } 480 481 return 0; 482 } 483 484 /* 485 * Parse responder WQE. Advances the wqe pointer to point at the 486 * scatter-gather list, and set wqe_end to the end of the WQE. 487 */ 488 static int mlx5_ib_mr_responder_pfault_handler( 489 struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, 490 void **wqe, void **wqe_end, int wqe_length) 491 { 492 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); 493 struct mlx5_ib_wq *wq = &qp->rq; 494 int wqe_size = 1 << wq->wqe_shift; 495 496 if (qp->ibqp.srq) { 497 mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); 498 return -EFAULT; 499 } 500 501 if (qp->wq_sig) { 502 mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); 503 return -EFAULT; 504 } 505 506 if (wqe_size > wqe_length) { 507 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); 508 return -EFAULT; 509 } 510 511 switch (qp->ibqp.qp_type) { 512 case IB_QPT_RC: 513 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & 514 IB_ODP_SUPPORT_RECV)) 515 goto invalid_transport_or_opcode; 516 break; 517 default: 518 invalid_transport_or_opcode: 519 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", 520 qp->ibqp.qp_type); 521 return -EFAULT; 522 } 523 524 *wqe_end = *wqe + wqe_size; 525 526 return 0; 527 } 528 529 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, 530 struct mlx5_ib_pfault *pfault) 531 { 532 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); 533 int ret; 534 void *wqe, *wqe_end; 535 u32 bytes_mapped, total_wqe_bytes; 536 char *buffer = NULL; 537 int resume_with_error = 0; 538 u16 wqe_index = pfault->mpfault.wqe.wqe_index; 539 int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR; 540 541 buffer = (char *)__get_free_page(GFP_KERNEL); 542 if (!buffer) { 543 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); 544 resume_with_error = 1; 545 goto resolve_page_fault; 546 } 547 548 ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, 549 PAGE_SIZE); 550 if (ret < 0) { 551 mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n", 552 -ret, wqe_index, qp->mqp.qpn); 553 resume_with_error = 1; 554 goto resolve_page_fault; 555 } 556 557 wqe = buffer; 558 if (requestor) 559 ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe, 560 &wqe_end, ret); 561 else 562 ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe, 563 &wqe_end, ret); 564 if (ret < 0) { 565 resume_with_error = 1; 566 goto resolve_page_fault; 567 } 568 569 if (wqe >= wqe_end) { 570 mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); 571 resume_with_error = 1; 572 goto resolve_page_fault; 573 } 574 575 ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped, 576 &total_wqe_bytes, !requestor); 577 if (ret == -EAGAIN) { 578 goto resolve_page_fault; 579 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { 580 mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n", 581 -ret); 582 resume_with_error = 1; 583 goto resolve_page_fault; 584 } 585 586 resolve_page_fault: 587 mlx5_ib_page_fault_resume(qp, pfault, resume_with_error); 588 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n", 589 qp->mqp.qpn, resume_with_error, pfault->mpfault.flags); 590 591 free_page((unsigned long)buffer); 592 } 593 594 static int pages_in_range(u64 address, u32 length) 595 { 596 return (ALIGN(address + length, PAGE_SIZE) - 597 (address & PAGE_MASK)) >> PAGE_SHIFT; 598 } 599 600 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, 601 struct mlx5_ib_pfault *pfault) 602 { 603 struct mlx5_pagefault *mpfault = &pfault->mpfault; 604 u64 address; 605 u32 length; 606 u32 prefetch_len = mpfault->bytes_committed; 607 int prefetch_activated = 0; 608 u32 rkey = mpfault->rdma.r_key; 609 int ret; 610 611 /* The RDMA responder handler handles the page fault in two parts. 612 * First it brings the necessary pages for the current packet 613 * (and uses the pfault context), and then (after resuming the QP) 614 * prefetches more pages. The second operation cannot use the pfault 615 * context and therefore uses the dummy_pfault context allocated on 616 * the stack */ 617 struct mlx5_ib_pfault dummy_pfault = {}; 618 619 dummy_pfault.mpfault.bytes_committed = 0; 620 621 mpfault->rdma.rdma_va += mpfault->bytes_committed; 622 mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed, 623 mpfault->rdma.rdma_op_len); 624 mpfault->bytes_committed = 0; 625 626 address = mpfault->rdma.rdma_va; 627 length = mpfault->rdma.rdma_op_len; 628 629 /* For some operations, the hardware cannot tell the exact message 630 * length, and in those cases it reports zero. Use prefetch 631 * logic. */ 632 if (length == 0) { 633 prefetch_activated = 1; 634 length = mpfault->rdma.packet_size; 635 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); 636 } 637 638 ret = pagefault_single_data_segment(qp, pfault, rkey, address, length, 639 NULL); 640 if (ret == -EAGAIN) { 641 /* We're racing with an invalidation, don't prefetch */ 642 prefetch_activated = 0; 643 } else if (ret < 0 || pages_in_range(address, length) > ret) { 644 mlx5_ib_page_fault_resume(qp, pfault, 1); 645 return; 646 } 647 648 mlx5_ib_page_fault_resume(qp, pfault, 0); 649 650 /* At this point, there might be a new pagefault already arriving in 651 * the eq, switch to the dummy pagefault for the rest of the 652 * processing. We're still OK with the objects being alive as the 653 * work-queue is being fenced. */ 654 655 if (prefetch_activated) { 656 ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey, 657 address, 658 prefetch_len, 659 NULL); 660 if (ret < 0) { 661 pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n", 662 ret, prefetch_activated, 663 qp->ibqp.qp_num, address, prefetch_len); 664 } 665 } 666 } 667 668 void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, 669 struct mlx5_ib_pfault *pfault) 670 { 671 u8 event_subtype = pfault->mpfault.event_subtype; 672 673 switch (event_subtype) { 674 case MLX5_PFAULT_SUBTYPE_WQE: 675 mlx5_ib_mr_wqe_pfault_handler(qp, pfault); 676 break; 677 case MLX5_PFAULT_SUBTYPE_RDMA: 678 mlx5_ib_mr_rdma_pfault_handler(qp, pfault); 679 break; 680 default: 681 pr_warn("Invalid page fault event subtype: 0x%x\n", 682 event_subtype); 683 mlx5_ib_page_fault_resume(qp, pfault, 1); 684 break; 685 } 686 } 687 688 static void mlx5_ib_qp_pfault_action(struct work_struct *work) 689 { 690 struct mlx5_ib_pfault *pfault = container_of(work, 691 struct mlx5_ib_pfault, 692 work); 693 enum mlx5_ib_pagefault_context context = 694 mlx5_ib_get_pagefault_context(&pfault->mpfault); 695 struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp, 696 pagefaults[context]); 697 mlx5_ib_mr_pfault_handler(qp, pfault); 698 } 699 700 void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) 701 { 702 unsigned long flags; 703 704 spin_lock_irqsave(&qp->disable_page_faults_lock, flags); 705 qp->disable_page_faults = 1; 706 spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); 707 708 /* 709 * Note that at this point, we are guarenteed that no more 710 * work queue elements will be posted to the work queue with 711 * the QP we are closing. 712 */ 713 flush_workqueue(mlx5_ib_page_fault_wq); 714 } 715 716 void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) 717 { 718 unsigned long flags; 719 720 spin_lock_irqsave(&qp->disable_page_faults_lock, flags); 721 qp->disable_page_faults = 0; 722 spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); 723 } 724 725 static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp, 726 struct mlx5_pagefault *pfault) 727 { 728 /* 729 * Note that we will only get one fault event per QP per context 730 * (responder/initiator, read/write), until we resolve the page fault 731 * with the mlx5_ib_page_fault_resume command. Since this function is 732 * called from within the work element, there is no risk of missing 733 * events. 734 */ 735 struct mlx5_ib_qp *mibqp = to_mibqp(qp); 736 enum mlx5_ib_pagefault_context context = 737 mlx5_ib_get_pagefault_context(pfault); 738 struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context]; 739 740 qp_pfault->mpfault = *pfault; 741 742 /* No need to stop interrupts here since we are in an interrupt */ 743 spin_lock(&mibqp->disable_page_faults_lock); 744 if (!mibqp->disable_page_faults) 745 queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work); 746 spin_unlock(&mibqp->disable_page_faults_lock); 747 } 748 749 void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) 750 { 751 int i; 752 753 qp->disable_page_faults = 1; 754 spin_lock_init(&qp->disable_page_faults_lock); 755 756 qp->mqp.pfault_handler = mlx5_ib_pfault_handler; 757 758 for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) 759 INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); 760 } 761 762 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) 763 { 764 int ret; 765 766 ret = init_srcu_struct(&ibdev->mr_srcu); 767 if (ret) 768 return ret; 769 770 return 0; 771 } 772 773 void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) 774 { 775 cleanup_srcu_struct(&ibdev->mr_srcu); 776 } 777 778 int __init mlx5_ib_odp_init(void) 779 { 780 mlx5_ib_page_fault_wq = 781 create_singlethread_workqueue("mlx5_ib_page_faults"); 782 if (!mlx5_ib_page_fault_wq) 783 return -ENOMEM; 784 785 return 0; 786 } 787 788 void mlx5_ib_odp_cleanup(void) 789 { 790 destroy_workqueue(mlx5_ib_page_fault_wq); 791 } 792