1 /* 2 * Copyright (c) 2014 Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <linux/types.h> 34 #include <linux/sched.h> 35 #include <linux/sched/mm.h> 36 #include <linux/sched/task.h> 37 #include <linux/pid.h> 38 #include <linux/slab.h> 39 #include <linux/export.h> 40 #include <linux/vmalloc.h> 41 #include <linux/hugetlb.h> 42 #include <linux/interval_tree.h> 43 #include <linux/pagemap.h> 44 45 #include <rdma/ib_verbs.h> 46 #include <rdma/ib_umem.h> 47 #include <rdma/ib_umem_odp.h> 48 49 static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) 50 { 51 mutex_lock(&umem_odp->umem_mutex); 52 if (umem_odp->notifiers_count++ == 0) 53 /* 54 * Initialize the completion object for waiting on 55 * notifiers. Since notifier_count is zero, no one should be 56 * waiting right now. 57 */ 58 reinit_completion(&umem_odp->notifier_completion); 59 mutex_unlock(&umem_odp->umem_mutex); 60 } 61 62 static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) 63 { 64 mutex_lock(&umem_odp->umem_mutex); 65 /* 66 * This sequence increase will notify the QP page fault that the page 67 * that is going to be mapped in the spte could have been freed. 68 */ 69 ++umem_odp->notifiers_seq; 70 if (--umem_odp->notifiers_count == 0) 71 complete_all(&umem_odp->notifier_completion); 72 mutex_unlock(&umem_odp->umem_mutex); 73 } 74 75 static void ib_umem_notifier_release(struct mmu_notifier *mn, 76 struct mm_struct *mm) 77 { 78 struct ib_ucontext_per_mm *per_mm = 79 container_of(mn, struct ib_ucontext_per_mm, mn); 80 struct rb_node *node; 81 82 down_read(&per_mm->umem_rwsem); 83 if (!per_mm->active) 84 goto out; 85 86 for (node = rb_first_cached(&per_mm->umem_tree); node; 87 node = rb_next(node)) { 88 struct ib_umem_odp *umem_odp = 89 rb_entry(node, struct ib_umem_odp, interval_tree.rb); 90 91 /* 92 * Increase the number of notifiers running, to prevent any 93 * further fault handling on this MR. 94 */ 95 ib_umem_notifier_start_account(umem_odp); 96 complete_all(&umem_odp->notifier_completion); 97 umem_odp->umem.context->invalidate_range( 98 umem_odp, ib_umem_start(umem_odp), 99 ib_umem_end(umem_odp)); 100 } 101 102 out: 103 up_read(&per_mm->umem_rwsem); 104 } 105 106 static int invalidate_range_start_trampoline(struct ib_umem_odp *item, 107 u64 start, u64 end, void *cookie) 108 { 109 ib_umem_notifier_start_account(item); 110 item->umem.context->invalidate_range(item, start, end); 111 return 0; 112 } 113 114 static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, 115 const struct mmu_notifier_range *range) 116 { 117 struct ib_ucontext_per_mm *per_mm = 118 container_of(mn, struct ib_ucontext_per_mm, mn); 119 int rc; 120 121 if (mmu_notifier_range_blockable(range)) 122 down_read(&per_mm->umem_rwsem); 123 else if (!down_read_trylock(&per_mm->umem_rwsem)) 124 return -EAGAIN; 125 126 if (!per_mm->active) { 127 up_read(&per_mm->umem_rwsem); 128 /* 129 * At this point active is permanently set and visible to this 130 * CPU without a lock, that fact is relied on to skip the unlock 131 * in range_end. 132 */ 133 return 0; 134 } 135 136 rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, 137 range->end, 138 invalidate_range_start_trampoline, 139 mmu_notifier_range_blockable(range), 140 NULL); 141 if (rc) 142 up_read(&per_mm->umem_rwsem); 143 return rc; 144 } 145 146 static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, 147 u64 end, void *cookie) 148 { 149 ib_umem_notifier_end_account(item); 150 return 0; 151 } 152 153 static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, 154 const struct mmu_notifier_range *range) 155 { 156 struct ib_ucontext_per_mm *per_mm = 157 container_of(mn, struct ib_ucontext_per_mm, mn); 158 159 if (unlikely(!per_mm->active)) 160 return; 161 162 rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, 163 range->end, 164 invalidate_range_end_trampoline, true, NULL); 165 up_read(&per_mm->umem_rwsem); 166 } 167 168 static const struct mmu_notifier_ops ib_umem_notifiers = { 169 .release = ib_umem_notifier_release, 170 .invalidate_range_start = ib_umem_notifier_invalidate_range_start, 171 .invalidate_range_end = ib_umem_notifier_invalidate_range_end, 172 }; 173 174 static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) 175 { 176 struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; 177 178 down_write(&per_mm->umem_rwsem); 179 interval_tree_remove(&umem_odp->interval_tree, &per_mm->umem_tree); 180 complete_all(&umem_odp->notifier_completion); 181 up_write(&per_mm->umem_rwsem); 182 } 183 184 static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, 185 struct mm_struct *mm) 186 { 187 struct ib_ucontext_per_mm *per_mm; 188 int ret; 189 190 per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); 191 if (!per_mm) 192 return ERR_PTR(-ENOMEM); 193 194 per_mm->context = ctx; 195 per_mm->mm = mm; 196 per_mm->umem_tree = RB_ROOT_CACHED; 197 init_rwsem(&per_mm->umem_rwsem); 198 per_mm->active = true; 199 200 rcu_read_lock(); 201 per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); 202 rcu_read_unlock(); 203 204 WARN_ON(mm != current->mm); 205 206 per_mm->mn.ops = &ib_umem_notifiers; 207 ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); 208 if (ret) { 209 dev_err(&ctx->device->dev, 210 "Failed to register mmu_notifier %d\n", ret); 211 goto out_pid; 212 } 213 214 list_add(&per_mm->ucontext_list, &ctx->per_mm_list); 215 return per_mm; 216 217 out_pid: 218 put_pid(per_mm->tgid); 219 kfree(per_mm); 220 return ERR_PTR(ret); 221 } 222 223 static struct ib_ucontext_per_mm *get_per_mm(struct ib_umem_odp *umem_odp) 224 { 225 struct ib_ucontext *ctx = umem_odp->umem.context; 226 struct ib_ucontext_per_mm *per_mm; 227 228 lockdep_assert_held(&ctx->per_mm_list_lock); 229 230 /* 231 * Generally speaking we expect only one or two per_mm in this list, 232 * so no reason to optimize this search today. 233 */ 234 list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { 235 if (per_mm->mm == umem_odp->umem.owning_mm) 236 return per_mm; 237 } 238 239 return alloc_per_mm(ctx, umem_odp->umem.owning_mm); 240 } 241 242 static void free_per_mm(struct rcu_head *rcu) 243 { 244 kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); 245 } 246 247 static void put_per_mm(struct ib_umem_odp *umem_odp) 248 { 249 struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; 250 struct ib_ucontext *ctx = umem_odp->umem.context; 251 bool need_free; 252 253 mutex_lock(&ctx->per_mm_list_lock); 254 umem_odp->per_mm = NULL; 255 per_mm->odp_mrs_count--; 256 need_free = per_mm->odp_mrs_count == 0; 257 if (need_free) 258 list_del(&per_mm->ucontext_list); 259 mutex_unlock(&ctx->per_mm_list_lock); 260 261 if (!need_free) 262 return; 263 264 /* 265 * NOTE! mmu_notifier_unregister() can happen between a start/end 266 * callback, resulting in an start/end, and thus an unbalanced 267 * lock. This doesn't really matter to us since we are about to kfree 268 * the memory that holds the lock, however LOCKDEP doesn't like this. 269 */ 270 down_write(&per_mm->umem_rwsem); 271 per_mm->active = false; 272 up_write(&per_mm->umem_rwsem); 273 274 WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); 275 mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); 276 put_pid(per_mm->tgid); 277 mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); 278 } 279 280 static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, 281 struct ib_ucontext_per_mm *per_mm) 282 { 283 struct ib_ucontext *ctx = umem_odp->umem.context; 284 int ret; 285 286 umem_odp->umem.is_odp = 1; 287 if (!umem_odp->is_implicit_odp) { 288 size_t pages = ib_umem_odp_num_pages(umem_odp); 289 290 if (!pages) 291 return -EINVAL; 292 293 /* 294 * Note that the representation of the intervals in the 295 * interval tree considers the ending point as contained in 296 * the interval, while the function ib_umem_end returns the 297 * first address which is not contained in the umem. 298 */ 299 umem_odp->interval_tree.start = ib_umem_start(umem_odp); 300 umem_odp->interval_tree.last = ib_umem_end(umem_odp) - 1; 301 302 umem_odp->page_list = vzalloc( 303 array_size(sizeof(*umem_odp->page_list), pages)); 304 if (!umem_odp->page_list) 305 return -ENOMEM; 306 307 umem_odp->dma_list = 308 vzalloc(array_size(sizeof(*umem_odp->dma_list), pages)); 309 if (!umem_odp->dma_list) { 310 ret = -ENOMEM; 311 goto out_page_list; 312 } 313 } 314 315 mutex_lock(&ctx->per_mm_list_lock); 316 if (!per_mm) { 317 per_mm = get_per_mm(umem_odp); 318 if (IS_ERR(per_mm)) { 319 ret = PTR_ERR(per_mm); 320 goto out_unlock; 321 } 322 } 323 umem_odp->per_mm = per_mm; 324 per_mm->odp_mrs_count++; 325 mutex_unlock(&ctx->per_mm_list_lock); 326 327 mutex_init(&umem_odp->umem_mutex); 328 init_completion(&umem_odp->notifier_completion); 329 330 if (!umem_odp->is_implicit_odp) { 331 down_write(&per_mm->umem_rwsem); 332 interval_tree_insert(&umem_odp->interval_tree, 333 &per_mm->umem_tree); 334 up_write(&per_mm->umem_rwsem); 335 } 336 337 return 0; 338 339 out_unlock: 340 mutex_unlock(&ctx->per_mm_list_lock); 341 vfree(umem_odp->dma_list); 342 out_page_list: 343 vfree(umem_odp->page_list); 344 return ret; 345 } 346 347 struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, 348 unsigned long addr, size_t size) 349 { 350 /* 351 * Caller must ensure that root cannot be freed during the call to 352 * ib_alloc_odp_umem. 353 */ 354 struct ib_umem_odp *odp_data; 355 struct ib_umem *umem; 356 int ret; 357 358 odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); 359 if (!odp_data) 360 return ERR_PTR(-ENOMEM); 361 umem = &odp_data->umem; 362 umem->context = root->umem.context; 363 umem->length = size; 364 umem->address = addr; 365 umem->writable = root->umem.writable; 366 umem->owning_mm = root->umem.owning_mm; 367 odp_data->page_shift = PAGE_SHIFT; 368 369 ret = ib_init_umem_odp(odp_data, root->per_mm); 370 if (ret) { 371 kfree(odp_data); 372 return ERR_PTR(ret); 373 } 374 375 mmgrab(umem->owning_mm); 376 377 return odp_data; 378 } 379 EXPORT_SYMBOL(ib_alloc_odp_umem); 380 381 int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) 382 { 383 /* 384 * NOTE: This must called in a process context where umem->owning_mm 385 * == current->mm 386 */ 387 struct mm_struct *mm = umem_odp->umem.owning_mm; 388 389 if (umem_odp->umem.address == 0 && umem_odp->umem.length == 0) 390 umem_odp->is_implicit_odp = 1; 391 392 umem_odp->page_shift = PAGE_SHIFT; 393 if (access & IB_ACCESS_HUGETLB) { 394 struct vm_area_struct *vma; 395 struct hstate *h; 396 397 down_read(&mm->mmap_sem); 398 vma = find_vma(mm, ib_umem_start(umem_odp)); 399 if (!vma || !is_vm_hugetlb_page(vma)) { 400 up_read(&mm->mmap_sem); 401 return -EINVAL; 402 } 403 h = hstate_vma(vma); 404 umem_odp->page_shift = huge_page_shift(h); 405 up_read(&mm->mmap_sem); 406 } 407 408 return ib_init_umem_odp(umem_odp, NULL); 409 } 410 411 void ib_umem_odp_release(struct ib_umem_odp *umem_odp) 412 { 413 /* 414 * Ensure that no more pages are mapped in the umem. 415 * 416 * It is the driver's responsibility to ensure, before calling us, 417 * that the hardware will not attempt to access the MR any more. 418 */ 419 if (!umem_odp->is_implicit_odp) { 420 ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), 421 ib_umem_end(umem_odp)); 422 remove_umem_from_per_mm(umem_odp); 423 vfree(umem_odp->dma_list); 424 vfree(umem_odp->page_list); 425 } 426 put_per_mm(umem_odp); 427 } 428 429 /* 430 * Map for DMA and insert a single page into the on-demand paging page tables. 431 * 432 * @umem: the umem to insert the page to. 433 * @page_index: index in the umem to add the page to. 434 * @page: the page struct to map and add. 435 * @access_mask: access permissions needed for this page. 436 * @current_seq: sequence number for synchronization with invalidations. 437 * the sequence number is taken from 438 * umem_odp->notifiers_seq. 439 * 440 * The function returns -EFAULT if the DMA mapping operation fails. It returns 441 * -EAGAIN if a concurrent invalidation prevents us from updating the page. 442 * 443 * The page is released via put_user_page even if the operation failed. For 444 * on-demand pinning, the page is released whenever it isn't stored in the 445 * umem. 446 */ 447 static int ib_umem_odp_map_dma_single_page( 448 struct ib_umem_odp *umem_odp, 449 int page_index, 450 struct page *page, 451 u64 access_mask, 452 unsigned long current_seq) 453 { 454 struct ib_ucontext *context = umem_odp->umem.context; 455 struct ib_device *dev = context->device; 456 dma_addr_t dma_addr; 457 int remove_existing_mapping = 0; 458 int ret = 0; 459 460 /* 461 * Note: we avoid writing if seq is different from the initial seq, to 462 * handle case of a racing notifier. This check also allows us to bail 463 * early if we have a notifier running in parallel with us. 464 */ 465 if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) { 466 ret = -EAGAIN; 467 goto out; 468 } 469 if (!(umem_odp->dma_list[page_index])) { 470 dma_addr = 471 ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift), 472 DMA_BIDIRECTIONAL); 473 if (ib_dma_mapping_error(dev, dma_addr)) { 474 ret = -EFAULT; 475 goto out; 476 } 477 umem_odp->dma_list[page_index] = dma_addr | access_mask; 478 umem_odp->page_list[page_index] = page; 479 umem_odp->npages++; 480 } else if (umem_odp->page_list[page_index] == page) { 481 umem_odp->dma_list[page_index] |= access_mask; 482 } else { 483 pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", 484 umem_odp->page_list[page_index], page); 485 /* Better remove the mapping now, to prevent any further 486 * damage. */ 487 remove_existing_mapping = 1; 488 } 489 490 out: 491 put_user_page(page); 492 493 if (remove_existing_mapping) { 494 ib_umem_notifier_start_account(umem_odp); 495 context->invalidate_range( 496 umem_odp, 497 ib_umem_start(umem_odp) + 498 (page_index << umem_odp->page_shift), 499 ib_umem_start(umem_odp) + 500 ((page_index + 1) << umem_odp->page_shift)); 501 ib_umem_notifier_end_account(umem_odp); 502 ret = -EAGAIN; 503 } 504 505 return ret; 506 } 507 508 /** 509 * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. 510 * 511 * Pins the range of pages passed in the argument, and maps them to 512 * DMA addresses. The DMA addresses of the mapped pages is updated in 513 * umem_odp->dma_list. 514 * 515 * Returns the number of pages mapped in success, negative error code 516 * for failure. 517 * An -EAGAIN error code is returned when a concurrent mmu notifier prevents 518 * the function from completing its task. 519 * An -ENOENT error code indicates that userspace process is being terminated 520 * and mm was already destroyed. 521 * @umem_odp: the umem to map and pin 522 * @user_virt: the address from which we need to map. 523 * @bcnt: the minimal number of bytes to pin and map. The mapping might be 524 * bigger due to alignment, and may also be smaller in case of an error 525 * pinning or mapping a page. The actual pages mapped is returned in 526 * the return value. 527 * @access_mask: bit mask of the requested access permissions for the given 528 * range. 529 * @current_seq: the MMU notifiers sequance value for synchronization with 530 * invalidations. the sequance number is read from 531 * umem_odp->notifiers_seq before calling this function 532 */ 533 int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, 534 u64 bcnt, u64 access_mask, 535 unsigned long current_seq) 536 { 537 struct task_struct *owning_process = NULL; 538 struct mm_struct *owning_mm = umem_odp->umem.owning_mm; 539 struct page **local_page_list = NULL; 540 u64 page_mask, off; 541 int j, k, ret = 0, start_idx, npages = 0; 542 unsigned int flags = 0, page_shift; 543 phys_addr_t p = 0; 544 545 if (access_mask == 0) 546 return -EINVAL; 547 548 if (user_virt < ib_umem_start(umem_odp) || 549 user_virt + bcnt > ib_umem_end(umem_odp)) 550 return -EFAULT; 551 552 local_page_list = (struct page **)__get_free_page(GFP_KERNEL); 553 if (!local_page_list) 554 return -ENOMEM; 555 556 page_shift = umem_odp->page_shift; 557 page_mask = ~(BIT(page_shift) - 1); 558 off = user_virt & (~page_mask); 559 user_virt = user_virt & page_mask; 560 bcnt += off; /* Charge for the first page offset as well. */ 561 562 /* 563 * owning_process is allowed to be NULL, this means somehow the mm is 564 * existing beyond the lifetime of the originating process.. Presumably 565 * mmget_not_zero will fail in this case. 566 */ 567 owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID); 568 if (!owning_process || !mmget_not_zero(owning_mm)) { 569 ret = -EINVAL; 570 goto out_put_task; 571 } 572 573 if (access_mask & ODP_WRITE_ALLOWED_BIT) 574 flags |= FOLL_WRITE; 575 576 start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift; 577 k = start_idx; 578 579 while (bcnt > 0) { 580 const size_t gup_num_pages = min_t(size_t, 581 (bcnt + BIT(page_shift) - 1) >> page_shift, 582 PAGE_SIZE / sizeof(struct page *)); 583 584 down_read(&owning_mm->mmap_sem); 585 /* 586 * Note: this might result in redundent page getting. We can 587 * avoid this by checking dma_list to be 0 before calling 588 * get_user_pages. However, this make the code much more 589 * complex (and doesn't gain us much performance in most use 590 * cases). 591 */ 592 npages = get_user_pages_remote(owning_process, owning_mm, 593 user_virt, gup_num_pages, 594 flags, local_page_list, NULL, NULL); 595 up_read(&owning_mm->mmap_sem); 596 597 if (npages < 0) { 598 if (npages != -EAGAIN) 599 pr_warn("fail to get %zu user pages with error %d\n", gup_num_pages, npages); 600 else 601 pr_debug("fail to get %zu user pages with error %d\n", gup_num_pages, npages); 602 break; 603 } 604 605 bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); 606 mutex_lock(&umem_odp->umem_mutex); 607 for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { 608 if (user_virt & ~page_mask) { 609 p += PAGE_SIZE; 610 if (page_to_phys(local_page_list[j]) != p) { 611 ret = -EFAULT; 612 break; 613 } 614 put_user_page(local_page_list[j]); 615 continue; 616 } 617 618 ret = ib_umem_odp_map_dma_single_page( 619 umem_odp, k, local_page_list[j], 620 access_mask, current_seq); 621 if (ret < 0) { 622 if (ret != -EAGAIN) 623 pr_warn("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); 624 else 625 pr_debug("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); 626 break; 627 } 628 629 p = page_to_phys(local_page_list[j]); 630 k++; 631 } 632 mutex_unlock(&umem_odp->umem_mutex); 633 634 if (ret < 0) { 635 /* 636 * Release pages, remembering that the first page 637 * to hit an error was already released by 638 * ib_umem_odp_map_dma_single_page(). 639 */ 640 if (npages - (j + 1) > 0) 641 put_user_pages(&local_page_list[j+1], 642 npages - (j + 1)); 643 break; 644 } 645 } 646 647 if (ret >= 0) { 648 if (npages < 0 && k == start_idx) 649 ret = npages; 650 else 651 ret = k - start_idx; 652 } 653 654 mmput(owning_mm); 655 out_put_task: 656 if (owning_process) 657 put_task_struct(owning_process); 658 free_page((unsigned long)local_page_list); 659 return ret; 660 } 661 EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); 662 663 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, 664 u64 bound) 665 { 666 int idx; 667 u64 addr; 668 struct ib_device *dev = umem_odp->umem.context->device; 669 670 virt = max_t(u64, virt, ib_umem_start(umem_odp)); 671 bound = min_t(u64, bound, ib_umem_end(umem_odp)); 672 /* Note that during the run of this function, the 673 * notifiers_count of the MR is > 0, preventing any racing 674 * faults from completion. We might be racing with other 675 * invalidations, so we must make sure we free each page only 676 * once. */ 677 mutex_lock(&umem_odp->umem_mutex); 678 for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { 679 idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; 680 if (umem_odp->page_list[idx]) { 681 struct page *page = umem_odp->page_list[idx]; 682 dma_addr_t dma = umem_odp->dma_list[idx]; 683 dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; 684 685 WARN_ON(!dma_addr); 686 687 ib_dma_unmap_page(dev, dma_addr, 688 BIT(umem_odp->page_shift), 689 DMA_BIDIRECTIONAL); 690 if (dma & ODP_WRITE_ALLOWED_BIT) { 691 struct page *head_page = compound_head(page); 692 /* 693 * set_page_dirty prefers being called with 694 * the page lock. However, MMU notifiers are 695 * called sometimes with and sometimes without 696 * the lock. We rely on the umem_mutex instead 697 * to prevent other mmu notifiers from 698 * continuing and allowing the page mapping to 699 * be removed. 700 */ 701 set_page_dirty(head_page); 702 } 703 umem_odp->page_list[idx] = NULL; 704 umem_odp->dma_list[idx] = 0; 705 umem_odp->npages--; 706 } 707 } 708 mutex_unlock(&umem_odp->umem_mutex); 709 } 710 EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); 711 712 /* @last is not a part of the interval. See comment for function 713 * node_last. 714 */ 715 int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, 716 u64 start, u64 last, 717 umem_call_back cb, 718 bool blockable, 719 void *cookie) 720 { 721 int ret_val = 0; 722 struct interval_tree_node *node, *next; 723 struct ib_umem_odp *umem; 724 725 if (unlikely(start == last)) 726 return ret_val; 727 728 for (node = interval_tree_iter_first(root, start, last - 1); 729 node; node = next) { 730 /* TODO move the blockable decision up to the callback */ 731 if (!blockable) 732 return -EAGAIN; 733 next = interval_tree_iter_next(node, start, last - 1); 734 umem = container_of(node, struct ib_umem_odp, interval_tree); 735 ret_val = cb(umem, start, last, cookie) || ret_val; 736 } 737 738 return ret_val; 739 } 740