1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. 3 * 4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The 5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page 6 * list for access by an in-kernel user. 7 * 8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs 9 * between the domains and xarray. 10 */ 11 #include <linux/iommufd.h> 12 #include <linux/lockdep.h> 13 #include <linux/iommu.h> 14 #include <linux/sched/mm.h> 15 #include <linux/err.h> 16 #include <linux/slab.h> 17 #include <linux/errno.h> 18 19 #include "io_pagetable.h" 20 #include "double_span.h" 21 22 struct iopt_pages_list { 23 struct iopt_pages *pages; 24 struct iopt_area *area; 25 struct list_head next; 26 unsigned long start_byte; 27 unsigned long length; 28 }; 29 30 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, 31 struct io_pagetable *iopt, 32 unsigned long iova, 33 unsigned long last_iova) 34 { 35 lockdep_assert_held(&iopt->iova_rwsem); 36 37 iter->cur_iova = iova; 38 iter->last_iova = last_iova; 39 iter->area = iopt_area_iter_first(iopt, iova, iova); 40 if (!iter->area) 41 return NULL; 42 if (!iter->area->pages) { 43 iter->area = NULL; 44 return NULL; 45 } 46 return iter->area; 47 } 48 49 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter) 50 { 51 unsigned long last_iova; 52 53 if (!iter->area) 54 return NULL; 55 last_iova = iopt_area_last_iova(iter->area); 56 if (iter->last_iova <= last_iova) 57 return NULL; 58 59 iter->cur_iova = last_iova + 1; 60 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova, 61 iter->last_iova); 62 if (!iter->area) 63 return NULL; 64 if (iter->cur_iova != iopt_area_iova(iter->area) || 65 !iter->area->pages) { 66 iter->area = NULL; 67 return NULL; 68 } 69 return iter->area; 70 } 71 72 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span, 73 unsigned long length, 74 unsigned long iova_alignment, 75 unsigned long page_offset) 76 { 77 if (span->is_used || span->last_hole - span->start_hole < length - 1) 78 return false; 79 80 span->start_hole = ALIGN(span->start_hole, iova_alignment) | 81 page_offset; 82 if (span->start_hole > span->last_hole || 83 span->last_hole - span->start_hole < length - 1) 84 return false; 85 return true; 86 } 87 88 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, 89 unsigned long length, 90 unsigned long iova_alignment, 91 unsigned long page_offset) 92 { 93 if (span->is_hole || span->last_used - span->start_used < length - 1) 94 return false; 95 96 span->start_used = ALIGN(span->start_used, iova_alignment) | 97 page_offset; 98 if (span->start_used > span->last_used || 99 span->last_used - span->start_used < length - 1) 100 return false; 101 return true; 102 } 103 104 /* 105 * Automatically find a block of IOVA that is not being used and not reserved. 106 * Does not return a 0 IOVA even if it is valid. 107 */ 108 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, 109 unsigned long uptr, unsigned long length) 110 { 111 unsigned long page_offset = uptr % PAGE_SIZE; 112 struct interval_tree_double_span_iter used_span; 113 struct interval_tree_span_iter allowed_span; 114 unsigned long iova_alignment; 115 116 lockdep_assert_held(&iopt->iova_rwsem); 117 118 /* Protect roundup_pow-of_two() from overflow */ 119 if (length == 0 || length >= ULONG_MAX / 2) 120 return -EOVERFLOW; 121 122 /* 123 * Keep alignment present in the uptr when building the IOVA, this 124 * increases the chance we can map a THP. 125 */ 126 if (!uptr) 127 iova_alignment = roundup_pow_of_two(length); 128 else 129 iova_alignment = min_t(unsigned long, 130 roundup_pow_of_two(length), 131 1UL << __ffs64(uptr)); 132 133 if (iova_alignment < iopt->iova_alignment) 134 return -EINVAL; 135 136 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree, 137 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) { 138 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) { 139 allowed_span.start_used = PAGE_SIZE; 140 allowed_span.last_used = ULONG_MAX - PAGE_SIZE; 141 allowed_span.is_hole = false; 142 } 143 144 if (!__alloc_iova_check_used(&allowed_span, length, 145 iova_alignment, page_offset)) 146 continue; 147 148 interval_tree_for_each_double_span( 149 &used_span, &iopt->reserved_itree, &iopt->area_itree, 150 allowed_span.start_used, allowed_span.last_used) { 151 if (!__alloc_iova_check_hole(&used_span, length, 152 iova_alignment, 153 page_offset)) 154 continue; 155 156 *iova = used_span.start_hole; 157 return 0; 158 } 159 } 160 return -ENOSPC; 161 } 162 163 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova, 164 unsigned long length) 165 { 166 unsigned long last; 167 168 lockdep_assert_held(&iopt->iova_rwsem); 169 170 if ((iova & (iopt->iova_alignment - 1))) 171 return -EINVAL; 172 173 if (check_add_overflow(iova, length - 1, &last)) 174 return -EOVERFLOW; 175 176 /* No reserved IOVA intersects the range */ 177 if (iopt_reserved_iter_first(iopt, iova, last)) 178 return -EINVAL; 179 180 /* Check that there is not already a mapping in the range */ 181 if (iopt_area_iter_first(iopt, iova, last)) 182 return -EEXIST; 183 return 0; 184 } 185 186 /* 187 * The area takes a slice of the pages from start_bytes to start_byte + length 188 */ 189 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area, 190 struct iopt_pages *pages, unsigned long iova, 191 unsigned long start_byte, unsigned long length, 192 int iommu_prot) 193 { 194 lockdep_assert_held_write(&iopt->iova_rwsem); 195 196 if ((iommu_prot & IOMMU_WRITE) && !pages->writable) 197 return -EPERM; 198 199 area->iommu_prot = iommu_prot; 200 area->page_offset = start_byte % PAGE_SIZE; 201 if (area->page_offset & (iopt->iova_alignment - 1)) 202 return -EINVAL; 203 204 area->node.start = iova; 205 if (check_add_overflow(iova, length - 1, &area->node.last)) 206 return -EOVERFLOW; 207 208 area->pages_node.start = start_byte / PAGE_SIZE; 209 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last)) 210 return -EOVERFLOW; 211 area->pages_node.last = area->pages_node.last / PAGE_SIZE; 212 if (WARN_ON(area->pages_node.last >= pages->npages)) 213 return -EOVERFLOW; 214 215 /* 216 * The area is inserted with a NULL pages indicating it is not fully 217 * initialized yet. 218 */ 219 area->iopt = iopt; 220 interval_tree_insert(&area->node, &iopt->area_itree); 221 return 0; 222 } 223 224 static struct iopt_area *iopt_area_alloc(void) 225 { 226 struct iopt_area *area; 227 228 area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); 229 if (!area) 230 return NULL; 231 RB_CLEAR_NODE(&area->node.rb); 232 RB_CLEAR_NODE(&area->pages_node.rb); 233 return area; 234 } 235 236 static int iopt_alloc_area_pages(struct io_pagetable *iopt, 237 struct list_head *pages_list, 238 unsigned long length, unsigned long *dst_iova, 239 int iommu_prot, unsigned int flags) 240 { 241 struct iopt_pages_list *elm; 242 unsigned long iova; 243 int rc = 0; 244 245 list_for_each_entry(elm, pages_list, next) { 246 elm->area = iopt_area_alloc(); 247 if (!elm->area) 248 return -ENOMEM; 249 } 250 251 down_write(&iopt->iova_rwsem); 252 if ((length & (iopt->iova_alignment - 1)) || !length) { 253 rc = -EINVAL; 254 goto out_unlock; 255 } 256 257 if (flags & IOPT_ALLOC_IOVA) { 258 /* Use the first entry to guess the ideal IOVA alignment */ 259 elm = list_first_entry(pages_list, struct iopt_pages_list, 260 next); 261 rc = iopt_alloc_iova( 262 iopt, dst_iova, 263 (uintptr_t)elm->pages->uptr + elm->start_byte, length); 264 if (rc) 265 goto out_unlock; 266 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && 267 WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) { 268 rc = -EINVAL; 269 goto out_unlock; 270 } 271 } else { 272 rc = iopt_check_iova(iopt, *dst_iova, length); 273 if (rc) 274 goto out_unlock; 275 } 276 277 /* 278 * Areas are created with a NULL pages so that the IOVA space is 279 * reserved and we can unlock the iova_rwsem. 280 */ 281 iova = *dst_iova; 282 list_for_each_entry(elm, pages_list, next) { 283 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova, 284 elm->start_byte, elm->length, iommu_prot); 285 if (rc) 286 goto out_unlock; 287 iova += elm->length; 288 } 289 290 out_unlock: 291 up_write(&iopt->iova_rwsem); 292 return rc; 293 } 294 295 static void iopt_abort_area(struct iopt_area *area) 296 { 297 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) 298 WARN_ON(area->pages); 299 if (area->iopt) { 300 down_write(&area->iopt->iova_rwsem); 301 interval_tree_remove(&area->node, &area->iopt->area_itree); 302 up_write(&area->iopt->iova_rwsem); 303 } 304 kfree(area); 305 } 306 307 void iopt_free_pages_list(struct list_head *pages_list) 308 { 309 struct iopt_pages_list *elm; 310 311 while ((elm = list_first_entry_or_null(pages_list, 312 struct iopt_pages_list, next))) { 313 if (elm->area) 314 iopt_abort_area(elm->area); 315 if (elm->pages) 316 iopt_put_pages(elm->pages); 317 list_del(&elm->next); 318 kfree(elm); 319 } 320 } 321 322 static int iopt_fill_domains_pages(struct list_head *pages_list) 323 { 324 struct iopt_pages_list *undo_elm; 325 struct iopt_pages_list *elm; 326 int rc; 327 328 list_for_each_entry(elm, pages_list, next) { 329 rc = iopt_area_fill_domains(elm->area, elm->pages); 330 if (rc) 331 goto err_undo; 332 } 333 return 0; 334 335 err_undo: 336 list_for_each_entry(undo_elm, pages_list, next) { 337 if (undo_elm == elm) 338 break; 339 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages); 340 } 341 return rc; 342 } 343 344 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, 345 unsigned long length, unsigned long *dst_iova, 346 int iommu_prot, unsigned int flags) 347 { 348 struct iopt_pages_list *elm; 349 int rc; 350 351 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova, 352 iommu_prot, flags); 353 if (rc) 354 return rc; 355 356 down_read(&iopt->domains_rwsem); 357 rc = iopt_fill_domains_pages(pages_list); 358 if (rc) 359 goto out_unlock_domains; 360 361 down_write(&iopt->iova_rwsem); 362 list_for_each_entry(elm, pages_list, next) { 363 /* 364 * area->pages must be set inside the domains_rwsem to ensure 365 * any newly added domains will get filled. Moves the reference 366 * in from the list. 367 */ 368 elm->area->pages = elm->pages; 369 elm->pages = NULL; 370 elm->area = NULL; 371 } 372 up_write(&iopt->iova_rwsem); 373 out_unlock_domains: 374 up_read(&iopt->domains_rwsem); 375 return rc; 376 } 377 378 /** 379 * iopt_map_user_pages() - Map a user VA to an iova in the io page table 380 * @ictx: iommufd_ctx the iopt is part of 381 * @iopt: io_pagetable to act on 382 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains 383 * the chosen iova on output. Otherwise is the iova to map to on input 384 * @uptr: User VA to map 385 * @length: Number of bytes to map 386 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping 387 * @flags: IOPT_ALLOC_IOVA or zero 388 * 389 * iova, uptr, and length must be aligned to iova_alignment. For domain backed 390 * page tables this will pin the pages and load them into the domain at iova. 391 * For non-domain page tables this will only setup a lazy reference and the 392 * caller must use iopt_access_pages() to touch them. 393 * 394 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be 395 * destroyed. 396 */ 397 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, 398 unsigned long *iova, void __user *uptr, 399 unsigned long length, int iommu_prot, 400 unsigned int flags) 401 { 402 struct iopt_pages_list elm = {}; 403 LIST_HEAD(pages_list); 404 int rc; 405 406 elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE); 407 if (IS_ERR(elm.pages)) 408 return PTR_ERR(elm.pages); 409 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && 410 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) 411 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; 412 elm.start_byte = uptr - elm.pages->uptr; 413 elm.length = length; 414 list_add(&elm.next, &pages_list); 415 416 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); 417 if (rc) { 418 if (elm.area) 419 iopt_abort_area(elm.area); 420 if (elm.pages) 421 iopt_put_pages(elm.pages); 422 return rc; 423 } 424 return 0; 425 } 426 427 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, 428 unsigned long length, struct list_head *pages_list) 429 { 430 struct iopt_area_contig_iter iter; 431 unsigned long last_iova; 432 struct iopt_area *area; 433 int rc; 434 435 if (!length) 436 return -EINVAL; 437 if (check_add_overflow(iova, length - 1, &last_iova)) 438 return -EOVERFLOW; 439 440 down_read(&iopt->iova_rwsem); 441 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { 442 struct iopt_pages_list *elm; 443 unsigned long last = min(last_iova, iopt_area_last_iova(area)); 444 445 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT); 446 if (!elm) { 447 rc = -ENOMEM; 448 goto err_free; 449 } 450 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova); 451 elm->pages = area->pages; 452 elm->length = (last - iter.cur_iova) + 1; 453 kref_get(&elm->pages->kref); 454 list_add_tail(&elm->next, pages_list); 455 } 456 if (!iopt_area_contig_done(&iter)) { 457 rc = -ENOENT; 458 goto err_free; 459 } 460 up_read(&iopt->iova_rwsem); 461 return 0; 462 err_free: 463 up_read(&iopt->iova_rwsem); 464 iopt_free_pages_list(pages_list); 465 return rc; 466 } 467 468 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, 469 unsigned long last, unsigned long *unmapped) 470 { 471 struct iopt_area *area; 472 unsigned long unmapped_bytes = 0; 473 unsigned int tries = 0; 474 int rc = -ENOENT; 475 476 /* 477 * The domains_rwsem must be held in read mode any time any area->pages 478 * is NULL. This prevents domain attach/detatch from running 479 * concurrently with cleaning up the area. 480 */ 481 again: 482 down_read(&iopt->domains_rwsem); 483 down_write(&iopt->iova_rwsem); 484 while ((area = iopt_area_iter_first(iopt, start, last))) { 485 unsigned long area_last = iopt_area_last_iova(area); 486 unsigned long area_first = iopt_area_iova(area); 487 struct iopt_pages *pages; 488 489 /* Userspace should not race map/unmap's of the same area */ 490 if (!area->pages) { 491 rc = -EBUSY; 492 goto out_unlock_iova; 493 } 494 495 if (area_first < start || area_last > last) { 496 rc = -ENOENT; 497 goto out_unlock_iova; 498 } 499 500 if (area_first != start) 501 tries = 0; 502 503 /* 504 * num_accesses writers must hold the iova_rwsem too, so we can 505 * safely read it under the write side of the iovam_rwsem 506 * without the pages->mutex. 507 */ 508 if (area->num_accesses) { 509 size_t length = iopt_area_length(area); 510 511 start = area_first; 512 area->prevent_access = true; 513 up_write(&iopt->iova_rwsem); 514 up_read(&iopt->domains_rwsem); 515 516 iommufd_access_notify_unmap(iopt, area_first, length); 517 /* Something is not responding to unmap requests. */ 518 tries++; 519 if (WARN_ON(tries > 100)) 520 return -EDEADLOCK; 521 goto again; 522 } 523 524 pages = area->pages; 525 area->pages = NULL; 526 up_write(&iopt->iova_rwsem); 527 528 iopt_area_unfill_domains(area, pages); 529 iopt_abort_area(area); 530 iopt_put_pages(pages); 531 532 unmapped_bytes += area_last - area_first + 1; 533 534 down_write(&iopt->iova_rwsem); 535 } 536 if (unmapped_bytes) 537 rc = 0; 538 539 out_unlock_iova: 540 up_write(&iopt->iova_rwsem); 541 up_read(&iopt->domains_rwsem); 542 if (unmapped) 543 *unmapped = unmapped_bytes; 544 return rc; 545 } 546 547 /** 548 * iopt_unmap_iova() - Remove a range of iova 549 * @iopt: io_pagetable to act on 550 * @iova: Starting iova to unmap 551 * @length: Number of bytes to unmap 552 * @unmapped: Return number of bytes unmapped 553 * 554 * The requested range must be a superset of existing ranges. 555 * Splitting/truncating IOVA mappings is not allowed. 556 */ 557 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, 558 unsigned long length, unsigned long *unmapped) 559 { 560 unsigned long iova_last; 561 562 if (!length) 563 return -EINVAL; 564 565 if (check_add_overflow(iova, length - 1, &iova_last)) 566 return -EOVERFLOW; 567 568 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped); 569 } 570 571 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped) 572 { 573 int rc; 574 575 rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); 576 /* If the IOVAs are empty then unmap all succeeds */ 577 if (rc == -ENOENT) 578 return 0; 579 return rc; 580 } 581 582 /* The caller must always free all the nodes in the allowed_iova rb_root. */ 583 int iopt_set_allow_iova(struct io_pagetable *iopt, 584 struct rb_root_cached *allowed_iova) 585 { 586 struct iopt_allowed *allowed; 587 588 down_write(&iopt->iova_rwsem); 589 swap(*allowed_iova, iopt->allowed_itree); 590 591 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed; 592 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) { 593 if (iopt_reserved_iter_first(iopt, allowed->node.start, 594 allowed->node.last)) { 595 swap(*allowed_iova, iopt->allowed_itree); 596 up_write(&iopt->iova_rwsem); 597 return -EADDRINUSE; 598 } 599 } 600 up_write(&iopt->iova_rwsem); 601 return 0; 602 } 603 604 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, 605 unsigned long last, void *owner) 606 { 607 struct iopt_reserved *reserved; 608 609 lockdep_assert_held_write(&iopt->iova_rwsem); 610 611 if (iopt_area_iter_first(iopt, start, last) || 612 iopt_allowed_iter_first(iopt, start, last)) 613 return -EADDRINUSE; 614 615 reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT); 616 if (!reserved) 617 return -ENOMEM; 618 reserved->node.start = start; 619 reserved->node.last = last; 620 reserved->owner = owner; 621 interval_tree_insert(&reserved->node, &iopt->reserved_itree); 622 return 0; 623 } 624 625 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) 626 { 627 struct iopt_reserved *reserved, *next; 628 629 lockdep_assert_held_write(&iopt->iova_rwsem); 630 631 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved; 632 reserved = next) { 633 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX); 634 635 if (reserved->owner == owner) { 636 interval_tree_remove(&reserved->node, 637 &iopt->reserved_itree); 638 kfree(reserved); 639 } 640 } 641 } 642 643 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) 644 { 645 down_write(&iopt->iova_rwsem); 646 __iopt_remove_reserved_iova(iopt, owner); 647 up_write(&iopt->iova_rwsem); 648 } 649 650 void iopt_init_table(struct io_pagetable *iopt) 651 { 652 init_rwsem(&iopt->iova_rwsem); 653 init_rwsem(&iopt->domains_rwsem); 654 iopt->area_itree = RB_ROOT_CACHED; 655 iopt->allowed_itree = RB_ROOT_CACHED; 656 iopt->reserved_itree = RB_ROOT_CACHED; 657 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT); 658 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC); 659 660 /* 661 * iopt's start as SW tables that can use the entire size_t IOVA space 662 * due to the use of size_t in the APIs. They have no alignment 663 * restriction. 664 */ 665 iopt->iova_alignment = 1; 666 } 667 668 void iopt_destroy_table(struct io_pagetable *iopt) 669 { 670 struct interval_tree_node *node; 671 672 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) 673 iopt_remove_reserved_iova(iopt, NULL); 674 675 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0, 676 ULONG_MAX))) { 677 interval_tree_remove(node, &iopt->allowed_itree); 678 kfree(container_of(node, struct iopt_allowed, node)); 679 } 680 681 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root)); 682 WARN_ON(!xa_empty(&iopt->domains)); 683 WARN_ON(!xa_empty(&iopt->access_list)); 684 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root)); 685 } 686 687 /** 688 * iopt_unfill_domain() - Unfill a domain with PFNs 689 * @iopt: io_pagetable to act on 690 * @domain: domain to unfill 691 * 692 * This is used when removing a domain from the iopt. Every area in the iopt 693 * will be unmapped from the domain. The domain must already be removed from the 694 * domains xarray. 695 */ 696 static void iopt_unfill_domain(struct io_pagetable *iopt, 697 struct iommu_domain *domain) 698 { 699 struct iopt_area *area; 700 701 lockdep_assert_held(&iopt->iova_rwsem); 702 lockdep_assert_held_write(&iopt->domains_rwsem); 703 704 /* 705 * Some other domain is holding all the pfns still, rapidly unmap this 706 * domain. 707 */ 708 if (iopt->next_domain_id != 0) { 709 /* Pick an arbitrary remaining domain to act as storage */ 710 struct iommu_domain *storage_domain = 711 xa_load(&iopt->domains, 0); 712 713 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 714 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 715 struct iopt_pages *pages = area->pages; 716 717 if (!pages) 718 continue; 719 720 mutex_lock(&pages->mutex); 721 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) 722 WARN_ON(!area->storage_domain); 723 if (area->storage_domain == domain) 724 area->storage_domain = storage_domain; 725 mutex_unlock(&pages->mutex); 726 727 iopt_area_unmap_domain(area, domain); 728 } 729 return; 730 } 731 732 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 733 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 734 struct iopt_pages *pages = area->pages; 735 736 if (!pages) 737 continue; 738 739 mutex_lock(&pages->mutex); 740 interval_tree_remove(&area->pages_node, &pages->domains_itree); 741 WARN_ON(area->storage_domain != domain); 742 area->storage_domain = NULL; 743 iopt_area_unfill_domain(area, pages, domain); 744 mutex_unlock(&pages->mutex); 745 } 746 } 747 748 /** 749 * iopt_fill_domain() - Fill a domain with PFNs 750 * @iopt: io_pagetable to act on 751 * @domain: domain to fill 752 * 753 * Fill the domain with PFNs from every area in the iopt. On failure the domain 754 * is left unchanged. 755 */ 756 static int iopt_fill_domain(struct io_pagetable *iopt, 757 struct iommu_domain *domain) 758 { 759 struct iopt_area *end_area; 760 struct iopt_area *area; 761 int rc; 762 763 lockdep_assert_held(&iopt->iova_rwsem); 764 lockdep_assert_held_write(&iopt->domains_rwsem); 765 766 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 767 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 768 struct iopt_pages *pages = area->pages; 769 770 if (!pages) 771 continue; 772 773 mutex_lock(&pages->mutex); 774 rc = iopt_area_fill_domain(area, domain); 775 if (rc) { 776 mutex_unlock(&pages->mutex); 777 goto out_unfill; 778 } 779 if (!area->storage_domain) { 780 WARN_ON(iopt->next_domain_id != 0); 781 area->storage_domain = domain; 782 interval_tree_insert(&area->pages_node, 783 &pages->domains_itree); 784 } 785 mutex_unlock(&pages->mutex); 786 } 787 return 0; 788 789 out_unfill: 790 end_area = area; 791 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 792 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 793 struct iopt_pages *pages = area->pages; 794 795 if (area == end_area) 796 break; 797 if (!pages) 798 continue; 799 mutex_lock(&pages->mutex); 800 if (iopt->next_domain_id == 0) { 801 interval_tree_remove(&area->pages_node, 802 &pages->domains_itree); 803 area->storage_domain = NULL; 804 } 805 iopt_area_unfill_domain(area, pages, domain); 806 mutex_unlock(&pages->mutex); 807 } 808 return rc; 809 } 810 811 /* All existing area's conform to an increased page size */ 812 static int iopt_check_iova_alignment(struct io_pagetable *iopt, 813 unsigned long new_iova_alignment) 814 { 815 unsigned long align_mask = new_iova_alignment - 1; 816 struct iopt_area *area; 817 818 lockdep_assert_held(&iopt->iova_rwsem); 819 lockdep_assert_held(&iopt->domains_rwsem); 820 821 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 822 area = iopt_area_iter_next(area, 0, ULONG_MAX)) 823 if ((iopt_area_iova(area) & align_mask) || 824 (iopt_area_length(area) & align_mask) || 825 (area->page_offset & align_mask)) 826 return -EADDRINUSE; 827 828 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) { 829 struct iommufd_access *access; 830 unsigned long index; 831 832 xa_for_each(&iopt->access_list, index, access) 833 if (WARN_ON(access->iova_alignment > 834 new_iova_alignment)) 835 return -EADDRINUSE; 836 } 837 return 0; 838 } 839 840 int iopt_table_add_domain(struct io_pagetable *iopt, 841 struct iommu_domain *domain) 842 { 843 const struct iommu_domain_geometry *geometry = &domain->geometry; 844 struct iommu_domain *iter_domain; 845 unsigned int new_iova_alignment; 846 unsigned long index; 847 int rc; 848 849 down_write(&iopt->domains_rwsem); 850 down_write(&iopt->iova_rwsem); 851 852 xa_for_each(&iopt->domains, index, iter_domain) { 853 if (WARN_ON(iter_domain == domain)) { 854 rc = -EEXIST; 855 goto out_unlock; 856 } 857 } 858 859 /* 860 * The io page size drives the iova_alignment. Internally the iopt_pages 861 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE 862 * objects into the iommu_domain. 863 * 864 * A iommu_domain must always be able to accept PAGE_SIZE to be 865 * compatible as we can't guarantee higher contiguity. 866 */ 867 new_iova_alignment = max_t(unsigned long, 868 1UL << __ffs(domain->pgsize_bitmap), 869 iopt->iova_alignment); 870 if (new_iova_alignment > PAGE_SIZE) { 871 rc = -EINVAL; 872 goto out_unlock; 873 } 874 if (new_iova_alignment != iopt->iova_alignment) { 875 rc = iopt_check_iova_alignment(iopt, new_iova_alignment); 876 if (rc) 877 goto out_unlock; 878 } 879 880 /* No area exists that is outside the allowed domain aperture */ 881 if (geometry->aperture_start != 0) { 882 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1, 883 domain); 884 if (rc) 885 goto out_reserved; 886 } 887 if (geometry->aperture_end != ULONG_MAX) { 888 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1, 889 ULONG_MAX, domain); 890 if (rc) 891 goto out_reserved; 892 } 893 894 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL); 895 if (rc) 896 goto out_reserved; 897 898 rc = iopt_fill_domain(iopt, domain); 899 if (rc) 900 goto out_release; 901 902 iopt->iova_alignment = new_iova_alignment; 903 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL); 904 iopt->next_domain_id++; 905 up_write(&iopt->iova_rwsem); 906 up_write(&iopt->domains_rwsem); 907 return 0; 908 out_release: 909 xa_release(&iopt->domains, iopt->next_domain_id); 910 out_reserved: 911 __iopt_remove_reserved_iova(iopt, domain); 912 out_unlock: 913 up_write(&iopt->iova_rwsem); 914 up_write(&iopt->domains_rwsem); 915 return rc; 916 } 917 918 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt) 919 { 920 unsigned long new_iova_alignment; 921 struct iommufd_access *access; 922 struct iommu_domain *domain; 923 unsigned long index; 924 925 lockdep_assert_held_write(&iopt->iova_rwsem); 926 lockdep_assert_held(&iopt->domains_rwsem); 927 928 /* See batch_iommu_map_small() */ 929 if (iopt->disable_large_pages) 930 new_iova_alignment = PAGE_SIZE; 931 else 932 new_iova_alignment = 1; 933 934 xa_for_each(&iopt->domains, index, domain) 935 new_iova_alignment = max_t(unsigned long, 936 1UL << __ffs(domain->pgsize_bitmap), 937 new_iova_alignment); 938 xa_for_each(&iopt->access_list, index, access) 939 new_iova_alignment = max_t(unsigned long, 940 access->iova_alignment, 941 new_iova_alignment); 942 943 if (new_iova_alignment > iopt->iova_alignment) { 944 int rc; 945 946 rc = iopt_check_iova_alignment(iopt, new_iova_alignment); 947 if (rc) 948 return rc; 949 } 950 iopt->iova_alignment = new_iova_alignment; 951 return 0; 952 } 953 954 void iopt_table_remove_domain(struct io_pagetable *iopt, 955 struct iommu_domain *domain) 956 { 957 struct iommu_domain *iter_domain = NULL; 958 unsigned long index; 959 960 down_write(&iopt->domains_rwsem); 961 down_write(&iopt->iova_rwsem); 962 963 xa_for_each(&iopt->domains, index, iter_domain) 964 if (iter_domain == domain) 965 break; 966 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id) 967 goto out_unlock; 968 969 /* 970 * Compress the xarray to keep it linear by swapping the entry to erase 971 * with the tail entry and shrinking the tail. 972 */ 973 iopt->next_domain_id--; 974 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id); 975 if (index != iopt->next_domain_id) 976 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL); 977 978 iopt_unfill_domain(iopt, domain); 979 __iopt_remove_reserved_iova(iopt, domain); 980 981 WARN_ON(iopt_calculate_iova_alignment(iopt)); 982 out_unlock: 983 up_write(&iopt->iova_rwsem); 984 up_write(&iopt->domains_rwsem); 985 } 986 987 /** 988 * iopt_area_split - Split an area into two parts at iova 989 * @area: The area to split 990 * @iova: Becomes the last of a new area 991 * 992 * This splits an area into two. It is part of the VFIO compatibility to allow 993 * poking a hole in the mapping. The two areas continue to point at the same 994 * iopt_pages, just with different starting bytes. 995 */ 996 static int iopt_area_split(struct iopt_area *area, unsigned long iova) 997 { 998 unsigned long alignment = area->iopt->iova_alignment; 999 unsigned long last_iova = iopt_area_last_iova(area); 1000 unsigned long start_iova = iopt_area_iova(area); 1001 unsigned long new_start = iova + 1; 1002 struct io_pagetable *iopt = area->iopt; 1003 struct iopt_pages *pages = area->pages; 1004 struct iopt_area *lhs; 1005 struct iopt_area *rhs; 1006 int rc; 1007 1008 lockdep_assert_held_write(&iopt->iova_rwsem); 1009 1010 if (iova == start_iova || iova == last_iova) 1011 return 0; 1012 1013 if (!pages || area->prevent_access) 1014 return -EBUSY; 1015 1016 if (new_start & (alignment - 1) || 1017 iopt_area_start_byte(area, new_start) & (alignment - 1)) 1018 return -EINVAL; 1019 1020 lhs = iopt_area_alloc(); 1021 if (!lhs) 1022 return -ENOMEM; 1023 1024 rhs = iopt_area_alloc(); 1025 if (!rhs) { 1026 rc = -ENOMEM; 1027 goto err_free_lhs; 1028 } 1029 1030 mutex_lock(&pages->mutex); 1031 /* 1032 * Splitting is not permitted if an access exists, we don't track enough 1033 * information to split existing accesses. 1034 */ 1035 if (area->num_accesses) { 1036 rc = -EINVAL; 1037 goto err_unlock; 1038 } 1039 1040 /* 1041 * Splitting is not permitted if a domain could have been mapped with 1042 * huge pages. 1043 */ 1044 if (area->storage_domain && !iopt->disable_large_pages) { 1045 rc = -EINVAL; 1046 goto err_unlock; 1047 } 1048 1049 interval_tree_remove(&area->node, &iopt->area_itree); 1050 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova, 1051 iopt_area_start_byte(area, start_iova), 1052 (new_start - 1) - start_iova + 1, 1053 area->iommu_prot); 1054 if (WARN_ON(rc)) 1055 goto err_insert; 1056 1057 rc = iopt_insert_area(iopt, rhs, area->pages, new_start, 1058 iopt_area_start_byte(area, new_start), 1059 last_iova - new_start + 1, area->iommu_prot); 1060 if (WARN_ON(rc)) 1061 goto err_remove_lhs; 1062 1063 /* 1064 * If the original area has filled a domain, domains_itree has to be 1065 * updated. 1066 */ 1067 if (area->storage_domain) { 1068 interval_tree_remove(&area->pages_node, &pages->domains_itree); 1069 interval_tree_insert(&lhs->pages_node, &pages->domains_itree); 1070 interval_tree_insert(&rhs->pages_node, &pages->domains_itree); 1071 } 1072 1073 lhs->storage_domain = area->storage_domain; 1074 lhs->pages = area->pages; 1075 rhs->storage_domain = area->storage_domain; 1076 rhs->pages = area->pages; 1077 kref_get(&rhs->pages->kref); 1078 kfree(area); 1079 mutex_unlock(&pages->mutex); 1080 1081 /* 1082 * No change to domains or accesses because the pages hasn't been 1083 * changed 1084 */ 1085 return 0; 1086 1087 err_remove_lhs: 1088 interval_tree_remove(&lhs->node, &iopt->area_itree); 1089 err_insert: 1090 interval_tree_insert(&area->node, &iopt->area_itree); 1091 err_unlock: 1092 mutex_unlock(&pages->mutex); 1093 kfree(rhs); 1094 err_free_lhs: 1095 kfree(lhs); 1096 return rc; 1097 } 1098 1099 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, 1100 size_t num_iovas) 1101 { 1102 int rc = 0; 1103 int i; 1104 1105 down_write(&iopt->iova_rwsem); 1106 for (i = 0; i < num_iovas; i++) { 1107 struct iopt_area *area; 1108 1109 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]); 1110 if (!area) 1111 continue; 1112 rc = iopt_area_split(area, iovas[i]); 1113 if (rc) 1114 break; 1115 } 1116 up_write(&iopt->iova_rwsem); 1117 return rc; 1118 } 1119 1120 void iopt_enable_large_pages(struct io_pagetable *iopt) 1121 { 1122 int rc; 1123 1124 down_write(&iopt->domains_rwsem); 1125 down_write(&iopt->iova_rwsem); 1126 WRITE_ONCE(iopt->disable_large_pages, false); 1127 rc = iopt_calculate_iova_alignment(iopt); 1128 WARN_ON(rc); 1129 up_write(&iopt->iova_rwsem); 1130 up_write(&iopt->domains_rwsem); 1131 } 1132 1133 int iopt_disable_large_pages(struct io_pagetable *iopt) 1134 { 1135 int rc = 0; 1136 1137 down_write(&iopt->domains_rwsem); 1138 down_write(&iopt->iova_rwsem); 1139 if (iopt->disable_large_pages) 1140 goto out_unlock; 1141 1142 /* Won't do it if domains already have pages mapped in them */ 1143 if (!xa_empty(&iopt->domains) && 1144 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) { 1145 rc = -EINVAL; 1146 goto out_unlock; 1147 } 1148 1149 WRITE_ONCE(iopt->disable_large_pages, true); 1150 rc = iopt_calculate_iova_alignment(iopt); 1151 if (rc) 1152 WRITE_ONCE(iopt->disable_large_pages, false); 1153 out_unlock: 1154 up_write(&iopt->iova_rwsem); 1155 up_write(&iopt->domains_rwsem); 1156 return rc; 1157 } 1158 1159 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access) 1160 { 1161 int rc; 1162 1163 down_write(&iopt->domains_rwsem); 1164 down_write(&iopt->iova_rwsem); 1165 rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access, 1166 xa_limit_16b, GFP_KERNEL_ACCOUNT); 1167 if (rc) 1168 goto out_unlock; 1169 1170 rc = iopt_calculate_iova_alignment(iopt); 1171 if (rc) { 1172 xa_erase(&iopt->access_list, access->iopt_access_list_id); 1173 goto out_unlock; 1174 } 1175 1176 out_unlock: 1177 up_write(&iopt->iova_rwsem); 1178 up_write(&iopt->domains_rwsem); 1179 return rc; 1180 } 1181 1182 void iopt_remove_access(struct io_pagetable *iopt, 1183 struct iommufd_access *access, 1184 u32 iopt_access_list_id) 1185 { 1186 down_write(&iopt->domains_rwsem); 1187 down_write(&iopt->iova_rwsem); 1188 WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access); 1189 WARN_ON(iopt_calculate_iova_alignment(iopt)); 1190 up_write(&iopt->iova_rwsem); 1191 up_write(&iopt->domains_rwsem); 1192 } 1193 1194 /* Narrow the valid_iova_itree to include reserved ranges from a device. */ 1195 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, 1196 struct device *dev, 1197 phys_addr_t *sw_msi_start) 1198 { 1199 struct iommu_resv_region *resv; 1200 LIST_HEAD(resv_regions); 1201 unsigned int num_hw_msi = 0; 1202 unsigned int num_sw_msi = 0; 1203 int rc; 1204 1205 if (iommufd_should_fail()) 1206 return -EINVAL; 1207 1208 down_write(&iopt->iova_rwsem); 1209 /* FIXME: drivers allocate memory but there is no failure propogated */ 1210 iommu_get_resv_regions(dev, &resv_regions); 1211 1212 list_for_each_entry(resv, &resv_regions, list) { 1213 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) 1214 continue; 1215 1216 if (sw_msi_start && resv->type == IOMMU_RESV_MSI) 1217 num_hw_msi++; 1218 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) { 1219 *sw_msi_start = resv->start; 1220 num_sw_msi++; 1221 } 1222 1223 rc = iopt_reserve_iova(iopt, resv->start, 1224 resv->length - 1 + resv->start, dev); 1225 if (rc) 1226 goto out_reserved; 1227 } 1228 1229 /* Drivers must offer sane combinations of regions */ 1230 if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) { 1231 rc = -EINVAL; 1232 goto out_reserved; 1233 } 1234 1235 rc = 0; 1236 goto out_free_resv; 1237 1238 out_reserved: 1239 __iopt_remove_reserved_iova(iopt, dev); 1240 out_free_resv: 1241 iommu_put_resv_regions(dev, &resv_regions); 1242 up_write(&iopt->iova_rwsem); 1243 return rc; 1244 } 1245