1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. 3 * 4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The 5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page 6 * list for access by an in-kernel user. 7 * 8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs 9 * between the domains and xarray. 10 */ 11 #include <linux/iommufd.h> 12 #include <linux/lockdep.h> 13 #include <linux/iommu.h> 14 #include <linux/sched/mm.h> 15 #include <linux/err.h> 16 #include <linux/slab.h> 17 #include <linux/errno.h> 18 19 #include "io_pagetable.h" 20 #include "double_span.h" 21 22 struct iopt_pages_list { 23 struct iopt_pages *pages; 24 struct iopt_area *area; 25 struct list_head next; 26 unsigned long start_byte; 27 unsigned long length; 28 }; 29 30 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, 31 struct io_pagetable *iopt, 32 unsigned long iova, 33 unsigned long last_iova) 34 { 35 lockdep_assert_held(&iopt->iova_rwsem); 36 37 iter->cur_iova = iova; 38 iter->last_iova = last_iova; 39 iter->area = iopt_area_iter_first(iopt, iova, iova); 40 if (!iter->area) 41 return NULL; 42 if (!iter->area->pages) { 43 iter->area = NULL; 44 return NULL; 45 } 46 return iter->area; 47 } 48 49 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter) 50 { 51 unsigned long last_iova; 52 53 if (!iter->area) 54 return NULL; 55 last_iova = iopt_area_last_iova(iter->area); 56 if (iter->last_iova <= last_iova) 57 return NULL; 58 59 iter->cur_iova = last_iova + 1; 60 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova, 61 iter->last_iova); 62 if (!iter->area) 63 return NULL; 64 if (iter->cur_iova != iopt_area_iova(iter->area) || 65 !iter->area->pages) { 66 iter->area = NULL; 67 return NULL; 68 } 69 return iter->area; 70 } 71 72 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span, 73 unsigned long length, 74 unsigned long iova_alignment, 75 unsigned long page_offset) 76 { 77 if (span->is_used || span->last_hole - span->start_hole < length - 1) 78 return false; 79 80 span->start_hole = ALIGN(span->start_hole, iova_alignment) | 81 page_offset; 82 if (span->start_hole > span->last_hole || 83 span->last_hole - span->start_hole < length - 1) 84 return false; 85 return true; 86 } 87 88 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, 89 unsigned long length, 90 unsigned long iova_alignment, 91 unsigned long page_offset) 92 { 93 if (span->is_hole || span->last_used - span->start_used < length - 1) 94 return false; 95 96 span->start_used = ALIGN(span->start_used, iova_alignment) | 97 page_offset; 98 if (span->start_used > span->last_used || 99 span->last_used - span->start_used < length - 1) 100 return false; 101 return true; 102 } 103 104 /* 105 * Automatically find a block of IOVA that is not being used and not reserved. 106 * Does not return a 0 IOVA even if it is valid. 107 */ 108 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, 109 unsigned long uptr, unsigned long length) 110 { 111 unsigned long page_offset = uptr % PAGE_SIZE; 112 struct interval_tree_double_span_iter used_span; 113 struct interval_tree_span_iter allowed_span; 114 unsigned long max_alignment = PAGE_SIZE; 115 unsigned long iova_alignment; 116 117 lockdep_assert_held(&iopt->iova_rwsem); 118 119 /* Protect roundup_pow-of_two() from overflow */ 120 if (length == 0 || length >= ULONG_MAX / 2) 121 return -EOVERFLOW; 122 123 /* 124 * Keep alignment present in the uptr when building the IOVA, this 125 * increases the chance we can map a THP. 126 */ 127 if (!uptr) 128 iova_alignment = roundup_pow_of_two(length); 129 else 130 iova_alignment = min_t(unsigned long, 131 roundup_pow_of_two(length), 132 1UL << __ffs64(uptr)); 133 134 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 135 max_alignment = HPAGE_SIZE; 136 #endif 137 /* Protect against ALIGN() overflow */ 138 if (iova_alignment >= max_alignment) 139 iova_alignment = max_alignment; 140 141 if (iova_alignment < iopt->iova_alignment) 142 return -EINVAL; 143 144 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree, 145 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) { 146 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) { 147 allowed_span.start_used = PAGE_SIZE; 148 allowed_span.last_used = ULONG_MAX - PAGE_SIZE; 149 allowed_span.is_hole = false; 150 } 151 152 if (!__alloc_iova_check_used(&allowed_span, length, 153 iova_alignment, page_offset)) 154 continue; 155 156 interval_tree_for_each_double_span( 157 &used_span, &iopt->reserved_itree, &iopt->area_itree, 158 allowed_span.start_used, allowed_span.last_used) { 159 if (!__alloc_iova_check_hole(&used_span, length, 160 iova_alignment, 161 page_offset)) 162 continue; 163 164 *iova = used_span.start_hole; 165 return 0; 166 } 167 } 168 return -ENOSPC; 169 } 170 171 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova, 172 unsigned long length) 173 { 174 unsigned long last; 175 176 lockdep_assert_held(&iopt->iova_rwsem); 177 178 if ((iova & (iopt->iova_alignment - 1))) 179 return -EINVAL; 180 181 if (check_add_overflow(iova, length - 1, &last)) 182 return -EOVERFLOW; 183 184 /* No reserved IOVA intersects the range */ 185 if (iopt_reserved_iter_first(iopt, iova, last)) 186 return -EINVAL; 187 188 /* Check that there is not already a mapping in the range */ 189 if (iopt_area_iter_first(iopt, iova, last)) 190 return -EEXIST; 191 return 0; 192 } 193 194 /* 195 * The area takes a slice of the pages from start_bytes to start_byte + length 196 */ 197 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area, 198 struct iopt_pages *pages, unsigned long iova, 199 unsigned long start_byte, unsigned long length, 200 int iommu_prot) 201 { 202 lockdep_assert_held_write(&iopt->iova_rwsem); 203 204 if ((iommu_prot & IOMMU_WRITE) && !pages->writable) 205 return -EPERM; 206 207 area->iommu_prot = iommu_prot; 208 area->page_offset = start_byte % PAGE_SIZE; 209 if (area->page_offset & (iopt->iova_alignment - 1)) 210 return -EINVAL; 211 212 area->node.start = iova; 213 if (check_add_overflow(iova, length - 1, &area->node.last)) 214 return -EOVERFLOW; 215 216 area->pages_node.start = start_byte / PAGE_SIZE; 217 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last)) 218 return -EOVERFLOW; 219 area->pages_node.last = area->pages_node.last / PAGE_SIZE; 220 if (WARN_ON(area->pages_node.last >= pages->npages)) 221 return -EOVERFLOW; 222 223 /* 224 * The area is inserted with a NULL pages indicating it is not fully 225 * initialized yet. 226 */ 227 area->iopt = iopt; 228 interval_tree_insert(&area->node, &iopt->area_itree); 229 return 0; 230 } 231 232 static struct iopt_area *iopt_area_alloc(void) 233 { 234 struct iopt_area *area; 235 236 area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); 237 if (!area) 238 return NULL; 239 RB_CLEAR_NODE(&area->node.rb); 240 RB_CLEAR_NODE(&area->pages_node.rb); 241 return area; 242 } 243 244 static int iopt_alloc_area_pages(struct io_pagetable *iopt, 245 struct list_head *pages_list, 246 unsigned long length, unsigned long *dst_iova, 247 int iommu_prot, unsigned int flags) 248 { 249 struct iopt_pages_list *elm; 250 unsigned long iova; 251 int rc = 0; 252 253 list_for_each_entry(elm, pages_list, next) { 254 elm->area = iopt_area_alloc(); 255 if (!elm->area) 256 return -ENOMEM; 257 } 258 259 down_write(&iopt->iova_rwsem); 260 if ((length & (iopt->iova_alignment - 1)) || !length) { 261 rc = -EINVAL; 262 goto out_unlock; 263 } 264 265 if (flags & IOPT_ALLOC_IOVA) { 266 /* Use the first entry to guess the ideal IOVA alignment */ 267 elm = list_first_entry(pages_list, struct iopt_pages_list, 268 next); 269 rc = iopt_alloc_iova( 270 iopt, dst_iova, 271 (uintptr_t)elm->pages->uptr + elm->start_byte, length); 272 if (rc) 273 goto out_unlock; 274 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && 275 WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) { 276 rc = -EINVAL; 277 goto out_unlock; 278 } 279 } else { 280 rc = iopt_check_iova(iopt, *dst_iova, length); 281 if (rc) 282 goto out_unlock; 283 } 284 285 /* 286 * Areas are created with a NULL pages so that the IOVA space is 287 * reserved and we can unlock the iova_rwsem. 288 */ 289 iova = *dst_iova; 290 list_for_each_entry(elm, pages_list, next) { 291 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova, 292 elm->start_byte, elm->length, iommu_prot); 293 if (rc) 294 goto out_unlock; 295 iova += elm->length; 296 } 297 298 out_unlock: 299 up_write(&iopt->iova_rwsem); 300 return rc; 301 } 302 303 static void iopt_abort_area(struct iopt_area *area) 304 { 305 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) 306 WARN_ON(area->pages); 307 if (area->iopt) { 308 down_write(&area->iopt->iova_rwsem); 309 interval_tree_remove(&area->node, &area->iopt->area_itree); 310 up_write(&area->iopt->iova_rwsem); 311 } 312 kfree(area); 313 } 314 315 void iopt_free_pages_list(struct list_head *pages_list) 316 { 317 struct iopt_pages_list *elm; 318 319 while ((elm = list_first_entry_or_null(pages_list, 320 struct iopt_pages_list, next))) { 321 if (elm->area) 322 iopt_abort_area(elm->area); 323 if (elm->pages) 324 iopt_put_pages(elm->pages); 325 list_del(&elm->next); 326 kfree(elm); 327 } 328 } 329 330 static int iopt_fill_domains_pages(struct list_head *pages_list) 331 { 332 struct iopt_pages_list *undo_elm; 333 struct iopt_pages_list *elm; 334 int rc; 335 336 list_for_each_entry(elm, pages_list, next) { 337 rc = iopt_area_fill_domains(elm->area, elm->pages); 338 if (rc) 339 goto err_undo; 340 } 341 return 0; 342 343 err_undo: 344 list_for_each_entry(undo_elm, pages_list, next) { 345 if (undo_elm == elm) 346 break; 347 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages); 348 } 349 return rc; 350 } 351 352 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, 353 unsigned long length, unsigned long *dst_iova, 354 int iommu_prot, unsigned int flags) 355 { 356 struct iopt_pages_list *elm; 357 int rc; 358 359 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova, 360 iommu_prot, flags); 361 if (rc) 362 return rc; 363 364 down_read(&iopt->domains_rwsem); 365 rc = iopt_fill_domains_pages(pages_list); 366 if (rc) 367 goto out_unlock_domains; 368 369 down_write(&iopt->iova_rwsem); 370 list_for_each_entry(elm, pages_list, next) { 371 /* 372 * area->pages must be set inside the domains_rwsem to ensure 373 * any newly added domains will get filled. Moves the reference 374 * in from the list. 375 */ 376 elm->area->pages = elm->pages; 377 elm->pages = NULL; 378 elm->area = NULL; 379 } 380 up_write(&iopt->iova_rwsem); 381 out_unlock_domains: 382 up_read(&iopt->domains_rwsem); 383 return rc; 384 } 385 386 /** 387 * iopt_map_user_pages() - Map a user VA to an iova in the io page table 388 * @ictx: iommufd_ctx the iopt is part of 389 * @iopt: io_pagetable to act on 390 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains 391 * the chosen iova on output. Otherwise is the iova to map to on input 392 * @uptr: User VA to map 393 * @length: Number of bytes to map 394 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping 395 * @flags: IOPT_ALLOC_IOVA or zero 396 * 397 * iova, uptr, and length must be aligned to iova_alignment. For domain backed 398 * page tables this will pin the pages and load them into the domain at iova. 399 * For non-domain page tables this will only setup a lazy reference and the 400 * caller must use iopt_access_pages() to touch them. 401 * 402 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be 403 * destroyed. 404 */ 405 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, 406 unsigned long *iova, void __user *uptr, 407 unsigned long length, int iommu_prot, 408 unsigned int flags) 409 { 410 struct iopt_pages_list elm = {}; 411 LIST_HEAD(pages_list); 412 int rc; 413 414 elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE); 415 if (IS_ERR(elm.pages)) 416 return PTR_ERR(elm.pages); 417 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && 418 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) 419 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; 420 elm.start_byte = uptr - elm.pages->uptr; 421 elm.length = length; 422 list_add(&elm.next, &pages_list); 423 424 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); 425 if (rc) { 426 if (elm.area) 427 iopt_abort_area(elm.area); 428 if (elm.pages) 429 iopt_put_pages(elm.pages); 430 return rc; 431 } 432 return 0; 433 } 434 435 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, 436 unsigned long length, struct list_head *pages_list) 437 { 438 struct iopt_area_contig_iter iter; 439 unsigned long last_iova; 440 struct iopt_area *area; 441 int rc; 442 443 if (!length) 444 return -EINVAL; 445 if (check_add_overflow(iova, length - 1, &last_iova)) 446 return -EOVERFLOW; 447 448 down_read(&iopt->iova_rwsem); 449 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { 450 struct iopt_pages_list *elm; 451 unsigned long last = min(last_iova, iopt_area_last_iova(area)); 452 453 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT); 454 if (!elm) { 455 rc = -ENOMEM; 456 goto err_free; 457 } 458 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova); 459 elm->pages = area->pages; 460 elm->length = (last - iter.cur_iova) + 1; 461 kref_get(&elm->pages->kref); 462 list_add_tail(&elm->next, pages_list); 463 } 464 if (!iopt_area_contig_done(&iter)) { 465 rc = -ENOENT; 466 goto err_free; 467 } 468 up_read(&iopt->iova_rwsem); 469 return 0; 470 err_free: 471 up_read(&iopt->iova_rwsem); 472 iopt_free_pages_list(pages_list); 473 return rc; 474 } 475 476 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, 477 unsigned long last, unsigned long *unmapped) 478 { 479 struct iopt_area *area; 480 unsigned long unmapped_bytes = 0; 481 unsigned int tries = 0; 482 int rc = -ENOENT; 483 484 /* 485 * The domains_rwsem must be held in read mode any time any area->pages 486 * is NULL. This prevents domain attach/detatch from running 487 * concurrently with cleaning up the area. 488 */ 489 again: 490 down_read(&iopt->domains_rwsem); 491 down_write(&iopt->iova_rwsem); 492 while ((area = iopt_area_iter_first(iopt, start, last))) { 493 unsigned long area_last = iopt_area_last_iova(area); 494 unsigned long area_first = iopt_area_iova(area); 495 struct iopt_pages *pages; 496 497 /* Userspace should not race map/unmap's of the same area */ 498 if (!area->pages) { 499 rc = -EBUSY; 500 goto out_unlock_iova; 501 } 502 503 if (area_first < start || area_last > last) { 504 rc = -ENOENT; 505 goto out_unlock_iova; 506 } 507 508 if (area_first != start) 509 tries = 0; 510 511 /* 512 * num_accesses writers must hold the iova_rwsem too, so we can 513 * safely read it under the write side of the iovam_rwsem 514 * without the pages->mutex. 515 */ 516 if (area->num_accesses) { 517 size_t length = iopt_area_length(area); 518 519 start = area_first; 520 area->prevent_access = true; 521 up_write(&iopt->iova_rwsem); 522 up_read(&iopt->domains_rwsem); 523 524 iommufd_access_notify_unmap(iopt, area_first, length); 525 /* Something is not responding to unmap requests. */ 526 tries++; 527 if (WARN_ON(tries > 100)) 528 return -EDEADLOCK; 529 goto again; 530 } 531 532 pages = area->pages; 533 area->pages = NULL; 534 up_write(&iopt->iova_rwsem); 535 536 iopt_area_unfill_domains(area, pages); 537 iopt_abort_area(area); 538 iopt_put_pages(pages); 539 540 unmapped_bytes += area_last - area_first + 1; 541 542 down_write(&iopt->iova_rwsem); 543 } 544 if (unmapped_bytes) 545 rc = 0; 546 547 out_unlock_iova: 548 up_write(&iopt->iova_rwsem); 549 up_read(&iopt->domains_rwsem); 550 if (unmapped) 551 *unmapped = unmapped_bytes; 552 return rc; 553 } 554 555 /** 556 * iopt_unmap_iova() - Remove a range of iova 557 * @iopt: io_pagetable to act on 558 * @iova: Starting iova to unmap 559 * @length: Number of bytes to unmap 560 * @unmapped: Return number of bytes unmapped 561 * 562 * The requested range must be a superset of existing ranges. 563 * Splitting/truncating IOVA mappings is not allowed. 564 */ 565 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, 566 unsigned long length, unsigned long *unmapped) 567 { 568 unsigned long iova_last; 569 570 if (!length) 571 return -EINVAL; 572 573 if (check_add_overflow(iova, length - 1, &iova_last)) 574 return -EOVERFLOW; 575 576 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped); 577 } 578 579 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped) 580 { 581 int rc; 582 583 rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); 584 /* If the IOVAs are empty then unmap all succeeds */ 585 if (rc == -ENOENT) 586 return 0; 587 return rc; 588 } 589 590 /* The caller must always free all the nodes in the allowed_iova rb_root. */ 591 int iopt_set_allow_iova(struct io_pagetable *iopt, 592 struct rb_root_cached *allowed_iova) 593 { 594 struct iopt_allowed *allowed; 595 596 down_write(&iopt->iova_rwsem); 597 swap(*allowed_iova, iopt->allowed_itree); 598 599 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed; 600 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) { 601 if (iopt_reserved_iter_first(iopt, allowed->node.start, 602 allowed->node.last)) { 603 swap(*allowed_iova, iopt->allowed_itree); 604 up_write(&iopt->iova_rwsem); 605 return -EADDRINUSE; 606 } 607 } 608 up_write(&iopt->iova_rwsem); 609 return 0; 610 } 611 612 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, 613 unsigned long last, void *owner) 614 { 615 struct iopt_reserved *reserved; 616 617 lockdep_assert_held_write(&iopt->iova_rwsem); 618 619 if (iopt_area_iter_first(iopt, start, last) || 620 iopt_allowed_iter_first(iopt, start, last)) 621 return -EADDRINUSE; 622 623 reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT); 624 if (!reserved) 625 return -ENOMEM; 626 reserved->node.start = start; 627 reserved->node.last = last; 628 reserved->owner = owner; 629 interval_tree_insert(&reserved->node, &iopt->reserved_itree); 630 return 0; 631 } 632 633 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) 634 { 635 struct iopt_reserved *reserved, *next; 636 637 lockdep_assert_held_write(&iopt->iova_rwsem); 638 639 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved; 640 reserved = next) { 641 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX); 642 643 if (reserved->owner == owner) { 644 interval_tree_remove(&reserved->node, 645 &iopt->reserved_itree); 646 kfree(reserved); 647 } 648 } 649 } 650 651 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) 652 { 653 down_write(&iopt->iova_rwsem); 654 __iopt_remove_reserved_iova(iopt, owner); 655 up_write(&iopt->iova_rwsem); 656 } 657 658 void iopt_init_table(struct io_pagetable *iopt) 659 { 660 init_rwsem(&iopt->iova_rwsem); 661 init_rwsem(&iopt->domains_rwsem); 662 iopt->area_itree = RB_ROOT_CACHED; 663 iopt->allowed_itree = RB_ROOT_CACHED; 664 iopt->reserved_itree = RB_ROOT_CACHED; 665 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT); 666 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC); 667 668 /* 669 * iopt's start as SW tables that can use the entire size_t IOVA space 670 * due to the use of size_t in the APIs. They have no alignment 671 * restriction. 672 */ 673 iopt->iova_alignment = 1; 674 } 675 676 void iopt_destroy_table(struct io_pagetable *iopt) 677 { 678 struct interval_tree_node *node; 679 680 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) 681 iopt_remove_reserved_iova(iopt, NULL); 682 683 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0, 684 ULONG_MAX))) { 685 interval_tree_remove(node, &iopt->allowed_itree); 686 kfree(container_of(node, struct iopt_allowed, node)); 687 } 688 689 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root)); 690 WARN_ON(!xa_empty(&iopt->domains)); 691 WARN_ON(!xa_empty(&iopt->access_list)); 692 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root)); 693 } 694 695 /** 696 * iopt_unfill_domain() - Unfill a domain with PFNs 697 * @iopt: io_pagetable to act on 698 * @domain: domain to unfill 699 * 700 * This is used when removing a domain from the iopt. Every area in the iopt 701 * will be unmapped from the domain. The domain must already be removed from the 702 * domains xarray. 703 */ 704 static void iopt_unfill_domain(struct io_pagetable *iopt, 705 struct iommu_domain *domain) 706 { 707 struct iopt_area *area; 708 709 lockdep_assert_held(&iopt->iova_rwsem); 710 lockdep_assert_held_write(&iopt->domains_rwsem); 711 712 /* 713 * Some other domain is holding all the pfns still, rapidly unmap this 714 * domain. 715 */ 716 if (iopt->next_domain_id != 0) { 717 /* Pick an arbitrary remaining domain to act as storage */ 718 struct iommu_domain *storage_domain = 719 xa_load(&iopt->domains, 0); 720 721 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 722 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 723 struct iopt_pages *pages = area->pages; 724 725 if (!pages) 726 continue; 727 728 mutex_lock(&pages->mutex); 729 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) 730 WARN_ON(!area->storage_domain); 731 if (area->storage_domain == domain) 732 area->storage_domain = storage_domain; 733 mutex_unlock(&pages->mutex); 734 735 iopt_area_unmap_domain(area, domain); 736 } 737 return; 738 } 739 740 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 741 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 742 struct iopt_pages *pages = area->pages; 743 744 if (!pages) 745 continue; 746 747 mutex_lock(&pages->mutex); 748 interval_tree_remove(&area->pages_node, &pages->domains_itree); 749 WARN_ON(area->storage_domain != domain); 750 area->storage_domain = NULL; 751 iopt_area_unfill_domain(area, pages, domain); 752 mutex_unlock(&pages->mutex); 753 } 754 } 755 756 /** 757 * iopt_fill_domain() - Fill a domain with PFNs 758 * @iopt: io_pagetable to act on 759 * @domain: domain to fill 760 * 761 * Fill the domain with PFNs from every area in the iopt. On failure the domain 762 * is left unchanged. 763 */ 764 static int iopt_fill_domain(struct io_pagetable *iopt, 765 struct iommu_domain *domain) 766 { 767 struct iopt_area *end_area; 768 struct iopt_area *area; 769 int rc; 770 771 lockdep_assert_held(&iopt->iova_rwsem); 772 lockdep_assert_held_write(&iopt->domains_rwsem); 773 774 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 775 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 776 struct iopt_pages *pages = area->pages; 777 778 if (!pages) 779 continue; 780 781 mutex_lock(&pages->mutex); 782 rc = iopt_area_fill_domain(area, domain); 783 if (rc) { 784 mutex_unlock(&pages->mutex); 785 goto out_unfill; 786 } 787 if (!area->storage_domain) { 788 WARN_ON(iopt->next_domain_id != 0); 789 area->storage_domain = domain; 790 interval_tree_insert(&area->pages_node, 791 &pages->domains_itree); 792 } 793 mutex_unlock(&pages->mutex); 794 } 795 return 0; 796 797 out_unfill: 798 end_area = area; 799 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 800 area = iopt_area_iter_next(area, 0, ULONG_MAX)) { 801 struct iopt_pages *pages = area->pages; 802 803 if (area == end_area) 804 break; 805 if (!pages) 806 continue; 807 mutex_lock(&pages->mutex); 808 if (iopt->next_domain_id == 0) { 809 interval_tree_remove(&area->pages_node, 810 &pages->domains_itree); 811 area->storage_domain = NULL; 812 } 813 iopt_area_unfill_domain(area, pages, domain); 814 mutex_unlock(&pages->mutex); 815 } 816 return rc; 817 } 818 819 /* All existing area's conform to an increased page size */ 820 static int iopt_check_iova_alignment(struct io_pagetable *iopt, 821 unsigned long new_iova_alignment) 822 { 823 unsigned long align_mask = new_iova_alignment - 1; 824 struct iopt_area *area; 825 826 lockdep_assert_held(&iopt->iova_rwsem); 827 lockdep_assert_held(&iopt->domains_rwsem); 828 829 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; 830 area = iopt_area_iter_next(area, 0, ULONG_MAX)) 831 if ((iopt_area_iova(area) & align_mask) || 832 (iopt_area_length(area) & align_mask) || 833 (area->page_offset & align_mask)) 834 return -EADDRINUSE; 835 836 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) { 837 struct iommufd_access *access; 838 unsigned long index; 839 840 xa_for_each(&iopt->access_list, index, access) 841 if (WARN_ON(access->iova_alignment > 842 new_iova_alignment)) 843 return -EADDRINUSE; 844 } 845 return 0; 846 } 847 848 int iopt_table_add_domain(struct io_pagetable *iopt, 849 struct iommu_domain *domain) 850 { 851 const struct iommu_domain_geometry *geometry = &domain->geometry; 852 struct iommu_domain *iter_domain; 853 unsigned int new_iova_alignment; 854 unsigned long index; 855 int rc; 856 857 down_write(&iopt->domains_rwsem); 858 down_write(&iopt->iova_rwsem); 859 860 xa_for_each(&iopt->domains, index, iter_domain) { 861 if (WARN_ON(iter_domain == domain)) { 862 rc = -EEXIST; 863 goto out_unlock; 864 } 865 } 866 867 /* 868 * The io page size drives the iova_alignment. Internally the iopt_pages 869 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE 870 * objects into the iommu_domain. 871 * 872 * A iommu_domain must always be able to accept PAGE_SIZE to be 873 * compatible as we can't guarantee higher contiguity. 874 */ 875 new_iova_alignment = max_t(unsigned long, 876 1UL << __ffs(domain->pgsize_bitmap), 877 iopt->iova_alignment); 878 if (new_iova_alignment > PAGE_SIZE) { 879 rc = -EINVAL; 880 goto out_unlock; 881 } 882 if (new_iova_alignment != iopt->iova_alignment) { 883 rc = iopt_check_iova_alignment(iopt, new_iova_alignment); 884 if (rc) 885 goto out_unlock; 886 } 887 888 /* No area exists that is outside the allowed domain aperture */ 889 if (geometry->aperture_start != 0) { 890 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1, 891 domain); 892 if (rc) 893 goto out_reserved; 894 } 895 if (geometry->aperture_end != ULONG_MAX) { 896 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1, 897 ULONG_MAX, domain); 898 if (rc) 899 goto out_reserved; 900 } 901 902 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL); 903 if (rc) 904 goto out_reserved; 905 906 rc = iopt_fill_domain(iopt, domain); 907 if (rc) 908 goto out_release; 909 910 iopt->iova_alignment = new_iova_alignment; 911 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL); 912 iopt->next_domain_id++; 913 up_write(&iopt->iova_rwsem); 914 up_write(&iopt->domains_rwsem); 915 return 0; 916 out_release: 917 xa_release(&iopt->domains, iopt->next_domain_id); 918 out_reserved: 919 __iopt_remove_reserved_iova(iopt, domain); 920 out_unlock: 921 up_write(&iopt->iova_rwsem); 922 up_write(&iopt->domains_rwsem); 923 return rc; 924 } 925 926 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt) 927 { 928 unsigned long new_iova_alignment; 929 struct iommufd_access *access; 930 struct iommu_domain *domain; 931 unsigned long index; 932 933 lockdep_assert_held_write(&iopt->iova_rwsem); 934 lockdep_assert_held(&iopt->domains_rwsem); 935 936 /* See batch_iommu_map_small() */ 937 if (iopt->disable_large_pages) 938 new_iova_alignment = PAGE_SIZE; 939 else 940 new_iova_alignment = 1; 941 942 xa_for_each(&iopt->domains, index, domain) 943 new_iova_alignment = max_t(unsigned long, 944 1UL << __ffs(domain->pgsize_bitmap), 945 new_iova_alignment); 946 xa_for_each(&iopt->access_list, index, access) 947 new_iova_alignment = max_t(unsigned long, 948 access->iova_alignment, 949 new_iova_alignment); 950 951 if (new_iova_alignment > iopt->iova_alignment) { 952 int rc; 953 954 rc = iopt_check_iova_alignment(iopt, new_iova_alignment); 955 if (rc) 956 return rc; 957 } 958 iopt->iova_alignment = new_iova_alignment; 959 return 0; 960 } 961 962 void iopt_table_remove_domain(struct io_pagetable *iopt, 963 struct iommu_domain *domain) 964 { 965 struct iommu_domain *iter_domain = NULL; 966 unsigned long index; 967 968 down_write(&iopt->domains_rwsem); 969 down_write(&iopt->iova_rwsem); 970 971 xa_for_each(&iopt->domains, index, iter_domain) 972 if (iter_domain == domain) 973 break; 974 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id) 975 goto out_unlock; 976 977 /* 978 * Compress the xarray to keep it linear by swapping the entry to erase 979 * with the tail entry and shrinking the tail. 980 */ 981 iopt->next_domain_id--; 982 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id); 983 if (index != iopt->next_domain_id) 984 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL); 985 986 iopt_unfill_domain(iopt, domain); 987 __iopt_remove_reserved_iova(iopt, domain); 988 989 WARN_ON(iopt_calculate_iova_alignment(iopt)); 990 out_unlock: 991 up_write(&iopt->iova_rwsem); 992 up_write(&iopt->domains_rwsem); 993 } 994 995 /** 996 * iopt_area_split - Split an area into two parts at iova 997 * @area: The area to split 998 * @iova: Becomes the last of a new area 999 * 1000 * This splits an area into two. It is part of the VFIO compatibility to allow 1001 * poking a hole in the mapping. The two areas continue to point at the same 1002 * iopt_pages, just with different starting bytes. 1003 */ 1004 static int iopt_area_split(struct iopt_area *area, unsigned long iova) 1005 { 1006 unsigned long alignment = area->iopt->iova_alignment; 1007 unsigned long last_iova = iopt_area_last_iova(area); 1008 unsigned long start_iova = iopt_area_iova(area); 1009 unsigned long new_start = iova + 1; 1010 struct io_pagetable *iopt = area->iopt; 1011 struct iopt_pages *pages = area->pages; 1012 struct iopt_area *lhs; 1013 struct iopt_area *rhs; 1014 int rc; 1015 1016 lockdep_assert_held_write(&iopt->iova_rwsem); 1017 1018 if (iova == start_iova || iova == last_iova) 1019 return 0; 1020 1021 if (!pages || area->prevent_access) 1022 return -EBUSY; 1023 1024 if (new_start & (alignment - 1) || 1025 iopt_area_start_byte(area, new_start) & (alignment - 1)) 1026 return -EINVAL; 1027 1028 lhs = iopt_area_alloc(); 1029 if (!lhs) 1030 return -ENOMEM; 1031 1032 rhs = iopt_area_alloc(); 1033 if (!rhs) { 1034 rc = -ENOMEM; 1035 goto err_free_lhs; 1036 } 1037 1038 mutex_lock(&pages->mutex); 1039 /* 1040 * Splitting is not permitted if an access exists, we don't track enough 1041 * information to split existing accesses. 1042 */ 1043 if (area->num_accesses) { 1044 rc = -EINVAL; 1045 goto err_unlock; 1046 } 1047 1048 /* 1049 * Splitting is not permitted if a domain could have been mapped with 1050 * huge pages. 1051 */ 1052 if (area->storage_domain && !iopt->disable_large_pages) { 1053 rc = -EINVAL; 1054 goto err_unlock; 1055 } 1056 1057 interval_tree_remove(&area->node, &iopt->area_itree); 1058 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova, 1059 iopt_area_start_byte(area, start_iova), 1060 (new_start - 1) - start_iova + 1, 1061 area->iommu_prot); 1062 if (WARN_ON(rc)) 1063 goto err_insert; 1064 1065 rc = iopt_insert_area(iopt, rhs, area->pages, new_start, 1066 iopt_area_start_byte(area, new_start), 1067 last_iova - new_start + 1, area->iommu_prot); 1068 if (WARN_ON(rc)) 1069 goto err_remove_lhs; 1070 1071 /* 1072 * If the original area has filled a domain, domains_itree has to be 1073 * updated. 1074 */ 1075 if (area->storage_domain) { 1076 interval_tree_remove(&area->pages_node, &pages->domains_itree); 1077 interval_tree_insert(&lhs->pages_node, &pages->domains_itree); 1078 interval_tree_insert(&rhs->pages_node, &pages->domains_itree); 1079 } 1080 1081 lhs->storage_domain = area->storage_domain; 1082 lhs->pages = area->pages; 1083 rhs->storage_domain = area->storage_domain; 1084 rhs->pages = area->pages; 1085 kref_get(&rhs->pages->kref); 1086 kfree(area); 1087 mutex_unlock(&pages->mutex); 1088 1089 /* 1090 * No change to domains or accesses because the pages hasn't been 1091 * changed 1092 */ 1093 return 0; 1094 1095 err_remove_lhs: 1096 interval_tree_remove(&lhs->node, &iopt->area_itree); 1097 err_insert: 1098 interval_tree_insert(&area->node, &iopt->area_itree); 1099 err_unlock: 1100 mutex_unlock(&pages->mutex); 1101 kfree(rhs); 1102 err_free_lhs: 1103 kfree(lhs); 1104 return rc; 1105 } 1106 1107 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, 1108 size_t num_iovas) 1109 { 1110 int rc = 0; 1111 int i; 1112 1113 down_write(&iopt->iova_rwsem); 1114 for (i = 0; i < num_iovas; i++) { 1115 struct iopt_area *area; 1116 1117 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]); 1118 if (!area) 1119 continue; 1120 rc = iopt_area_split(area, iovas[i]); 1121 if (rc) 1122 break; 1123 } 1124 up_write(&iopt->iova_rwsem); 1125 return rc; 1126 } 1127 1128 void iopt_enable_large_pages(struct io_pagetable *iopt) 1129 { 1130 int rc; 1131 1132 down_write(&iopt->domains_rwsem); 1133 down_write(&iopt->iova_rwsem); 1134 WRITE_ONCE(iopt->disable_large_pages, false); 1135 rc = iopt_calculate_iova_alignment(iopt); 1136 WARN_ON(rc); 1137 up_write(&iopt->iova_rwsem); 1138 up_write(&iopt->domains_rwsem); 1139 } 1140 1141 int iopt_disable_large_pages(struct io_pagetable *iopt) 1142 { 1143 int rc = 0; 1144 1145 down_write(&iopt->domains_rwsem); 1146 down_write(&iopt->iova_rwsem); 1147 if (iopt->disable_large_pages) 1148 goto out_unlock; 1149 1150 /* Won't do it if domains already have pages mapped in them */ 1151 if (!xa_empty(&iopt->domains) && 1152 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) { 1153 rc = -EINVAL; 1154 goto out_unlock; 1155 } 1156 1157 WRITE_ONCE(iopt->disable_large_pages, true); 1158 rc = iopt_calculate_iova_alignment(iopt); 1159 if (rc) 1160 WRITE_ONCE(iopt->disable_large_pages, false); 1161 out_unlock: 1162 up_write(&iopt->iova_rwsem); 1163 up_write(&iopt->domains_rwsem); 1164 return rc; 1165 } 1166 1167 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access) 1168 { 1169 u32 new_id; 1170 int rc; 1171 1172 down_write(&iopt->domains_rwsem); 1173 down_write(&iopt->iova_rwsem); 1174 rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b, 1175 GFP_KERNEL_ACCOUNT); 1176 1177 if (rc) 1178 goto out_unlock; 1179 1180 rc = iopt_calculate_iova_alignment(iopt); 1181 if (rc) { 1182 xa_erase(&iopt->access_list, new_id); 1183 goto out_unlock; 1184 } 1185 access->iopt_access_list_id = new_id; 1186 1187 out_unlock: 1188 up_write(&iopt->iova_rwsem); 1189 up_write(&iopt->domains_rwsem); 1190 return rc; 1191 } 1192 1193 void iopt_remove_access(struct io_pagetable *iopt, 1194 struct iommufd_access *access, 1195 u32 iopt_access_list_id) 1196 { 1197 down_write(&iopt->domains_rwsem); 1198 down_write(&iopt->iova_rwsem); 1199 WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access); 1200 WARN_ON(iopt_calculate_iova_alignment(iopt)); 1201 up_write(&iopt->iova_rwsem); 1202 up_write(&iopt->domains_rwsem); 1203 } 1204 1205 /* Narrow the valid_iova_itree to include reserved ranges from a device. */ 1206 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, 1207 struct device *dev, 1208 phys_addr_t *sw_msi_start) 1209 { 1210 struct iommu_resv_region *resv; 1211 LIST_HEAD(resv_regions); 1212 unsigned int num_hw_msi = 0; 1213 unsigned int num_sw_msi = 0; 1214 int rc; 1215 1216 if (iommufd_should_fail()) 1217 return -EINVAL; 1218 1219 down_write(&iopt->iova_rwsem); 1220 /* FIXME: drivers allocate memory but there is no failure propogated */ 1221 iommu_get_resv_regions(dev, &resv_regions); 1222 1223 list_for_each_entry(resv, &resv_regions, list) { 1224 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) 1225 continue; 1226 1227 if (sw_msi_start && resv->type == IOMMU_RESV_MSI) 1228 num_hw_msi++; 1229 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) { 1230 *sw_msi_start = resv->start; 1231 num_sw_msi++; 1232 } 1233 1234 rc = iopt_reserve_iova(iopt, resv->start, 1235 resv->length - 1 + resv->start, dev); 1236 if (rc) 1237 goto out_reserved; 1238 } 1239 1240 /* Drivers must offer sane combinations of regions */ 1241 if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) { 1242 rc = -EINVAL; 1243 goto out_reserved; 1244 } 1245 1246 rc = 0; 1247 goto out_free_resv; 1248 1249 out_reserved: 1250 __iopt_remove_reserved_iova(iopt, dev); 1251 out_free_resv: 1252 iommu_put_resv_regions(dev, &resv_regions); 1253 up_write(&iopt->iova_rwsem); 1254 return rc; 1255 } 1256