1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO: IOMMU DMA mapping support for Type1 IOMMU 4 * 5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 6 * Author: Alex Williamson <alex.williamson@redhat.com> 7 * 8 * Derived from original vfio: 9 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 10 * Author: Tom Lyon, pugs@cisco.com 11 * 12 * We arbitrarily define a Type1 IOMMU as one matching the below code. 13 * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel 14 * VT-d, but that makes it harder to re-use as theoretically anyone 15 * implementing a similar IOMMU could make use of this. We expect the 16 * IOMMU to support the IOMMU API and have few to no restrictions around 17 * the IOVA range that can be mapped. The Type1 IOMMU is currently 18 * optimized for relatively static mappings of a userspace process with 19 * userspace pages pinned into memory. We also assume devices and IOMMU 20 * domains are PCI based as the IOMMU API is still centered around a 21 * device/bus interface rather than a group interface. 22 */ 23 24 #include <linux/compat.h> 25 #include <linux/device.h> 26 #include <linux/fs.h> 27 #include <linux/highmem.h> 28 #include <linux/iommu.h> 29 #include <linux/module.h> 30 #include <linux/mm.h> 31 #include <linux/kthread.h> 32 #include <linux/rbtree.h> 33 #include <linux/sched/signal.h> 34 #include <linux/sched/mm.h> 35 #include <linux/slab.h> 36 #include <linux/uaccess.h> 37 #include <linux/vfio.h> 38 #include <linux/workqueue.h> 39 #include <linux/notifier.h> 40 #include <linux/dma-iommu.h> 41 #include <linux/irqdomain.h> 42 #include "vfio.h" 43 44 #define DRIVER_VERSION "0.2" 45 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 46 #define DRIVER_DESC "Type1 IOMMU driver for VFIO" 47 48 static bool allow_unsafe_interrupts; 49 module_param_named(allow_unsafe_interrupts, 50 allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); 51 MODULE_PARM_DESC(allow_unsafe_interrupts, 52 "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); 53 54 static bool disable_hugepages; 55 module_param_named(disable_hugepages, 56 disable_hugepages, bool, S_IRUGO | S_IWUSR); 57 MODULE_PARM_DESC(disable_hugepages, 58 "Disable VFIO IOMMU support for IOMMU hugepages."); 59 60 static unsigned int dma_entry_limit __read_mostly = U16_MAX; 61 module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644); 62 MODULE_PARM_DESC(dma_entry_limit, 63 "Maximum number of user DMA mappings per container (65535)."); 64 65 struct vfio_iommu { 66 struct list_head domain_list; 67 struct list_head iova_list; 68 struct mutex lock; 69 struct rb_root dma_list; 70 struct list_head device_list; 71 struct mutex device_list_lock; 72 unsigned int dma_avail; 73 unsigned int vaddr_invalid_count; 74 uint64_t pgsize_bitmap; 75 uint64_t num_non_pinned_groups; 76 wait_queue_head_t vaddr_wait; 77 bool v2; 78 bool nesting; 79 bool dirty_page_tracking; 80 bool container_open; 81 struct list_head emulated_iommu_groups; 82 }; 83 84 struct vfio_domain { 85 struct iommu_domain *domain; 86 struct list_head next; 87 struct list_head group_list; 88 bool fgsp : 1; /* Fine-grained super pages */ 89 bool enforce_cache_coherency : 1; 90 }; 91 92 struct vfio_dma { 93 struct rb_node node; 94 dma_addr_t iova; /* Device address */ 95 unsigned long vaddr; /* Process virtual addr */ 96 size_t size; /* Map size (bytes) */ 97 int prot; /* IOMMU_READ/WRITE */ 98 bool iommu_mapped; 99 bool lock_cap; /* capable(CAP_IPC_LOCK) */ 100 bool vaddr_invalid; 101 struct task_struct *task; 102 struct rb_root pfn_list; /* Ex-user pinned pfn list */ 103 unsigned long *bitmap; 104 }; 105 106 struct vfio_batch { 107 struct page **pages; /* for pin_user_pages_remote */ 108 struct page *fallback_page; /* if pages alloc fails */ 109 int capacity; /* length of pages array */ 110 int size; /* of batch currently */ 111 int offset; /* of next entry in pages */ 112 }; 113 114 struct vfio_iommu_group { 115 struct iommu_group *iommu_group; 116 struct list_head next; 117 bool pinned_page_dirty_scope; 118 }; 119 120 struct vfio_iova { 121 struct list_head list; 122 dma_addr_t start; 123 dma_addr_t end; 124 }; 125 126 /* 127 * Guest RAM pinning working set or DMA target 128 */ 129 struct vfio_pfn { 130 struct rb_node node; 131 dma_addr_t iova; /* Device address */ 132 unsigned long pfn; /* Host pfn */ 133 unsigned int ref_count; 134 }; 135 136 struct vfio_regions { 137 struct list_head list; 138 dma_addr_t iova; 139 phys_addr_t phys; 140 size_t len; 141 }; 142 143 #define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE) 144 145 /* 146 * Input argument of number of bits to bitmap_set() is unsigned integer, which 147 * further casts to signed integer for unaligned multi-bit operation, 148 * __bitmap_set(). 149 * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte, 150 * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page 151 * system. 152 */ 153 #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) 154 #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX) 155 156 #define WAITED 1 157 158 static int put_pfn(unsigned long pfn, int prot); 159 160 static struct vfio_iommu_group* 161 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, 162 struct iommu_group *iommu_group); 163 164 /* 165 * This code handles mapping and unmapping of user data buffers 166 * into DMA'ble space using the IOMMU 167 */ 168 169 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, 170 dma_addr_t start, size_t size) 171 { 172 struct rb_node *node = iommu->dma_list.rb_node; 173 174 while (node) { 175 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); 176 177 if (start + size <= dma->iova) 178 node = node->rb_left; 179 else if (start >= dma->iova + dma->size) 180 node = node->rb_right; 181 else 182 return dma; 183 } 184 185 return NULL; 186 } 187 188 static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu, 189 dma_addr_t start, u64 size) 190 { 191 struct rb_node *res = NULL; 192 struct rb_node *node = iommu->dma_list.rb_node; 193 struct vfio_dma *dma_res = NULL; 194 195 while (node) { 196 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); 197 198 if (start < dma->iova + dma->size) { 199 res = node; 200 dma_res = dma; 201 if (start >= dma->iova) 202 break; 203 node = node->rb_left; 204 } else { 205 node = node->rb_right; 206 } 207 } 208 if (res && size && dma_res->iova >= start + size) 209 res = NULL; 210 return res; 211 } 212 213 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new) 214 { 215 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; 216 struct vfio_dma *dma; 217 218 while (*link) { 219 parent = *link; 220 dma = rb_entry(parent, struct vfio_dma, node); 221 222 if (new->iova + new->size <= dma->iova) 223 link = &(*link)->rb_left; 224 else 225 link = &(*link)->rb_right; 226 } 227 228 rb_link_node(&new->node, parent, link); 229 rb_insert_color(&new->node, &iommu->dma_list); 230 } 231 232 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) 233 { 234 rb_erase(&old->node, &iommu->dma_list); 235 } 236 237 238 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize) 239 { 240 uint64_t npages = dma->size / pgsize; 241 242 if (npages > DIRTY_BITMAP_PAGES_MAX) 243 return -EINVAL; 244 245 /* 246 * Allocate extra 64 bits that are used to calculate shift required for 247 * bitmap_shift_left() to manipulate and club unaligned number of pages 248 * in adjacent vfio_dma ranges. 249 */ 250 dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64), 251 GFP_KERNEL); 252 if (!dma->bitmap) 253 return -ENOMEM; 254 255 return 0; 256 } 257 258 static void vfio_dma_bitmap_free(struct vfio_dma *dma) 259 { 260 kvfree(dma->bitmap); 261 dma->bitmap = NULL; 262 } 263 264 static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize) 265 { 266 struct rb_node *p; 267 unsigned long pgshift = __ffs(pgsize); 268 269 for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) { 270 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node); 271 272 bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1); 273 } 274 } 275 276 static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu) 277 { 278 struct rb_node *n; 279 unsigned long pgshift = __ffs(iommu->pgsize_bitmap); 280 281 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { 282 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); 283 284 bitmap_set(dma->bitmap, 0, dma->size >> pgshift); 285 } 286 } 287 288 static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize) 289 { 290 struct rb_node *n; 291 292 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { 293 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); 294 int ret; 295 296 ret = vfio_dma_bitmap_alloc(dma, pgsize); 297 if (ret) { 298 struct rb_node *p; 299 300 for (p = rb_prev(n); p; p = rb_prev(p)) { 301 struct vfio_dma *dma = rb_entry(n, 302 struct vfio_dma, node); 303 304 vfio_dma_bitmap_free(dma); 305 } 306 return ret; 307 } 308 vfio_dma_populate_bitmap(dma, pgsize); 309 } 310 return 0; 311 } 312 313 static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu) 314 { 315 struct rb_node *n; 316 317 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { 318 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); 319 320 vfio_dma_bitmap_free(dma); 321 } 322 } 323 324 /* 325 * Helper Functions for host iova-pfn list 326 */ 327 static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) 328 { 329 struct vfio_pfn *vpfn; 330 struct rb_node *node = dma->pfn_list.rb_node; 331 332 while (node) { 333 vpfn = rb_entry(node, struct vfio_pfn, node); 334 335 if (iova < vpfn->iova) 336 node = node->rb_left; 337 else if (iova > vpfn->iova) 338 node = node->rb_right; 339 else 340 return vpfn; 341 } 342 return NULL; 343 } 344 345 static void vfio_link_pfn(struct vfio_dma *dma, 346 struct vfio_pfn *new) 347 { 348 struct rb_node **link, *parent = NULL; 349 struct vfio_pfn *vpfn; 350 351 link = &dma->pfn_list.rb_node; 352 while (*link) { 353 parent = *link; 354 vpfn = rb_entry(parent, struct vfio_pfn, node); 355 356 if (new->iova < vpfn->iova) 357 link = &(*link)->rb_left; 358 else 359 link = &(*link)->rb_right; 360 } 361 362 rb_link_node(&new->node, parent, link); 363 rb_insert_color(&new->node, &dma->pfn_list); 364 } 365 366 static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old) 367 { 368 rb_erase(&old->node, &dma->pfn_list); 369 } 370 371 static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova, 372 unsigned long pfn) 373 { 374 struct vfio_pfn *vpfn; 375 376 vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL); 377 if (!vpfn) 378 return -ENOMEM; 379 380 vpfn->iova = iova; 381 vpfn->pfn = pfn; 382 vpfn->ref_count = 1; 383 vfio_link_pfn(dma, vpfn); 384 return 0; 385 } 386 387 static void vfio_remove_from_pfn_list(struct vfio_dma *dma, 388 struct vfio_pfn *vpfn) 389 { 390 vfio_unlink_pfn(dma, vpfn); 391 kfree(vpfn); 392 } 393 394 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma, 395 unsigned long iova) 396 { 397 struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova); 398 399 if (vpfn) 400 vpfn->ref_count++; 401 return vpfn; 402 } 403 404 static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) 405 { 406 int ret = 0; 407 408 vpfn->ref_count--; 409 if (!vpfn->ref_count) { 410 ret = put_pfn(vpfn->pfn, dma->prot); 411 vfio_remove_from_pfn_list(dma, vpfn); 412 } 413 return ret; 414 } 415 416 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async) 417 { 418 struct mm_struct *mm; 419 int ret; 420 421 if (!npage) 422 return 0; 423 424 mm = async ? get_task_mm(dma->task) : dma->task->mm; 425 if (!mm) 426 return -ESRCH; /* process exited */ 427 428 ret = mmap_write_lock_killable(mm); 429 if (!ret) { 430 ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task, 431 dma->lock_cap); 432 mmap_write_unlock(mm); 433 } 434 435 if (async) 436 mmput(mm); 437 438 return ret; 439 } 440 441 /* 442 * Some mappings aren't backed by a struct page, for example an mmap'd 443 * MMIO range for our own or another device. These use a different 444 * pfn conversion and shouldn't be tracked as locked pages. 445 * For compound pages, any driver that sets the reserved bit in head 446 * page needs to set the reserved bit in all subpages to be safe. 447 */ 448 static bool is_invalid_reserved_pfn(unsigned long pfn) 449 { 450 if (pfn_valid(pfn)) 451 return PageReserved(pfn_to_page(pfn)); 452 453 return true; 454 } 455 456 static int put_pfn(unsigned long pfn, int prot) 457 { 458 if (!is_invalid_reserved_pfn(pfn)) { 459 struct page *page = pfn_to_page(pfn); 460 461 unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE); 462 return 1; 463 } 464 return 0; 465 } 466 467 #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *)) 468 469 static void vfio_batch_init(struct vfio_batch *batch) 470 { 471 batch->size = 0; 472 batch->offset = 0; 473 474 if (unlikely(disable_hugepages)) 475 goto fallback; 476 477 batch->pages = (struct page **) __get_free_page(GFP_KERNEL); 478 if (!batch->pages) 479 goto fallback; 480 481 batch->capacity = VFIO_BATCH_MAX_CAPACITY; 482 return; 483 484 fallback: 485 batch->pages = &batch->fallback_page; 486 batch->capacity = 1; 487 } 488 489 static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma) 490 { 491 while (batch->size) { 492 unsigned long pfn = page_to_pfn(batch->pages[batch->offset]); 493 494 put_pfn(pfn, dma->prot); 495 batch->offset++; 496 batch->size--; 497 } 498 } 499 500 static void vfio_batch_fini(struct vfio_batch *batch) 501 { 502 if (batch->capacity == VFIO_BATCH_MAX_CAPACITY) 503 free_page((unsigned long)batch->pages); 504 } 505 506 static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, 507 unsigned long vaddr, unsigned long *pfn, 508 bool write_fault) 509 { 510 pte_t *ptep; 511 spinlock_t *ptl; 512 int ret; 513 514 ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); 515 if (ret) { 516 bool unlocked = false; 517 518 ret = fixup_user_fault(mm, vaddr, 519 FAULT_FLAG_REMOTE | 520 (write_fault ? FAULT_FLAG_WRITE : 0), 521 &unlocked); 522 if (unlocked) 523 return -EAGAIN; 524 525 if (ret) 526 return ret; 527 528 ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); 529 if (ret) 530 return ret; 531 } 532 533 if (write_fault && !pte_write(*ptep)) 534 ret = -EFAULT; 535 else 536 *pfn = pte_pfn(*ptep); 537 538 pte_unmap_unlock(ptep, ptl); 539 return ret; 540 } 541 542 /* 543 * Returns the positive number of pfns successfully obtained or a negative 544 * error code. 545 */ 546 static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, 547 long npages, int prot, unsigned long *pfn, 548 struct page **pages) 549 { 550 struct vm_area_struct *vma; 551 unsigned int flags = 0; 552 int ret; 553 554 if (prot & IOMMU_WRITE) 555 flags |= FOLL_WRITE; 556 557 mmap_read_lock(mm); 558 ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM, 559 pages, NULL, NULL); 560 if (ret > 0) { 561 int i; 562 563 /* 564 * The zero page is always resident, we don't need to pin it 565 * and it falls into our invalid/reserved test so we don't 566 * unpin in put_pfn(). Unpin all zero pages in the batch here. 567 */ 568 for (i = 0 ; i < ret; i++) { 569 if (unlikely(is_zero_pfn(page_to_pfn(pages[i])))) 570 unpin_user_page(pages[i]); 571 } 572 573 *pfn = page_to_pfn(pages[0]); 574 goto done; 575 } 576 577 vaddr = untagged_addr(vaddr); 578 579 retry: 580 vma = vma_lookup(mm, vaddr); 581 582 if (vma && vma->vm_flags & VM_PFNMAP) { 583 ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE); 584 if (ret == -EAGAIN) 585 goto retry; 586 587 if (!ret) { 588 if (is_invalid_reserved_pfn(*pfn)) 589 ret = 1; 590 else 591 ret = -EFAULT; 592 } 593 } 594 done: 595 mmap_read_unlock(mm); 596 return ret; 597 } 598 599 static int vfio_wait(struct vfio_iommu *iommu) 600 { 601 DEFINE_WAIT(wait); 602 603 prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE); 604 mutex_unlock(&iommu->lock); 605 schedule(); 606 mutex_lock(&iommu->lock); 607 finish_wait(&iommu->vaddr_wait, &wait); 608 if (kthread_should_stop() || !iommu->container_open || 609 fatal_signal_pending(current)) { 610 return -EFAULT; 611 } 612 return WAITED; 613 } 614 615 /* 616 * Find dma struct and wait for its vaddr to be valid. iommu lock is dropped 617 * if the task waits, but is re-locked on return. Return result in *dma_p. 618 * Return 0 on success with no waiting, WAITED on success if waited, and -errno 619 * on error. 620 */ 621 static int vfio_find_dma_valid(struct vfio_iommu *iommu, dma_addr_t start, 622 size_t size, struct vfio_dma **dma_p) 623 { 624 int ret = 0; 625 626 do { 627 *dma_p = vfio_find_dma(iommu, start, size); 628 if (!*dma_p) 629 return -EINVAL; 630 else if (!(*dma_p)->vaddr_invalid) 631 return ret; 632 else 633 ret = vfio_wait(iommu); 634 } while (ret == WAITED); 635 636 return ret; 637 } 638 639 /* 640 * Wait for all vaddr in the dma_list to become valid. iommu lock is dropped 641 * if the task waits, but is re-locked on return. Return 0 on success with no 642 * waiting, WAITED on success if waited, and -errno on error. 643 */ 644 static int vfio_wait_all_valid(struct vfio_iommu *iommu) 645 { 646 int ret = 0; 647 648 while (iommu->vaddr_invalid_count && ret >= 0) 649 ret = vfio_wait(iommu); 650 651 return ret; 652 } 653 654 /* 655 * Attempt to pin pages. We really don't want to track all the pfns and 656 * the iommu can only map chunks of consecutive pfns anyway, so get the 657 * first page and all consecutive pages with the same locking. 658 */ 659 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, 660 long npage, unsigned long *pfn_base, 661 unsigned long limit, struct vfio_batch *batch) 662 { 663 unsigned long pfn; 664 struct mm_struct *mm = current->mm; 665 long ret, pinned = 0, lock_acct = 0; 666 bool rsvd; 667 dma_addr_t iova = vaddr - dma->vaddr + dma->iova; 668 669 /* This code path is only user initiated */ 670 if (!mm) 671 return -ENODEV; 672 673 if (batch->size) { 674 /* Leftover pages in batch from an earlier call. */ 675 *pfn_base = page_to_pfn(batch->pages[batch->offset]); 676 pfn = *pfn_base; 677 rsvd = is_invalid_reserved_pfn(*pfn_base); 678 } else { 679 *pfn_base = 0; 680 } 681 682 while (npage) { 683 if (!batch->size) { 684 /* Empty batch, so refill it. */ 685 long req_pages = min_t(long, npage, batch->capacity); 686 687 ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot, 688 &pfn, batch->pages); 689 if (ret < 0) 690 goto unpin_out; 691 692 batch->size = ret; 693 batch->offset = 0; 694 695 if (!*pfn_base) { 696 *pfn_base = pfn; 697 rsvd = is_invalid_reserved_pfn(*pfn_base); 698 } 699 } 700 701 /* 702 * pfn is preset for the first iteration of this inner loop and 703 * updated at the end to handle a VM_PFNMAP pfn. In that case, 704 * batch->pages isn't valid (there's no struct page), so allow 705 * batch->pages to be touched only when there's more than one 706 * pfn to check, which guarantees the pfns are from a 707 * !VM_PFNMAP vma. 708 */ 709 while (true) { 710 if (pfn != *pfn_base + pinned || 711 rsvd != is_invalid_reserved_pfn(pfn)) 712 goto out; 713 714 /* 715 * Reserved pages aren't counted against the user, 716 * externally pinned pages are already counted against 717 * the user. 718 */ 719 if (!rsvd && !vfio_find_vpfn(dma, iova)) { 720 if (!dma->lock_cap && 721 mm->locked_vm + lock_acct + 1 > limit) { 722 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", 723 __func__, limit << PAGE_SHIFT); 724 ret = -ENOMEM; 725 goto unpin_out; 726 } 727 lock_acct++; 728 } 729 730 pinned++; 731 npage--; 732 vaddr += PAGE_SIZE; 733 iova += PAGE_SIZE; 734 batch->offset++; 735 batch->size--; 736 737 if (!batch->size) 738 break; 739 740 pfn = page_to_pfn(batch->pages[batch->offset]); 741 } 742 743 if (unlikely(disable_hugepages)) 744 break; 745 } 746 747 out: 748 ret = vfio_lock_acct(dma, lock_acct, false); 749 750 unpin_out: 751 if (batch->size == 1 && !batch->offset) { 752 /* May be a VM_PFNMAP pfn, which the batch can't remember. */ 753 put_pfn(pfn, dma->prot); 754 batch->size = 0; 755 } 756 757 if (ret < 0) { 758 if (pinned && !rsvd) { 759 for (pfn = *pfn_base ; pinned ; pfn++, pinned--) 760 put_pfn(pfn, dma->prot); 761 } 762 vfio_batch_unpin(batch, dma); 763 764 return ret; 765 } 766 767 return pinned; 768 } 769 770 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, 771 unsigned long pfn, long npage, 772 bool do_accounting) 773 { 774 long unlocked = 0, locked = 0; 775 long i; 776 777 for (i = 0; i < npage; i++, iova += PAGE_SIZE) { 778 if (put_pfn(pfn++, dma->prot)) { 779 unlocked++; 780 if (vfio_find_vpfn(dma, iova)) 781 locked++; 782 } 783 } 784 785 if (do_accounting) 786 vfio_lock_acct(dma, locked - unlocked, true); 787 788 return unlocked; 789 } 790 791 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, 792 unsigned long *pfn_base, bool do_accounting) 793 { 794 struct page *pages[1]; 795 struct mm_struct *mm; 796 int ret; 797 798 mm = get_task_mm(dma->task); 799 if (!mm) 800 return -ENODEV; 801 802 ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages); 803 if (ret != 1) 804 goto out; 805 806 ret = 0; 807 808 if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) { 809 ret = vfio_lock_acct(dma, 1, true); 810 if (ret) { 811 put_pfn(*pfn_base, dma->prot); 812 if (ret == -ENOMEM) 813 pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK " 814 "(%ld) exceeded\n", __func__, 815 dma->task->comm, task_pid_nr(dma->task), 816 task_rlimit(dma->task, RLIMIT_MEMLOCK)); 817 } 818 } 819 820 out: 821 mmput(mm); 822 return ret; 823 } 824 825 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova, 826 bool do_accounting) 827 { 828 int unlocked; 829 struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova); 830 831 if (!vpfn) 832 return 0; 833 834 unlocked = vfio_iova_put_vfio_pfn(dma, vpfn); 835 836 if (do_accounting) 837 vfio_lock_acct(dma, -unlocked, true); 838 839 return unlocked; 840 } 841 842 static int vfio_iommu_type1_pin_pages(void *iommu_data, 843 struct iommu_group *iommu_group, 844 dma_addr_t user_iova, 845 int npage, int prot, 846 struct page **pages) 847 { 848 struct vfio_iommu *iommu = iommu_data; 849 struct vfio_iommu_group *group; 850 int i, j, ret; 851 unsigned long remote_vaddr; 852 struct vfio_dma *dma; 853 bool do_accounting; 854 dma_addr_t iova; 855 856 if (!iommu || !pages) 857 return -EINVAL; 858 859 /* Supported for v2 version only */ 860 if (!iommu->v2) 861 return -EACCES; 862 863 mutex_lock(&iommu->lock); 864 865 /* 866 * Wait for all necessary vaddr's to be valid so they can be used in 867 * the main loop without dropping the lock, to avoid racing vs unmap. 868 */ 869 again: 870 if (iommu->vaddr_invalid_count) { 871 for (i = 0; i < npage; i++) { 872 iova = user_iova + PAGE_SIZE * i; 873 ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma); 874 if (ret < 0) 875 goto pin_done; 876 if (ret == WAITED) 877 goto again; 878 } 879 } 880 881 /* Fail if no dma_umap notifier is registered */ 882 if (list_empty(&iommu->device_list)) { 883 ret = -EINVAL; 884 goto pin_done; 885 } 886 887 /* 888 * If iommu capable domain exist in the container then all pages are 889 * already pinned and accounted. Accounting should be done if there is no 890 * iommu capable domain in the container. 891 */ 892 do_accounting = list_empty(&iommu->domain_list); 893 894 for (i = 0; i < npage; i++) { 895 unsigned long phys_pfn; 896 struct vfio_pfn *vpfn; 897 898 iova = user_iova + PAGE_SIZE * i; 899 dma = vfio_find_dma(iommu, iova, PAGE_SIZE); 900 if (!dma) { 901 ret = -EINVAL; 902 goto pin_unwind; 903 } 904 905 if ((dma->prot & prot) != prot) { 906 ret = -EPERM; 907 goto pin_unwind; 908 } 909 910 vpfn = vfio_iova_get_vfio_pfn(dma, iova); 911 if (vpfn) { 912 pages[i] = pfn_to_page(vpfn->pfn); 913 continue; 914 } 915 916 remote_vaddr = dma->vaddr + (iova - dma->iova); 917 ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn, 918 do_accounting); 919 if (ret) 920 goto pin_unwind; 921 922 ret = vfio_add_to_pfn_list(dma, iova, phys_pfn); 923 if (ret) { 924 if (put_pfn(phys_pfn, dma->prot) && do_accounting) 925 vfio_lock_acct(dma, -1, true); 926 goto pin_unwind; 927 } 928 929 pages[i] = pfn_to_page(phys_pfn); 930 931 if (iommu->dirty_page_tracking) { 932 unsigned long pgshift = __ffs(iommu->pgsize_bitmap); 933 934 /* 935 * Bitmap populated with the smallest supported page 936 * size 937 */ 938 bitmap_set(dma->bitmap, 939 (iova - dma->iova) >> pgshift, 1); 940 } 941 } 942 ret = i; 943 944 group = vfio_iommu_find_iommu_group(iommu, iommu_group); 945 if (!group->pinned_page_dirty_scope) { 946 group->pinned_page_dirty_scope = true; 947 iommu->num_non_pinned_groups--; 948 } 949 950 goto pin_done; 951 952 pin_unwind: 953 pages[i] = NULL; 954 for (j = 0; j < i; j++) { 955 dma_addr_t iova; 956 957 iova = user_iova + PAGE_SIZE * j; 958 dma = vfio_find_dma(iommu, iova, PAGE_SIZE); 959 vfio_unpin_page_external(dma, iova, do_accounting); 960 pages[j] = NULL; 961 } 962 pin_done: 963 mutex_unlock(&iommu->lock); 964 return ret; 965 } 966 967 static void vfio_iommu_type1_unpin_pages(void *iommu_data, 968 dma_addr_t user_iova, int npage) 969 { 970 struct vfio_iommu *iommu = iommu_data; 971 bool do_accounting; 972 int i; 973 974 /* Supported for v2 version only */ 975 if (WARN_ON(!iommu->v2)) 976 return; 977 978 mutex_lock(&iommu->lock); 979 980 do_accounting = list_empty(&iommu->domain_list); 981 for (i = 0; i < npage; i++) { 982 dma_addr_t iova = user_iova + PAGE_SIZE * i; 983 struct vfio_dma *dma; 984 985 dma = vfio_find_dma(iommu, iova, PAGE_SIZE); 986 if (!dma) 987 break; 988 989 vfio_unpin_page_external(dma, iova, do_accounting); 990 } 991 992 mutex_unlock(&iommu->lock); 993 994 WARN_ON(i != npage); 995 } 996 997 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain, 998 struct list_head *regions, 999 struct iommu_iotlb_gather *iotlb_gather) 1000 { 1001 long unlocked = 0; 1002 struct vfio_regions *entry, *next; 1003 1004 iommu_iotlb_sync(domain->domain, iotlb_gather); 1005 1006 list_for_each_entry_safe(entry, next, regions, list) { 1007 unlocked += vfio_unpin_pages_remote(dma, 1008 entry->iova, 1009 entry->phys >> PAGE_SHIFT, 1010 entry->len >> PAGE_SHIFT, 1011 false); 1012 list_del(&entry->list); 1013 kfree(entry); 1014 } 1015 1016 cond_resched(); 1017 1018 return unlocked; 1019 } 1020 1021 /* 1022 * Generally, VFIO needs to unpin remote pages after each IOTLB flush. 1023 * Therefore, when using IOTLB flush sync interface, VFIO need to keep track 1024 * of these regions (currently using a list). 1025 * 1026 * This value specifies maximum number of regions for each IOTLB flush sync. 1027 */ 1028 #define VFIO_IOMMU_TLB_SYNC_MAX 512 1029 1030 static size_t unmap_unpin_fast(struct vfio_domain *domain, 1031 struct vfio_dma *dma, dma_addr_t *iova, 1032 size_t len, phys_addr_t phys, long *unlocked, 1033 struct list_head *unmapped_list, 1034 int *unmapped_cnt, 1035 struct iommu_iotlb_gather *iotlb_gather) 1036 { 1037 size_t unmapped = 0; 1038 struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL); 1039 1040 if (entry) { 1041 unmapped = iommu_unmap_fast(domain->domain, *iova, len, 1042 iotlb_gather); 1043 1044 if (!unmapped) { 1045 kfree(entry); 1046 } else { 1047 entry->iova = *iova; 1048 entry->phys = phys; 1049 entry->len = unmapped; 1050 list_add_tail(&entry->list, unmapped_list); 1051 1052 *iova += unmapped; 1053 (*unmapped_cnt)++; 1054 } 1055 } 1056 1057 /* 1058 * Sync if the number of fast-unmap regions hits the limit 1059 * or in case of errors. 1060 */ 1061 if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) { 1062 *unlocked += vfio_sync_unpin(dma, domain, unmapped_list, 1063 iotlb_gather); 1064 *unmapped_cnt = 0; 1065 } 1066 1067 return unmapped; 1068 } 1069 1070 static size_t unmap_unpin_slow(struct vfio_domain *domain, 1071 struct vfio_dma *dma, dma_addr_t *iova, 1072 size_t len, phys_addr_t phys, 1073 long *unlocked) 1074 { 1075 size_t unmapped = iommu_unmap(domain->domain, *iova, len); 1076 1077 if (unmapped) { 1078 *unlocked += vfio_unpin_pages_remote(dma, *iova, 1079 phys >> PAGE_SHIFT, 1080 unmapped >> PAGE_SHIFT, 1081 false); 1082 *iova += unmapped; 1083 cond_resched(); 1084 } 1085 return unmapped; 1086 } 1087 1088 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, 1089 bool do_accounting) 1090 { 1091 dma_addr_t iova = dma->iova, end = dma->iova + dma->size; 1092 struct vfio_domain *domain, *d; 1093 LIST_HEAD(unmapped_region_list); 1094 struct iommu_iotlb_gather iotlb_gather; 1095 int unmapped_region_cnt = 0; 1096 long unlocked = 0; 1097 1098 if (!dma->size) 1099 return 0; 1100 1101 if (list_empty(&iommu->domain_list)) 1102 return 0; 1103 1104 /* 1105 * We use the IOMMU to track the physical addresses, otherwise we'd 1106 * need a much more complicated tracking system. Unfortunately that 1107 * means we need to use one of the iommu domains to figure out the 1108 * pfns to unpin. The rest need to be unmapped in advance so we have 1109 * no iommu translations remaining when the pages are unpinned. 1110 */ 1111 domain = d = list_first_entry(&iommu->domain_list, 1112 struct vfio_domain, next); 1113 1114 list_for_each_entry_continue(d, &iommu->domain_list, next) { 1115 iommu_unmap(d->domain, dma->iova, dma->size); 1116 cond_resched(); 1117 } 1118 1119 iommu_iotlb_gather_init(&iotlb_gather); 1120 while (iova < end) { 1121 size_t unmapped, len; 1122 phys_addr_t phys, next; 1123 1124 phys = iommu_iova_to_phys(domain->domain, iova); 1125 if (WARN_ON(!phys)) { 1126 iova += PAGE_SIZE; 1127 continue; 1128 } 1129 1130 /* 1131 * To optimize for fewer iommu_unmap() calls, each of which 1132 * may require hardware cache flushing, try to find the 1133 * largest contiguous physical memory chunk to unmap. 1134 */ 1135 for (len = PAGE_SIZE; 1136 !domain->fgsp && iova + len < end; len += PAGE_SIZE) { 1137 next = iommu_iova_to_phys(domain->domain, iova + len); 1138 if (next != phys + len) 1139 break; 1140 } 1141 1142 /* 1143 * First, try to use fast unmap/unpin. In case of failure, 1144 * switch to slow unmap/unpin path. 1145 */ 1146 unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys, 1147 &unlocked, &unmapped_region_list, 1148 &unmapped_region_cnt, 1149 &iotlb_gather); 1150 if (!unmapped) { 1151 unmapped = unmap_unpin_slow(domain, dma, &iova, len, 1152 phys, &unlocked); 1153 if (WARN_ON(!unmapped)) 1154 break; 1155 } 1156 } 1157 1158 dma->iommu_mapped = false; 1159 1160 if (unmapped_region_cnt) { 1161 unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list, 1162 &iotlb_gather); 1163 } 1164 1165 if (do_accounting) { 1166 vfio_lock_acct(dma, -unlocked, true); 1167 return 0; 1168 } 1169 return unlocked; 1170 } 1171 1172 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) 1173 { 1174 WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)); 1175 vfio_unmap_unpin(iommu, dma, true); 1176 vfio_unlink_dma(iommu, dma); 1177 put_task_struct(dma->task); 1178 vfio_dma_bitmap_free(dma); 1179 if (dma->vaddr_invalid) { 1180 iommu->vaddr_invalid_count--; 1181 wake_up_all(&iommu->vaddr_wait); 1182 } 1183 kfree(dma); 1184 iommu->dma_avail++; 1185 } 1186 1187 static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu) 1188 { 1189 struct vfio_domain *domain; 1190 1191 iommu->pgsize_bitmap = ULONG_MAX; 1192 1193 list_for_each_entry(domain, &iommu->domain_list, next) 1194 iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap; 1195 1196 /* 1197 * In case the IOMMU supports page sizes smaller than PAGE_SIZE 1198 * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes. 1199 * That way the user will be able to map/unmap buffers whose size/ 1200 * start address is aligned with PAGE_SIZE. Pinning code uses that 1201 * granularity while iommu driver can use the sub-PAGE_SIZE size 1202 * to map the buffer. 1203 */ 1204 if (iommu->pgsize_bitmap & ~PAGE_MASK) { 1205 iommu->pgsize_bitmap &= PAGE_MASK; 1206 iommu->pgsize_bitmap |= PAGE_SIZE; 1207 } 1208 } 1209 1210 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, 1211 struct vfio_dma *dma, dma_addr_t base_iova, 1212 size_t pgsize) 1213 { 1214 unsigned long pgshift = __ffs(pgsize); 1215 unsigned long nbits = dma->size >> pgshift; 1216 unsigned long bit_offset = (dma->iova - base_iova) >> pgshift; 1217 unsigned long copy_offset = bit_offset / BITS_PER_LONG; 1218 unsigned long shift = bit_offset % BITS_PER_LONG; 1219 unsigned long leftover; 1220 1221 /* 1222 * mark all pages dirty if any IOMMU capable device is not able 1223 * to report dirty pages and all pages are pinned and mapped. 1224 */ 1225 if (iommu->num_non_pinned_groups && dma->iommu_mapped) 1226 bitmap_set(dma->bitmap, 0, nbits); 1227 1228 if (shift) { 1229 bitmap_shift_left(dma->bitmap, dma->bitmap, shift, 1230 nbits + shift); 1231 1232 if (copy_from_user(&leftover, 1233 (void __user *)(bitmap + copy_offset), 1234 sizeof(leftover))) 1235 return -EFAULT; 1236 1237 bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift); 1238 } 1239 1240 if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap, 1241 DIRTY_BITMAP_BYTES(nbits + shift))) 1242 return -EFAULT; 1243 1244 return 0; 1245 } 1246 1247 static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, 1248 dma_addr_t iova, size_t size, size_t pgsize) 1249 { 1250 struct vfio_dma *dma; 1251 struct rb_node *n; 1252 unsigned long pgshift = __ffs(pgsize); 1253 int ret; 1254 1255 /* 1256 * GET_BITMAP request must fully cover vfio_dma mappings. Multiple 1257 * vfio_dma mappings may be clubbed by specifying large ranges, but 1258 * there must not be any previous mappings bisected by the range. 1259 * An error will be returned if these conditions are not met. 1260 */ 1261 dma = vfio_find_dma(iommu, iova, 1); 1262 if (dma && dma->iova != iova) 1263 return -EINVAL; 1264 1265 dma = vfio_find_dma(iommu, iova + size - 1, 0); 1266 if (dma && dma->iova + dma->size != iova + size) 1267 return -EINVAL; 1268 1269 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { 1270 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); 1271 1272 if (dma->iova < iova) 1273 continue; 1274 1275 if (dma->iova > iova + size - 1) 1276 break; 1277 1278 ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize); 1279 if (ret) 1280 return ret; 1281 1282 /* 1283 * Re-populate bitmap to include all pinned pages which are 1284 * considered as dirty but exclude pages which are unpinned and 1285 * pages which are marked dirty by vfio_dma_rw() 1286 */ 1287 bitmap_clear(dma->bitmap, 0, dma->size >> pgshift); 1288 vfio_dma_populate_bitmap(dma, pgsize); 1289 } 1290 return 0; 1291 } 1292 1293 static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size) 1294 { 1295 if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) || 1296 (bitmap_size < DIRTY_BITMAP_BYTES(npages))) 1297 return -EINVAL; 1298 1299 return 0; 1300 } 1301 1302 /* 1303 * Notify VFIO drivers using vfio_register_emulated_iommu_dev() to invalidate 1304 * and unmap iovas within the range we're about to unmap. Drivers MUST unpin 1305 * pages in response to an invalidation. 1306 */ 1307 static void vfio_notify_dma_unmap(struct vfio_iommu *iommu, 1308 struct vfio_dma *dma) 1309 { 1310 struct vfio_device *device; 1311 1312 if (list_empty(&iommu->device_list)) 1313 return; 1314 1315 /* 1316 * The device is expected to call vfio_unpin_pages() for any IOVA it has 1317 * pinned within the range. Since vfio_unpin_pages() will eventually 1318 * call back down to this code and try to obtain the iommu->lock we must 1319 * drop it. 1320 */ 1321 mutex_lock(&iommu->device_list_lock); 1322 mutex_unlock(&iommu->lock); 1323 1324 list_for_each_entry(device, &iommu->device_list, iommu_entry) 1325 device->ops->dma_unmap(device, dma->iova, dma->size); 1326 1327 mutex_unlock(&iommu->device_list_lock); 1328 mutex_lock(&iommu->lock); 1329 } 1330 1331 static int vfio_dma_do_unmap(struct vfio_iommu *iommu, 1332 struct vfio_iommu_type1_dma_unmap *unmap, 1333 struct vfio_bitmap *bitmap) 1334 { 1335 struct vfio_dma *dma, *dma_last = NULL; 1336 size_t unmapped = 0, pgsize; 1337 int ret = -EINVAL, retries = 0; 1338 unsigned long pgshift; 1339 dma_addr_t iova = unmap->iova; 1340 u64 size = unmap->size; 1341 bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL; 1342 bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR; 1343 struct rb_node *n, *first_n; 1344 1345 mutex_lock(&iommu->lock); 1346 1347 pgshift = __ffs(iommu->pgsize_bitmap); 1348 pgsize = (size_t)1 << pgshift; 1349 1350 if (iova & (pgsize - 1)) 1351 goto unlock; 1352 1353 if (unmap_all) { 1354 if (iova || size) 1355 goto unlock; 1356 size = U64_MAX; 1357 } else if (!size || size & (pgsize - 1) || 1358 iova + size - 1 < iova || size > SIZE_MAX) { 1359 goto unlock; 1360 } 1361 1362 /* When dirty tracking is enabled, allow only min supported pgsize */ 1363 if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) && 1364 (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) { 1365 goto unlock; 1366 } 1367 1368 WARN_ON((pgsize - 1) & PAGE_MASK); 1369 again: 1370 /* 1371 * vfio-iommu-type1 (v1) - User mappings were coalesced together to 1372 * avoid tracking individual mappings. This means that the granularity 1373 * of the original mapping was lost and the user was allowed to attempt 1374 * to unmap any range. Depending on the contiguousness of physical 1375 * memory and page sizes supported by the IOMMU, arbitrary unmaps may 1376 * or may not have worked. We only guaranteed unmap granularity 1377 * matching the original mapping; even though it was untracked here, 1378 * the original mappings are reflected in IOMMU mappings. This 1379 * resulted in a couple unusual behaviors. First, if a range is not 1380 * able to be unmapped, ex. a set of 4k pages that was mapped as a 1381 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with 1382 * a zero sized unmap. Also, if an unmap request overlaps the first 1383 * address of a hugepage, the IOMMU will unmap the entire hugepage. 1384 * This also returns success and the returned unmap size reflects the 1385 * actual size unmapped. 1386 * 1387 * We attempt to maintain compatibility with this "v1" interface, but 1388 * we take control out of the hands of the IOMMU. Therefore, an unmap 1389 * request offset from the beginning of the original mapping will 1390 * return success with zero sized unmap. And an unmap request covering 1391 * the first iova of mapping will unmap the entire range. 1392 * 1393 * The v2 version of this interface intends to be more deterministic. 1394 * Unmap requests must fully cover previous mappings. Multiple 1395 * mappings may still be unmaped by specifying large ranges, but there 1396 * must not be any previous mappings bisected by the range. An error 1397 * will be returned if these conditions are not met. The v2 interface 1398 * will only return success and a size of zero if there were no 1399 * mappings within the range. 1400 */ 1401 if (iommu->v2 && !unmap_all) { 1402 dma = vfio_find_dma(iommu, iova, 1); 1403 if (dma && dma->iova != iova) 1404 goto unlock; 1405 1406 dma = vfio_find_dma(iommu, iova + size - 1, 0); 1407 if (dma && dma->iova + dma->size != iova + size) 1408 goto unlock; 1409 } 1410 1411 ret = 0; 1412 n = first_n = vfio_find_dma_first_node(iommu, iova, size); 1413 1414 while (n) { 1415 dma = rb_entry(n, struct vfio_dma, node); 1416 if (dma->iova >= iova + size) 1417 break; 1418 1419 if (!iommu->v2 && iova > dma->iova) 1420 break; 1421 1422 if (invalidate_vaddr) { 1423 if (dma->vaddr_invalid) { 1424 struct rb_node *last_n = n; 1425 1426 for (n = first_n; n != last_n; n = rb_next(n)) { 1427 dma = rb_entry(n, 1428 struct vfio_dma, node); 1429 dma->vaddr_invalid = false; 1430 iommu->vaddr_invalid_count--; 1431 } 1432 ret = -EINVAL; 1433 unmapped = 0; 1434 break; 1435 } 1436 dma->vaddr_invalid = true; 1437 iommu->vaddr_invalid_count++; 1438 unmapped += dma->size; 1439 n = rb_next(n); 1440 continue; 1441 } 1442 1443 if (!RB_EMPTY_ROOT(&dma->pfn_list)) { 1444 if (dma_last == dma) { 1445 BUG_ON(++retries > 10); 1446 } else { 1447 dma_last = dma; 1448 retries = 0; 1449 } 1450 1451 vfio_notify_dma_unmap(iommu, dma); 1452 goto again; 1453 } 1454 1455 if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { 1456 ret = update_user_bitmap(bitmap->data, iommu, dma, 1457 iova, pgsize); 1458 if (ret) 1459 break; 1460 } 1461 1462 unmapped += dma->size; 1463 n = rb_next(n); 1464 vfio_remove_dma(iommu, dma); 1465 } 1466 1467 unlock: 1468 mutex_unlock(&iommu->lock); 1469 1470 /* Report how much was unmapped */ 1471 unmap->size = unmapped; 1472 1473 return ret; 1474 } 1475 1476 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova, 1477 unsigned long pfn, long npage, int prot) 1478 { 1479 struct vfio_domain *d; 1480 int ret; 1481 1482 list_for_each_entry(d, &iommu->domain_list, next) { 1483 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT, 1484 npage << PAGE_SHIFT, prot | IOMMU_CACHE); 1485 if (ret) 1486 goto unwind; 1487 1488 cond_resched(); 1489 } 1490 1491 return 0; 1492 1493 unwind: 1494 list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) { 1495 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT); 1496 cond_resched(); 1497 } 1498 1499 return ret; 1500 } 1501 1502 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, 1503 size_t map_size) 1504 { 1505 dma_addr_t iova = dma->iova; 1506 unsigned long vaddr = dma->vaddr; 1507 struct vfio_batch batch; 1508 size_t size = map_size; 1509 long npage; 1510 unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 1511 int ret = 0; 1512 1513 vfio_batch_init(&batch); 1514 1515 while (size) { 1516 /* Pin a contiguous chunk of memory */ 1517 npage = vfio_pin_pages_remote(dma, vaddr + dma->size, 1518 size >> PAGE_SHIFT, &pfn, limit, 1519 &batch); 1520 if (npage <= 0) { 1521 WARN_ON(!npage); 1522 ret = (int)npage; 1523 break; 1524 } 1525 1526 /* Map it! */ 1527 ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, 1528 dma->prot); 1529 if (ret) { 1530 vfio_unpin_pages_remote(dma, iova + dma->size, pfn, 1531 npage, true); 1532 vfio_batch_unpin(&batch, dma); 1533 break; 1534 } 1535 1536 size -= npage << PAGE_SHIFT; 1537 dma->size += npage << PAGE_SHIFT; 1538 } 1539 1540 vfio_batch_fini(&batch); 1541 dma->iommu_mapped = true; 1542 1543 if (ret) 1544 vfio_remove_dma(iommu, dma); 1545 1546 return ret; 1547 } 1548 1549 /* 1550 * Check dma map request is within a valid iova range 1551 */ 1552 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu, 1553 dma_addr_t start, dma_addr_t end) 1554 { 1555 struct list_head *iova = &iommu->iova_list; 1556 struct vfio_iova *node; 1557 1558 list_for_each_entry(node, iova, list) { 1559 if (start >= node->start && end <= node->end) 1560 return true; 1561 } 1562 1563 /* 1564 * Check for list_empty() as well since a container with 1565 * a single mdev device will have an empty list. 1566 */ 1567 return list_empty(iova); 1568 } 1569 1570 static int vfio_dma_do_map(struct vfio_iommu *iommu, 1571 struct vfio_iommu_type1_dma_map *map) 1572 { 1573 bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR; 1574 dma_addr_t iova = map->iova; 1575 unsigned long vaddr = map->vaddr; 1576 size_t size = map->size; 1577 int ret = 0, prot = 0; 1578 size_t pgsize; 1579 struct vfio_dma *dma; 1580 1581 /* Verify that none of our __u64 fields overflow */ 1582 if (map->size != size || map->vaddr != vaddr || map->iova != iova) 1583 return -EINVAL; 1584 1585 /* READ/WRITE from device perspective */ 1586 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) 1587 prot |= IOMMU_WRITE; 1588 if (map->flags & VFIO_DMA_MAP_FLAG_READ) 1589 prot |= IOMMU_READ; 1590 1591 if ((prot && set_vaddr) || (!prot && !set_vaddr)) 1592 return -EINVAL; 1593 1594 mutex_lock(&iommu->lock); 1595 1596 pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap); 1597 1598 WARN_ON((pgsize - 1) & PAGE_MASK); 1599 1600 if (!size || (size | iova | vaddr) & (pgsize - 1)) { 1601 ret = -EINVAL; 1602 goto out_unlock; 1603 } 1604 1605 /* Don't allow IOVA or virtual address wrap */ 1606 if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) { 1607 ret = -EINVAL; 1608 goto out_unlock; 1609 } 1610 1611 dma = vfio_find_dma(iommu, iova, size); 1612 if (set_vaddr) { 1613 if (!dma) { 1614 ret = -ENOENT; 1615 } else if (!dma->vaddr_invalid || dma->iova != iova || 1616 dma->size != size) { 1617 ret = -EINVAL; 1618 } else { 1619 dma->vaddr = vaddr; 1620 dma->vaddr_invalid = false; 1621 iommu->vaddr_invalid_count--; 1622 wake_up_all(&iommu->vaddr_wait); 1623 } 1624 goto out_unlock; 1625 } else if (dma) { 1626 ret = -EEXIST; 1627 goto out_unlock; 1628 } 1629 1630 if (!iommu->dma_avail) { 1631 ret = -ENOSPC; 1632 goto out_unlock; 1633 } 1634 1635 if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) { 1636 ret = -EINVAL; 1637 goto out_unlock; 1638 } 1639 1640 dma = kzalloc(sizeof(*dma), GFP_KERNEL); 1641 if (!dma) { 1642 ret = -ENOMEM; 1643 goto out_unlock; 1644 } 1645 1646 iommu->dma_avail--; 1647 dma->iova = iova; 1648 dma->vaddr = vaddr; 1649 dma->prot = prot; 1650 1651 /* 1652 * We need to be able to both add to a task's locked memory and test 1653 * against the locked memory limit and we need to be able to do both 1654 * outside of this call path as pinning can be asynchronous via the 1655 * external interfaces for mdev devices. RLIMIT_MEMLOCK requires a 1656 * task_struct and VM locked pages requires an mm_struct, however 1657 * holding an indefinite mm reference is not recommended, therefore we 1658 * only hold a reference to a task. We could hold a reference to 1659 * current, however QEMU uses this call path through vCPU threads, 1660 * which can be killed resulting in a NULL mm and failure in the unmap 1661 * path when called via a different thread. Avoid this problem by 1662 * using the group_leader as threads within the same group require 1663 * both CLONE_THREAD and CLONE_VM and will therefore use the same 1664 * mm_struct. 1665 * 1666 * Previously we also used the task for testing CAP_IPC_LOCK at the 1667 * time of pinning and accounting, however has_capability() makes use 1668 * of real_cred, a copy-on-write field, so we can't guarantee that it 1669 * matches group_leader, or in fact that it might not change by the 1670 * time it's evaluated. If a process were to call MAP_DMA with 1671 * CAP_IPC_LOCK but later drop it, it doesn't make sense that they 1672 * possibly see different results for an iommu_mapped vfio_dma vs 1673 * externally mapped. Therefore track CAP_IPC_LOCK in vfio_dma at the 1674 * time of calling MAP_DMA. 1675 */ 1676 get_task_struct(current->group_leader); 1677 dma->task = current->group_leader; 1678 dma->lock_cap = capable(CAP_IPC_LOCK); 1679 1680 dma->pfn_list = RB_ROOT; 1681 1682 /* Insert zero-sized and grow as we map chunks of it */ 1683 vfio_link_dma(iommu, dma); 1684 1685 /* Don't pin and map if container doesn't contain IOMMU capable domain*/ 1686 if (list_empty(&iommu->domain_list)) 1687 dma->size = size; 1688 else 1689 ret = vfio_pin_map_dma(iommu, dma, size); 1690 1691 if (!ret && iommu->dirty_page_tracking) { 1692 ret = vfio_dma_bitmap_alloc(dma, pgsize); 1693 if (ret) 1694 vfio_remove_dma(iommu, dma); 1695 } 1696 1697 out_unlock: 1698 mutex_unlock(&iommu->lock); 1699 return ret; 1700 } 1701 1702 static int vfio_iommu_replay(struct vfio_iommu *iommu, 1703 struct vfio_domain *domain) 1704 { 1705 struct vfio_batch batch; 1706 struct vfio_domain *d = NULL; 1707 struct rb_node *n; 1708 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 1709 int ret; 1710 1711 ret = vfio_wait_all_valid(iommu); 1712 if (ret < 0) 1713 return ret; 1714 1715 /* Arbitrarily pick the first domain in the list for lookups */ 1716 if (!list_empty(&iommu->domain_list)) 1717 d = list_first_entry(&iommu->domain_list, 1718 struct vfio_domain, next); 1719 1720 vfio_batch_init(&batch); 1721 1722 n = rb_first(&iommu->dma_list); 1723 1724 for (; n; n = rb_next(n)) { 1725 struct vfio_dma *dma; 1726 dma_addr_t iova; 1727 1728 dma = rb_entry(n, struct vfio_dma, node); 1729 iova = dma->iova; 1730 1731 while (iova < dma->iova + dma->size) { 1732 phys_addr_t phys; 1733 size_t size; 1734 1735 if (dma->iommu_mapped) { 1736 phys_addr_t p; 1737 dma_addr_t i; 1738 1739 if (WARN_ON(!d)) { /* mapped w/o a domain?! */ 1740 ret = -EINVAL; 1741 goto unwind; 1742 } 1743 1744 phys = iommu_iova_to_phys(d->domain, iova); 1745 1746 if (WARN_ON(!phys)) { 1747 iova += PAGE_SIZE; 1748 continue; 1749 } 1750 1751 size = PAGE_SIZE; 1752 p = phys + size; 1753 i = iova + size; 1754 while (i < dma->iova + dma->size && 1755 p == iommu_iova_to_phys(d->domain, i)) { 1756 size += PAGE_SIZE; 1757 p += PAGE_SIZE; 1758 i += PAGE_SIZE; 1759 } 1760 } else { 1761 unsigned long pfn; 1762 unsigned long vaddr = dma->vaddr + 1763 (iova - dma->iova); 1764 size_t n = dma->iova + dma->size - iova; 1765 long npage; 1766 1767 npage = vfio_pin_pages_remote(dma, vaddr, 1768 n >> PAGE_SHIFT, 1769 &pfn, limit, 1770 &batch); 1771 if (npage <= 0) { 1772 WARN_ON(!npage); 1773 ret = (int)npage; 1774 goto unwind; 1775 } 1776 1777 phys = pfn << PAGE_SHIFT; 1778 size = npage << PAGE_SHIFT; 1779 } 1780 1781 ret = iommu_map(domain->domain, iova, phys, 1782 size, dma->prot | IOMMU_CACHE); 1783 if (ret) { 1784 if (!dma->iommu_mapped) { 1785 vfio_unpin_pages_remote(dma, iova, 1786 phys >> PAGE_SHIFT, 1787 size >> PAGE_SHIFT, 1788 true); 1789 vfio_batch_unpin(&batch, dma); 1790 } 1791 goto unwind; 1792 } 1793 1794 iova += size; 1795 } 1796 } 1797 1798 /* All dmas are now mapped, defer to second tree walk for unwind */ 1799 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { 1800 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); 1801 1802 dma->iommu_mapped = true; 1803 } 1804 1805 vfio_batch_fini(&batch); 1806 return 0; 1807 1808 unwind: 1809 for (; n; n = rb_prev(n)) { 1810 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); 1811 dma_addr_t iova; 1812 1813 if (dma->iommu_mapped) { 1814 iommu_unmap(domain->domain, dma->iova, dma->size); 1815 continue; 1816 } 1817 1818 iova = dma->iova; 1819 while (iova < dma->iova + dma->size) { 1820 phys_addr_t phys, p; 1821 size_t size; 1822 dma_addr_t i; 1823 1824 phys = iommu_iova_to_phys(domain->domain, iova); 1825 if (!phys) { 1826 iova += PAGE_SIZE; 1827 continue; 1828 } 1829 1830 size = PAGE_SIZE; 1831 p = phys + size; 1832 i = iova + size; 1833 while (i < dma->iova + dma->size && 1834 p == iommu_iova_to_phys(domain->domain, i)) { 1835 size += PAGE_SIZE; 1836 p += PAGE_SIZE; 1837 i += PAGE_SIZE; 1838 } 1839 1840 iommu_unmap(domain->domain, iova, size); 1841 vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT, 1842 size >> PAGE_SHIFT, true); 1843 } 1844 } 1845 1846 vfio_batch_fini(&batch); 1847 return ret; 1848 } 1849 1850 /* 1851 * We change our unmap behavior slightly depending on whether the IOMMU 1852 * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage 1853 * for practically any contiguous power-of-two mapping we give it. This means 1854 * we don't need to look for contiguous chunks ourselves to make unmapping 1855 * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d 1856 * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks 1857 * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when 1858 * hugetlbfs is in use. 1859 */ 1860 static void vfio_test_domain_fgsp(struct vfio_domain *domain) 1861 { 1862 struct page *pages; 1863 int ret, order = get_order(PAGE_SIZE * 2); 1864 1865 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); 1866 if (!pages) 1867 return; 1868 1869 ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2, 1870 IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE); 1871 if (!ret) { 1872 size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE); 1873 1874 if (unmapped == PAGE_SIZE) 1875 iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE); 1876 else 1877 domain->fgsp = true; 1878 } 1879 1880 __free_pages(pages, order); 1881 } 1882 1883 static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain, 1884 struct iommu_group *iommu_group) 1885 { 1886 struct vfio_iommu_group *g; 1887 1888 list_for_each_entry(g, &domain->group_list, next) { 1889 if (g->iommu_group == iommu_group) 1890 return g; 1891 } 1892 1893 return NULL; 1894 } 1895 1896 static struct vfio_iommu_group* 1897 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, 1898 struct iommu_group *iommu_group) 1899 { 1900 struct vfio_iommu_group *group; 1901 struct vfio_domain *domain; 1902 1903 list_for_each_entry(domain, &iommu->domain_list, next) { 1904 group = find_iommu_group(domain, iommu_group); 1905 if (group) 1906 return group; 1907 } 1908 1909 list_for_each_entry(group, &iommu->emulated_iommu_groups, next) 1910 if (group->iommu_group == iommu_group) 1911 return group; 1912 return NULL; 1913 } 1914 1915 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions, 1916 phys_addr_t *base) 1917 { 1918 struct iommu_resv_region *region; 1919 bool ret = false; 1920 1921 list_for_each_entry(region, group_resv_regions, list) { 1922 /* 1923 * The presence of any 'real' MSI regions should take 1924 * precedence over the software-managed one if the 1925 * IOMMU driver happens to advertise both types. 1926 */ 1927 if (region->type == IOMMU_RESV_MSI) { 1928 ret = false; 1929 break; 1930 } 1931 1932 if (region->type == IOMMU_RESV_SW_MSI) { 1933 *base = region->start; 1934 ret = true; 1935 } 1936 } 1937 1938 return ret; 1939 } 1940 1941 /* 1942 * This is a helper function to insert an address range to iova list. 1943 * The list is initially created with a single entry corresponding to 1944 * the IOMMU domain geometry to which the device group is attached. 1945 * The list aperture gets modified when a new domain is added to the 1946 * container if the new aperture doesn't conflict with the current one 1947 * or with any existing dma mappings. The list is also modified to 1948 * exclude any reserved regions associated with the device group. 1949 */ 1950 static int vfio_iommu_iova_insert(struct list_head *head, 1951 dma_addr_t start, dma_addr_t end) 1952 { 1953 struct vfio_iova *region; 1954 1955 region = kmalloc(sizeof(*region), GFP_KERNEL); 1956 if (!region) 1957 return -ENOMEM; 1958 1959 INIT_LIST_HEAD(®ion->list); 1960 region->start = start; 1961 region->end = end; 1962 1963 list_add_tail(®ion->list, head); 1964 return 0; 1965 } 1966 1967 /* 1968 * Check the new iommu aperture conflicts with existing aper or with any 1969 * existing dma mappings. 1970 */ 1971 static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu, 1972 dma_addr_t start, dma_addr_t end) 1973 { 1974 struct vfio_iova *first, *last; 1975 struct list_head *iova = &iommu->iova_list; 1976 1977 if (list_empty(iova)) 1978 return false; 1979 1980 /* Disjoint sets, return conflict */ 1981 first = list_first_entry(iova, struct vfio_iova, list); 1982 last = list_last_entry(iova, struct vfio_iova, list); 1983 if (start > last->end || end < first->start) 1984 return true; 1985 1986 /* Check for any existing dma mappings below the new start */ 1987 if (start > first->start) { 1988 if (vfio_find_dma(iommu, first->start, start - first->start)) 1989 return true; 1990 } 1991 1992 /* Check for any existing dma mappings beyond the new end */ 1993 if (end < last->end) { 1994 if (vfio_find_dma(iommu, end + 1, last->end - end)) 1995 return true; 1996 } 1997 1998 return false; 1999 } 2000 2001 /* 2002 * Resize iommu iova aperture window. This is called only if the new 2003 * aperture has no conflict with existing aperture and dma mappings. 2004 */ 2005 static int vfio_iommu_aper_resize(struct list_head *iova, 2006 dma_addr_t start, dma_addr_t end) 2007 { 2008 struct vfio_iova *node, *next; 2009 2010 if (list_empty(iova)) 2011 return vfio_iommu_iova_insert(iova, start, end); 2012 2013 /* Adjust iova list start */ 2014 list_for_each_entry_safe(node, next, iova, list) { 2015 if (start < node->start) 2016 break; 2017 if (start >= node->start && start < node->end) { 2018 node->start = start; 2019 break; 2020 } 2021 /* Delete nodes before new start */ 2022 list_del(&node->list); 2023 kfree(node); 2024 } 2025 2026 /* Adjust iova list end */ 2027 list_for_each_entry_safe(node, next, iova, list) { 2028 if (end > node->end) 2029 continue; 2030 if (end > node->start && end <= node->end) { 2031 node->end = end; 2032 continue; 2033 } 2034 /* Delete nodes after new end */ 2035 list_del(&node->list); 2036 kfree(node); 2037 } 2038 2039 return 0; 2040 } 2041 2042 /* 2043 * Check reserved region conflicts with existing dma mappings 2044 */ 2045 static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu, 2046 struct list_head *resv_regions) 2047 { 2048 struct iommu_resv_region *region; 2049 2050 /* Check for conflict with existing dma mappings */ 2051 list_for_each_entry(region, resv_regions, list) { 2052 if (region->type == IOMMU_RESV_DIRECT_RELAXABLE) 2053 continue; 2054 2055 if (vfio_find_dma(iommu, region->start, region->length)) 2056 return true; 2057 } 2058 2059 return false; 2060 } 2061 2062 /* 2063 * Check iova region overlap with reserved regions and 2064 * exclude them from the iommu iova range 2065 */ 2066 static int vfio_iommu_resv_exclude(struct list_head *iova, 2067 struct list_head *resv_regions) 2068 { 2069 struct iommu_resv_region *resv; 2070 struct vfio_iova *n, *next; 2071 2072 list_for_each_entry(resv, resv_regions, list) { 2073 phys_addr_t start, end; 2074 2075 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) 2076 continue; 2077 2078 start = resv->start; 2079 end = resv->start + resv->length - 1; 2080 2081 list_for_each_entry_safe(n, next, iova, list) { 2082 int ret = 0; 2083 2084 /* No overlap */ 2085 if (start > n->end || end < n->start) 2086 continue; 2087 /* 2088 * Insert a new node if current node overlaps with the 2089 * reserve region to exclude that from valid iova range. 2090 * Note that, new node is inserted before the current 2091 * node and finally the current node is deleted keeping 2092 * the list updated and sorted. 2093 */ 2094 if (start > n->start) 2095 ret = vfio_iommu_iova_insert(&n->list, n->start, 2096 start - 1); 2097 if (!ret && end < n->end) 2098 ret = vfio_iommu_iova_insert(&n->list, end + 1, 2099 n->end); 2100 if (ret) 2101 return ret; 2102 2103 list_del(&n->list); 2104 kfree(n); 2105 } 2106 } 2107 2108 if (list_empty(iova)) 2109 return -EINVAL; 2110 2111 return 0; 2112 } 2113 2114 static void vfio_iommu_resv_free(struct list_head *resv_regions) 2115 { 2116 struct iommu_resv_region *n, *next; 2117 2118 list_for_each_entry_safe(n, next, resv_regions, list) { 2119 list_del(&n->list); 2120 kfree(n); 2121 } 2122 } 2123 2124 static void vfio_iommu_iova_free(struct list_head *iova) 2125 { 2126 struct vfio_iova *n, *next; 2127 2128 list_for_each_entry_safe(n, next, iova, list) { 2129 list_del(&n->list); 2130 kfree(n); 2131 } 2132 } 2133 2134 static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu, 2135 struct list_head *iova_copy) 2136 { 2137 struct list_head *iova = &iommu->iova_list; 2138 struct vfio_iova *n; 2139 int ret; 2140 2141 list_for_each_entry(n, iova, list) { 2142 ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end); 2143 if (ret) 2144 goto out_free; 2145 } 2146 2147 return 0; 2148 2149 out_free: 2150 vfio_iommu_iova_free(iova_copy); 2151 return ret; 2152 } 2153 2154 static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu, 2155 struct list_head *iova_copy) 2156 { 2157 struct list_head *iova = &iommu->iova_list; 2158 2159 vfio_iommu_iova_free(iova); 2160 2161 list_splice_tail(iova_copy, iova); 2162 } 2163 2164 /* Redundantly walks non-present capabilities to simplify caller */ 2165 static int vfio_iommu_device_capable(struct device *dev, void *data) 2166 { 2167 return device_iommu_capable(dev, (enum iommu_cap)data); 2168 } 2169 2170 static int vfio_iommu_domain_alloc(struct device *dev, void *data) 2171 { 2172 struct iommu_domain **domain = data; 2173 2174 *domain = iommu_domain_alloc(dev->bus); 2175 return 1; /* Don't iterate */ 2176 } 2177 2178 static int vfio_iommu_type1_attach_group(void *iommu_data, 2179 struct iommu_group *iommu_group, enum vfio_group_type type) 2180 { 2181 struct vfio_iommu *iommu = iommu_data; 2182 struct vfio_iommu_group *group; 2183 struct vfio_domain *domain, *d; 2184 bool resv_msi, msi_remap; 2185 phys_addr_t resv_msi_base = 0; 2186 struct iommu_domain_geometry *geo; 2187 LIST_HEAD(iova_copy); 2188 LIST_HEAD(group_resv_regions); 2189 int ret = -EINVAL; 2190 2191 mutex_lock(&iommu->lock); 2192 2193 /* Check for duplicates */ 2194 if (vfio_iommu_find_iommu_group(iommu, iommu_group)) 2195 goto out_unlock; 2196 2197 ret = -ENOMEM; 2198 group = kzalloc(sizeof(*group), GFP_KERNEL); 2199 if (!group) 2200 goto out_unlock; 2201 group->iommu_group = iommu_group; 2202 2203 if (type == VFIO_EMULATED_IOMMU) { 2204 list_add(&group->next, &iommu->emulated_iommu_groups); 2205 /* 2206 * An emulated IOMMU group cannot dirty memory directly, it can 2207 * only use interfaces that provide dirty tracking. 2208 * The iommu scope can only be promoted with the addition of a 2209 * dirty tracking group. 2210 */ 2211 group->pinned_page_dirty_scope = true; 2212 ret = 0; 2213 goto out_unlock; 2214 } 2215 2216 ret = -ENOMEM; 2217 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 2218 if (!domain) 2219 goto out_free_group; 2220 2221 /* 2222 * Going via the iommu_group iterator avoids races, and trivially gives 2223 * us a representative device for the IOMMU API call. We don't actually 2224 * want to iterate beyond the first device (if any). 2225 */ 2226 ret = -EIO; 2227 iommu_group_for_each_dev(iommu_group, &domain->domain, 2228 vfio_iommu_domain_alloc); 2229 if (!domain->domain) 2230 goto out_free_domain; 2231 2232 if (iommu->nesting) { 2233 ret = iommu_enable_nesting(domain->domain); 2234 if (ret) 2235 goto out_domain; 2236 } 2237 2238 ret = iommu_attach_group(domain->domain, group->iommu_group); 2239 if (ret) 2240 goto out_domain; 2241 2242 /* Get aperture info */ 2243 geo = &domain->domain->geometry; 2244 if (vfio_iommu_aper_conflict(iommu, geo->aperture_start, 2245 geo->aperture_end)) { 2246 ret = -EINVAL; 2247 goto out_detach; 2248 } 2249 2250 ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions); 2251 if (ret) 2252 goto out_detach; 2253 2254 if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) { 2255 ret = -EINVAL; 2256 goto out_detach; 2257 } 2258 2259 /* 2260 * We don't want to work on the original iova list as the list 2261 * gets modified and in case of failure we have to retain the 2262 * original list. Get a copy here. 2263 */ 2264 ret = vfio_iommu_iova_get_copy(iommu, &iova_copy); 2265 if (ret) 2266 goto out_detach; 2267 2268 ret = vfio_iommu_aper_resize(&iova_copy, geo->aperture_start, 2269 geo->aperture_end); 2270 if (ret) 2271 goto out_detach; 2272 2273 ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions); 2274 if (ret) 2275 goto out_detach; 2276 2277 resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base); 2278 2279 INIT_LIST_HEAD(&domain->group_list); 2280 list_add(&group->next, &domain->group_list); 2281 2282 msi_remap = irq_domain_check_msi_remap() || 2283 iommu_group_for_each_dev(iommu_group, (void *)IOMMU_CAP_INTR_REMAP, 2284 vfio_iommu_device_capable); 2285 2286 if (!allow_unsafe_interrupts && !msi_remap) { 2287 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", 2288 __func__); 2289 ret = -EPERM; 2290 goto out_detach; 2291 } 2292 2293 /* 2294 * If the IOMMU can block non-coherent operations (ie PCIe TLPs with 2295 * no-snoop set) then VFIO always turns this feature on because on Intel 2296 * platforms it optimizes KVM to disable wbinvd emulation. 2297 */ 2298 if (domain->domain->ops->enforce_cache_coherency) 2299 domain->enforce_cache_coherency = 2300 domain->domain->ops->enforce_cache_coherency( 2301 domain->domain); 2302 2303 /* 2304 * Try to match an existing compatible domain. We don't want to 2305 * preclude an IOMMU driver supporting multiple bus_types and being 2306 * able to include different bus_types in the same IOMMU domain, so 2307 * we test whether the domains use the same iommu_ops rather than 2308 * testing if they're on the same bus_type. 2309 */ 2310 list_for_each_entry(d, &iommu->domain_list, next) { 2311 if (d->domain->ops == domain->domain->ops && 2312 d->enforce_cache_coherency == 2313 domain->enforce_cache_coherency) { 2314 iommu_detach_group(domain->domain, group->iommu_group); 2315 if (!iommu_attach_group(d->domain, 2316 group->iommu_group)) { 2317 list_add(&group->next, &d->group_list); 2318 iommu_domain_free(domain->domain); 2319 kfree(domain); 2320 goto done; 2321 } 2322 2323 ret = iommu_attach_group(domain->domain, 2324 group->iommu_group); 2325 if (ret) 2326 goto out_domain; 2327 } 2328 } 2329 2330 vfio_test_domain_fgsp(domain); 2331 2332 /* replay mappings on new domains */ 2333 ret = vfio_iommu_replay(iommu, domain); 2334 if (ret) 2335 goto out_detach; 2336 2337 if (resv_msi) { 2338 ret = iommu_get_msi_cookie(domain->domain, resv_msi_base); 2339 if (ret && ret != -ENODEV) 2340 goto out_detach; 2341 } 2342 2343 list_add(&domain->next, &iommu->domain_list); 2344 vfio_update_pgsize_bitmap(iommu); 2345 done: 2346 /* Delete the old one and insert new iova list */ 2347 vfio_iommu_iova_insert_copy(iommu, &iova_copy); 2348 2349 /* 2350 * An iommu backed group can dirty memory directly and therefore 2351 * demotes the iommu scope until it declares itself dirty tracking 2352 * capable via the page pinning interface. 2353 */ 2354 iommu->num_non_pinned_groups++; 2355 mutex_unlock(&iommu->lock); 2356 vfio_iommu_resv_free(&group_resv_regions); 2357 2358 return 0; 2359 2360 out_detach: 2361 iommu_detach_group(domain->domain, group->iommu_group); 2362 out_domain: 2363 iommu_domain_free(domain->domain); 2364 vfio_iommu_iova_free(&iova_copy); 2365 vfio_iommu_resv_free(&group_resv_regions); 2366 out_free_domain: 2367 kfree(domain); 2368 out_free_group: 2369 kfree(group); 2370 out_unlock: 2371 mutex_unlock(&iommu->lock); 2372 return ret; 2373 } 2374 2375 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu) 2376 { 2377 struct rb_node *node; 2378 2379 while ((node = rb_first(&iommu->dma_list))) 2380 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node)); 2381 } 2382 2383 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu) 2384 { 2385 struct rb_node *n, *p; 2386 2387 n = rb_first(&iommu->dma_list); 2388 for (; n; n = rb_next(n)) { 2389 struct vfio_dma *dma; 2390 long locked = 0, unlocked = 0; 2391 2392 dma = rb_entry(n, struct vfio_dma, node); 2393 unlocked += vfio_unmap_unpin(iommu, dma, false); 2394 p = rb_first(&dma->pfn_list); 2395 for (; p; p = rb_next(p)) { 2396 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, 2397 node); 2398 2399 if (!is_invalid_reserved_pfn(vpfn->pfn)) 2400 locked++; 2401 } 2402 vfio_lock_acct(dma, locked - unlocked, true); 2403 } 2404 } 2405 2406 /* 2407 * Called when a domain is removed in detach. It is possible that 2408 * the removed domain decided the iova aperture window. Modify the 2409 * iova aperture with the smallest window among existing domains. 2410 */ 2411 static void vfio_iommu_aper_expand(struct vfio_iommu *iommu, 2412 struct list_head *iova_copy) 2413 { 2414 struct vfio_domain *domain; 2415 struct vfio_iova *node; 2416 dma_addr_t start = 0; 2417 dma_addr_t end = (dma_addr_t)~0; 2418 2419 if (list_empty(iova_copy)) 2420 return; 2421 2422 list_for_each_entry(domain, &iommu->domain_list, next) { 2423 struct iommu_domain_geometry *geo = &domain->domain->geometry; 2424 2425 if (geo->aperture_start > start) 2426 start = geo->aperture_start; 2427 if (geo->aperture_end < end) 2428 end = geo->aperture_end; 2429 } 2430 2431 /* Modify aperture limits. The new aper is either same or bigger */ 2432 node = list_first_entry(iova_copy, struct vfio_iova, list); 2433 node->start = start; 2434 node = list_last_entry(iova_copy, struct vfio_iova, list); 2435 node->end = end; 2436 } 2437 2438 /* 2439 * Called when a group is detached. The reserved regions for that 2440 * group can be part of valid iova now. But since reserved regions 2441 * may be duplicated among groups, populate the iova valid regions 2442 * list again. 2443 */ 2444 static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu, 2445 struct list_head *iova_copy) 2446 { 2447 struct vfio_domain *d; 2448 struct vfio_iommu_group *g; 2449 struct vfio_iova *node; 2450 dma_addr_t start, end; 2451 LIST_HEAD(resv_regions); 2452 int ret; 2453 2454 if (list_empty(iova_copy)) 2455 return -EINVAL; 2456 2457 list_for_each_entry(d, &iommu->domain_list, next) { 2458 list_for_each_entry(g, &d->group_list, next) { 2459 ret = iommu_get_group_resv_regions(g->iommu_group, 2460 &resv_regions); 2461 if (ret) 2462 goto done; 2463 } 2464 } 2465 2466 node = list_first_entry(iova_copy, struct vfio_iova, list); 2467 start = node->start; 2468 node = list_last_entry(iova_copy, struct vfio_iova, list); 2469 end = node->end; 2470 2471 /* purge the iova list and create new one */ 2472 vfio_iommu_iova_free(iova_copy); 2473 2474 ret = vfio_iommu_aper_resize(iova_copy, start, end); 2475 if (ret) 2476 goto done; 2477 2478 /* Exclude current reserved regions from iova ranges */ 2479 ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions); 2480 done: 2481 vfio_iommu_resv_free(&resv_regions); 2482 return ret; 2483 } 2484 2485 static void vfio_iommu_type1_detach_group(void *iommu_data, 2486 struct iommu_group *iommu_group) 2487 { 2488 struct vfio_iommu *iommu = iommu_data; 2489 struct vfio_domain *domain; 2490 struct vfio_iommu_group *group; 2491 bool update_dirty_scope = false; 2492 LIST_HEAD(iova_copy); 2493 2494 mutex_lock(&iommu->lock); 2495 list_for_each_entry(group, &iommu->emulated_iommu_groups, next) { 2496 if (group->iommu_group != iommu_group) 2497 continue; 2498 update_dirty_scope = !group->pinned_page_dirty_scope; 2499 list_del(&group->next); 2500 kfree(group); 2501 2502 if (list_empty(&iommu->emulated_iommu_groups) && 2503 list_empty(&iommu->domain_list)) { 2504 WARN_ON(!list_empty(&iommu->device_list)); 2505 vfio_iommu_unmap_unpin_all(iommu); 2506 } 2507 goto detach_group_done; 2508 } 2509 2510 /* 2511 * Get a copy of iova list. This will be used to update 2512 * and to replace the current one later. Please note that 2513 * we will leave the original list as it is if update fails. 2514 */ 2515 vfio_iommu_iova_get_copy(iommu, &iova_copy); 2516 2517 list_for_each_entry(domain, &iommu->domain_list, next) { 2518 group = find_iommu_group(domain, iommu_group); 2519 if (!group) 2520 continue; 2521 2522 iommu_detach_group(domain->domain, group->iommu_group); 2523 update_dirty_scope = !group->pinned_page_dirty_scope; 2524 list_del(&group->next); 2525 kfree(group); 2526 /* 2527 * Group ownership provides privilege, if the group list is 2528 * empty, the domain goes away. If it's the last domain with 2529 * iommu and external domain doesn't exist, then all the 2530 * mappings go away too. If it's the last domain with iommu and 2531 * external domain exist, update accounting 2532 */ 2533 if (list_empty(&domain->group_list)) { 2534 if (list_is_singular(&iommu->domain_list)) { 2535 if (list_empty(&iommu->emulated_iommu_groups)) { 2536 WARN_ON(!list_empty( 2537 &iommu->device_list)); 2538 vfio_iommu_unmap_unpin_all(iommu); 2539 } else { 2540 vfio_iommu_unmap_unpin_reaccount(iommu); 2541 } 2542 } 2543 iommu_domain_free(domain->domain); 2544 list_del(&domain->next); 2545 kfree(domain); 2546 vfio_iommu_aper_expand(iommu, &iova_copy); 2547 vfio_update_pgsize_bitmap(iommu); 2548 } 2549 break; 2550 } 2551 2552 if (!vfio_iommu_resv_refresh(iommu, &iova_copy)) 2553 vfio_iommu_iova_insert_copy(iommu, &iova_copy); 2554 else 2555 vfio_iommu_iova_free(&iova_copy); 2556 2557 detach_group_done: 2558 /* 2559 * Removal of a group without dirty tracking may allow the iommu scope 2560 * to be promoted. 2561 */ 2562 if (update_dirty_scope) { 2563 iommu->num_non_pinned_groups--; 2564 if (iommu->dirty_page_tracking) 2565 vfio_iommu_populate_bitmap_full(iommu); 2566 } 2567 mutex_unlock(&iommu->lock); 2568 } 2569 2570 static void *vfio_iommu_type1_open(unsigned long arg) 2571 { 2572 struct vfio_iommu *iommu; 2573 2574 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); 2575 if (!iommu) 2576 return ERR_PTR(-ENOMEM); 2577 2578 switch (arg) { 2579 case VFIO_TYPE1_IOMMU: 2580 break; 2581 case VFIO_TYPE1_NESTING_IOMMU: 2582 iommu->nesting = true; 2583 fallthrough; 2584 case VFIO_TYPE1v2_IOMMU: 2585 iommu->v2 = true; 2586 break; 2587 default: 2588 kfree(iommu); 2589 return ERR_PTR(-EINVAL); 2590 } 2591 2592 INIT_LIST_HEAD(&iommu->domain_list); 2593 INIT_LIST_HEAD(&iommu->iova_list); 2594 iommu->dma_list = RB_ROOT; 2595 iommu->dma_avail = dma_entry_limit; 2596 iommu->container_open = true; 2597 mutex_init(&iommu->lock); 2598 mutex_init(&iommu->device_list_lock); 2599 INIT_LIST_HEAD(&iommu->device_list); 2600 init_waitqueue_head(&iommu->vaddr_wait); 2601 iommu->pgsize_bitmap = PAGE_MASK; 2602 INIT_LIST_HEAD(&iommu->emulated_iommu_groups); 2603 2604 return iommu; 2605 } 2606 2607 static void vfio_release_domain(struct vfio_domain *domain) 2608 { 2609 struct vfio_iommu_group *group, *group_tmp; 2610 2611 list_for_each_entry_safe(group, group_tmp, 2612 &domain->group_list, next) { 2613 iommu_detach_group(domain->domain, group->iommu_group); 2614 list_del(&group->next); 2615 kfree(group); 2616 } 2617 2618 iommu_domain_free(domain->domain); 2619 } 2620 2621 static void vfio_iommu_type1_release(void *iommu_data) 2622 { 2623 struct vfio_iommu *iommu = iommu_data; 2624 struct vfio_domain *domain, *domain_tmp; 2625 struct vfio_iommu_group *group, *next_group; 2626 2627 list_for_each_entry_safe(group, next_group, 2628 &iommu->emulated_iommu_groups, next) { 2629 list_del(&group->next); 2630 kfree(group); 2631 } 2632 2633 vfio_iommu_unmap_unpin_all(iommu); 2634 2635 list_for_each_entry_safe(domain, domain_tmp, 2636 &iommu->domain_list, next) { 2637 vfio_release_domain(domain); 2638 list_del(&domain->next); 2639 kfree(domain); 2640 } 2641 2642 vfio_iommu_iova_free(&iommu->iova_list); 2643 2644 kfree(iommu); 2645 } 2646 2647 static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu) 2648 { 2649 struct vfio_domain *domain; 2650 int ret = 1; 2651 2652 mutex_lock(&iommu->lock); 2653 list_for_each_entry(domain, &iommu->domain_list, next) { 2654 if (!(domain->enforce_cache_coherency)) { 2655 ret = 0; 2656 break; 2657 } 2658 } 2659 mutex_unlock(&iommu->lock); 2660 2661 return ret; 2662 } 2663 2664 static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu, 2665 unsigned long arg) 2666 { 2667 switch (arg) { 2668 case VFIO_TYPE1_IOMMU: 2669 case VFIO_TYPE1v2_IOMMU: 2670 case VFIO_TYPE1_NESTING_IOMMU: 2671 case VFIO_UNMAP_ALL: 2672 case VFIO_UPDATE_VADDR: 2673 return 1; 2674 case VFIO_DMA_CC_IOMMU: 2675 if (!iommu) 2676 return 0; 2677 return vfio_domains_have_enforce_cache_coherency(iommu); 2678 default: 2679 return 0; 2680 } 2681 } 2682 2683 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps, 2684 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas, 2685 size_t size) 2686 { 2687 struct vfio_info_cap_header *header; 2688 struct vfio_iommu_type1_info_cap_iova_range *iova_cap; 2689 2690 header = vfio_info_cap_add(caps, size, 2691 VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1); 2692 if (IS_ERR(header)) 2693 return PTR_ERR(header); 2694 2695 iova_cap = container_of(header, 2696 struct vfio_iommu_type1_info_cap_iova_range, 2697 header); 2698 iova_cap->nr_iovas = cap_iovas->nr_iovas; 2699 memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges, 2700 cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges)); 2701 return 0; 2702 } 2703 2704 static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu, 2705 struct vfio_info_cap *caps) 2706 { 2707 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas; 2708 struct vfio_iova *iova; 2709 size_t size; 2710 int iovas = 0, i = 0, ret; 2711 2712 list_for_each_entry(iova, &iommu->iova_list, list) 2713 iovas++; 2714 2715 if (!iovas) { 2716 /* 2717 * Return 0 as a container with a single mdev device 2718 * will have an empty list 2719 */ 2720 return 0; 2721 } 2722 2723 size = struct_size(cap_iovas, iova_ranges, iovas); 2724 2725 cap_iovas = kzalloc(size, GFP_KERNEL); 2726 if (!cap_iovas) 2727 return -ENOMEM; 2728 2729 cap_iovas->nr_iovas = iovas; 2730 2731 list_for_each_entry(iova, &iommu->iova_list, list) { 2732 cap_iovas->iova_ranges[i].start = iova->start; 2733 cap_iovas->iova_ranges[i].end = iova->end; 2734 i++; 2735 } 2736 2737 ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size); 2738 2739 kfree(cap_iovas); 2740 return ret; 2741 } 2742 2743 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, 2744 struct vfio_info_cap *caps) 2745 { 2746 struct vfio_iommu_type1_info_cap_migration cap_mig; 2747 2748 cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION; 2749 cap_mig.header.version = 1; 2750 2751 cap_mig.flags = 0; 2752 /* support minimum pgsize */ 2753 cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap); 2754 cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX; 2755 2756 return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig)); 2757 } 2758 2759 static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu, 2760 struct vfio_info_cap *caps) 2761 { 2762 struct vfio_iommu_type1_info_dma_avail cap_dma_avail; 2763 2764 cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL; 2765 cap_dma_avail.header.version = 1; 2766 2767 cap_dma_avail.avail = iommu->dma_avail; 2768 2769 return vfio_info_add_capability(caps, &cap_dma_avail.header, 2770 sizeof(cap_dma_avail)); 2771 } 2772 2773 static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, 2774 unsigned long arg) 2775 { 2776 struct vfio_iommu_type1_info info; 2777 unsigned long minsz; 2778 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 2779 unsigned long capsz; 2780 int ret; 2781 2782 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); 2783 2784 /* For backward compatibility, cannot require this */ 2785 capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); 2786 2787 if (copy_from_user(&info, (void __user *)arg, minsz)) 2788 return -EFAULT; 2789 2790 if (info.argsz < minsz) 2791 return -EINVAL; 2792 2793 if (info.argsz >= capsz) { 2794 minsz = capsz; 2795 info.cap_offset = 0; /* output, no-recopy necessary */ 2796 } 2797 2798 mutex_lock(&iommu->lock); 2799 info.flags = VFIO_IOMMU_INFO_PGSIZES; 2800 2801 info.iova_pgsizes = iommu->pgsize_bitmap; 2802 2803 ret = vfio_iommu_migration_build_caps(iommu, &caps); 2804 2805 if (!ret) 2806 ret = vfio_iommu_dma_avail_build_caps(iommu, &caps); 2807 2808 if (!ret) 2809 ret = vfio_iommu_iova_build_caps(iommu, &caps); 2810 2811 mutex_unlock(&iommu->lock); 2812 2813 if (ret) 2814 return ret; 2815 2816 if (caps.size) { 2817 info.flags |= VFIO_IOMMU_INFO_CAPS; 2818 2819 if (info.argsz < sizeof(info) + caps.size) { 2820 info.argsz = sizeof(info) + caps.size; 2821 } else { 2822 vfio_info_cap_shift(&caps, sizeof(info)); 2823 if (copy_to_user((void __user *)arg + 2824 sizeof(info), caps.buf, 2825 caps.size)) { 2826 kfree(caps.buf); 2827 return -EFAULT; 2828 } 2829 info.cap_offset = sizeof(info); 2830 } 2831 2832 kfree(caps.buf); 2833 } 2834 2835 return copy_to_user((void __user *)arg, &info, minsz) ? 2836 -EFAULT : 0; 2837 } 2838 2839 static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu, 2840 unsigned long arg) 2841 { 2842 struct vfio_iommu_type1_dma_map map; 2843 unsigned long minsz; 2844 uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE | 2845 VFIO_DMA_MAP_FLAG_VADDR; 2846 2847 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 2848 2849 if (copy_from_user(&map, (void __user *)arg, minsz)) 2850 return -EFAULT; 2851 2852 if (map.argsz < minsz || map.flags & ~mask) 2853 return -EINVAL; 2854 2855 return vfio_dma_do_map(iommu, &map); 2856 } 2857 2858 static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu, 2859 unsigned long arg) 2860 { 2861 struct vfio_iommu_type1_dma_unmap unmap; 2862 struct vfio_bitmap bitmap = { 0 }; 2863 uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP | 2864 VFIO_DMA_UNMAP_FLAG_VADDR | 2865 VFIO_DMA_UNMAP_FLAG_ALL; 2866 unsigned long minsz; 2867 int ret; 2868 2869 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); 2870 2871 if (copy_from_user(&unmap, (void __user *)arg, minsz)) 2872 return -EFAULT; 2873 2874 if (unmap.argsz < minsz || unmap.flags & ~mask) 2875 return -EINVAL; 2876 2877 if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) && 2878 (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL | 2879 VFIO_DMA_UNMAP_FLAG_VADDR))) 2880 return -EINVAL; 2881 2882 if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { 2883 unsigned long pgshift; 2884 2885 if (unmap.argsz < (minsz + sizeof(bitmap))) 2886 return -EINVAL; 2887 2888 if (copy_from_user(&bitmap, 2889 (void __user *)(arg + minsz), 2890 sizeof(bitmap))) 2891 return -EFAULT; 2892 2893 if (!access_ok((void __user *)bitmap.data, bitmap.size)) 2894 return -EINVAL; 2895 2896 pgshift = __ffs(bitmap.pgsize); 2897 ret = verify_bitmap_size(unmap.size >> pgshift, 2898 bitmap.size); 2899 if (ret) 2900 return ret; 2901 } 2902 2903 ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap); 2904 if (ret) 2905 return ret; 2906 2907 return copy_to_user((void __user *)arg, &unmap, minsz) ? 2908 -EFAULT : 0; 2909 } 2910 2911 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu, 2912 unsigned long arg) 2913 { 2914 struct vfio_iommu_type1_dirty_bitmap dirty; 2915 uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START | 2916 VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP | 2917 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; 2918 unsigned long minsz; 2919 int ret = 0; 2920 2921 if (!iommu->v2) 2922 return -EACCES; 2923 2924 minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags); 2925 2926 if (copy_from_user(&dirty, (void __user *)arg, minsz)) 2927 return -EFAULT; 2928 2929 if (dirty.argsz < minsz || dirty.flags & ~mask) 2930 return -EINVAL; 2931 2932 /* only one flag should be set at a time */ 2933 if (__ffs(dirty.flags) != __fls(dirty.flags)) 2934 return -EINVAL; 2935 2936 if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) { 2937 size_t pgsize; 2938 2939 mutex_lock(&iommu->lock); 2940 pgsize = 1 << __ffs(iommu->pgsize_bitmap); 2941 if (!iommu->dirty_page_tracking) { 2942 ret = vfio_dma_bitmap_alloc_all(iommu, pgsize); 2943 if (!ret) 2944 iommu->dirty_page_tracking = true; 2945 } 2946 mutex_unlock(&iommu->lock); 2947 return ret; 2948 } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) { 2949 mutex_lock(&iommu->lock); 2950 if (iommu->dirty_page_tracking) { 2951 iommu->dirty_page_tracking = false; 2952 vfio_dma_bitmap_free_all(iommu); 2953 } 2954 mutex_unlock(&iommu->lock); 2955 return 0; 2956 } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) { 2957 struct vfio_iommu_type1_dirty_bitmap_get range; 2958 unsigned long pgshift; 2959 size_t data_size = dirty.argsz - minsz; 2960 size_t iommu_pgsize; 2961 2962 if (!data_size || data_size < sizeof(range)) 2963 return -EINVAL; 2964 2965 if (copy_from_user(&range, (void __user *)(arg + minsz), 2966 sizeof(range))) 2967 return -EFAULT; 2968 2969 if (range.iova + range.size < range.iova) 2970 return -EINVAL; 2971 if (!access_ok((void __user *)range.bitmap.data, 2972 range.bitmap.size)) 2973 return -EINVAL; 2974 2975 pgshift = __ffs(range.bitmap.pgsize); 2976 ret = verify_bitmap_size(range.size >> pgshift, 2977 range.bitmap.size); 2978 if (ret) 2979 return ret; 2980 2981 mutex_lock(&iommu->lock); 2982 2983 iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap); 2984 2985 /* allow only smallest supported pgsize */ 2986 if (range.bitmap.pgsize != iommu_pgsize) { 2987 ret = -EINVAL; 2988 goto out_unlock; 2989 } 2990 if (range.iova & (iommu_pgsize - 1)) { 2991 ret = -EINVAL; 2992 goto out_unlock; 2993 } 2994 if (!range.size || range.size & (iommu_pgsize - 1)) { 2995 ret = -EINVAL; 2996 goto out_unlock; 2997 } 2998 2999 if (iommu->dirty_page_tracking) 3000 ret = vfio_iova_dirty_bitmap(range.bitmap.data, 3001 iommu, range.iova, 3002 range.size, 3003 range.bitmap.pgsize); 3004 else 3005 ret = -EINVAL; 3006 out_unlock: 3007 mutex_unlock(&iommu->lock); 3008 3009 return ret; 3010 } 3011 3012 return -EINVAL; 3013 } 3014 3015 static long vfio_iommu_type1_ioctl(void *iommu_data, 3016 unsigned int cmd, unsigned long arg) 3017 { 3018 struct vfio_iommu *iommu = iommu_data; 3019 3020 switch (cmd) { 3021 case VFIO_CHECK_EXTENSION: 3022 return vfio_iommu_type1_check_extension(iommu, arg); 3023 case VFIO_IOMMU_GET_INFO: 3024 return vfio_iommu_type1_get_info(iommu, arg); 3025 case VFIO_IOMMU_MAP_DMA: 3026 return vfio_iommu_type1_map_dma(iommu, arg); 3027 case VFIO_IOMMU_UNMAP_DMA: 3028 return vfio_iommu_type1_unmap_dma(iommu, arg); 3029 case VFIO_IOMMU_DIRTY_PAGES: 3030 return vfio_iommu_type1_dirty_pages(iommu, arg); 3031 default: 3032 return -ENOTTY; 3033 } 3034 } 3035 3036 static void vfio_iommu_type1_register_device(void *iommu_data, 3037 struct vfio_device *vdev) 3038 { 3039 struct vfio_iommu *iommu = iommu_data; 3040 3041 if (!vdev->ops->dma_unmap) 3042 return; 3043 3044 /* 3045 * list_empty(&iommu->device_list) is tested under the iommu->lock while 3046 * iteration for dma_unmap must be done under the device_list_lock. 3047 * Holding both locks here allows avoiding the device_list_lock in 3048 * several fast paths. See vfio_notify_dma_unmap() 3049 */ 3050 mutex_lock(&iommu->lock); 3051 mutex_lock(&iommu->device_list_lock); 3052 list_add(&vdev->iommu_entry, &iommu->device_list); 3053 mutex_unlock(&iommu->device_list_lock); 3054 mutex_unlock(&iommu->lock); 3055 } 3056 3057 static void vfio_iommu_type1_unregister_device(void *iommu_data, 3058 struct vfio_device *vdev) 3059 { 3060 struct vfio_iommu *iommu = iommu_data; 3061 3062 if (!vdev->ops->dma_unmap) 3063 return; 3064 3065 mutex_lock(&iommu->lock); 3066 mutex_lock(&iommu->device_list_lock); 3067 list_del(&vdev->iommu_entry); 3068 mutex_unlock(&iommu->device_list_lock); 3069 mutex_unlock(&iommu->lock); 3070 } 3071 3072 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, 3073 dma_addr_t user_iova, void *data, 3074 size_t count, bool write, 3075 size_t *copied) 3076 { 3077 struct mm_struct *mm; 3078 unsigned long vaddr; 3079 struct vfio_dma *dma; 3080 bool kthread = current->mm == NULL; 3081 size_t offset; 3082 int ret; 3083 3084 *copied = 0; 3085 3086 ret = vfio_find_dma_valid(iommu, user_iova, 1, &dma); 3087 if (ret < 0) 3088 return ret; 3089 3090 if ((write && !(dma->prot & IOMMU_WRITE)) || 3091 !(dma->prot & IOMMU_READ)) 3092 return -EPERM; 3093 3094 mm = get_task_mm(dma->task); 3095 3096 if (!mm) 3097 return -EPERM; 3098 3099 if (kthread) 3100 kthread_use_mm(mm); 3101 else if (current->mm != mm) 3102 goto out; 3103 3104 offset = user_iova - dma->iova; 3105 3106 if (count > dma->size - offset) 3107 count = dma->size - offset; 3108 3109 vaddr = dma->vaddr + offset; 3110 3111 if (write) { 3112 *copied = copy_to_user((void __user *)vaddr, data, 3113 count) ? 0 : count; 3114 if (*copied && iommu->dirty_page_tracking) { 3115 unsigned long pgshift = __ffs(iommu->pgsize_bitmap); 3116 /* 3117 * Bitmap populated with the smallest supported page 3118 * size 3119 */ 3120 bitmap_set(dma->bitmap, offset >> pgshift, 3121 ((offset + *copied - 1) >> pgshift) - 3122 (offset >> pgshift) + 1); 3123 } 3124 } else 3125 *copied = copy_from_user(data, (void __user *)vaddr, 3126 count) ? 0 : count; 3127 if (kthread) 3128 kthread_unuse_mm(mm); 3129 out: 3130 mmput(mm); 3131 return *copied ? 0 : -EFAULT; 3132 } 3133 3134 static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova, 3135 void *data, size_t count, bool write) 3136 { 3137 struct vfio_iommu *iommu = iommu_data; 3138 int ret = 0; 3139 size_t done; 3140 3141 mutex_lock(&iommu->lock); 3142 while (count > 0) { 3143 ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data, 3144 count, write, &done); 3145 if (ret) 3146 break; 3147 3148 count -= done; 3149 data += done; 3150 user_iova += done; 3151 } 3152 3153 mutex_unlock(&iommu->lock); 3154 return ret; 3155 } 3156 3157 static struct iommu_domain * 3158 vfio_iommu_type1_group_iommu_domain(void *iommu_data, 3159 struct iommu_group *iommu_group) 3160 { 3161 struct iommu_domain *domain = ERR_PTR(-ENODEV); 3162 struct vfio_iommu *iommu = iommu_data; 3163 struct vfio_domain *d; 3164 3165 if (!iommu || !iommu_group) 3166 return ERR_PTR(-EINVAL); 3167 3168 mutex_lock(&iommu->lock); 3169 list_for_each_entry(d, &iommu->domain_list, next) { 3170 if (find_iommu_group(d, iommu_group)) { 3171 domain = d->domain; 3172 break; 3173 } 3174 } 3175 mutex_unlock(&iommu->lock); 3176 3177 return domain; 3178 } 3179 3180 static void vfio_iommu_type1_notify(void *iommu_data, 3181 enum vfio_iommu_notify_type event) 3182 { 3183 struct vfio_iommu *iommu = iommu_data; 3184 3185 if (event != VFIO_IOMMU_CONTAINER_CLOSE) 3186 return; 3187 mutex_lock(&iommu->lock); 3188 iommu->container_open = false; 3189 mutex_unlock(&iommu->lock); 3190 wake_up_all(&iommu->vaddr_wait); 3191 } 3192 3193 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { 3194 .name = "vfio-iommu-type1", 3195 .owner = THIS_MODULE, 3196 .open = vfio_iommu_type1_open, 3197 .release = vfio_iommu_type1_release, 3198 .ioctl = vfio_iommu_type1_ioctl, 3199 .attach_group = vfio_iommu_type1_attach_group, 3200 .detach_group = vfio_iommu_type1_detach_group, 3201 .pin_pages = vfio_iommu_type1_pin_pages, 3202 .unpin_pages = vfio_iommu_type1_unpin_pages, 3203 .register_device = vfio_iommu_type1_register_device, 3204 .unregister_device = vfio_iommu_type1_unregister_device, 3205 .dma_rw = vfio_iommu_type1_dma_rw, 3206 .group_iommu_domain = vfio_iommu_type1_group_iommu_domain, 3207 .notify = vfio_iommu_type1_notify, 3208 }; 3209 3210 static int __init vfio_iommu_type1_init(void) 3211 { 3212 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); 3213 } 3214 3215 static void __exit vfio_iommu_type1_cleanup(void) 3216 { 3217 vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1); 3218 } 3219 3220 module_init(vfio_iommu_type1_init); 3221 module_exit(vfio_iommu_type1_cleanup); 3222 3223 MODULE_VERSION(DRIVER_VERSION); 3224 MODULE_LICENSE("GPL v2"); 3225 MODULE_AUTHOR(DRIVER_AUTHOR); 3226 MODULE_DESCRIPTION(DRIVER_DESC); 3227