1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO: IOMMU DMA mapping support for Type1 IOMMU 4 * 5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 6 * Author: Alex Williamson <alex.williamson@redhat.com> 7 * 8 * Derived from original vfio: 9 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 10 * Author: Tom Lyon, pugs@cisco.com 11 * 12 * We arbitrarily define a Type1 IOMMU as one matching the below code. 13 * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel 14 * VT-d, but that makes it harder to re-use as theoretically anyone 15 * implementing a similar IOMMU could make use of this. We expect the 16 * IOMMU to support the IOMMU API and have few to no restrictions around 17 * the IOVA range that can be mapped. The Type1 IOMMU is currently 18 * optimized for relatively static mappings of a userspace process with 19 * userspace pages pinned into memory. We also assume devices and IOMMU 20 * domains are PCI based as the IOMMU API is still centered around a 21 * device/bus interface rather than a group interface. 22 */ 23 24 #include <linux/compat.h> 25 #include <linux/device.h> 26 #include <linux/fs.h> 27 #include <linux/highmem.h> 28 #include <linux/iommu.h> 29 #include <linux/module.h> 30 #include <linux/mm.h> 31 #include <linux/kthread.h> 32 #include <linux/rbtree.h> 33 #include <linux/sched/signal.h> 34 #include <linux/sched/mm.h> 35 #include <linux/slab.h> 36 #include <linux/uaccess.h> 37 #include <linux/vfio.h> 38 #include <linux/workqueue.h> 39 #include <linux/notifier.h> 40 #include "vfio.h" 41 42 #define DRIVER_VERSION "0.2" 43 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 44 #define DRIVER_DESC "Type1 IOMMU driver for VFIO" 45 46 static bool allow_unsafe_interrupts; 47 module_param_named(allow_unsafe_interrupts, 48 allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); 49 MODULE_PARM_DESC(allow_unsafe_interrupts, 50 "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); 51 52 static bool disable_hugepages; 53 module_param_named(disable_hugepages, 54 disable_hugepages, bool, S_IRUGO | S_IWUSR); 55 MODULE_PARM_DESC(disable_hugepages, 56 "Disable VFIO IOMMU support for IOMMU hugepages."); 57 58 static unsigned int dma_entry_limit __read_mostly = U16_MAX; 59 module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644); 60 MODULE_PARM_DESC(dma_entry_limit, 61 "Maximum number of user DMA mappings per container (65535)."); 62 63 struct vfio_iommu { 64 struct list_head domain_list; 65 struct list_head iova_list; 66 struct mutex lock; 67 struct rb_root dma_list; 68 struct list_head device_list; 69 struct mutex device_list_lock; 70 unsigned int dma_avail; 71 unsigned int vaddr_invalid_count; 72 uint64_t pgsize_bitmap; 73 uint64_t num_non_pinned_groups; 74 bool v2; 75 bool nesting; 76 bool dirty_page_tracking; 77 struct list_head emulated_iommu_groups; 78 }; 79 80 struct vfio_domain { 81 struct iommu_domain *domain; 82 struct list_head next; 83 struct list_head group_list; 84 bool fgsp : 1; /* Fine-grained super pages */ 85 bool enforce_cache_coherency : 1; 86 }; 87 88 struct vfio_dma { 89 struct rb_node node; 90 dma_addr_t iova; /* Device address */ 91 unsigned long vaddr; /* Process virtual addr */ 92 size_t size; /* Map size (bytes) */ 93 int prot; /* IOMMU_READ/WRITE */ 94 bool iommu_mapped; 95 bool lock_cap; /* capable(CAP_IPC_LOCK) */ 96 bool vaddr_invalid; 97 struct task_struct *task; 98 struct rb_root pfn_list; /* Ex-user pinned pfn list */ 99 unsigned long *bitmap; 100 struct mm_struct *mm; 101 size_t locked_vm; 102 }; 103 104 struct vfio_batch { 105 struct page **pages; /* for pin_user_pages_remote */ 106 struct page *fallback_page; /* if pages alloc fails */ 107 int capacity; /* length of pages array */ 108 int size; /* of batch currently */ 109 int offset; /* of next entry in pages */ 110 }; 111 112 struct vfio_iommu_group { 113 struct iommu_group *iommu_group; 114 struct list_head next; 115 bool pinned_page_dirty_scope; 116 }; 117 118 struct vfio_iova { 119 struct list_head list; 120 dma_addr_t start; 121 dma_addr_t end; 122 }; 123 124 /* 125 * Guest RAM pinning working set or DMA target 126 */ 127 struct vfio_pfn { 128 struct rb_node node; 129 dma_addr_t iova; /* Device address */ 130 unsigned long pfn; /* Host pfn */ 131 unsigned int ref_count; 132 }; 133 134 struct vfio_regions { 135 struct list_head list; 136 dma_addr_t iova; 137 phys_addr_t phys; 138 size_t len; 139 }; 140 141 #define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE) 142 143 /* 144 * Input argument of number of bits to bitmap_set() is unsigned integer, which 145 * further casts to signed integer for unaligned multi-bit operation, 146 * __bitmap_set(). 147 * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte, 148 * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page 149 * system. 150 */ 151 #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) 152 #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX) 153 154 static int put_pfn(unsigned long pfn, int prot); 155 156 static struct vfio_iommu_group* 157 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, 158 struct iommu_group *iommu_group); 159 160 /* 161 * This code handles mapping and unmapping of user data buffers 162 * into DMA'ble space using the IOMMU 163 */ 164 165 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, 166 dma_addr_t start, size_t size) 167 { 168 struct rb_node *node = iommu->dma_list.rb_node; 169 170 while (node) { 171 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); 172 173 if (start + size <= dma->iova) 174 node = node->rb_left; 175 else if (start >= dma->iova + dma->size) 176 node = node->rb_right; 177 else 178 return dma; 179 } 180 181 return NULL; 182 } 183 184 static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu, 185 dma_addr_t start, u64 size) 186 { 187 struct rb_node *res = NULL; 188 struct rb_node *node = iommu->dma_list.rb_node; 189 struct vfio_dma *dma_res = NULL; 190 191 while (node) { 192 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); 193 194 if (start < dma->iova + dma->size) { 195 res = node; 196 dma_res = dma; 197 if (start >= dma->iova) 198 break; 199 node = node->rb_left; 200 } else { 201 node = node->rb_right; 202 } 203 } 204 if (res && size && dma_res->iova >= start + size) 205 res = NULL; 206 return res; 207 } 208 209 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new) 210 { 211 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; 212 struct vfio_dma *dma; 213 214 while (*link) { 215 parent = *link; 216 dma = rb_entry(parent, struct vfio_dma, node); 217 218 if (new->iova + new->size <= dma->iova) 219 link = &(*link)->rb_left; 220 else 221 link = &(*link)->rb_right; 222 } 223 224 rb_link_node(&new->node, parent, link); 225 rb_insert_color(&new->node, &iommu->dma_list); 226 } 227 228 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) 229 { 230 rb_erase(&old->node, &iommu->dma_list); 231 } 232 233 234 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize) 235 { 236 uint64_t npages = dma->size / pgsize; 237 238 if (npages > DIRTY_BITMAP_PAGES_MAX) 239 return -EINVAL; 240 241 /* 242 * Allocate extra 64 bits that are used to calculate shift required for 243 * bitmap_shift_left() to manipulate and club unaligned number of pages 244 * in adjacent vfio_dma ranges. 245 */ 246 dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64), 247 GFP_KERNEL); 248 if (!dma->bitmap) 249 return -ENOMEM; 250 251 return 0; 252 } 253 254 static void vfio_dma_bitmap_free(struct vfio_dma *dma) 255 { 256 kvfree(dma->bitmap); 257 dma->bitmap = NULL; 258 } 259 260 static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize) 261 { 262 struct rb_node *p; 263 unsigned long pgshift = __ffs(pgsize); 264 265 for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) { 266 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node); 267 268 bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1); 269 } 270 } 271 272 static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu) 273 { 274 struct rb_node *n; 275 unsigned long pgshift = __ffs(iommu->pgsize_bitmap); 276 277 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { 278 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); 279 280 bitmap_set(dma->bitmap, 0, dma->size >> pgshift); 281 } 282 } 283 284 static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize) 285 { 286 struct rb_node *n; 287 288 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { 289 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); 290 int ret; 291 292 ret = vfio_dma_bitmap_alloc(dma, pgsize); 293 if (ret) { 294 struct rb_node *p; 295 296 for (p = rb_prev(n); p; p = rb_prev(p)) { 297 struct vfio_dma *dma = rb_entry(n, 298 struct vfio_dma, node); 299 300 vfio_dma_bitmap_free(dma); 301 } 302 return ret; 303 } 304 vfio_dma_populate_bitmap(dma, pgsize); 305 } 306 return 0; 307 } 308 309 static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu) 310 { 311 struct rb_node *n; 312 313 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { 314 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); 315 316 vfio_dma_bitmap_free(dma); 317 } 318 } 319 320 /* 321 * Helper Functions for host iova-pfn list 322 */ 323 static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) 324 { 325 struct vfio_pfn *vpfn; 326 struct rb_node *node = dma->pfn_list.rb_node; 327 328 while (node) { 329 vpfn = rb_entry(node, struct vfio_pfn, node); 330 331 if (iova < vpfn->iova) 332 node = node->rb_left; 333 else if (iova > vpfn->iova) 334 node = node->rb_right; 335 else 336 return vpfn; 337 } 338 return NULL; 339 } 340 341 static void vfio_link_pfn(struct vfio_dma *dma, 342 struct vfio_pfn *new) 343 { 344 struct rb_node **link, *parent = NULL; 345 struct vfio_pfn *vpfn; 346 347 link = &dma->pfn_list.rb_node; 348 while (*link) { 349 parent = *link; 350 vpfn = rb_entry(parent, struct vfio_pfn, node); 351 352 if (new->iova < vpfn->iova) 353 link = &(*link)->rb_left; 354 else 355 link = &(*link)->rb_right; 356 } 357 358 rb_link_node(&new->node, parent, link); 359 rb_insert_color(&new->node, &dma->pfn_list); 360 } 361 362 static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old) 363 { 364 rb_erase(&old->node, &dma->pfn_list); 365 } 366 367 static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova, 368 unsigned long pfn) 369 { 370 struct vfio_pfn *vpfn; 371 372 vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL); 373 if (!vpfn) 374 return -ENOMEM; 375 376 vpfn->iova = iova; 377 vpfn->pfn = pfn; 378 vpfn->ref_count = 1; 379 vfio_link_pfn(dma, vpfn); 380 return 0; 381 } 382 383 static void vfio_remove_from_pfn_list(struct vfio_dma *dma, 384 struct vfio_pfn *vpfn) 385 { 386 vfio_unlink_pfn(dma, vpfn); 387 kfree(vpfn); 388 } 389 390 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma, 391 unsigned long iova) 392 { 393 struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova); 394 395 if (vpfn) 396 vpfn->ref_count++; 397 return vpfn; 398 } 399 400 static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) 401 { 402 int ret = 0; 403 404 vpfn->ref_count--; 405 if (!vpfn->ref_count) { 406 ret = put_pfn(vpfn->pfn, dma->prot); 407 vfio_remove_from_pfn_list(dma, vpfn); 408 } 409 return ret; 410 } 411 412 static int mm_lock_acct(struct task_struct *task, struct mm_struct *mm, 413 bool lock_cap, long npage) 414 { 415 int ret = mmap_write_lock_killable(mm); 416 417 if (ret) 418 return ret; 419 420 ret = __account_locked_vm(mm, abs(npage), npage > 0, task, lock_cap); 421 mmap_write_unlock(mm); 422 return ret; 423 } 424 425 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async) 426 { 427 struct mm_struct *mm; 428 int ret; 429 430 if (!npage) 431 return 0; 432 433 mm = dma->mm; 434 if (async && !mmget_not_zero(mm)) 435 return -ESRCH; /* process exited */ 436 437 ret = mm_lock_acct(dma->task, mm, dma->lock_cap, npage); 438 if (!ret) 439 dma->locked_vm += npage; 440 441 if (async) 442 mmput(mm); 443 444 return ret; 445 } 446 447 /* 448 * Some mappings aren't backed by a struct page, for example an mmap'd 449 * MMIO range for our own or another device. These use a different 450 * pfn conversion and shouldn't be tracked as locked pages. 451 * For compound pages, any driver that sets the reserved bit in head 452 * page needs to set the reserved bit in all subpages to be safe. 453 */ 454 static bool is_invalid_reserved_pfn(unsigned long pfn) 455 { 456 if (pfn_valid(pfn)) 457 return PageReserved(pfn_to_page(pfn)); 458 459 return true; 460 } 461 462 static int put_pfn(unsigned long pfn, int prot) 463 { 464 if (!is_invalid_reserved_pfn(pfn)) { 465 struct page *page = pfn_to_page(pfn); 466 467 unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE); 468 return 1; 469 } 470 return 0; 471 } 472 473 #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *)) 474 475 static void vfio_batch_init(struct vfio_batch *batch) 476 { 477 batch->size = 0; 478 batch->offset = 0; 479 480 if (unlikely(disable_hugepages)) 481 goto fallback; 482 483 batch->pages = (struct page **) __get_free_page(GFP_KERNEL); 484 if (!batch->pages) 485 goto fallback; 486 487 batch->capacity = VFIO_BATCH_MAX_CAPACITY; 488 return; 489 490 fallback: 491 batch->pages = &batch->fallback_page; 492 batch->capacity = 1; 493 } 494 495 static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma) 496 { 497 while (batch->size) { 498 unsigned long pfn = page_to_pfn(batch->pages[batch->offset]); 499 500 put_pfn(pfn, dma->prot); 501 batch->offset++; 502 batch->size--; 503 } 504 } 505 506 static void vfio_batch_fini(struct vfio_batch *batch) 507 { 508 if (batch->capacity == VFIO_BATCH_MAX_CAPACITY) 509 free_page((unsigned long)batch->pages); 510 } 511 512 static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, 513 unsigned long vaddr, unsigned long *pfn, 514 bool write_fault) 515 { 516 pte_t *ptep; 517 pte_t pte; 518 spinlock_t *ptl; 519 int ret; 520 521 ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); 522 if (ret) { 523 bool unlocked = false; 524 525 ret = fixup_user_fault(mm, vaddr, 526 FAULT_FLAG_REMOTE | 527 (write_fault ? FAULT_FLAG_WRITE : 0), 528 &unlocked); 529 if (unlocked) 530 return -EAGAIN; 531 532 if (ret) 533 return ret; 534 535 ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); 536 if (ret) 537 return ret; 538 } 539 540 pte = ptep_get(ptep); 541 542 if (write_fault && !pte_write(pte)) 543 ret = -EFAULT; 544 else 545 *pfn = pte_pfn(pte); 546 547 pte_unmap_unlock(ptep, ptl); 548 return ret; 549 } 550 551 /* 552 * Returns the positive number of pfns successfully obtained or a negative 553 * error code. 554 */ 555 static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, 556 long npages, int prot, unsigned long *pfn, 557 struct page **pages) 558 { 559 struct vm_area_struct *vma; 560 unsigned int flags = 0; 561 int ret; 562 563 if (prot & IOMMU_WRITE) 564 flags |= FOLL_WRITE; 565 566 mmap_read_lock(mm); 567 ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM, 568 pages, NULL); 569 if (ret > 0) { 570 int i; 571 572 /* 573 * The zero page is always resident, we don't need to pin it 574 * and it falls into our invalid/reserved test so we don't 575 * unpin in put_pfn(). Unpin all zero pages in the batch here. 576 */ 577 for (i = 0 ; i < ret; i++) { 578 if (unlikely(is_zero_pfn(page_to_pfn(pages[i])))) 579 unpin_user_page(pages[i]); 580 } 581 582 *pfn = page_to_pfn(pages[0]); 583 goto done; 584 } 585 586 vaddr = untagged_addr_remote(mm, vaddr); 587 588 retry: 589 vma = vma_lookup(mm, vaddr); 590 591 if (vma && vma->vm_flags & VM_PFNMAP) { 592 ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE); 593 if (ret == -EAGAIN) 594 goto retry; 595 596 if (!ret) { 597 if (is_invalid_reserved_pfn(*pfn)) 598 ret = 1; 599 else 600 ret = -EFAULT; 601 } 602 } 603 done: 604 mmap_read_unlock(mm); 605 return ret; 606 } 607 608 /* 609 * Attempt to pin pages. We really don't want to track all the pfns and 610 * the iommu can only map chunks of consecutive pfns anyway, so get the 611 * first page and all consecutive pages with the same locking. 612 */ 613 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, 614 long npage, unsigned long *pfn_base, 615 unsigned long limit, struct vfio_batch *batch) 616 { 617 unsigned long pfn; 618 struct mm_struct *mm = current->mm; 619 long ret, pinned = 0, lock_acct = 0; 620 bool rsvd; 621 dma_addr_t iova = vaddr - dma->vaddr + dma->iova; 622 623 /* This code path is only user initiated */ 624 if (!mm) 625 return -ENODEV; 626 627 if (batch->size) { 628 /* Leftover pages in batch from an earlier call. */ 629 *pfn_base = page_to_pfn(batch->pages[batch->offset]); 630 pfn = *pfn_base; 631 rsvd = is_invalid_reserved_pfn(*pfn_base); 632 } else { 633 *pfn_base = 0; 634 } 635 636 while (npage) { 637 if (!batch->size) { 638 /* Empty batch, so refill it. */ 639 long req_pages = min_t(long, npage, batch->capacity); 640 641 ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot, 642 &pfn, batch->pages); 643 if (ret < 0) 644 goto unpin_out; 645 646 batch->size = ret; 647 batch->offset = 0; 648 649 if (!*pfn_base) { 650 *pfn_base = pfn; 651 rsvd = is_invalid_reserved_pfn(*pfn_base); 652 } 653 } 654 655 /* 656 * pfn is preset for the first iteration of this inner loop and 657 * updated at the end to handle a VM_PFNMAP pfn. In that case, 658 * batch->pages isn't valid (there's no struct page), so allow 659 * batch->pages to be touched only when there's more than one 660 * pfn to check, which guarantees the pfns are from a 661 * !VM_PFNMAP vma. 662 */ 663 while (true) { 664 if (pfn != *pfn_base + pinned || 665 rsvd != is_invalid_reserved_pfn(pfn)) 666 goto out; 667 668 /* 669 * Reserved pages aren't counted against the user, 670 * externally pinned pages are already counted against 671 * the user. 672 */ 673 if (!rsvd && !vfio_find_vpfn(dma, iova)) { 674 if (!dma->lock_cap && 675 mm->locked_vm + lock_acct + 1 > limit) { 676 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", 677 __func__, limit << PAGE_SHIFT); 678 ret = -ENOMEM; 679 goto unpin_out; 680 } 681 lock_acct++; 682 } 683 684 pinned++; 685 npage--; 686 vaddr += PAGE_SIZE; 687 iova += PAGE_SIZE; 688 batch->offset++; 689 batch->size--; 690 691 if (!batch->size) 692 break; 693 694 pfn = page_to_pfn(batch->pages[batch->offset]); 695 } 696 697 if (unlikely(disable_hugepages)) 698 break; 699 } 700 701 out: 702 ret = vfio_lock_acct(dma, lock_acct, false); 703 704 unpin_out: 705 if (batch->size == 1 && !batch->offset) { 706 /* May be a VM_PFNMAP pfn, which the batch can't remember. */ 707 put_pfn(pfn, dma->prot); 708 batch->size = 0; 709 } 710 711 if (ret < 0) { 712 if (pinned && !rsvd) { 713 for (pfn = *pfn_base ; pinned ; pfn++, pinned--) 714 put_pfn(pfn, dma->prot); 715 } 716 vfio_batch_unpin(batch, dma); 717 718 return ret; 719 } 720 721 return pinned; 722 } 723 724 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, 725 unsigned long pfn, long npage, 726 bool do_accounting) 727 { 728 long unlocked = 0, locked = 0; 729 long i; 730 731 for (i = 0; i < npage; i++, iova += PAGE_SIZE) { 732 if (put_pfn(pfn++, dma->prot)) { 733 unlocked++; 734 if (vfio_find_vpfn(dma, iova)) 735 locked++; 736 } 737 } 738 739 if (do_accounting) 740 vfio_lock_acct(dma, locked - unlocked, true); 741 742 return unlocked; 743 } 744 745 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, 746 unsigned long *pfn_base, bool do_accounting) 747 { 748 struct page *pages[1]; 749 struct mm_struct *mm; 750 int ret; 751 752 mm = dma->mm; 753 if (!mmget_not_zero(mm)) 754 return -ENODEV; 755 756 ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages); 757 if (ret != 1) 758 goto out; 759 760 ret = 0; 761 762 if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) { 763 ret = vfio_lock_acct(dma, 1, false); 764 if (ret) { 765 put_pfn(*pfn_base, dma->prot); 766 if (ret == -ENOMEM) 767 pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK " 768 "(%ld) exceeded\n", __func__, 769 dma->task->comm, task_pid_nr(dma->task), 770 task_rlimit(dma->task, RLIMIT_MEMLOCK)); 771 } 772 } 773 774 out: 775 mmput(mm); 776 return ret; 777 } 778 779 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova, 780 bool do_accounting) 781 { 782 int unlocked; 783 struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova); 784 785 if (!vpfn) 786 return 0; 787 788 unlocked = vfio_iova_put_vfio_pfn(dma, vpfn); 789 790 if (do_accounting) 791 vfio_lock_acct(dma, -unlocked, true); 792 793 return unlocked; 794 } 795 796 static int vfio_iommu_type1_pin_pages(void *iommu_data, 797 struct iommu_group *iommu_group, 798 dma_addr_t user_iova, 799 int npage, int prot, 800 struct page **pages) 801 { 802 struct vfio_iommu *iommu = iommu_data; 803 struct vfio_iommu_group *group; 804 int i, j, ret; 805 unsigned long remote_vaddr; 806 struct vfio_dma *dma; 807 bool do_accounting; 808 809 if (!iommu || !pages) 810 return -EINVAL; 811 812 /* Supported for v2 version only */ 813 if (!iommu->v2) 814 return -EACCES; 815 816 mutex_lock(&iommu->lock); 817 818 if (WARN_ONCE(iommu->vaddr_invalid_count, 819 "vfio_pin_pages not allowed with VFIO_UPDATE_VADDR\n")) { 820 ret = -EBUSY; 821 goto pin_done; 822 } 823 824 /* Fail if no dma_umap notifier is registered */ 825 if (list_empty(&iommu->device_list)) { 826 ret = -EINVAL; 827 goto pin_done; 828 } 829 830 /* 831 * If iommu capable domain exist in the container then all pages are 832 * already pinned and accounted. Accounting should be done if there is no 833 * iommu capable domain in the container. 834 */ 835 do_accounting = list_empty(&iommu->domain_list); 836 837 for (i = 0; i < npage; i++) { 838 unsigned long phys_pfn; 839 dma_addr_t iova; 840 struct vfio_pfn *vpfn; 841 842 iova = user_iova + PAGE_SIZE * i; 843 dma = vfio_find_dma(iommu, iova, PAGE_SIZE); 844 if (!dma) { 845 ret = -EINVAL; 846 goto pin_unwind; 847 } 848 849 if ((dma->prot & prot) != prot) { 850 ret = -EPERM; 851 goto pin_unwind; 852 } 853 854 vpfn = vfio_iova_get_vfio_pfn(dma, iova); 855 if (vpfn) { 856 pages[i] = pfn_to_page(vpfn->pfn); 857 continue; 858 } 859 860 remote_vaddr = dma->vaddr + (iova - dma->iova); 861 ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn, 862 do_accounting); 863 if (ret) 864 goto pin_unwind; 865 866 if (!pfn_valid(phys_pfn)) { 867 ret = -EINVAL; 868 goto pin_unwind; 869 } 870 871 ret = vfio_add_to_pfn_list(dma, iova, phys_pfn); 872 if (ret) { 873 if (put_pfn(phys_pfn, dma->prot) && do_accounting) 874 vfio_lock_acct(dma, -1, true); 875 goto pin_unwind; 876 } 877 878 pages[i] = pfn_to_page(phys_pfn); 879 880 if (iommu->dirty_page_tracking) { 881 unsigned long pgshift = __ffs(iommu->pgsize_bitmap); 882 883 /* 884 * Bitmap populated with the smallest supported page 885 * size 886 */ 887 bitmap_set(dma->bitmap, 888 (iova - dma->iova) >> pgshift, 1); 889 } 890 } 891 ret = i; 892 893 group = vfio_iommu_find_iommu_group(iommu, iommu_group); 894 if (!group->pinned_page_dirty_scope) { 895 group->pinned_page_dirty_scope = true; 896 iommu->num_non_pinned_groups--; 897 } 898 899 goto pin_done; 900 901 pin_unwind: 902 pages[i] = NULL; 903 for (j = 0; j < i; j++) { 904 dma_addr_t iova; 905 906 iova = user_iova + PAGE_SIZE * j; 907 dma = vfio_find_dma(iommu, iova, PAGE_SIZE); 908 vfio_unpin_page_external(dma, iova, do_accounting); 909 pages[j] = NULL; 910 } 911 pin_done: 912 mutex_unlock(&iommu->lock); 913 return ret; 914 } 915 916 static void vfio_iommu_type1_unpin_pages(void *iommu_data, 917 dma_addr_t user_iova, int npage) 918 { 919 struct vfio_iommu *iommu = iommu_data; 920 bool do_accounting; 921 int i; 922 923 /* Supported for v2 version only */ 924 if (WARN_ON(!iommu->v2)) 925 return; 926 927 mutex_lock(&iommu->lock); 928 929 do_accounting = list_empty(&iommu->domain_list); 930 for (i = 0; i < npage; i++) { 931 dma_addr_t iova = user_iova + PAGE_SIZE * i; 932 struct vfio_dma *dma; 933 934 dma = vfio_find_dma(iommu, iova, PAGE_SIZE); 935 if (!dma) 936 break; 937 938 vfio_unpin_page_external(dma, iova, do_accounting); 939 } 940 941 mutex_unlock(&iommu->lock); 942 943 WARN_ON(i != npage); 944 } 945 946 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain, 947 struct list_head *regions, 948 struct iommu_iotlb_gather *iotlb_gather) 949 { 950 long unlocked = 0; 951 struct vfio_regions *entry, *next; 952 953 iommu_iotlb_sync(domain->domain, iotlb_gather); 954 955 list_for_each_entry_safe(entry, next, regions, list) { 956 unlocked += vfio_unpin_pages_remote(dma, 957 entry->iova, 958 entry->phys >> PAGE_SHIFT, 959 entry->len >> PAGE_SHIFT, 960 false); 961 list_del(&entry->list); 962 kfree(entry); 963 } 964 965 cond_resched(); 966 967 return unlocked; 968 } 969 970 /* 971 * Generally, VFIO needs to unpin remote pages after each IOTLB flush. 972 * Therefore, when using IOTLB flush sync interface, VFIO need to keep track 973 * of these regions (currently using a list). 974 * 975 * This value specifies maximum number of regions for each IOTLB flush sync. 976 */ 977 #define VFIO_IOMMU_TLB_SYNC_MAX 512 978 979 static size_t unmap_unpin_fast(struct vfio_domain *domain, 980 struct vfio_dma *dma, dma_addr_t *iova, 981 size_t len, phys_addr_t phys, long *unlocked, 982 struct list_head *unmapped_list, 983 int *unmapped_cnt, 984 struct iommu_iotlb_gather *iotlb_gather) 985 { 986 size_t unmapped = 0; 987 struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL); 988 989 if (entry) { 990 unmapped = iommu_unmap_fast(domain->domain, *iova, len, 991 iotlb_gather); 992 993 if (!unmapped) { 994 kfree(entry); 995 } else { 996 entry->iova = *iova; 997 entry->phys = phys; 998 entry->len = unmapped; 999 list_add_tail(&entry->list, unmapped_list); 1000 1001 *iova += unmapped; 1002 (*unmapped_cnt)++; 1003 } 1004 } 1005 1006 /* 1007 * Sync if the number of fast-unmap regions hits the limit 1008 * or in case of errors. 1009 */ 1010 if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) { 1011 *unlocked += vfio_sync_unpin(dma, domain, unmapped_list, 1012 iotlb_gather); 1013 *unmapped_cnt = 0; 1014 } 1015 1016 return unmapped; 1017 } 1018 1019 static size_t unmap_unpin_slow(struct vfio_domain *domain, 1020 struct vfio_dma *dma, dma_addr_t *iova, 1021 size_t len, phys_addr_t phys, 1022 long *unlocked) 1023 { 1024 size_t unmapped = iommu_unmap(domain->domain, *iova, len); 1025 1026 if (unmapped) { 1027 *unlocked += vfio_unpin_pages_remote(dma, *iova, 1028 phys >> PAGE_SHIFT, 1029 unmapped >> PAGE_SHIFT, 1030 false); 1031 *iova += unmapped; 1032 cond_resched(); 1033 } 1034 return unmapped; 1035 } 1036 1037 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, 1038 bool do_accounting) 1039 { 1040 dma_addr_t iova = dma->iova, end = dma->iova + dma->size; 1041 struct vfio_domain *domain, *d; 1042 LIST_HEAD(unmapped_region_list); 1043 struct iommu_iotlb_gather iotlb_gather; 1044 int unmapped_region_cnt = 0; 1045 long unlocked = 0; 1046 1047 if (!dma->size) 1048 return 0; 1049 1050 if (list_empty(&iommu->domain_list)) 1051 return 0; 1052 1053 /* 1054 * We use the IOMMU to track the physical addresses, otherwise we'd 1055 * need a much more complicated tracking system. Unfortunately that 1056 * means we need to use one of the iommu domains to figure out the 1057 * pfns to unpin. The rest need to be unmapped in advance so we have 1058 * no iommu translations remaining when the pages are unpinned. 1059 */ 1060 domain = d = list_first_entry(&iommu->domain_list, 1061 struct vfio_domain, next); 1062 1063 list_for_each_entry_continue(d, &iommu->domain_list, next) { 1064 iommu_unmap(d->domain, dma->iova, dma->size); 1065 cond_resched(); 1066 } 1067 1068 iommu_iotlb_gather_init(&iotlb_gather); 1069 while (iova < end) { 1070 size_t unmapped, len; 1071 phys_addr_t phys, next; 1072 1073 phys = iommu_iova_to_phys(domain->domain, iova); 1074 if (WARN_ON(!phys)) { 1075 iova += PAGE_SIZE; 1076 continue; 1077 } 1078 1079 /* 1080 * To optimize for fewer iommu_unmap() calls, each of which 1081 * may require hardware cache flushing, try to find the 1082 * largest contiguous physical memory chunk to unmap. 1083 */ 1084 for (len = PAGE_SIZE; 1085 !domain->fgsp && iova + len < end; len += PAGE_SIZE) { 1086 next = iommu_iova_to_phys(domain->domain, iova + len); 1087 if (next != phys + len) 1088 break; 1089 } 1090 1091 /* 1092 * First, try to use fast unmap/unpin. In case of failure, 1093 * switch to slow unmap/unpin path. 1094 */ 1095 unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys, 1096 &unlocked, &unmapped_region_list, 1097 &unmapped_region_cnt, 1098 &iotlb_gather); 1099 if (!unmapped) { 1100 unmapped = unmap_unpin_slow(domain, dma, &iova, len, 1101 phys, &unlocked); 1102 if (WARN_ON(!unmapped)) 1103 break; 1104 } 1105 } 1106 1107 dma->iommu_mapped = false; 1108 1109 if (unmapped_region_cnt) { 1110 unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list, 1111 &iotlb_gather); 1112 } 1113 1114 if (do_accounting) { 1115 vfio_lock_acct(dma, -unlocked, true); 1116 return 0; 1117 } 1118 return unlocked; 1119 } 1120 1121 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) 1122 { 1123 WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)); 1124 vfio_unmap_unpin(iommu, dma, true); 1125 vfio_unlink_dma(iommu, dma); 1126 put_task_struct(dma->task); 1127 mmdrop(dma->mm); 1128 vfio_dma_bitmap_free(dma); 1129 if (dma->vaddr_invalid) 1130 iommu->vaddr_invalid_count--; 1131 kfree(dma); 1132 iommu->dma_avail++; 1133 } 1134 1135 static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu) 1136 { 1137 struct vfio_domain *domain; 1138 1139 iommu->pgsize_bitmap = ULONG_MAX; 1140 1141 list_for_each_entry(domain, &iommu->domain_list, next) 1142 iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap; 1143 1144 /* 1145 * In case the IOMMU supports page sizes smaller than PAGE_SIZE 1146 * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes. 1147 * That way the user will be able to map/unmap buffers whose size/ 1148 * start address is aligned with PAGE_SIZE. Pinning code uses that 1149 * granularity while iommu driver can use the sub-PAGE_SIZE size 1150 * to map the buffer. 1151 */ 1152 if (iommu->pgsize_bitmap & ~PAGE_MASK) { 1153 iommu->pgsize_bitmap &= PAGE_MASK; 1154 iommu->pgsize_bitmap |= PAGE_SIZE; 1155 } 1156 } 1157 1158 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, 1159 struct vfio_dma *dma, dma_addr_t base_iova, 1160 size_t pgsize) 1161 { 1162 unsigned long pgshift = __ffs(pgsize); 1163 unsigned long nbits = dma->size >> pgshift; 1164 unsigned long bit_offset = (dma->iova - base_iova) >> pgshift; 1165 unsigned long copy_offset = bit_offset / BITS_PER_LONG; 1166 unsigned long shift = bit_offset % BITS_PER_LONG; 1167 unsigned long leftover; 1168 1169 /* 1170 * mark all pages dirty if any IOMMU capable device is not able 1171 * to report dirty pages and all pages are pinned and mapped. 1172 */ 1173 if (iommu->num_non_pinned_groups && dma->iommu_mapped) 1174 bitmap_set(dma->bitmap, 0, nbits); 1175 1176 if (shift) { 1177 bitmap_shift_left(dma->bitmap, dma->bitmap, shift, 1178 nbits + shift); 1179 1180 if (copy_from_user(&leftover, 1181 (void __user *)(bitmap + copy_offset), 1182 sizeof(leftover))) 1183 return -EFAULT; 1184 1185 bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift); 1186 } 1187 1188 if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap, 1189 DIRTY_BITMAP_BYTES(nbits + shift))) 1190 return -EFAULT; 1191 1192 return 0; 1193 } 1194 1195 static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, 1196 dma_addr_t iova, size_t size, size_t pgsize) 1197 { 1198 struct vfio_dma *dma; 1199 struct rb_node *n; 1200 unsigned long pgshift = __ffs(pgsize); 1201 int ret; 1202 1203 /* 1204 * GET_BITMAP request must fully cover vfio_dma mappings. Multiple 1205 * vfio_dma mappings may be clubbed by specifying large ranges, but 1206 * there must not be any previous mappings bisected by the range. 1207 * An error will be returned if these conditions are not met. 1208 */ 1209 dma = vfio_find_dma(iommu, iova, 1); 1210 if (dma && dma->iova != iova) 1211 return -EINVAL; 1212 1213 dma = vfio_find_dma(iommu, iova + size - 1, 0); 1214 if (dma && dma->iova + dma->size != iova + size) 1215 return -EINVAL; 1216 1217 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { 1218 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); 1219 1220 if (dma->iova < iova) 1221 continue; 1222 1223 if (dma->iova > iova + size - 1) 1224 break; 1225 1226 ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize); 1227 if (ret) 1228 return ret; 1229 1230 /* 1231 * Re-populate bitmap to include all pinned pages which are 1232 * considered as dirty but exclude pages which are unpinned and 1233 * pages which are marked dirty by vfio_dma_rw() 1234 */ 1235 bitmap_clear(dma->bitmap, 0, dma->size >> pgshift); 1236 vfio_dma_populate_bitmap(dma, pgsize); 1237 } 1238 return 0; 1239 } 1240 1241 static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size) 1242 { 1243 if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) || 1244 (bitmap_size < DIRTY_BITMAP_BYTES(npages))) 1245 return -EINVAL; 1246 1247 return 0; 1248 } 1249 1250 /* 1251 * Notify VFIO drivers using vfio_register_emulated_iommu_dev() to invalidate 1252 * and unmap iovas within the range we're about to unmap. Drivers MUST unpin 1253 * pages in response to an invalidation. 1254 */ 1255 static void vfio_notify_dma_unmap(struct vfio_iommu *iommu, 1256 struct vfio_dma *dma) 1257 { 1258 struct vfio_device *device; 1259 1260 if (list_empty(&iommu->device_list)) 1261 return; 1262 1263 /* 1264 * The device is expected to call vfio_unpin_pages() for any IOVA it has 1265 * pinned within the range. Since vfio_unpin_pages() will eventually 1266 * call back down to this code and try to obtain the iommu->lock we must 1267 * drop it. 1268 */ 1269 mutex_lock(&iommu->device_list_lock); 1270 mutex_unlock(&iommu->lock); 1271 1272 list_for_each_entry(device, &iommu->device_list, iommu_entry) 1273 device->ops->dma_unmap(device, dma->iova, dma->size); 1274 1275 mutex_unlock(&iommu->device_list_lock); 1276 mutex_lock(&iommu->lock); 1277 } 1278 1279 static int vfio_dma_do_unmap(struct vfio_iommu *iommu, 1280 struct vfio_iommu_type1_dma_unmap *unmap, 1281 struct vfio_bitmap *bitmap) 1282 { 1283 struct vfio_dma *dma, *dma_last = NULL; 1284 size_t unmapped = 0, pgsize; 1285 int ret = -EINVAL, retries = 0; 1286 unsigned long pgshift; 1287 dma_addr_t iova = unmap->iova; 1288 u64 size = unmap->size; 1289 bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL; 1290 bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR; 1291 struct rb_node *n, *first_n; 1292 1293 mutex_lock(&iommu->lock); 1294 1295 /* Cannot update vaddr if mdev is present. */ 1296 if (invalidate_vaddr && !list_empty(&iommu->emulated_iommu_groups)) { 1297 ret = -EBUSY; 1298 goto unlock; 1299 } 1300 1301 pgshift = __ffs(iommu->pgsize_bitmap); 1302 pgsize = (size_t)1 << pgshift; 1303 1304 if (iova & (pgsize - 1)) 1305 goto unlock; 1306 1307 if (unmap_all) { 1308 if (iova || size) 1309 goto unlock; 1310 size = U64_MAX; 1311 } else if (!size || size & (pgsize - 1) || 1312 iova + size - 1 < iova || size > SIZE_MAX) { 1313 goto unlock; 1314 } 1315 1316 /* When dirty tracking is enabled, allow only min supported pgsize */ 1317 if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) && 1318 (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) { 1319 goto unlock; 1320 } 1321 1322 WARN_ON((pgsize - 1) & PAGE_MASK); 1323 again: 1324 /* 1325 * vfio-iommu-type1 (v1) - User mappings were coalesced together to 1326 * avoid tracking individual mappings. This means that the granularity 1327 * of the original mapping was lost and the user was allowed to attempt 1328 * to unmap any range. Depending on the contiguousness of physical 1329 * memory and page sizes supported by the IOMMU, arbitrary unmaps may 1330 * or may not have worked. We only guaranteed unmap granularity 1331 * matching the original mapping; even though it was untracked here, 1332 * the original mappings are reflected in IOMMU mappings. This 1333 * resulted in a couple unusual behaviors. First, if a range is not 1334 * able to be unmapped, ex. a set of 4k pages that was mapped as a 1335 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with 1336 * a zero sized unmap. Also, if an unmap request overlaps the first 1337 * address of a hugepage, the IOMMU will unmap the entire hugepage. 1338 * This also returns success and the returned unmap size reflects the 1339 * actual size unmapped. 1340 * 1341 * We attempt to maintain compatibility with this "v1" interface, but 1342 * we take control out of the hands of the IOMMU. Therefore, an unmap 1343 * request offset from the beginning of the original mapping will 1344 * return success with zero sized unmap. And an unmap request covering 1345 * the first iova of mapping will unmap the entire range. 1346 * 1347 * The v2 version of this interface intends to be more deterministic. 1348 * Unmap requests must fully cover previous mappings. Multiple 1349 * mappings may still be unmaped by specifying large ranges, but there 1350 * must not be any previous mappings bisected by the range. An error 1351 * will be returned if these conditions are not met. The v2 interface 1352 * will only return success and a size of zero if there were no 1353 * mappings within the range. 1354 */ 1355 if (iommu->v2 && !unmap_all) { 1356 dma = vfio_find_dma(iommu, iova, 1); 1357 if (dma && dma->iova != iova) 1358 goto unlock; 1359 1360 dma = vfio_find_dma(iommu, iova + size - 1, 0); 1361 if (dma && dma->iova + dma->size != iova + size) 1362 goto unlock; 1363 } 1364 1365 ret = 0; 1366 n = first_n = vfio_find_dma_first_node(iommu, iova, size); 1367 1368 while (n) { 1369 dma = rb_entry(n, struct vfio_dma, node); 1370 if (dma->iova >= iova + size) 1371 break; 1372 1373 if (!iommu->v2 && iova > dma->iova) 1374 break; 1375 1376 if (invalidate_vaddr) { 1377 if (dma->vaddr_invalid) { 1378 struct rb_node *last_n = n; 1379 1380 for (n = first_n; n != last_n; n = rb_next(n)) { 1381 dma = rb_entry(n, 1382 struct vfio_dma, node); 1383 dma->vaddr_invalid = false; 1384 iommu->vaddr_invalid_count--; 1385 } 1386 ret = -EINVAL; 1387 unmapped = 0; 1388 break; 1389 } 1390 dma->vaddr_invalid = true; 1391 iommu->vaddr_invalid_count++; 1392 unmapped += dma->size; 1393 n = rb_next(n); 1394 continue; 1395 } 1396 1397 if (!RB_EMPTY_ROOT(&dma->pfn_list)) { 1398 if (dma_last == dma) { 1399 BUG_ON(++retries > 10); 1400 } else { 1401 dma_last = dma; 1402 retries = 0; 1403 } 1404 1405 vfio_notify_dma_unmap(iommu, dma); 1406 goto again; 1407 } 1408 1409 if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { 1410 ret = update_user_bitmap(bitmap->data, iommu, dma, 1411 iova, pgsize); 1412 if (ret) 1413 break; 1414 } 1415 1416 unmapped += dma->size; 1417 n = rb_next(n); 1418 vfio_remove_dma(iommu, dma); 1419 } 1420 1421 unlock: 1422 mutex_unlock(&iommu->lock); 1423 1424 /* Report how much was unmapped */ 1425 unmap->size = unmapped; 1426 1427 return ret; 1428 } 1429 1430 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova, 1431 unsigned long pfn, long npage, int prot) 1432 { 1433 struct vfio_domain *d; 1434 int ret; 1435 1436 list_for_each_entry(d, &iommu->domain_list, next) { 1437 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT, 1438 npage << PAGE_SHIFT, prot | IOMMU_CACHE, 1439 GFP_KERNEL); 1440 if (ret) 1441 goto unwind; 1442 1443 cond_resched(); 1444 } 1445 1446 return 0; 1447 1448 unwind: 1449 list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) { 1450 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT); 1451 cond_resched(); 1452 } 1453 1454 return ret; 1455 } 1456 1457 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, 1458 size_t map_size) 1459 { 1460 dma_addr_t iova = dma->iova; 1461 unsigned long vaddr = dma->vaddr; 1462 struct vfio_batch batch; 1463 size_t size = map_size; 1464 long npage; 1465 unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 1466 int ret = 0; 1467 1468 vfio_batch_init(&batch); 1469 1470 while (size) { 1471 /* Pin a contiguous chunk of memory */ 1472 npage = vfio_pin_pages_remote(dma, vaddr + dma->size, 1473 size >> PAGE_SHIFT, &pfn, limit, 1474 &batch); 1475 if (npage <= 0) { 1476 WARN_ON(!npage); 1477 ret = (int)npage; 1478 break; 1479 } 1480 1481 /* Map it! */ 1482 ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, 1483 dma->prot); 1484 if (ret) { 1485 vfio_unpin_pages_remote(dma, iova + dma->size, pfn, 1486 npage, true); 1487 vfio_batch_unpin(&batch, dma); 1488 break; 1489 } 1490 1491 size -= npage << PAGE_SHIFT; 1492 dma->size += npage << PAGE_SHIFT; 1493 } 1494 1495 vfio_batch_fini(&batch); 1496 dma->iommu_mapped = true; 1497 1498 if (ret) 1499 vfio_remove_dma(iommu, dma); 1500 1501 return ret; 1502 } 1503 1504 /* 1505 * Check dma map request is within a valid iova range 1506 */ 1507 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu, 1508 dma_addr_t start, dma_addr_t end) 1509 { 1510 struct list_head *iova = &iommu->iova_list; 1511 struct vfio_iova *node; 1512 1513 list_for_each_entry(node, iova, list) { 1514 if (start >= node->start && end <= node->end) 1515 return true; 1516 } 1517 1518 /* 1519 * Check for list_empty() as well since a container with 1520 * a single mdev device will have an empty list. 1521 */ 1522 return list_empty(iova); 1523 } 1524 1525 static int vfio_change_dma_owner(struct vfio_dma *dma) 1526 { 1527 struct task_struct *task = current->group_leader; 1528 struct mm_struct *mm = current->mm; 1529 long npage = dma->locked_vm; 1530 bool lock_cap; 1531 int ret; 1532 1533 if (mm == dma->mm) 1534 return 0; 1535 1536 lock_cap = capable(CAP_IPC_LOCK); 1537 ret = mm_lock_acct(task, mm, lock_cap, npage); 1538 if (ret) 1539 return ret; 1540 1541 if (mmget_not_zero(dma->mm)) { 1542 mm_lock_acct(dma->task, dma->mm, dma->lock_cap, -npage); 1543 mmput(dma->mm); 1544 } 1545 1546 if (dma->task != task) { 1547 put_task_struct(dma->task); 1548 dma->task = get_task_struct(task); 1549 } 1550 mmdrop(dma->mm); 1551 dma->mm = mm; 1552 mmgrab(dma->mm); 1553 dma->lock_cap = lock_cap; 1554 return 0; 1555 } 1556 1557 static int vfio_dma_do_map(struct vfio_iommu *iommu, 1558 struct vfio_iommu_type1_dma_map *map) 1559 { 1560 bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR; 1561 dma_addr_t iova = map->iova; 1562 unsigned long vaddr = map->vaddr; 1563 size_t size = map->size; 1564 int ret = 0, prot = 0; 1565 size_t pgsize; 1566 struct vfio_dma *dma; 1567 1568 /* Verify that none of our __u64 fields overflow */ 1569 if (map->size != size || map->vaddr != vaddr || map->iova != iova) 1570 return -EINVAL; 1571 1572 /* READ/WRITE from device perspective */ 1573 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) 1574 prot |= IOMMU_WRITE; 1575 if (map->flags & VFIO_DMA_MAP_FLAG_READ) 1576 prot |= IOMMU_READ; 1577 1578 if ((prot && set_vaddr) || (!prot && !set_vaddr)) 1579 return -EINVAL; 1580 1581 mutex_lock(&iommu->lock); 1582 1583 pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap); 1584 1585 WARN_ON((pgsize - 1) & PAGE_MASK); 1586 1587 if (!size || (size | iova | vaddr) & (pgsize - 1)) { 1588 ret = -EINVAL; 1589 goto out_unlock; 1590 } 1591 1592 /* Don't allow IOVA or virtual address wrap */ 1593 if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) { 1594 ret = -EINVAL; 1595 goto out_unlock; 1596 } 1597 1598 dma = vfio_find_dma(iommu, iova, size); 1599 if (set_vaddr) { 1600 if (!dma) { 1601 ret = -ENOENT; 1602 } else if (!dma->vaddr_invalid || dma->iova != iova || 1603 dma->size != size) { 1604 ret = -EINVAL; 1605 } else { 1606 ret = vfio_change_dma_owner(dma); 1607 if (ret) 1608 goto out_unlock; 1609 dma->vaddr = vaddr; 1610 dma->vaddr_invalid = false; 1611 iommu->vaddr_invalid_count--; 1612 } 1613 goto out_unlock; 1614 } else if (dma) { 1615 ret = -EEXIST; 1616 goto out_unlock; 1617 } 1618 1619 if (!iommu->dma_avail) { 1620 ret = -ENOSPC; 1621 goto out_unlock; 1622 } 1623 1624 if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) { 1625 ret = -EINVAL; 1626 goto out_unlock; 1627 } 1628 1629 dma = kzalloc(sizeof(*dma), GFP_KERNEL); 1630 if (!dma) { 1631 ret = -ENOMEM; 1632 goto out_unlock; 1633 } 1634 1635 iommu->dma_avail--; 1636 dma->iova = iova; 1637 dma->vaddr = vaddr; 1638 dma->prot = prot; 1639 1640 /* 1641 * We need to be able to both add to a task's locked memory and test 1642 * against the locked memory limit and we need to be able to do both 1643 * outside of this call path as pinning can be asynchronous via the 1644 * external interfaces for mdev devices. RLIMIT_MEMLOCK requires a 1645 * task_struct. Save the group_leader so that all DMA tracking uses 1646 * the same task, to make debugging easier. VM locked pages requires 1647 * an mm_struct, so grab the mm in case the task dies. 1648 */ 1649 get_task_struct(current->group_leader); 1650 dma->task = current->group_leader; 1651 dma->lock_cap = capable(CAP_IPC_LOCK); 1652 dma->mm = current->mm; 1653 mmgrab(dma->mm); 1654 1655 dma->pfn_list = RB_ROOT; 1656 1657 /* Insert zero-sized and grow as we map chunks of it */ 1658 vfio_link_dma(iommu, dma); 1659 1660 /* Don't pin and map if container doesn't contain IOMMU capable domain*/ 1661 if (list_empty(&iommu->domain_list)) 1662 dma->size = size; 1663 else 1664 ret = vfio_pin_map_dma(iommu, dma, size); 1665 1666 if (!ret && iommu->dirty_page_tracking) { 1667 ret = vfio_dma_bitmap_alloc(dma, pgsize); 1668 if (ret) 1669 vfio_remove_dma(iommu, dma); 1670 } 1671 1672 out_unlock: 1673 mutex_unlock(&iommu->lock); 1674 return ret; 1675 } 1676 1677 static int vfio_iommu_replay(struct vfio_iommu *iommu, 1678 struct vfio_domain *domain) 1679 { 1680 struct vfio_batch batch; 1681 struct vfio_domain *d = NULL; 1682 struct rb_node *n; 1683 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 1684 int ret; 1685 1686 /* Arbitrarily pick the first domain in the list for lookups */ 1687 if (!list_empty(&iommu->domain_list)) 1688 d = list_first_entry(&iommu->domain_list, 1689 struct vfio_domain, next); 1690 1691 vfio_batch_init(&batch); 1692 1693 n = rb_first(&iommu->dma_list); 1694 1695 for (; n; n = rb_next(n)) { 1696 struct vfio_dma *dma; 1697 dma_addr_t iova; 1698 1699 dma = rb_entry(n, struct vfio_dma, node); 1700 iova = dma->iova; 1701 1702 while (iova < dma->iova + dma->size) { 1703 phys_addr_t phys; 1704 size_t size; 1705 1706 if (dma->iommu_mapped) { 1707 phys_addr_t p; 1708 dma_addr_t i; 1709 1710 if (WARN_ON(!d)) { /* mapped w/o a domain?! */ 1711 ret = -EINVAL; 1712 goto unwind; 1713 } 1714 1715 phys = iommu_iova_to_phys(d->domain, iova); 1716 1717 if (WARN_ON(!phys)) { 1718 iova += PAGE_SIZE; 1719 continue; 1720 } 1721 1722 size = PAGE_SIZE; 1723 p = phys + size; 1724 i = iova + size; 1725 while (i < dma->iova + dma->size && 1726 p == iommu_iova_to_phys(d->domain, i)) { 1727 size += PAGE_SIZE; 1728 p += PAGE_SIZE; 1729 i += PAGE_SIZE; 1730 } 1731 } else { 1732 unsigned long pfn; 1733 unsigned long vaddr = dma->vaddr + 1734 (iova - dma->iova); 1735 size_t n = dma->iova + dma->size - iova; 1736 long npage; 1737 1738 npage = vfio_pin_pages_remote(dma, vaddr, 1739 n >> PAGE_SHIFT, 1740 &pfn, limit, 1741 &batch); 1742 if (npage <= 0) { 1743 WARN_ON(!npage); 1744 ret = (int)npage; 1745 goto unwind; 1746 } 1747 1748 phys = pfn << PAGE_SHIFT; 1749 size = npage << PAGE_SHIFT; 1750 } 1751 1752 ret = iommu_map(domain->domain, iova, phys, size, 1753 dma->prot | IOMMU_CACHE, GFP_KERNEL); 1754 if (ret) { 1755 if (!dma->iommu_mapped) { 1756 vfio_unpin_pages_remote(dma, iova, 1757 phys >> PAGE_SHIFT, 1758 size >> PAGE_SHIFT, 1759 true); 1760 vfio_batch_unpin(&batch, dma); 1761 } 1762 goto unwind; 1763 } 1764 1765 iova += size; 1766 } 1767 } 1768 1769 /* All dmas are now mapped, defer to second tree walk for unwind */ 1770 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { 1771 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); 1772 1773 dma->iommu_mapped = true; 1774 } 1775 1776 vfio_batch_fini(&batch); 1777 return 0; 1778 1779 unwind: 1780 for (; n; n = rb_prev(n)) { 1781 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); 1782 dma_addr_t iova; 1783 1784 if (dma->iommu_mapped) { 1785 iommu_unmap(domain->domain, dma->iova, dma->size); 1786 continue; 1787 } 1788 1789 iova = dma->iova; 1790 while (iova < dma->iova + dma->size) { 1791 phys_addr_t phys, p; 1792 size_t size; 1793 dma_addr_t i; 1794 1795 phys = iommu_iova_to_phys(domain->domain, iova); 1796 if (!phys) { 1797 iova += PAGE_SIZE; 1798 continue; 1799 } 1800 1801 size = PAGE_SIZE; 1802 p = phys + size; 1803 i = iova + size; 1804 while (i < dma->iova + dma->size && 1805 p == iommu_iova_to_phys(domain->domain, i)) { 1806 size += PAGE_SIZE; 1807 p += PAGE_SIZE; 1808 i += PAGE_SIZE; 1809 } 1810 1811 iommu_unmap(domain->domain, iova, size); 1812 vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT, 1813 size >> PAGE_SHIFT, true); 1814 } 1815 } 1816 1817 vfio_batch_fini(&batch); 1818 return ret; 1819 } 1820 1821 /* 1822 * We change our unmap behavior slightly depending on whether the IOMMU 1823 * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage 1824 * for practically any contiguous power-of-two mapping we give it. This means 1825 * we don't need to look for contiguous chunks ourselves to make unmapping 1826 * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d 1827 * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks 1828 * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when 1829 * hugetlbfs is in use. 1830 */ 1831 static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *regions) 1832 { 1833 int ret, order = get_order(PAGE_SIZE * 2); 1834 struct vfio_iova *region; 1835 struct page *pages; 1836 dma_addr_t start; 1837 1838 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); 1839 if (!pages) 1840 return; 1841 1842 list_for_each_entry(region, regions, list) { 1843 start = ALIGN(region->start, PAGE_SIZE * 2); 1844 if (start >= region->end || (region->end - start < PAGE_SIZE * 2)) 1845 continue; 1846 1847 ret = iommu_map(domain->domain, start, page_to_phys(pages), PAGE_SIZE * 2, 1848 IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE, GFP_KERNEL); 1849 if (!ret) { 1850 size_t unmapped = iommu_unmap(domain->domain, start, PAGE_SIZE); 1851 1852 if (unmapped == PAGE_SIZE) 1853 iommu_unmap(domain->domain, start + PAGE_SIZE, PAGE_SIZE); 1854 else 1855 domain->fgsp = true; 1856 } 1857 break; 1858 } 1859 1860 __free_pages(pages, order); 1861 } 1862 1863 static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain, 1864 struct iommu_group *iommu_group) 1865 { 1866 struct vfio_iommu_group *g; 1867 1868 list_for_each_entry(g, &domain->group_list, next) { 1869 if (g->iommu_group == iommu_group) 1870 return g; 1871 } 1872 1873 return NULL; 1874 } 1875 1876 static struct vfio_iommu_group* 1877 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, 1878 struct iommu_group *iommu_group) 1879 { 1880 struct vfio_iommu_group *group; 1881 struct vfio_domain *domain; 1882 1883 list_for_each_entry(domain, &iommu->domain_list, next) { 1884 group = find_iommu_group(domain, iommu_group); 1885 if (group) 1886 return group; 1887 } 1888 1889 list_for_each_entry(group, &iommu->emulated_iommu_groups, next) 1890 if (group->iommu_group == iommu_group) 1891 return group; 1892 return NULL; 1893 } 1894 1895 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions, 1896 phys_addr_t *base) 1897 { 1898 struct iommu_resv_region *region; 1899 bool ret = false; 1900 1901 list_for_each_entry(region, group_resv_regions, list) { 1902 /* 1903 * The presence of any 'real' MSI regions should take 1904 * precedence over the software-managed one if the 1905 * IOMMU driver happens to advertise both types. 1906 */ 1907 if (region->type == IOMMU_RESV_MSI) { 1908 ret = false; 1909 break; 1910 } 1911 1912 if (region->type == IOMMU_RESV_SW_MSI) { 1913 *base = region->start; 1914 ret = true; 1915 } 1916 } 1917 1918 return ret; 1919 } 1920 1921 /* 1922 * This is a helper function to insert an address range to iova list. 1923 * The list is initially created with a single entry corresponding to 1924 * the IOMMU domain geometry to which the device group is attached. 1925 * The list aperture gets modified when a new domain is added to the 1926 * container if the new aperture doesn't conflict with the current one 1927 * or with any existing dma mappings. The list is also modified to 1928 * exclude any reserved regions associated with the device group. 1929 */ 1930 static int vfio_iommu_iova_insert(struct list_head *head, 1931 dma_addr_t start, dma_addr_t end) 1932 { 1933 struct vfio_iova *region; 1934 1935 region = kmalloc(sizeof(*region), GFP_KERNEL); 1936 if (!region) 1937 return -ENOMEM; 1938 1939 INIT_LIST_HEAD(®ion->list); 1940 region->start = start; 1941 region->end = end; 1942 1943 list_add_tail(®ion->list, head); 1944 return 0; 1945 } 1946 1947 /* 1948 * Check the new iommu aperture conflicts with existing aper or with any 1949 * existing dma mappings. 1950 */ 1951 static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu, 1952 dma_addr_t start, dma_addr_t end) 1953 { 1954 struct vfio_iova *first, *last; 1955 struct list_head *iova = &iommu->iova_list; 1956 1957 if (list_empty(iova)) 1958 return false; 1959 1960 /* Disjoint sets, return conflict */ 1961 first = list_first_entry(iova, struct vfio_iova, list); 1962 last = list_last_entry(iova, struct vfio_iova, list); 1963 if (start > last->end || end < first->start) 1964 return true; 1965 1966 /* Check for any existing dma mappings below the new start */ 1967 if (start > first->start) { 1968 if (vfio_find_dma(iommu, first->start, start - first->start)) 1969 return true; 1970 } 1971 1972 /* Check for any existing dma mappings beyond the new end */ 1973 if (end < last->end) { 1974 if (vfio_find_dma(iommu, end + 1, last->end - end)) 1975 return true; 1976 } 1977 1978 return false; 1979 } 1980 1981 /* 1982 * Resize iommu iova aperture window. This is called only if the new 1983 * aperture has no conflict with existing aperture and dma mappings. 1984 */ 1985 static int vfio_iommu_aper_resize(struct list_head *iova, 1986 dma_addr_t start, dma_addr_t end) 1987 { 1988 struct vfio_iova *node, *next; 1989 1990 if (list_empty(iova)) 1991 return vfio_iommu_iova_insert(iova, start, end); 1992 1993 /* Adjust iova list start */ 1994 list_for_each_entry_safe(node, next, iova, list) { 1995 if (start < node->start) 1996 break; 1997 if (start >= node->start && start < node->end) { 1998 node->start = start; 1999 break; 2000 } 2001 /* Delete nodes before new start */ 2002 list_del(&node->list); 2003 kfree(node); 2004 } 2005 2006 /* Adjust iova list end */ 2007 list_for_each_entry_safe(node, next, iova, list) { 2008 if (end > node->end) 2009 continue; 2010 if (end > node->start && end <= node->end) { 2011 node->end = end; 2012 continue; 2013 } 2014 /* Delete nodes after new end */ 2015 list_del(&node->list); 2016 kfree(node); 2017 } 2018 2019 return 0; 2020 } 2021 2022 /* 2023 * Check reserved region conflicts with existing dma mappings 2024 */ 2025 static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu, 2026 struct list_head *resv_regions) 2027 { 2028 struct iommu_resv_region *region; 2029 2030 /* Check for conflict with existing dma mappings */ 2031 list_for_each_entry(region, resv_regions, list) { 2032 if (region->type == IOMMU_RESV_DIRECT_RELAXABLE) 2033 continue; 2034 2035 if (vfio_find_dma(iommu, region->start, region->length)) 2036 return true; 2037 } 2038 2039 return false; 2040 } 2041 2042 /* 2043 * Check iova region overlap with reserved regions and 2044 * exclude them from the iommu iova range 2045 */ 2046 static int vfio_iommu_resv_exclude(struct list_head *iova, 2047 struct list_head *resv_regions) 2048 { 2049 struct iommu_resv_region *resv; 2050 struct vfio_iova *n, *next; 2051 2052 list_for_each_entry(resv, resv_regions, list) { 2053 phys_addr_t start, end; 2054 2055 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) 2056 continue; 2057 2058 start = resv->start; 2059 end = resv->start + resv->length - 1; 2060 2061 list_for_each_entry_safe(n, next, iova, list) { 2062 int ret = 0; 2063 2064 /* No overlap */ 2065 if (start > n->end || end < n->start) 2066 continue; 2067 /* 2068 * Insert a new node if current node overlaps with the 2069 * reserve region to exclude that from valid iova range. 2070 * Note that, new node is inserted before the current 2071 * node and finally the current node is deleted keeping 2072 * the list updated and sorted. 2073 */ 2074 if (start > n->start) 2075 ret = vfio_iommu_iova_insert(&n->list, n->start, 2076 start - 1); 2077 if (!ret && end < n->end) 2078 ret = vfio_iommu_iova_insert(&n->list, end + 1, 2079 n->end); 2080 if (ret) 2081 return ret; 2082 2083 list_del(&n->list); 2084 kfree(n); 2085 } 2086 } 2087 2088 if (list_empty(iova)) 2089 return -EINVAL; 2090 2091 return 0; 2092 } 2093 2094 static void vfio_iommu_resv_free(struct list_head *resv_regions) 2095 { 2096 struct iommu_resv_region *n, *next; 2097 2098 list_for_each_entry_safe(n, next, resv_regions, list) { 2099 list_del(&n->list); 2100 kfree(n); 2101 } 2102 } 2103 2104 static void vfio_iommu_iova_free(struct list_head *iova) 2105 { 2106 struct vfio_iova *n, *next; 2107 2108 list_for_each_entry_safe(n, next, iova, list) { 2109 list_del(&n->list); 2110 kfree(n); 2111 } 2112 } 2113 2114 static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu, 2115 struct list_head *iova_copy) 2116 { 2117 struct list_head *iova = &iommu->iova_list; 2118 struct vfio_iova *n; 2119 int ret; 2120 2121 list_for_each_entry(n, iova, list) { 2122 ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end); 2123 if (ret) 2124 goto out_free; 2125 } 2126 2127 return 0; 2128 2129 out_free: 2130 vfio_iommu_iova_free(iova_copy); 2131 return ret; 2132 } 2133 2134 static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu, 2135 struct list_head *iova_copy) 2136 { 2137 struct list_head *iova = &iommu->iova_list; 2138 2139 vfio_iommu_iova_free(iova); 2140 2141 list_splice_tail(iova_copy, iova); 2142 } 2143 2144 static int vfio_iommu_domain_alloc(struct device *dev, void *data) 2145 { 2146 struct iommu_domain **domain = data; 2147 2148 *domain = iommu_domain_alloc(dev->bus); 2149 return 1; /* Don't iterate */ 2150 } 2151 2152 static int vfio_iommu_type1_attach_group(void *iommu_data, 2153 struct iommu_group *iommu_group, enum vfio_group_type type) 2154 { 2155 struct vfio_iommu *iommu = iommu_data; 2156 struct vfio_iommu_group *group; 2157 struct vfio_domain *domain, *d; 2158 bool resv_msi; 2159 phys_addr_t resv_msi_base = 0; 2160 struct iommu_domain_geometry *geo; 2161 LIST_HEAD(iova_copy); 2162 LIST_HEAD(group_resv_regions); 2163 int ret = -EBUSY; 2164 2165 mutex_lock(&iommu->lock); 2166 2167 /* Attach could require pinning, so disallow while vaddr is invalid. */ 2168 if (iommu->vaddr_invalid_count) 2169 goto out_unlock; 2170 2171 /* Check for duplicates */ 2172 ret = -EINVAL; 2173 if (vfio_iommu_find_iommu_group(iommu, iommu_group)) 2174 goto out_unlock; 2175 2176 ret = -ENOMEM; 2177 group = kzalloc(sizeof(*group), GFP_KERNEL); 2178 if (!group) 2179 goto out_unlock; 2180 group->iommu_group = iommu_group; 2181 2182 if (type == VFIO_EMULATED_IOMMU) { 2183 list_add(&group->next, &iommu->emulated_iommu_groups); 2184 /* 2185 * An emulated IOMMU group cannot dirty memory directly, it can 2186 * only use interfaces that provide dirty tracking. 2187 * The iommu scope can only be promoted with the addition of a 2188 * dirty tracking group. 2189 */ 2190 group->pinned_page_dirty_scope = true; 2191 ret = 0; 2192 goto out_unlock; 2193 } 2194 2195 ret = -ENOMEM; 2196 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 2197 if (!domain) 2198 goto out_free_group; 2199 2200 /* 2201 * Going via the iommu_group iterator avoids races, and trivially gives 2202 * us a representative device for the IOMMU API call. We don't actually 2203 * want to iterate beyond the first device (if any). 2204 */ 2205 ret = -EIO; 2206 iommu_group_for_each_dev(iommu_group, &domain->domain, 2207 vfio_iommu_domain_alloc); 2208 if (!domain->domain) 2209 goto out_free_domain; 2210 2211 if (iommu->nesting) { 2212 ret = iommu_enable_nesting(domain->domain); 2213 if (ret) 2214 goto out_domain; 2215 } 2216 2217 ret = iommu_attach_group(domain->domain, group->iommu_group); 2218 if (ret) 2219 goto out_domain; 2220 2221 /* Get aperture info */ 2222 geo = &domain->domain->geometry; 2223 if (vfio_iommu_aper_conflict(iommu, geo->aperture_start, 2224 geo->aperture_end)) { 2225 ret = -EINVAL; 2226 goto out_detach; 2227 } 2228 2229 ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions); 2230 if (ret) 2231 goto out_detach; 2232 2233 if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) { 2234 ret = -EINVAL; 2235 goto out_detach; 2236 } 2237 2238 /* 2239 * We don't want to work on the original iova list as the list 2240 * gets modified and in case of failure we have to retain the 2241 * original list. Get a copy here. 2242 */ 2243 ret = vfio_iommu_iova_get_copy(iommu, &iova_copy); 2244 if (ret) 2245 goto out_detach; 2246 2247 ret = vfio_iommu_aper_resize(&iova_copy, geo->aperture_start, 2248 geo->aperture_end); 2249 if (ret) 2250 goto out_detach; 2251 2252 ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions); 2253 if (ret) 2254 goto out_detach; 2255 2256 resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base); 2257 2258 INIT_LIST_HEAD(&domain->group_list); 2259 list_add(&group->next, &domain->group_list); 2260 2261 if (!allow_unsafe_interrupts && 2262 !iommu_group_has_isolated_msi(iommu_group)) { 2263 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", 2264 __func__); 2265 ret = -EPERM; 2266 goto out_detach; 2267 } 2268 2269 /* 2270 * If the IOMMU can block non-coherent operations (ie PCIe TLPs with 2271 * no-snoop set) then VFIO always turns this feature on because on Intel 2272 * platforms it optimizes KVM to disable wbinvd emulation. 2273 */ 2274 if (domain->domain->ops->enforce_cache_coherency) 2275 domain->enforce_cache_coherency = 2276 domain->domain->ops->enforce_cache_coherency( 2277 domain->domain); 2278 2279 /* 2280 * Try to match an existing compatible domain. We don't want to 2281 * preclude an IOMMU driver supporting multiple bus_types and being 2282 * able to include different bus_types in the same IOMMU domain, so 2283 * we test whether the domains use the same iommu_ops rather than 2284 * testing if they're on the same bus_type. 2285 */ 2286 list_for_each_entry(d, &iommu->domain_list, next) { 2287 if (d->domain->ops == domain->domain->ops && 2288 d->enforce_cache_coherency == 2289 domain->enforce_cache_coherency) { 2290 iommu_detach_group(domain->domain, group->iommu_group); 2291 if (!iommu_attach_group(d->domain, 2292 group->iommu_group)) { 2293 list_add(&group->next, &d->group_list); 2294 iommu_domain_free(domain->domain); 2295 kfree(domain); 2296 goto done; 2297 } 2298 2299 ret = iommu_attach_group(domain->domain, 2300 group->iommu_group); 2301 if (ret) 2302 goto out_domain; 2303 } 2304 } 2305 2306 vfio_test_domain_fgsp(domain, &iova_copy); 2307 2308 /* replay mappings on new domains */ 2309 ret = vfio_iommu_replay(iommu, domain); 2310 if (ret) 2311 goto out_detach; 2312 2313 if (resv_msi) { 2314 ret = iommu_get_msi_cookie(domain->domain, resv_msi_base); 2315 if (ret && ret != -ENODEV) 2316 goto out_detach; 2317 } 2318 2319 list_add(&domain->next, &iommu->domain_list); 2320 vfio_update_pgsize_bitmap(iommu); 2321 done: 2322 /* Delete the old one and insert new iova list */ 2323 vfio_iommu_iova_insert_copy(iommu, &iova_copy); 2324 2325 /* 2326 * An iommu backed group can dirty memory directly and therefore 2327 * demotes the iommu scope until it declares itself dirty tracking 2328 * capable via the page pinning interface. 2329 */ 2330 iommu->num_non_pinned_groups++; 2331 mutex_unlock(&iommu->lock); 2332 vfio_iommu_resv_free(&group_resv_regions); 2333 2334 return 0; 2335 2336 out_detach: 2337 iommu_detach_group(domain->domain, group->iommu_group); 2338 out_domain: 2339 iommu_domain_free(domain->domain); 2340 vfio_iommu_iova_free(&iova_copy); 2341 vfio_iommu_resv_free(&group_resv_regions); 2342 out_free_domain: 2343 kfree(domain); 2344 out_free_group: 2345 kfree(group); 2346 out_unlock: 2347 mutex_unlock(&iommu->lock); 2348 return ret; 2349 } 2350 2351 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu) 2352 { 2353 struct rb_node *node; 2354 2355 while ((node = rb_first(&iommu->dma_list))) 2356 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node)); 2357 } 2358 2359 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu) 2360 { 2361 struct rb_node *n, *p; 2362 2363 n = rb_first(&iommu->dma_list); 2364 for (; n; n = rb_next(n)) { 2365 struct vfio_dma *dma; 2366 long locked = 0, unlocked = 0; 2367 2368 dma = rb_entry(n, struct vfio_dma, node); 2369 unlocked += vfio_unmap_unpin(iommu, dma, false); 2370 p = rb_first(&dma->pfn_list); 2371 for (; p; p = rb_next(p)) { 2372 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, 2373 node); 2374 2375 if (!is_invalid_reserved_pfn(vpfn->pfn)) 2376 locked++; 2377 } 2378 vfio_lock_acct(dma, locked - unlocked, true); 2379 } 2380 } 2381 2382 /* 2383 * Called when a domain is removed in detach. It is possible that 2384 * the removed domain decided the iova aperture window. Modify the 2385 * iova aperture with the smallest window among existing domains. 2386 */ 2387 static void vfio_iommu_aper_expand(struct vfio_iommu *iommu, 2388 struct list_head *iova_copy) 2389 { 2390 struct vfio_domain *domain; 2391 struct vfio_iova *node; 2392 dma_addr_t start = 0; 2393 dma_addr_t end = (dma_addr_t)~0; 2394 2395 if (list_empty(iova_copy)) 2396 return; 2397 2398 list_for_each_entry(domain, &iommu->domain_list, next) { 2399 struct iommu_domain_geometry *geo = &domain->domain->geometry; 2400 2401 if (geo->aperture_start > start) 2402 start = geo->aperture_start; 2403 if (geo->aperture_end < end) 2404 end = geo->aperture_end; 2405 } 2406 2407 /* Modify aperture limits. The new aper is either same or bigger */ 2408 node = list_first_entry(iova_copy, struct vfio_iova, list); 2409 node->start = start; 2410 node = list_last_entry(iova_copy, struct vfio_iova, list); 2411 node->end = end; 2412 } 2413 2414 /* 2415 * Called when a group is detached. The reserved regions for that 2416 * group can be part of valid iova now. But since reserved regions 2417 * may be duplicated among groups, populate the iova valid regions 2418 * list again. 2419 */ 2420 static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu, 2421 struct list_head *iova_copy) 2422 { 2423 struct vfio_domain *d; 2424 struct vfio_iommu_group *g; 2425 struct vfio_iova *node; 2426 dma_addr_t start, end; 2427 LIST_HEAD(resv_regions); 2428 int ret; 2429 2430 if (list_empty(iova_copy)) 2431 return -EINVAL; 2432 2433 list_for_each_entry(d, &iommu->domain_list, next) { 2434 list_for_each_entry(g, &d->group_list, next) { 2435 ret = iommu_get_group_resv_regions(g->iommu_group, 2436 &resv_regions); 2437 if (ret) 2438 goto done; 2439 } 2440 } 2441 2442 node = list_first_entry(iova_copy, struct vfio_iova, list); 2443 start = node->start; 2444 node = list_last_entry(iova_copy, struct vfio_iova, list); 2445 end = node->end; 2446 2447 /* purge the iova list and create new one */ 2448 vfio_iommu_iova_free(iova_copy); 2449 2450 ret = vfio_iommu_aper_resize(iova_copy, start, end); 2451 if (ret) 2452 goto done; 2453 2454 /* Exclude current reserved regions from iova ranges */ 2455 ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions); 2456 done: 2457 vfio_iommu_resv_free(&resv_regions); 2458 return ret; 2459 } 2460 2461 static void vfio_iommu_type1_detach_group(void *iommu_data, 2462 struct iommu_group *iommu_group) 2463 { 2464 struct vfio_iommu *iommu = iommu_data; 2465 struct vfio_domain *domain; 2466 struct vfio_iommu_group *group; 2467 bool update_dirty_scope = false; 2468 LIST_HEAD(iova_copy); 2469 2470 mutex_lock(&iommu->lock); 2471 list_for_each_entry(group, &iommu->emulated_iommu_groups, next) { 2472 if (group->iommu_group != iommu_group) 2473 continue; 2474 update_dirty_scope = !group->pinned_page_dirty_scope; 2475 list_del(&group->next); 2476 kfree(group); 2477 2478 if (list_empty(&iommu->emulated_iommu_groups) && 2479 list_empty(&iommu->domain_list)) { 2480 WARN_ON(!list_empty(&iommu->device_list)); 2481 vfio_iommu_unmap_unpin_all(iommu); 2482 } 2483 goto detach_group_done; 2484 } 2485 2486 /* 2487 * Get a copy of iova list. This will be used to update 2488 * and to replace the current one later. Please note that 2489 * we will leave the original list as it is if update fails. 2490 */ 2491 vfio_iommu_iova_get_copy(iommu, &iova_copy); 2492 2493 list_for_each_entry(domain, &iommu->domain_list, next) { 2494 group = find_iommu_group(domain, iommu_group); 2495 if (!group) 2496 continue; 2497 2498 iommu_detach_group(domain->domain, group->iommu_group); 2499 update_dirty_scope = !group->pinned_page_dirty_scope; 2500 list_del(&group->next); 2501 kfree(group); 2502 /* 2503 * Group ownership provides privilege, if the group list is 2504 * empty, the domain goes away. If it's the last domain with 2505 * iommu and external domain doesn't exist, then all the 2506 * mappings go away too. If it's the last domain with iommu and 2507 * external domain exist, update accounting 2508 */ 2509 if (list_empty(&domain->group_list)) { 2510 if (list_is_singular(&iommu->domain_list)) { 2511 if (list_empty(&iommu->emulated_iommu_groups)) { 2512 WARN_ON(!list_empty( 2513 &iommu->device_list)); 2514 vfio_iommu_unmap_unpin_all(iommu); 2515 } else { 2516 vfio_iommu_unmap_unpin_reaccount(iommu); 2517 } 2518 } 2519 iommu_domain_free(domain->domain); 2520 list_del(&domain->next); 2521 kfree(domain); 2522 vfio_iommu_aper_expand(iommu, &iova_copy); 2523 vfio_update_pgsize_bitmap(iommu); 2524 } 2525 break; 2526 } 2527 2528 if (!vfio_iommu_resv_refresh(iommu, &iova_copy)) 2529 vfio_iommu_iova_insert_copy(iommu, &iova_copy); 2530 else 2531 vfio_iommu_iova_free(&iova_copy); 2532 2533 detach_group_done: 2534 /* 2535 * Removal of a group without dirty tracking may allow the iommu scope 2536 * to be promoted. 2537 */ 2538 if (update_dirty_scope) { 2539 iommu->num_non_pinned_groups--; 2540 if (iommu->dirty_page_tracking) 2541 vfio_iommu_populate_bitmap_full(iommu); 2542 } 2543 mutex_unlock(&iommu->lock); 2544 } 2545 2546 static void *vfio_iommu_type1_open(unsigned long arg) 2547 { 2548 struct vfio_iommu *iommu; 2549 2550 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); 2551 if (!iommu) 2552 return ERR_PTR(-ENOMEM); 2553 2554 switch (arg) { 2555 case VFIO_TYPE1_IOMMU: 2556 break; 2557 case VFIO_TYPE1_NESTING_IOMMU: 2558 iommu->nesting = true; 2559 fallthrough; 2560 case VFIO_TYPE1v2_IOMMU: 2561 iommu->v2 = true; 2562 break; 2563 default: 2564 kfree(iommu); 2565 return ERR_PTR(-EINVAL); 2566 } 2567 2568 INIT_LIST_HEAD(&iommu->domain_list); 2569 INIT_LIST_HEAD(&iommu->iova_list); 2570 iommu->dma_list = RB_ROOT; 2571 iommu->dma_avail = dma_entry_limit; 2572 mutex_init(&iommu->lock); 2573 mutex_init(&iommu->device_list_lock); 2574 INIT_LIST_HEAD(&iommu->device_list); 2575 iommu->pgsize_bitmap = PAGE_MASK; 2576 INIT_LIST_HEAD(&iommu->emulated_iommu_groups); 2577 2578 return iommu; 2579 } 2580 2581 static void vfio_release_domain(struct vfio_domain *domain) 2582 { 2583 struct vfio_iommu_group *group, *group_tmp; 2584 2585 list_for_each_entry_safe(group, group_tmp, 2586 &domain->group_list, next) { 2587 iommu_detach_group(domain->domain, group->iommu_group); 2588 list_del(&group->next); 2589 kfree(group); 2590 } 2591 2592 iommu_domain_free(domain->domain); 2593 } 2594 2595 static void vfio_iommu_type1_release(void *iommu_data) 2596 { 2597 struct vfio_iommu *iommu = iommu_data; 2598 struct vfio_domain *domain, *domain_tmp; 2599 struct vfio_iommu_group *group, *next_group; 2600 2601 list_for_each_entry_safe(group, next_group, 2602 &iommu->emulated_iommu_groups, next) { 2603 list_del(&group->next); 2604 kfree(group); 2605 } 2606 2607 vfio_iommu_unmap_unpin_all(iommu); 2608 2609 list_for_each_entry_safe(domain, domain_tmp, 2610 &iommu->domain_list, next) { 2611 vfio_release_domain(domain); 2612 list_del(&domain->next); 2613 kfree(domain); 2614 } 2615 2616 vfio_iommu_iova_free(&iommu->iova_list); 2617 2618 kfree(iommu); 2619 } 2620 2621 static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu) 2622 { 2623 struct vfio_domain *domain; 2624 int ret = 1; 2625 2626 mutex_lock(&iommu->lock); 2627 list_for_each_entry(domain, &iommu->domain_list, next) { 2628 if (!(domain->enforce_cache_coherency)) { 2629 ret = 0; 2630 break; 2631 } 2632 } 2633 mutex_unlock(&iommu->lock); 2634 2635 return ret; 2636 } 2637 2638 static bool vfio_iommu_has_emulated(struct vfio_iommu *iommu) 2639 { 2640 bool ret; 2641 2642 mutex_lock(&iommu->lock); 2643 ret = !list_empty(&iommu->emulated_iommu_groups); 2644 mutex_unlock(&iommu->lock); 2645 return ret; 2646 } 2647 2648 static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu, 2649 unsigned long arg) 2650 { 2651 switch (arg) { 2652 case VFIO_TYPE1_IOMMU: 2653 case VFIO_TYPE1v2_IOMMU: 2654 case VFIO_TYPE1_NESTING_IOMMU: 2655 case VFIO_UNMAP_ALL: 2656 return 1; 2657 case VFIO_UPDATE_VADDR: 2658 /* 2659 * Disable this feature if mdevs are present. They cannot 2660 * safely pin/unpin/rw while vaddrs are being updated. 2661 */ 2662 return iommu && !vfio_iommu_has_emulated(iommu); 2663 case VFIO_DMA_CC_IOMMU: 2664 if (!iommu) 2665 return 0; 2666 return vfio_domains_have_enforce_cache_coherency(iommu); 2667 default: 2668 return 0; 2669 } 2670 } 2671 2672 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps, 2673 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas, 2674 size_t size) 2675 { 2676 struct vfio_info_cap_header *header; 2677 struct vfio_iommu_type1_info_cap_iova_range *iova_cap; 2678 2679 header = vfio_info_cap_add(caps, size, 2680 VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1); 2681 if (IS_ERR(header)) 2682 return PTR_ERR(header); 2683 2684 iova_cap = container_of(header, 2685 struct vfio_iommu_type1_info_cap_iova_range, 2686 header); 2687 iova_cap->nr_iovas = cap_iovas->nr_iovas; 2688 memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges, 2689 cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges)); 2690 return 0; 2691 } 2692 2693 static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu, 2694 struct vfio_info_cap *caps) 2695 { 2696 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas; 2697 struct vfio_iova *iova; 2698 size_t size; 2699 int iovas = 0, i = 0, ret; 2700 2701 list_for_each_entry(iova, &iommu->iova_list, list) 2702 iovas++; 2703 2704 if (!iovas) { 2705 /* 2706 * Return 0 as a container with a single mdev device 2707 * will have an empty list 2708 */ 2709 return 0; 2710 } 2711 2712 size = struct_size(cap_iovas, iova_ranges, iovas); 2713 2714 cap_iovas = kzalloc(size, GFP_KERNEL); 2715 if (!cap_iovas) 2716 return -ENOMEM; 2717 2718 cap_iovas->nr_iovas = iovas; 2719 2720 list_for_each_entry(iova, &iommu->iova_list, list) { 2721 cap_iovas->iova_ranges[i].start = iova->start; 2722 cap_iovas->iova_ranges[i].end = iova->end; 2723 i++; 2724 } 2725 2726 ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size); 2727 2728 kfree(cap_iovas); 2729 return ret; 2730 } 2731 2732 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, 2733 struct vfio_info_cap *caps) 2734 { 2735 struct vfio_iommu_type1_info_cap_migration cap_mig = {}; 2736 2737 cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION; 2738 cap_mig.header.version = 1; 2739 2740 cap_mig.flags = 0; 2741 /* support minimum pgsize */ 2742 cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap); 2743 cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX; 2744 2745 return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig)); 2746 } 2747 2748 static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu, 2749 struct vfio_info_cap *caps) 2750 { 2751 struct vfio_iommu_type1_info_dma_avail cap_dma_avail; 2752 2753 cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL; 2754 cap_dma_avail.header.version = 1; 2755 2756 cap_dma_avail.avail = iommu->dma_avail; 2757 2758 return vfio_info_add_capability(caps, &cap_dma_avail.header, 2759 sizeof(cap_dma_avail)); 2760 } 2761 2762 static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, 2763 unsigned long arg) 2764 { 2765 struct vfio_iommu_type1_info info = {}; 2766 unsigned long minsz; 2767 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 2768 int ret; 2769 2770 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); 2771 2772 if (copy_from_user(&info, (void __user *)arg, minsz)) 2773 return -EFAULT; 2774 2775 if (info.argsz < minsz) 2776 return -EINVAL; 2777 2778 minsz = min_t(size_t, info.argsz, sizeof(info)); 2779 2780 mutex_lock(&iommu->lock); 2781 info.flags = VFIO_IOMMU_INFO_PGSIZES; 2782 2783 info.iova_pgsizes = iommu->pgsize_bitmap; 2784 2785 ret = vfio_iommu_migration_build_caps(iommu, &caps); 2786 2787 if (!ret) 2788 ret = vfio_iommu_dma_avail_build_caps(iommu, &caps); 2789 2790 if (!ret) 2791 ret = vfio_iommu_iova_build_caps(iommu, &caps); 2792 2793 mutex_unlock(&iommu->lock); 2794 2795 if (ret) 2796 return ret; 2797 2798 if (caps.size) { 2799 info.flags |= VFIO_IOMMU_INFO_CAPS; 2800 2801 if (info.argsz < sizeof(info) + caps.size) { 2802 info.argsz = sizeof(info) + caps.size; 2803 } else { 2804 vfio_info_cap_shift(&caps, sizeof(info)); 2805 if (copy_to_user((void __user *)arg + 2806 sizeof(info), caps.buf, 2807 caps.size)) { 2808 kfree(caps.buf); 2809 return -EFAULT; 2810 } 2811 info.cap_offset = sizeof(info); 2812 } 2813 2814 kfree(caps.buf); 2815 } 2816 2817 return copy_to_user((void __user *)arg, &info, minsz) ? 2818 -EFAULT : 0; 2819 } 2820 2821 static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu, 2822 unsigned long arg) 2823 { 2824 struct vfio_iommu_type1_dma_map map; 2825 unsigned long minsz; 2826 uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE | 2827 VFIO_DMA_MAP_FLAG_VADDR; 2828 2829 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 2830 2831 if (copy_from_user(&map, (void __user *)arg, minsz)) 2832 return -EFAULT; 2833 2834 if (map.argsz < minsz || map.flags & ~mask) 2835 return -EINVAL; 2836 2837 return vfio_dma_do_map(iommu, &map); 2838 } 2839 2840 static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu, 2841 unsigned long arg) 2842 { 2843 struct vfio_iommu_type1_dma_unmap unmap; 2844 struct vfio_bitmap bitmap = { 0 }; 2845 uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP | 2846 VFIO_DMA_UNMAP_FLAG_VADDR | 2847 VFIO_DMA_UNMAP_FLAG_ALL; 2848 unsigned long minsz; 2849 int ret; 2850 2851 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); 2852 2853 if (copy_from_user(&unmap, (void __user *)arg, minsz)) 2854 return -EFAULT; 2855 2856 if (unmap.argsz < minsz || unmap.flags & ~mask) 2857 return -EINVAL; 2858 2859 if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) && 2860 (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL | 2861 VFIO_DMA_UNMAP_FLAG_VADDR))) 2862 return -EINVAL; 2863 2864 if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { 2865 unsigned long pgshift; 2866 2867 if (unmap.argsz < (minsz + sizeof(bitmap))) 2868 return -EINVAL; 2869 2870 if (copy_from_user(&bitmap, 2871 (void __user *)(arg + minsz), 2872 sizeof(bitmap))) 2873 return -EFAULT; 2874 2875 if (!access_ok((void __user *)bitmap.data, bitmap.size)) 2876 return -EINVAL; 2877 2878 pgshift = __ffs(bitmap.pgsize); 2879 ret = verify_bitmap_size(unmap.size >> pgshift, 2880 bitmap.size); 2881 if (ret) 2882 return ret; 2883 } 2884 2885 ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap); 2886 if (ret) 2887 return ret; 2888 2889 return copy_to_user((void __user *)arg, &unmap, minsz) ? 2890 -EFAULT : 0; 2891 } 2892 2893 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu, 2894 unsigned long arg) 2895 { 2896 struct vfio_iommu_type1_dirty_bitmap dirty; 2897 uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START | 2898 VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP | 2899 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; 2900 unsigned long minsz; 2901 int ret = 0; 2902 2903 if (!iommu->v2) 2904 return -EACCES; 2905 2906 minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags); 2907 2908 if (copy_from_user(&dirty, (void __user *)arg, minsz)) 2909 return -EFAULT; 2910 2911 if (dirty.argsz < minsz || dirty.flags & ~mask) 2912 return -EINVAL; 2913 2914 /* only one flag should be set at a time */ 2915 if (__ffs(dirty.flags) != __fls(dirty.flags)) 2916 return -EINVAL; 2917 2918 if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) { 2919 size_t pgsize; 2920 2921 mutex_lock(&iommu->lock); 2922 pgsize = 1 << __ffs(iommu->pgsize_bitmap); 2923 if (!iommu->dirty_page_tracking) { 2924 ret = vfio_dma_bitmap_alloc_all(iommu, pgsize); 2925 if (!ret) 2926 iommu->dirty_page_tracking = true; 2927 } 2928 mutex_unlock(&iommu->lock); 2929 return ret; 2930 } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) { 2931 mutex_lock(&iommu->lock); 2932 if (iommu->dirty_page_tracking) { 2933 iommu->dirty_page_tracking = false; 2934 vfio_dma_bitmap_free_all(iommu); 2935 } 2936 mutex_unlock(&iommu->lock); 2937 return 0; 2938 } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) { 2939 struct vfio_iommu_type1_dirty_bitmap_get range; 2940 unsigned long pgshift; 2941 size_t data_size = dirty.argsz - minsz; 2942 size_t iommu_pgsize; 2943 2944 if (!data_size || data_size < sizeof(range)) 2945 return -EINVAL; 2946 2947 if (copy_from_user(&range, (void __user *)(arg + minsz), 2948 sizeof(range))) 2949 return -EFAULT; 2950 2951 if (range.iova + range.size < range.iova) 2952 return -EINVAL; 2953 if (!access_ok((void __user *)range.bitmap.data, 2954 range.bitmap.size)) 2955 return -EINVAL; 2956 2957 pgshift = __ffs(range.bitmap.pgsize); 2958 ret = verify_bitmap_size(range.size >> pgshift, 2959 range.bitmap.size); 2960 if (ret) 2961 return ret; 2962 2963 mutex_lock(&iommu->lock); 2964 2965 iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap); 2966 2967 /* allow only smallest supported pgsize */ 2968 if (range.bitmap.pgsize != iommu_pgsize) { 2969 ret = -EINVAL; 2970 goto out_unlock; 2971 } 2972 if (range.iova & (iommu_pgsize - 1)) { 2973 ret = -EINVAL; 2974 goto out_unlock; 2975 } 2976 if (!range.size || range.size & (iommu_pgsize - 1)) { 2977 ret = -EINVAL; 2978 goto out_unlock; 2979 } 2980 2981 if (iommu->dirty_page_tracking) 2982 ret = vfio_iova_dirty_bitmap(range.bitmap.data, 2983 iommu, range.iova, 2984 range.size, 2985 range.bitmap.pgsize); 2986 else 2987 ret = -EINVAL; 2988 out_unlock: 2989 mutex_unlock(&iommu->lock); 2990 2991 return ret; 2992 } 2993 2994 return -EINVAL; 2995 } 2996 2997 static long vfio_iommu_type1_ioctl(void *iommu_data, 2998 unsigned int cmd, unsigned long arg) 2999 { 3000 struct vfio_iommu *iommu = iommu_data; 3001 3002 switch (cmd) { 3003 case VFIO_CHECK_EXTENSION: 3004 return vfio_iommu_type1_check_extension(iommu, arg); 3005 case VFIO_IOMMU_GET_INFO: 3006 return vfio_iommu_type1_get_info(iommu, arg); 3007 case VFIO_IOMMU_MAP_DMA: 3008 return vfio_iommu_type1_map_dma(iommu, arg); 3009 case VFIO_IOMMU_UNMAP_DMA: 3010 return vfio_iommu_type1_unmap_dma(iommu, arg); 3011 case VFIO_IOMMU_DIRTY_PAGES: 3012 return vfio_iommu_type1_dirty_pages(iommu, arg); 3013 default: 3014 return -ENOTTY; 3015 } 3016 } 3017 3018 static void vfio_iommu_type1_register_device(void *iommu_data, 3019 struct vfio_device *vdev) 3020 { 3021 struct vfio_iommu *iommu = iommu_data; 3022 3023 if (!vdev->ops->dma_unmap) 3024 return; 3025 3026 /* 3027 * list_empty(&iommu->device_list) is tested under the iommu->lock while 3028 * iteration for dma_unmap must be done under the device_list_lock. 3029 * Holding both locks here allows avoiding the device_list_lock in 3030 * several fast paths. See vfio_notify_dma_unmap() 3031 */ 3032 mutex_lock(&iommu->lock); 3033 mutex_lock(&iommu->device_list_lock); 3034 list_add(&vdev->iommu_entry, &iommu->device_list); 3035 mutex_unlock(&iommu->device_list_lock); 3036 mutex_unlock(&iommu->lock); 3037 } 3038 3039 static void vfio_iommu_type1_unregister_device(void *iommu_data, 3040 struct vfio_device *vdev) 3041 { 3042 struct vfio_iommu *iommu = iommu_data; 3043 3044 if (!vdev->ops->dma_unmap) 3045 return; 3046 3047 mutex_lock(&iommu->lock); 3048 mutex_lock(&iommu->device_list_lock); 3049 list_del(&vdev->iommu_entry); 3050 mutex_unlock(&iommu->device_list_lock); 3051 mutex_unlock(&iommu->lock); 3052 } 3053 3054 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, 3055 dma_addr_t user_iova, void *data, 3056 size_t count, bool write, 3057 size_t *copied) 3058 { 3059 struct mm_struct *mm; 3060 unsigned long vaddr; 3061 struct vfio_dma *dma; 3062 bool kthread = current->mm == NULL; 3063 size_t offset; 3064 3065 *copied = 0; 3066 3067 dma = vfio_find_dma(iommu, user_iova, 1); 3068 if (!dma) 3069 return -EINVAL; 3070 3071 if ((write && !(dma->prot & IOMMU_WRITE)) || 3072 !(dma->prot & IOMMU_READ)) 3073 return -EPERM; 3074 3075 mm = dma->mm; 3076 if (!mmget_not_zero(mm)) 3077 return -EPERM; 3078 3079 if (kthread) 3080 kthread_use_mm(mm); 3081 else if (current->mm != mm) 3082 goto out; 3083 3084 offset = user_iova - dma->iova; 3085 3086 if (count > dma->size - offset) 3087 count = dma->size - offset; 3088 3089 vaddr = dma->vaddr + offset; 3090 3091 if (write) { 3092 *copied = copy_to_user((void __user *)vaddr, data, 3093 count) ? 0 : count; 3094 if (*copied && iommu->dirty_page_tracking) { 3095 unsigned long pgshift = __ffs(iommu->pgsize_bitmap); 3096 /* 3097 * Bitmap populated with the smallest supported page 3098 * size 3099 */ 3100 bitmap_set(dma->bitmap, offset >> pgshift, 3101 ((offset + *copied - 1) >> pgshift) - 3102 (offset >> pgshift) + 1); 3103 } 3104 } else 3105 *copied = copy_from_user(data, (void __user *)vaddr, 3106 count) ? 0 : count; 3107 if (kthread) 3108 kthread_unuse_mm(mm); 3109 out: 3110 mmput(mm); 3111 return *copied ? 0 : -EFAULT; 3112 } 3113 3114 static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova, 3115 void *data, size_t count, bool write) 3116 { 3117 struct vfio_iommu *iommu = iommu_data; 3118 int ret = 0; 3119 size_t done; 3120 3121 mutex_lock(&iommu->lock); 3122 3123 if (WARN_ONCE(iommu->vaddr_invalid_count, 3124 "vfio_dma_rw not allowed with VFIO_UPDATE_VADDR\n")) { 3125 ret = -EBUSY; 3126 goto out; 3127 } 3128 3129 while (count > 0) { 3130 ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data, 3131 count, write, &done); 3132 if (ret) 3133 break; 3134 3135 count -= done; 3136 data += done; 3137 user_iova += done; 3138 } 3139 3140 out: 3141 mutex_unlock(&iommu->lock); 3142 return ret; 3143 } 3144 3145 static struct iommu_domain * 3146 vfio_iommu_type1_group_iommu_domain(void *iommu_data, 3147 struct iommu_group *iommu_group) 3148 { 3149 struct iommu_domain *domain = ERR_PTR(-ENODEV); 3150 struct vfio_iommu *iommu = iommu_data; 3151 struct vfio_domain *d; 3152 3153 if (!iommu || !iommu_group) 3154 return ERR_PTR(-EINVAL); 3155 3156 mutex_lock(&iommu->lock); 3157 list_for_each_entry(d, &iommu->domain_list, next) { 3158 if (find_iommu_group(d, iommu_group)) { 3159 domain = d->domain; 3160 break; 3161 } 3162 } 3163 mutex_unlock(&iommu->lock); 3164 3165 return domain; 3166 } 3167 3168 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { 3169 .name = "vfio-iommu-type1", 3170 .owner = THIS_MODULE, 3171 .open = vfio_iommu_type1_open, 3172 .release = vfio_iommu_type1_release, 3173 .ioctl = vfio_iommu_type1_ioctl, 3174 .attach_group = vfio_iommu_type1_attach_group, 3175 .detach_group = vfio_iommu_type1_detach_group, 3176 .pin_pages = vfio_iommu_type1_pin_pages, 3177 .unpin_pages = vfio_iommu_type1_unpin_pages, 3178 .register_device = vfio_iommu_type1_register_device, 3179 .unregister_device = vfio_iommu_type1_unregister_device, 3180 .dma_rw = vfio_iommu_type1_dma_rw, 3181 .group_iommu_domain = vfio_iommu_type1_group_iommu_domain, 3182 }; 3183 3184 static int __init vfio_iommu_type1_init(void) 3185 { 3186 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); 3187 } 3188 3189 static void __exit vfio_iommu_type1_cleanup(void) 3190 { 3191 vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1); 3192 } 3193 3194 module_init(vfio_iommu_type1_init); 3195 module_exit(vfio_iommu_type1_cleanup); 3196 3197 MODULE_VERSION(DRIVER_VERSION); 3198 MODULE_LICENSE("GPL v2"); 3199 MODULE_AUTHOR(DRIVER_AUTHOR); 3200 MODULE_DESCRIPTION(DRIVER_DESC); 3201