1 /* 2 * VFIO: IOMMU DMA mapping support for Type1 IOMMU 3 * 4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 5 * Author: Alex Williamson <alex.williamson@redhat.com> 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 * 11 * Derived from original vfio: 12 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 13 * Author: Tom Lyon, pugs@cisco.com 14 * 15 * We arbitrarily define a Type1 IOMMU as one matching the below code. 16 * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel 17 * VT-d, but that makes it harder to re-use as theoretically anyone 18 * implementing a similar IOMMU could make use of this. We expect the 19 * IOMMU to support the IOMMU API and have few to no restrictions around 20 * the IOVA range that can be mapped. The Type1 IOMMU is currently 21 * optimized for relatively static mappings of a userspace process with 22 * userpsace pages pinned into memory. We also assume devices and IOMMU 23 * domains are PCI based as the IOMMU API is still centered around a 24 * device/bus interface rather than a group interface. 25 */ 26 27 #include <linux/compat.h> 28 #include <linux/device.h> 29 #include <linux/fs.h> 30 #include <linux/iommu.h> 31 #include <linux/module.h> 32 #include <linux/mm.h> 33 #include <linux/rbtree.h> 34 #include <linux/sched.h> 35 #include <linux/slab.h> 36 #include <linux/uaccess.h> 37 #include <linux/vfio.h> 38 #include <linux/workqueue.h> 39 40 #define DRIVER_VERSION "0.2" 41 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 42 #define DRIVER_DESC "Type1 IOMMU driver for VFIO" 43 44 static bool allow_unsafe_interrupts; 45 module_param_named(allow_unsafe_interrupts, 46 allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); 47 MODULE_PARM_DESC(allow_unsafe_interrupts, 48 "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); 49 50 static bool disable_hugepages; 51 module_param_named(disable_hugepages, 52 disable_hugepages, bool, S_IRUGO | S_IWUSR); 53 MODULE_PARM_DESC(disable_hugepages, 54 "Disable VFIO IOMMU support for IOMMU hugepages."); 55 56 struct vfio_iommu { 57 struct list_head domain_list; 58 struct mutex lock; 59 struct rb_root dma_list; 60 bool v2; 61 bool nesting; 62 }; 63 64 struct vfio_domain { 65 struct iommu_domain *domain; 66 struct list_head next; 67 struct list_head group_list; 68 int prot; /* IOMMU_CACHE */ 69 }; 70 71 struct vfio_dma { 72 struct rb_node node; 73 dma_addr_t iova; /* Device address */ 74 unsigned long vaddr; /* Process virtual addr */ 75 size_t size; /* Map size (bytes) */ 76 int prot; /* IOMMU_READ/WRITE */ 77 }; 78 79 struct vfio_group { 80 struct iommu_group *iommu_group; 81 struct list_head next; 82 }; 83 84 /* 85 * This code handles mapping and unmapping of user data buffers 86 * into DMA'ble space using the IOMMU 87 */ 88 89 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, 90 dma_addr_t start, size_t size) 91 { 92 struct rb_node *node = iommu->dma_list.rb_node; 93 94 while (node) { 95 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); 96 97 if (start + size <= dma->iova) 98 node = node->rb_left; 99 else if (start >= dma->iova + dma->size) 100 node = node->rb_right; 101 else 102 return dma; 103 } 104 105 return NULL; 106 } 107 108 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new) 109 { 110 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; 111 struct vfio_dma *dma; 112 113 while (*link) { 114 parent = *link; 115 dma = rb_entry(parent, struct vfio_dma, node); 116 117 if (new->iova + new->size <= dma->iova) 118 link = &(*link)->rb_left; 119 else 120 link = &(*link)->rb_right; 121 } 122 123 rb_link_node(&new->node, parent, link); 124 rb_insert_color(&new->node, &iommu->dma_list); 125 } 126 127 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) 128 { 129 rb_erase(&old->node, &iommu->dma_list); 130 } 131 132 struct vwork { 133 struct mm_struct *mm; 134 long npage; 135 struct work_struct work; 136 }; 137 138 /* delayed decrement/increment for locked_vm */ 139 static void vfio_lock_acct_bg(struct work_struct *work) 140 { 141 struct vwork *vwork = container_of(work, struct vwork, work); 142 struct mm_struct *mm; 143 144 mm = vwork->mm; 145 down_write(&mm->mmap_sem); 146 mm->locked_vm += vwork->npage; 147 up_write(&mm->mmap_sem); 148 mmput(mm); 149 kfree(vwork); 150 } 151 152 static void vfio_lock_acct(long npage) 153 { 154 struct vwork *vwork; 155 struct mm_struct *mm; 156 157 if (!current->mm || !npage) 158 return; /* process exited or nothing to do */ 159 160 if (down_write_trylock(¤t->mm->mmap_sem)) { 161 current->mm->locked_vm += npage; 162 up_write(¤t->mm->mmap_sem); 163 return; 164 } 165 166 /* 167 * Couldn't get mmap_sem lock, so must setup to update 168 * mm->locked_vm later. If locked_vm were atomic, we 169 * wouldn't need this silliness 170 */ 171 vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); 172 if (!vwork) 173 return; 174 mm = get_task_mm(current); 175 if (!mm) { 176 kfree(vwork); 177 return; 178 } 179 INIT_WORK(&vwork->work, vfio_lock_acct_bg); 180 vwork->mm = mm; 181 vwork->npage = npage; 182 schedule_work(&vwork->work); 183 } 184 185 /* 186 * Some mappings aren't backed by a struct page, for example an mmap'd 187 * MMIO range for our own or another device. These use a different 188 * pfn conversion and shouldn't be tracked as locked pages. 189 */ 190 static bool is_invalid_reserved_pfn(unsigned long pfn) 191 { 192 if (pfn_valid(pfn)) { 193 bool reserved; 194 struct page *tail = pfn_to_page(pfn); 195 struct page *head = compound_head(tail); 196 reserved = !!(PageReserved(head)); 197 if (head != tail) { 198 /* 199 * "head" is not a dangling pointer 200 * (compound_head takes care of that) 201 * but the hugepage may have been split 202 * from under us (and we may not hold a 203 * reference count on the head page so it can 204 * be reused before we run PageReferenced), so 205 * we've to check PageTail before returning 206 * what we just read. 207 */ 208 smp_rmb(); 209 if (PageTail(tail)) 210 return reserved; 211 } 212 return PageReserved(tail); 213 } 214 215 return true; 216 } 217 218 static int put_pfn(unsigned long pfn, int prot) 219 { 220 if (!is_invalid_reserved_pfn(pfn)) { 221 struct page *page = pfn_to_page(pfn); 222 if (prot & IOMMU_WRITE) 223 SetPageDirty(page); 224 put_page(page); 225 return 1; 226 } 227 return 0; 228 } 229 230 static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) 231 { 232 struct page *page[1]; 233 struct vm_area_struct *vma; 234 int ret = -EFAULT; 235 236 if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) { 237 *pfn = page_to_pfn(page[0]); 238 return 0; 239 } 240 241 down_read(¤t->mm->mmap_sem); 242 243 vma = find_vma_intersection(current->mm, vaddr, vaddr + 1); 244 245 if (vma && vma->vm_flags & VM_PFNMAP) { 246 *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 247 if (is_invalid_reserved_pfn(*pfn)) 248 ret = 0; 249 } 250 251 up_read(¤t->mm->mmap_sem); 252 253 return ret; 254 } 255 256 /* 257 * Attempt to pin pages. We really don't want to track all the pfns and 258 * the iommu can only map chunks of consecutive pfns anyway, so get the 259 * first page and all consecutive pages with the same locking. 260 */ 261 static long vfio_pin_pages(unsigned long vaddr, long npage, 262 int prot, unsigned long *pfn_base) 263 { 264 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 265 bool lock_cap = capable(CAP_IPC_LOCK); 266 long ret, i; 267 268 if (!current->mm) 269 return -ENODEV; 270 271 ret = vaddr_get_pfn(vaddr, prot, pfn_base); 272 if (ret) 273 return ret; 274 275 if (is_invalid_reserved_pfn(*pfn_base)) 276 return 1; 277 278 if (!lock_cap && current->mm->locked_vm + 1 > limit) { 279 put_pfn(*pfn_base, prot); 280 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, 281 limit << PAGE_SHIFT); 282 return -ENOMEM; 283 } 284 285 if (unlikely(disable_hugepages)) { 286 vfio_lock_acct(1); 287 return 1; 288 } 289 290 /* Lock all the consecutive pages from pfn_base */ 291 for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { 292 unsigned long pfn = 0; 293 294 ret = vaddr_get_pfn(vaddr, prot, &pfn); 295 if (ret) 296 break; 297 298 if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) { 299 put_pfn(pfn, prot); 300 break; 301 } 302 303 if (!lock_cap && current->mm->locked_vm + i + 1 > limit) { 304 put_pfn(pfn, prot); 305 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", 306 __func__, limit << PAGE_SHIFT); 307 break; 308 } 309 } 310 311 vfio_lock_acct(i); 312 313 return i; 314 } 315 316 static long vfio_unpin_pages(unsigned long pfn, long npage, 317 int prot, bool do_accounting) 318 { 319 unsigned long unlocked = 0; 320 long i; 321 322 for (i = 0; i < npage; i++) 323 unlocked += put_pfn(pfn++, prot); 324 325 if (do_accounting) 326 vfio_lock_acct(-unlocked); 327 328 return unlocked; 329 } 330 331 static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) 332 { 333 dma_addr_t iova = dma->iova, end = dma->iova + dma->size; 334 struct vfio_domain *domain, *d; 335 long unlocked = 0; 336 337 if (!dma->size) 338 return; 339 /* 340 * We use the IOMMU to track the physical addresses, otherwise we'd 341 * need a much more complicated tracking system. Unfortunately that 342 * means we need to use one of the iommu domains to figure out the 343 * pfns to unpin. The rest need to be unmapped in advance so we have 344 * no iommu translations remaining when the pages are unpinned. 345 */ 346 domain = d = list_first_entry(&iommu->domain_list, 347 struct vfio_domain, next); 348 349 list_for_each_entry_continue(d, &iommu->domain_list, next) 350 iommu_unmap(d->domain, dma->iova, dma->size); 351 352 while (iova < end) { 353 size_t unmapped; 354 phys_addr_t phys; 355 356 phys = iommu_iova_to_phys(domain->domain, iova); 357 if (WARN_ON(!phys)) { 358 iova += PAGE_SIZE; 359 continue; 360 } 361 362 unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE); 363 if (WARN_ON(!unmapped)) 364 break; 365 366 unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, 367 unmapped >> PAGE_SHIFT, 368 dma->prot, false); 369 iova += unmapped; 370 } 371 372 vfio_lock_acct(-unlocked); 373 } 374 375 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) 376 { 377 vfio_unmap_unpin(iommu, dma); 378 vfio_unlink_dma(iommu, dma); 379 kfree(dma); 380 } 381 382 static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu) 383 { 384 struct vfio_domain *domain; 385 unsigned long bitmap = PAGE_MASK; 386 387 mutex_lock(&iommu->lock); 388 list_for_each_entry(domain, &iommu->domain_list, next) 389 bitmap &= domain->domain->ops->pgsize_bitmap; 390 mutex_unlock(&iommu->lock); 391 392 return bitmap; 393 } 394 395 static int vfio_dma_do_unmap(struct vfio_iommu *iommu, 396 struct vfio_iommu_type1_dma_unmap *unmap) 397 { 398 uint64_t mask; 399 struct vfio_dma *dma; 400 size_t unmapped = 0; 401 int ret = 0; 402 403 mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1; 404 405 if (unmap->iova & mask) 406 return -EINVAL; 407 if (!unmap->size || unmap->size & mask) 408 return -EINVAL; 409 410 WARN_ON(mask & PAGE_MASK); 411 412 mutex_lock(&iommu->lock); 413 414 /* 415 * vfio-iommu-type1 (v1) - User mappings were coalesced together to 416 * avoid tracking individual mappings. This means that the granularity 417 * of the original mapping was lost and the user was allowed to attempt 418 * to unmap any range. Depending on the contiguousness of physical 419 * memory and page sizes supported by the IOMMU, arbitrary unmaps may 420 * or may not have worked. We only guaranteed unmap granularity 421 * matching the original mapping; even though it was untracked here, 422 * the original mappings are reflected in IOMMU mappings. This 423 * resulted in a couple unusual behaviors. First, if a range is not 424 * able to be unmapped, ex. a set of 4k pages that was mapped as a 425 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with 426 * a zero sized unmap. Also, if an unmap request overlaps the first 427 * address of a hugepage, the IOMMU will unmap the entire hugepage. 428 * This also returns success and the returned unmap size reflects the 429 * actual size unmapped. 430 * 431 * We attempt to maintain compatibility with this "v1" interface, but 432 * we take control out of the hands of the IOMMU. Therefore, an unmap 433 * request offset from the beginning of the original mapping will 434 * return success with zero sized unmap. And an unmap request covering 435 * the first iova of mapping will unmap the entire range. 436 * 437 * The v2 version of this interface intends to be more deterministic. 438 * Unmap requests must fully cover previous mappings. Multiple 439 * mappings may still be unmaped by specifying large ranges, but there 440 * must not be any previous mappings bisected by the range. An error 441 * will be returned if these conditions are not met. The v2 interface 442 * will only return success and a size of zero if there were no 443 * mappings within the range. 444 */ 445 if (iommu->v2) { 446 dma = vfio_find_dma(iommu, unmap->iova, 0); 447 if (dma && dma->iova != unmap->iova) { 448 ret = -EINVAL; 449 goto unlock; 450 } 451 dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0); 452 if (dma && dma->iova + dma->size != unmap->iova + unmap->size) { 453 ret = -EINVAL; 454 goto unlock; 455 } 456 } 457 458 while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { 459 if (!iommu->v2 && unmap->iova > dma->iova) 460 break; 461 unmapped += dma->size; 462 vfio_remove_dma(iommu, dma); 463 } 464 465 unlock: 466 mutex_unlock(&iommu->lock); 467 468 /* Report how much was unmapped */ 469 unmap->size = unmapped; 470 471 return ret; 472 } 473 474 /* 475 * Turns out AMD IOMMU has a page table bug where it won't map large pages 476 * to a region that previously mapped smaller pages. This should be fixed 477 * soon, so this is just a temporary workaround to break mappings down into 478 * PAGE_SIZE. Better to map smaller pages than nothing. 479 */ 480 static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova, 481 unsigned long pfn, long npage, int prot) 482 { 483 long i; 484 int ret; 485 486 for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { 487 ret = iommu_map(domain->domain, iova, 488 (phys_addr_t)pfn << PAGE_SHIFT, 489 PAGE_SIZE, prot | domain->prot); 490 if (ret) 491 break; 492 } 493 494 for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) 495 iommu_unmap(domain->domain, iova, PAGE_SIZE); 496 497 return ret; 498 } 499 500 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova, 501 unsigned long pfn, long npage, int prot) 502 { 503 struct vfio_domain *d; 504 int ret; 505 506 list_for_each_entry(d, &iommu->domain_list, next) { 507 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT, 508 npage << PAGE_SHIFT, prot | d->prot); 509 if (ret) { 510 if (ret != -EBUSY || 511 map_try_harder(d, iova, pfn, npage, prot)) 512 goto unwind; 513 } 514 } 515 516 return 0; 517 518 unwind: 519 list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) 520 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT); 521 522 return ret; 523 } 524 525 static int vfio_dma_do_map(struct vfio_iommu *iommu, 526 struct vfio_iommu_type1_dma_map *map) 527 { 528 dma_addr_t iova = map->iova; 529 unsigned long vaddr = map->vaddr; 530 size_t size = map->size; 531 long npage; 532 int ret = 0, prot = 0; 533 uint64_t mask; 534 struct vfio_dma *dma; 535 unsigned long pfn; 536 537 /* Verify that none of our __u64 fields overflow */ 538 if (map->size != size || map->vaddr != vaddr || map->iova != iova) 539 return -EINVAL; 540 541 mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1; 542 543 WARN_ON(mask & PAGE_MASK); 544 545 /* READ/WRITE from device perspective */ 546 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) 547 prot |= IOMMU_WRITE; 548 if (map->flags & VFIO_DMA_MAP_FLAG_READ) 549 prot |= IOMMU_READ; 550 551 if (!prot || !size || (size | iova | vaddr) & mask) 552 return -EINVAL; 553 554 /* Don't allow IOVA or virtual address wrap */ 555 if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) 556 return -EINVAL; 557 558 mutex_lock(&iommu->lock); 559 560 if (vfio_find_dma(iommu, iova, size)) { 561 mutex_unlock(&iommu->lock); 562 return -EEXIST; 563 } 564 565 dma = kzalloc(sizeof(*dma), GFP_KERNEL); 566 if (!dma) { 567 mutex_unlock(&iommu->lock); 568 return -ENOMEM; 569 } 570 571 dma->iova = iova; 572 dma->vaddr = vaddr; 573 dma->prot = prot; 574 575 /* Insert zero-sized and grow as we map chunks of it */ 576 vfio_link_dma(iommu, dma); 577 578 while (size) { 579 /* Pin a contiguous chunk of memory */ 580 npage = vfio_pin_pages(vaddr + dma->size, 581 size >> PAGE_SHIFT, prot, &pfn); 582 if (npage <= 0) { 583 WARN_ON(!npage); 584 ret = (int)npage; 585 break; 586 } 587 588 /* Map it! */ 589 ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot); 590 if (ret) { 591 vfio_unpin_pages(pfn, npage, prot, true); 592 break; 593 } 594 595 size -= npage << PAGE_SHIFT; 596 dma->size += npage << PAGE_SHIFT; 597 } 598 599 if (ret) 600 vfio_remove_dma(iommu, dma); 601 602 mutex_unlock(&iommu->lock); 603 return ret; 604 } 605 606 static int vfio_bus_type(struct device *dev, void *data) 607 { 608 struct bus_type **bus = data; 609 610 if (*bus && *bus != dev->bus) 611 return -EINVAL; 612 613 *bus = dev->bus; 614 615 return 0; 616 } 617 618 static int vfio_iommu_replay(struct vfio_iommu *iommu, 619 struct vfio_domain *domain) 620 { 621 struct vfio_domain *d; 622 struct rb_node *n; 623 int ret; 624 625 /* Arbitrarily pick the first domain in the list for lookups */ 626 d = list_first_entry(&iommu->domain_list, struct vfio_domain, next); 627 n = rb_first(&iommu->dma_list); 628 629 /* If there's not a domain, there better not be any mappings */ 630 if (WARN_ON(n && !d)) 631 return -EINVAL; 632 633 for (; n; n = rb_next(n)) { 634 struct vfio_dma *dma; 635 dma_addr_t iova; 636 637 dma = rb_entry(n, struct vfio_dma, node); 638 iova = dma->iova; 639 640 while (iova < dma->iova + dma->size) { 641 phys_addr_t phys = iommu_iova_to_phys(d->domain, iova); 642 size_t size; 643 644 if (WARN_ON(!phys)) { 645 iova += PAGE_SIZE; 646 continue; 647 } 648 649 size = PAGE_SIZE; 650 651 while (iova + size < dma->iova + dma->size && 652 phys + size == iommu_iova_to_phys(d->domain, 653 iova + size)) 654 size += PAGE_SIZE; 655 656 ret = iommu_map(domain->domain, iova, phys, 657 size, dma->prot | domain->prot); 658 if (ret) 659 return ret; 660 661 iova += size; 662 } 663 } 664 665 return 0; 666 } 667 668 static int vfio_iommu_type1_attach_group(void *iommu_data, 669 struct iommu_group *iommu_group) 670 { 671 struct vfio_iommu *iommu = iommu_data; 672 struct vfio_group *group, *g; 673 struct vfio_domain *domain, *d; 674 struct bus_type *bus = NULL; 675 int ret; 676 677 mutex_lock(&iommu->lock); 678 679 list_for_each_entry(d, &iommu->domain_list, next) { 680 list_for_each_entry(g, &d->group_list, next) { 681 if (g->iommu_group != iommu_group) 682 continue; 683 684 mutex_unlock(&iommu->lock); 685 return -EINVAL; 686 } 687 } 688 689 group = kzalloc(sizeof(*group), GFP_KERNEL); 690 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 691 if (!group || !domain) { 692 ret = -ENOMEM; 693 goto out_free; 694 } 695 696 group->iommu_group = iommu_group; 697 698 /* Determine bus_type in order to allocate a domain */ 699 ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type); 700 if (ret) 701 goto out_free; 702 703 domain->domain = iommu_domain_alloc(bus); 704 if (!domain->domain) { 705 ret = -EIO; 706 goto out_free; 707 } 708 709 if (iommu->nesting) { 710 int attr = 1; 711 712 ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING, 713 &attr); 714 if (ret) 715 goto out_domain; 716 } 717 718 ret = iommu_attach_group(domain->domain, iommu_group); 719 if (ret) 720 goto out_domain; 721 722 INIT_LIST_HEAD(&domain->group_list); 723 list_add(&group->next, &domain->group_list); 724 725 if (!allow_unsafe_interrupts && 726 !iommu_capable(bus, IOMMU_CAP_INTR_REMAP)) { 727 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", 728 __func__); 729 ret = -EPERM; 730 goto out_detach; 731 } 732 733 if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY)) 734 domain->prot |= IOMMU_CACHE; 735 736 /* 737 * Try to match an existing compatible domain. We don't want to 738 * preclude an IOMMU driver supporting multiple bus_types and being 739 * able to include different bus_types in the same IOMMU domain, so 740 * we test whether the domains use the same iommu_ops rather than 741 * testing if they're on the same bus_type. 742 */ 743 list_for_each_entry(d, &iommu->domain_list, next) { 744 if (d->domain->ops == domain->domain->ops && 745 d->prot == domain->prot) { 746 iommu_detach_group(domain->domain, iommu_group); 747 if (!iommu_attach_group(d->domain, iommu_group)) { 748 list_add(&group->next, &d->group_list); 749 iommu_domain_free(domain->domain); 750 kfree(domain); 751 mutex_unlock(&iommu->lock); 752 return 0; 753 } 754 755 ret = iommu_attach_group(domain->domain, iommu_group); 756 if (ret) 757 goto out_domain; 758 } 759 } 760 761 /* replay mappings on new domains */ 762 ret = vfio_iommu_replay(iommu, domain); 763 if (ret) 764 goto out_detach; 765 766 list_add(&domain->next, &iommu->domain_list); 767 768 mutex_unlock(&iommu->lock); 769 770 return 0; 771 772 out_detach: 773 iommu_detach_group(domain->domain, iommu_group); 774 out_domain: 775 iommu_domain_free(domain->domain); 776 out_free: 777 kfree(domain); 778 kfree(group); 779 mutex_unlock(&iommu->lock); 780 return ret; 781 } 782 783 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu) 784 { 785 struct rb_node *node; 786 787 while ((node = rb_first(&iommu->dma_list))) 788 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node)); 789 } 790 791 static void vfio_iommu_type1_detach_group(void *iommu_data, 792 struct iommu_group *iommu_group) 793 { 794 struct vfio_iommu *iommu = iommu_data; 795 struct vfio_domain *domain; 796 struct vfio_group *group; 797 798 mutex_lock(&iommu->lock); 799 800 list_for_each_entry(domain, &iommu->domain_list, next) { 801 list_for_each_entry(group, &domain->group_list, next) { 802 if (group->iommu_group != iommu_group) 803 continue; 804 805 iommu_detach_group(domain->domain, iommu_group); 806 list_del(&group->next); 807 kfree(group); 808 /* 809 * Group ownership provides privilege, if the group 810 * list is empty, the domain goes away. If it's the 811 * last domain, then all the mappings go away too. 812 */ 813 if (list_empty(&domain->group_list)) { 814 if (list_is_singular(&iommu->domain_list)) 815 vfio_iommu_unmap_unpin_all(iommu); 816 iommu_domain_free(domain->domain); 817 list_del(&domain->next); 818 kfree(domain); 819 } 820 goto done; 821 } 822 } 823 824 done: 825 mutex_unlock(&iommu->lock); 826 } 827 828 static void *vfio_iommu_type1_open(unsigned long arg) 829 { 830 struct vfio_iommu *iommu; 831 832 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); 833 if (!iommu) 834 return ERR_PTR(-ENOMEM); 835 836 switch (arg) { 837 case VFIO_TYPE1_IOMMU: 838 break; 839 case VFIO_TYPE1_NESTING_IOMMU: 840 iommu->nesting = true; 841 case VFIO_TYPE1v2_IOMMU: 842 iommu->v2 = true; 843 break; 844 default: 845 kfree(iommu); 846 return ERR_PTR(-EINVAL); 847 } 848 849 INIT_LIST_HEAD(&iommu->domain_list); 850 iommu->dma_list = RB_ROOT; 851 mutex_init(&iommu->lock); 852 853 return iommu; 854 } 855 856 static void vfio_iommu_type1_release(void *iommu_data) 857 { 858 struct vfio_iommu *iommu = iommu_data; 859 struct vfio_domain *domain, *domain_tmp; 860 struct vfio_group *group, *group_tmp; 861 862 vfio_iommu_unmap_unpin_all(iommu); 863 864 list_for_each_entry_safe(domain, domain_tmp, 865 &iommu->domain_list, next) { 866 list_for_each_entry_safe(group, group_tmp, 867 &domain->group_list, next) { 868 iommu_detach_group(domain->domain, group->iommu_group); 869 list_del(&group->next); 870 kfree(group); 871 } 872 iommu_domain_free(domain->domain); 873 list_del(&domain->next); 874 kfree(domain); 875 } 876 877 kfree(iommu); 878 } 879 880 static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu) 881 { 882 struct vfio_domain *domain; 883 int ret = 1; 884 885 mutex_lock(&iommu->lock); 886 list_for_each_entry(domain, &iommu->domain_list, next) { 887 if (!(domain->prot & IOMMU_CACHE)) { 888 ret = 0; 889 break; 890 } 891 } 892 mutex_unlock(&iommu->lock); 893 894 return ret; 895 } 896 897 static long vfio_iommu_type1_ioctl(void *iommu_data, 898 unsigned int cmd, unsigned long arg) 899 { 900 struct vfio_iommu *iommu = iommu_data; 901 unsigned long minsz; 902 903 if (cmd == VFIO_CHECK_EXTENSION) { 904 switch (arg) { 905 case VFIO_TYPE1_IOMMU: 906 case VFIO_TYPE1v2_IOMMU: 907 case VFIO_TYPE1_NESTING_IOMMU: 908 return 1; 909 case VFIO_DMA_CC_IOMMU: 910 if (!iommu) 911 return 0; 912 return vfio_domains_have_iommu_cache(iommu); 913 default: 914 return 0; 915 } 916 } else if (cmd == VFIO_IOMMU_GET_INFO) { 917 struct vfio_iommu_type1_info info; 918 919 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); 920 921 if (copy_from_user(&info, (void __user *)arg, minsz)) 922 return -EFAULT; 923 924 if (info.argsz < minsz) 925 return -EINVAL; 926 927 info.flags = 0; 928 929 info.iova_pgsizes = vfio_pgsize_bitmap(iommu); 930 931 return copy_to_user((void __user *)arg, &info, minsz); 932 933 } else if (cmd == VFIO_IOMMU_MAP_DMA) { 934 struct vfio_iommu_type1_dma_map map; 935 uint32_t mask = VFIO_DMA_MAP_FLAG_READ | 936 VFIO_DMA_MAP_FLAG_WRITE; 937 938 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 939 940 if (copy_from_user(&map, (void __user *)arg, minsz)) 941 return -EFAULT; 942 943 if (map.argsz < minsz || map.flags & ~mask) 944 return -EINVAL; 945 946 return vfio_dma_do_map(iommu, &map); 947 948 } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { 949 struct vfio_iommu_type1_dma_unmap unmap; 950 long ret; 951 952 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); 953 954 if (copy_from_user(&unmap, (void __user *)arg, minsz)) 955 return -EFAULT; 956 957 if (unmap.argsz < minsz || unmap.flags) 958 return -EINVAL; 959 960 ret = vfio_dma_do_unmap(iommu, &unmap); 961 if (ret) 962 return ret; 963 964 return copy_to_user((void __user *)arg, &unmap, minsz); 965 } 966 967 return -ENOTTY; 968 } 969 970 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { 971 .name = "vfio-iommu-type1", 972 .owner = THIS_MODULE, 973 .open = vfio_iommu_type1_open, 974 .release = vfio_iommu_type1_release, 975 .ioctl = vfio_iommu_type1_ioctl, 976 .attach_group = vfio_iommu_type1_attach_group, 977 .detach_group = vfio_iommu_type1_detach_group, 978 }; 979 980 static int __init vfio_iommu_type1_init(void) 981 { 982 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); 983 } 984 985 static void __exit vfio_iommu_type1_cleanup(void) 986 { 987 vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1); 988 } 989 990 module_init(vfio_iommu_type1_init); 991 module_exit(vfio_iommu_type1_cleanup); 992 993 MODULE_VERSION(DRIVER_VERSION); 994 MODULE_LICENSE("GPL v2"); 995 MODULE_AUTHOR(DRIVER_AUTHOR); 996 MODULE_DESCRIPTION(DRIVER_DESC); 997