1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO: IOMMU DMA mapping support for TCE on POWER 4 * 5 * Copyright (C) 2013 IBM Corp. All rights reserved. 6 * Author: Alexey Kardashevskiy <aik@ozlabs.ru> 7 * 8 * Derived from original vfio_iommu_type1.c: 9 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 10 * Author: Alex Williamson <alex.williamson@redhat.com> 11 */ 12 13 #include <linux/module.h> 14 #include <linux/pci.h> 15 #include <linux/slab.h> 16 #include <linux/uaccess.h> 17 #include <linux/err.h> 18 #include <linux/vfio.h> 19 #include <linux/vmalloc.h> 20 #include <linux/sched/mm.h> 21 #include <linux/sched/signal.h> 22 23 #include <asm/iommu.h> 24 #include <asm/tce.h> 25 #include <asm/mmu_context.h> 26 27 #define DRIVER_VERSION "0.1" 28 #define DRIVER_AUTHOR "aik@ozlabs.ru" 29 #define DRIVER_DESC "VFIO IOMMU SPAPR TCE" 30 31 static void tce_iommu_detach_group(void *iommu_data, 32 struct iommu_group *iommu_group); 33 34 static long try_increment_locked_vm(struct mm_struct *mm, long npages) 35 { 36 long ret = 0, locked, lock_limit; 37 38 if (WARN_ON_ONCE(!mm)) 39 return -EPERM; 40 41 if (!npages) 42 return 0; 43 44 down_write(&mm->mmap_sem); 45 locked = mm->locked_vm + npages; 46 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 47 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 48 ret = -ENOMEM; 49 else 50 mm->locked_vm += npages; 51 52 pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid, 53 npages << PAGE_SHIFT, 54 mm->locked_vm << PAGE_SHIFT, 55 rlimit(RLIMIT_MEMLOCK), 56 ret ? " - exceeded" : ""); 57 58 up_write(&mm->mmap_sem); 59 60 return ret; 61 } 62 63 static void decrement_locked_vm(struct mm_struct *mm, long npages) 64 { 65 if (!mm || !npages) 66 return; 67 68 down_write(&mm->mmap_sem); 69 if (WARN_ON_ONCE(npages > mm->locked_vm)) 70 npages = mm->locked_vm; 71 mm->locked_vm -= npages; 72 pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid, 73 npages << PAGE_SHIFT, 74 mm->locked_vm << PAGE_SHIFT, 75 rlimit(RLIMIT_MEMLOCK)); 76 up_write(&mm->mmap_sem); 77 } 78 79 /* 80 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation 81 * 82 * This code handles mapping and unmapping of user data buffers 83 * into DMA'ble space using the IOMMU 84 */ 85 86 struct tce_iommu_group { 87 struct list_head next; 88 struct iommu_group *grp; 89 }; 90 91 /* 92 * A container needs to remember which preregistered region it has 93 * referenced to do proper cleanup at the userspace process exit. 94 */ 95 struct tce_iommu_prereg { 96 struct list_head next; 97 struct mm_iommu_table_group_mem_t *mem; 98 }; 99 100 /* 101 * The container descriptor supports only a single group per container. 102 * Required by the API as the container is not supplied with the IOMMU group 103 * at the moment of initialization. 104 */ 105 struct tce_container { 106 struct mutex lock; 107 bool enabled; 108 bool v2; 109 bool def_window_pending; 110 unsigned long locked_pages; 111 struct mm_struct *mm; 112 struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; 113 struct list_head group_list; 114 struct list_head prereg_list; 115 }; 116 117 static long tce_iommu_mm_set(struct tce_container *container) 118 { 119 if (container->mm) { 120 if (container->mm == current->mm) 121 return 0; 122 return -EPERM; 123 } 124 BUG_ON(!current->mm); 125 container->mm = current->mm; 126 atomic_inc(&container->mm->mm_count); 127 128 return 0; 129 } 130 131 static long tce_iommu_prereg_free(struct tce_container *container, 132 struct tce_iommu_prereg *tcemem) 133 { 134 long ret; 135 136 ret = mm_iommu_put(container->mm, tcemem->mem); 137 if (ret) 138 return ret; 139 140 list_del(&tcemem->next); 141 kfree(tcemem); 142 143 return 0; 144 } 145 146 static long tce_iommu_unregister_pages(struct tce_container *container, 147 __u64 vaddr, __u64 size) 148 { 149 struct mm_iommu_table_group_mem_t *mem; 150 struct tce_iommu_prereg *tcemem; 151 bool found = false; 152 long ret; 153 154 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) 155 return -EINVAL; 156 157 mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT); 158 if (!mem) 159 return -ENOENT; 160 161 list_for_each_entry(tcemem, &container->prereg_list, next) { 162 if (tcemem->mem == mem) { 163 found = true; 164 break; 165 } 166 } 167 168 if (!found) 169 ret = -ENOENT; 170 else 171 ret = tce_iommu_prereg_free(container, tcemem); 172 173 mm_iommu_put(container->mm, mem); 174 175 return ret; 176 } 177 178 static long tce_iommu_register_pages(struct tce_container *container, 179 __u64 vaddr, __u64 size) 180 { 181 long ret = 0; 182 struct mm_iommu_table_group_mem_t *mem = NULL; 183 struct tce_iommu_prereg *tcemem; 184 unsigned long entries = size >> PAGE_SHIFT; 185 186 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || 187 ((vaddr + size) < vaddr)) 188 return -EINVAL; 189 190 mem = mm_iommu_get(container->mm, vaddr, entries); 191 if (mem) { 192 list_for_each_entry(tcemem, &container->prereg_list, next) { 193 if (tcemem->mem == mem) { 194 ret = -EBUSY; 195 goto put_exit; 196 } 197 } 198 } else { 199 ret = mm_iommu_new(container->mm, vaddr, entries, &mem); 200 if (ret) 201 return ret; 202 } 203 204 tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); 205 if (!tcemem) { 206 ret = -ENOMEM; 207 goto put_exit; 208 } 209 210 tcemem->mem = mem; 211 list_add(&tcemem->next, &container->prereg_list); 212 213 container->enabled = true; 214 215 return 0; 216 217 put_exit: 218 mm_iommu_put(container->mm, mem); 219 return ret; 220 } 221 222 static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, 223 unsigned int page_shift) 224 { 225 struct page *page; 226 unsigned long size = 0; 227 228 if (mm_iommu_is_devmem(mm, hpa, page_shift, &size)) 229 return size == (1UL << page_shift); 230 231 page = pfn_to_page(hpa >> PAGE_SHIFT); 232 /* 233 * Check that the TCE table granularity is not bigger than the size of 234 * a page we just found. Otherwise the hardware can get access to 235 * a bigger memory chunk that it should. 236 */ 237 return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; 238 } 239 240 static inline bool tce_groups_attached(struct tce_container *container) 241 { 242 return !list_empty(&container->group_list); 243 } 244 245 static long tce_iommu_find_table(struct tce_container *container, 246 phys_addr_t ioba, struct iommu_table **ptbl) 247 { 248 long i; 249 250 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 251 struct iommu_table *tbl = container->tables[i]; 252 253 if (tbl) { 254 unsigned long entry = ioba >> tbl->it_page_shift; 255 unsigned long start = tbl->it_offset; 256 unsigned long end = start + tbl->it_size; 257 258 if ((start <= entry) && (entry < end)) { 259 *ptbl = tbl; 260 return i; 261 } 262 } 263 } 264 265 return -1; 266 } 267 268 static int tce_iommu_find_free_table(struct tce_container *container) 269 { 270 int i; 271 272 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 273 if (!container->tables[i]) 274 return i; 275 } 276 277 return -ENOSPC; 278 } 279 280 static int tce_iommu_enable(struct tce_container *container) 281 { 282 int ret = 0; 283 unsigned long locked; 284 struct iommu_table_group *table_group; 285 struct tce_iommu_group *tcegrp; 286 287 if (container->enabled) 288 return -EBUSY; 289 290 /* 291 * When userspace pages are mapped into the IOMMU, they are effectively 292 * locked memory, so, theoretically, we need to update the accounting 293 * of locked pages on each map and unmap. For powerpc, the map unmap 294 * paths can be very hot, though, and the accounting would kill 295 * performance, especially since it would be difficult to impossible 296 * to handle the accounting in real mode only. 297 * 298 * To address that, rather than precisely accounting every page, we 299 * instead account for a worst case on locked memory when the iommu is 300 * enabled and disabled. The worst case upper bound on locked memory 301 * is the size of the whole iommu window, which is usually relatively 302 * small (compared to total memory sizes) on POWER hardware. 303 * 304 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, 305 * that would effectively kill the guest at random points, much better 306 * enforcing the limit based on the max that the guest can map. 307 * 308 * Unfortunately at the moment it counts whole tables, no matter how 309 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups 310 * each with 2GB DMA window, 8GB will be counted here. The reason for 311 * this is that we cannot tell here the amount of RAM used by the guest 312 * as this information is only available from KVM and VFIO is 313 * KVM agnostic. 314 * 315 * So we do not allow enabling a container without a group attached 316 * as there is no way to know how much we should increment 317 * the locked_vm counter. 318 */ 319 if (!tce_groups_attached(container)) 320 return -ENODEV; 321 322 tcegrp = list_first_entry(&container->group_list, 323 struct tce_iommu_group, next); 324 table_group = iommu_group_get_iommudata(tcegrp->grp); 325 if (!table_group) 326 return -ENODEV; 327 328 if (!table_group->tce32_size) 329 return -EPERM; 330 331 ret = tce_iommu_mm_set(container); 332 if (ret) 333 return ret; 334 335 locked = table_group->tce32_size >> PAGE_SHIFT; 336 ret = try_increment_locked_vm(container->mm, locked); 337 if (ret) 338 return ret; 339 340 container->locked_pages = locked; 341 342 container->enabled = true; 343 344 return ret; 345 } 346 347 static void tce_iommu_disable(struct tce_container *container) 348 { 349 if (!container->enabled) 350 return; 351 352 container->enabled = false; 353 354 BUG_ON(!container->mm); 355 decrement_locked_vm(container->mm, container->locked_pages); 356 } 357 358 static void *tce_iommu_open(unsigned long arg) 359 { 360 struct tce_container *container; 361 362 if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) { 363 pr_err("tce_vfio: Wrong IOMMU type\n"); 364 return ERR_PTR(-EINVAL); 365 } 366 367 container = kzalloc(sizeof(*container), GFP_KERNEL); 368 if (!container) 369 return ERR_PTR(-ENOMEM); 370 371 mutex_init(&container->lock); 372 INIT_LIST_HEAD_RCU(&container->group_list); 373 INIT_LIST_HEAD_RCU(&container->prereg_list); 374 375 container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; 376 377 return container; 378 } 379 380 static int tce_iommu_clear(struct tce_container *container, 381 struct iommu_table *tbl, 382 unsigned long entry, unsigned long pages); 383 static void tce_iommu_free_table(struct tce_container *container, 384 struct iommu_table *tbl); 385 386 static void tce_iommu_release(void *iommu_data) 387 { 388 struct tce_container *container = iommu_data; 389 struct tce_iommu_group *tcegrp; 390 struct tce_iommu_prereg *tcemem, *tmtmp; 391 long i; 392 393 while (tce_groups_attached(container)) { 394 tcegrp = list_first_entry(&container->group_list, 395 struct tce_iommu_group, next); 396 tce_iommu_detach_group(iommu_data, tcegrp->grp); 397 } 398 399 /* 400 * If VFIO created a table, it was not disposed 401 * by tce_iommu_detach_group() so do it now. 402 */ 403 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 404 struct iommu_table *tbl = container->tables[i]; 405 406 if (!tbl) 407 continue; 408 409 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 410 tce_iommu_free_table(container, tbl); 411 } 412 413 list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next) 414 WARN_ON(tce_iommu_prereg_free(container, tcemem)); 415 416 tce_iommu_disable(container); 417 if (container->mm) 418 mmdrop(container->mm); 419 mutex_destroy(&container->lock); 420 421 kfree(container); 422 } 423 424 static void tce_iommu_unuse_page(struct tce_container *container, 425 unsigned long hpa) 426 { 427 struct page *page; 428 429 page = pfn_to_page(hpa >> PAGE_SHIFT); 430 put_page(page); 431 } 432 433 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container, 434 unsigned long tce, unsigned long shift, 435 unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) 436 { 437 long ret = 0; 438 struct mm_iommu_table_group_mem_t *mem; 439 440 mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift); 441 if (!mem) 442 return -EINVAL; 443 444 ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa); 445 if (ret) 446 return -EINVAL; 447 448 *pmem = mem; 449 450 return 0; 451 } 452 453 static void tce_iommu_unuse_page_v2(struct tce_container *container, 454 struct iommu_table *tbl, unsigned long entry) 455 { 456 struct mm_iommu_table_group_mem_t *mem = NULL; 457 int ret; 458 unsigned long hpa = 0; 459 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); 460 461 if (!pua) 462 return; 463 464 ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua), 465 tbl->it_page_shift, &hpa, &mem); 466 if (ret) 467 pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n", 468 __func__, be64_to_cpu(*pua), entry, ret); 469 if (mem) 470 mm_iommu_mapped_dec(mem); 471 472 *pua = cpu_to_be64(0); 473 } 474 475 static int tce_iommu_clear(struct tce_container *container, 476 struct iommu_table *tbl, 477 unsigned long entry, unsigned long pages) 478 { 479 unsigned long oldhpa; 480 long ret; 481 enum dma_data_direction direction; 482 unsigned long lastentry = entry + pages; 483 484 for ( ; entry < lastentry; ++entry) { 485 if (tbl->it_indirect_levels && tbl->it_userspace) { 486 /* 487 * For multilevel tables, we can take a shortcut here 488 * and skip some TCEs as we know that the userspace 489 * addresses cache is a mirror of the real TCE table 490 * and if it is missing some indirect levels, then 491 * the hardware table does not have them allocated 492 * either and therefore does not require updating. 493 */ 494 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, 495 entry); 496 if (!pua) { 497 /* align to level_size which is power of two */ 498 entry |= tbl->it_level_size - 1; 499 continue; 500 } 501 } 502 503 cond_resched(); 504 505 direction = DMA_NONE; 506 oldhpa = 0; 507 ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa, 508 &direction); 509 if (ret) 510 continue; 511 512 if (direction == DMA_NONE) 513 continue; 514 515 if (container->v2) { 516 tce_iommu_unuse_page_v2(container, tbl, entry); 517 continue; 518 } 519 520 tce_iommu_unuse_page(container, oldhpa); 521 } 522 523 return 0; 524 } 525 526 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) 527 { 528 struct page *page = NULL; 529 enum dma_data_direction direction = iommu_tce_direction(tce); 530 531 if (get_user_pages_fast(tce & PAGE_MASK, 1, 532 direction != DMA_TO_DEVICE ? FOLL_WRITE : 0, 533 &page) != 1) 534 return -EFAULT; 535 536 *hpa = __pa((unsigned long) page_address(page)); 537 538 return 0; 539 } 540 541 static long tce_iommu_build(struct tce_container *container, 542 struct iommu_table *tbl, 543 unsigned long entry, unsigned long tce, unsigned long pages, 544 enum dma_data_direction direction) 545 { 546 long i, ret = 0; 547 unsigned long hpa; 548 enum dma_data_direction dirtmp; 549 550 for (i = 0; i < pages; ++i) { 551 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 552 553 ret = tce_iommu_use_page(tce, &hpa); 554 if (ret) 555 break; 556 557 if (!tce_page_is_contained(container->mm, hpa, 558 tbl->it_page_shift)) { 559 ret = -EPERM; 560 break; 561 } 562 563 hpa |= offset; 564 dirtmp = direction; 565 ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, 566 &dirtmp); 567 if (ret) { 568 tce_iommu_unuse_page(container, hpa); 569 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 570 __func__, entry << tbl->it_page_shift, 571 tce, ret); 572 break; 573 } 574 575 if (dirtmp != DMA_NONE) 576 tce_iommu_unuse_page(container, hpa); 577 578 tce += IOMMU_PAGE_SIZE(tbl); 579 } 580 581 if (ret) 582 tce_iommu_clear(container, tbl, entry, i); 583 584 return ret; 585 } 586 587 static long tce_iommu_build_v2(struct tce_container *container, 588 struct iommu_table *tbl, 589 unsigned long entry, unsigned long tce, unsigned long pages, 590 enum dma_data_direction direction) 591 { 592 long i, ret = 0; 593 unsigned long hpa; 594 enum dma_data_direction dirtmp; 595 596 for (i = 0; i < pages; ++i) { 597 struct mm_iommu_table_group_mem_t *mem = NULL; 598 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i); 599 600 ret = tce_iommu_prereg_ua_to_hpa(container, 601 tce, tbl->it_page_shift, &hpa, &mem); 602 if (ret) 603 break; 604 605 if (!tce_page_is_contained(container->mm, hpa, 606 tbl->it_page_shift)) { 607 ret = -EPERM; 608 break; 609 } 610 611 /* Preserve offset within IOMMU page */ 612 hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 613 dirtmp = direction; 614 615 /* The registered region is being unregistered */ 616 if (mm_iommu_mapped_inc(mem)) 617 break; 618 619 ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, 620 &dirtmp); 621 if (ret) { 622 /* dirtmp cannot be DMA_NONE here */ 623 tce_iommu_unuse_page_v2(container, tbl, entry + i); 624 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 625 __func__, entry << tbl->it_page_shift, 626 tce, ret); 627 break; 628 } 629 630 if (dirtmp != DMA_NONE) 631 tce_iommu_unuse_page_v2(container, tbl, entry + i); 632 633 *pua = cpu_to_be64(tce); 634 635 tce += IOMMU_PAGE_SIZE(tbl); 636 } 637 638 if (ret) 639 tce_iommu_clear(container, tbl, entry, i); 640 641 return ret; 642 } 643 644 static long tce_iommu_create_table(struct tce_container *container, 645 struct iommu_table_group *table_group, 646 int num, 647 __u32 page_shift, 648 __u64 window_size, 649 __u32 levels, 650 struct iommu_table **ptbl) 651 { 652 long ret, table_size; 653 654 table_size = table_group->ops->get_table_size(page_shift, window_size, 655 levels); 656 if (!table_size) 657 return -EINVAL; 658 659 ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT); 660 if (ret) 661 return ret; 662 663 ret = table_group->ops->create_table(table_group, num, 664 page_shift, window_size, levels, ptbl); 665 666 WARN_ON(!ret && !(*ptbl)->it_ops->free); 667 WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size)); 668 669 return ret; 670 } 671 672 static void tce_iommu_free_table(struct tce_container *container, 673 struct iommu_table *tbl) 674 { 675 unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; 676 677 iommu_tce_table_put(tbl); 678 decrement_locked_vm(container->mm, pages); 679 } 680 681 static long tce_iommu_create_window(struct tce_container *container, 682 __u32 page_shift, __u64 window_size, __u32 levels, 683 __u64 *start_addr) 684 { 685 struct tce_iommu_group *tcegrp; 686 struct iommu_table_group *table_group; 687 struct iommu_table *tbl = NULL; 688 long ret, num; 689 690 num = tce_iommu_find_free_table(container); 691 if (num < 0) 692 return num; 693 694 /* Get the first group for ops::create_table */ 695 tcegrp = list_first_entry(&container->group_list, 696 struct tce_iommu_group, next); 697 table_group = iommu_group_get_iommudata(tcegrp->grp); 698 if (!table_group) 699 return -EFAULT; 700 701 if (!(table_group->pgsizes & (1ULL << page_shift))) 702 return -EINVAL; 703 704 if (!table_group->ops->set_window || !table_group->ops->unset_window || 705 !table_group->ops->get_table_size || 706 !table_group->ops->create_table) 707 return -EPERM; 708 709 /* Create TCE table */ 710 ret = tce_iommu_create_table(container, table_group, num, 711 page_shift, window_size, levels, &tbl); 712 if (ret) 713 return ret; 714 715 BUG_ON(!tbl->it_ops->free); 716 717 /* 718 * Program the table to every group. 719 * Groups have been tested for compatibility at the attach time. 720 */ 721 list_for_each_entry(tcegrp, &container->group_list, next) { 722 table_group = iommu_group_get_iommudata(tcegrp->grp); 723 724 ret = table_group->ops->set_window(table_group, num, tbl); 725 if (ret) 726 goto unset_exit; 727 } 728 729 container->tables[num] = tbl; 730 731 /* Return start address assigned by platform in create_table() */ 732 *start_addr = tbl->it_offset << tbl->it_page_shift; 733 734 return 0; 735 736 unset_exit: 737 list_for_each_entry(tcegrp, &container->group_list, next) { 738 table_group = iommu_group_get_iommudata(tcegrp->grp); 739 table_group->ops->unset_window(table_group, num); 740 } 741 tce_iommu_free_table(container, tbl); 742 743 return ret; 744 } 745 746 static long tce_iommu_remove_window(struct tce_container *container, 747 __u64 start_addr) 748 { 749 struct iommu_table_group *table_group = NULL; 750 struct iommu_table *tbl; 751 struct tce_iommu_group *tcegrp; 752 int num; 753 754 num = tce_iommu_find_table(container, start_addr, &tbl); 755 if (num < 0) 756 return -EINVAL; 757 758 BUG_ON(!tbl->it_size); 759 760 /* Detach groups from IOMMUs */ 761 list_for_each_entry(tcegrp, &container->group_list, next) { 762 table_group = iommu_group_get_iommudata(tcegrp->grp); 763 764 /* 765 * SPAPR TCE IOMMU exposes the default DMA window to 766 * the guest via dma32_window_start/size of 767 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow 768 * the userspace to remove this window, some do not so 769 * here we check for the platform capability. 770 */ 771 if (!table_group->ops || !table_group->ops->unset_window) 772 return -EPERM; 773 774 table_group->ops->unset_window(table_group, num); 775 } 776 777 /* Free table */ 778 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 779 tce_iommu_free_table(container, tbl); 780 container->tables[num] = NULL; 781 782 return 0; 783 } 784 785 static long tce_iommu_create_default_window(struct tce_container *container) 786 { 787 long ret; 788 __u64 start_addr = 0; 789 struct tce_iommu_group *tcegrp; 790 struct iommu_table_group *table_group; 791 792 if (!container->def_window_pending) 793 return 0; 794 795 if (!tce_groups_attached(container)) 796 return -ENODEV; 797 798 tcegrp = list_first_entry(&container->group_list, 799 struct tce_iommu_group, next); 800 table_group = iommu_group_get_iommudata(tcegrp->grp); 801 if (!table_group) 802 return -ENODEV; 803 804 ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K, 805 table_group->tce32_size, 1, &start_addr); 806 WARN_ON_ONCE(!ret && start_addr); 807 808 if (!ret) 809 container->def_window_pending = false; 810 811 return ret; 812 } 813 814 static long tce_iommu_ioctl(void *iommu_data, 815 unsigned int cmd, unsigned long arg) 816 { 817 struct tce_container *container = iommu_data; 818 unsigned long minsz, ddwsz; 819 long ret; 820 821 switch (cmd) { 822 case VFIO_CHECK_EXTENSION: 823 switch (arg) { 824 case VFIO_SPAPR_TCE_IOMMU: 825 case VFIO_SPAPR_TCE_v2_IOMMU: 826 ret = 1; 827 break; 828 default: 829 ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); 830 break; 831 } 832 833 return (ret < 0) ? 0 : ret; 834 } 835 836 /* 837 * Sanity check to prevent one userspace from manipulating 838 * another userspace mm. 839 */ 840 BUG_ON(!container); 841 if (container->mm && container->mm != current->mm) 842 return -EPERM; 843 844 switch (cmd) { 845 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { 846 struct vfio_iommu_spapr_tce_info info; 847 struct tce_iommu_group *tcegrp; 848 struct iommu_table_group *table_group; 849 850 if (!tce_groups_attached(container)) 851 return -ENXIO; 852 853 tcegrp = list_first_entry(&container->group_list, 854 struct tce_iommu_group, next); 855 table_group = iommu_group_get_iommudata(tcegrp->grp); 856 857 if (!table_group) 858 return -ENXIO; 859 860 minsz = offsetofend(struct vfio_iommu_spapr_tce_info, 861 dma32_window_size); 862 863 if (copy_from_user(&info, (void __user *)arg, minsz)) 864 return -EFAULT; 865 866 if (info.argsz < minsz) 867 return -EINVAL; 868 869 info.dma32_window_start = table_group->tce32_start; 870 info.dma32_window_size = table_group->tce32_size; 871 info.flags = 0; 872 memset(&info.ddw, 0, sizeof(info.ddw)); 873 874 if (table_group->max_dynamic_windows_supported && 875 container->v2) { 876 info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW; 877 info.ddw.pgsizes = table_group->pgsizes; 878 info.ddw.max_dynamic_windows_supported = 879 table_group->max_dynamic_windows_supported; 880 info.ddw.levels = table_group->max_levels; 881 } 882 883 ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw); 884 885 if (info.argsz >= ddwsz) 886 minsz = ddwsz; 887 888 if (copy_to_user((void __user *)arg, &info, minsz)) 889 return -EFAULT; 890 891 return 0; 892 } 893 case VFIO_IOMMU_MAP_DMA: { 894 struct vfio_iommu_type1_dma_map param; 895 struct iommu_table *tbl = NULL; 896 long num; 897 enum dma_data_direction direction; 898 899 if (!container->enabled) 900 return -EPERM; 901 902 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 903 904 if (copy_from_user(¶m, (void __user *)arg, minsz)) 905 return -EFAULT; 906 907 if (param.argsz < minsz) 908 return -EINVAL; 909 910 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ | 911 VFIO_DMA_MAP_FLAG_WRITE)) 912 return -EINVAL; 913 914 ret = tce_iommu_create_default_window(container); 915 if (ret) 916 return ret; 917 918 num = tce_iommu_find_table(container, param.iova, &tbl); 919 if (num < 0) 920 return -ENXIO; 921 922 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || 923 (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) 924 return -EINVAL; 925 926 /* iova is checked by the IOMMU API */ 927 if (param.flags & VFIO_DMA_MAP_FLAG_READ) { 928 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 929 direction = DMA_BIDIRECTIONAL; 930 else 931 direction = DMA_TO_DEVICE; 932 } else { 933 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 934 direction = DMA_FROM_DEVICE; 935 else 936 return -EINVAL; 937 } 938 939 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); 940 if (ret) 941 return ret; 942 943 if (container->v2) 944 ret = tce_iommu_build_v2(container, tbl, 945 param.iova >> tbl->it_page_shift, 946 param.vaddr, 947 param.size >> tbl->it_page_shift, 948 direction); 949 else 950 ret = tce_iommu_build(container, tbl, 951 param.iova >> tbl->it_page_shift, 952 param.vaddr, 953 param.size >> tbl->it_page_shift, 954 direction); 955 956 iommu_flush_tce(tbl); 957 958 return ret; 959 } 960 case VFIO_IOMMU_UNMAP_DMA: { 961 struct vfio_iommu_type1_dma_unmap param; 962 struct iommu_table *tbl = NULL; 963 long num; 964 965 if (!container->enabled) 966 return -EPERM; 967 968 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, 969 size); 970 971 if (copy_from_user(¶m, (void __user *)arg, minsz)) 972 return -EFAULT; 973 974 if (param.argsz < minsz) 975 return -EINVAL; 976 977 /* No flag is supported now */ 978 if (param.flags) 979 return -EINVAL; 980 981 ret = tce_iommu_create_default_window(container); 982 if (ret) 983 return ret; 984 985 num = tce_iommu_find_table(container, param.iova, &tbl); 986 if (num < 0) 987 return -ENXIO; 988 989 if (param.size & ~IOMMU_PAGE_MASK(tbl)) 990 return -EINVAL; 991 992 ret = iommu_tce_clear_param_check(tbl, param.iova, 0, 993 param.size >> tbl->it_page_shift); 994 if (ret) 995 return ret; 996 997 ret = tce_iommu_clear(container, tbl, 998 param.iova >> tbl->it_page_shift, 999 param.size >> tbl->it_page_shift); 1000 iommu_flush_tce(tbl); 1001 1002 return ret; 1003 } 1004 case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: { 1005 struct vfio_iommu_spapr_register_memory param; 1006 1007 if (!container->v2) 1008 break; 1009 1010 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1011 size); 1012 1013 ret = tce_iommu_mm_set(container); 1014 if (ret) 1015 return ret; 1016 1017 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1018 return -EFAULT; 1019 1020 if (param.argsz < minsz) 1021 return -EINVAL; 1022 1023 /* No flag is supported now */ 1024 if (param.flags) 1025 return -EINVAL; 1026 1027 mutex_lock(&container->lock); 1028 ret = tce_iommu_register_pages(container, param.vaddr, 1029 param.size); 1030 mutex_unlock(&container->lock); 1031 1032 return ret; 1033 } 1034 case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: { 1035 struct vfio_iommu_spapr_register_memory param; 1036 1037 if (!container->v2) 1038 break; 1039 1040 if (!container->mm) 1041 return -EPERM; 1042 1043 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1044 size); 1045 1046 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1047 return -EFAULT; 1048 1049 if (param.argsz < minsz) 1050 return -EINVAL; 1051 1052 /* No flag is supported now */ 1053 if (param.flags) 1054 return -EINVAL; 1055 1056 mutex_lock(&container->lock); 1057 ret = tce_iommu_unregister_pages(container, param.vaddr, 1058 param.size); 1059 mutex_unlock(&container->lock); 1060 1061 return ret; 1062 } 1063 case VFIO_IOMMU_ENABLE: 1064 if (container->v2) 1065 break; 1066 1067 mutex_lock(&container->lock); 1068 ret = tce_iommu_enable(container); 1069 mutex_unlock(&container->lock); 1070 return ret; 1071 1072 1073 case VFIO_IOMMU_DISABLE: 1074 if (container->v2) 1075 break; 1076 1077 mutex_lock(&container->lock); 1078 tce_iommu_disable(container); 1079 mutex_unlock(&container->lock); 1080 return 0; 1081 1082 case VFIO_EEH_PE_OP: { 1083 struct tce_iommu_group *tcegrp; 1084 1085 ret = 0; 1086 list_for_each_entry(tcegrp, &container->group_list, next) { 1087 ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, 1088 cmd, arg); 1089 if (ret) 1090 return ret; 1091 } 1092 return ret; 1093 } 1094 1095 case VFIO_IOMMU_SPAPR_TCE_CREATE: { 1096 struct vfio_iommu_spapr_tce_create create; 1097 1098 if (!container->v2) 1099 break; 1100 1101 ret = tce_iommu_mm_set(container); 1102 if (ret) 1103 return ret; 1104 1105 if (!tce_groups_attached(container)) 1106 return -ENXIO; 1107 1108 minsz = offsetofend(struct vfio_iommu_spapr_tce_create, 1109 start_addr); 1110 1111 if (copy_from_user(&create, (void __user *)arg, minsz)) 1112 return -EFAULT; 1113 1114 if (create.argsz < minsz) 1115 return -EINVAL; 1116 1117 if (create.flags) 1118 return -EINVAL; 1119 1120 mutex_lock(&container->lock); 1121 1122 ret = tce_iommu_create_default_window(container); 1123 if (!ret) 1124 ret = tce_iommu_create_window(container, 1125 create.page_shift, 1126 create.window_size, create.levels, 1127 &create.start_addr); 1128 1129 mutex_unlock(&container->lock); 1130 1131 if (!ret && copy_to_user((void __user *)arg, &create, minsz)) 1132 ret = -EFAULT; 1133 1134 return ret; 1135 } 1136 case VFIO_IOMMU_SPAPR_TCE_REMOVE: { 1137 struct vfio_iommu_spapr_tce_remove remove; 1138 1139 if (!container->v2) 1140 break; 1141 1142 ret = tce_iommu_mm_set(container); 1143 if (ret) 1144 return ret; 1145 1146 if (!tce_groups_attached(container)) 1147 return -ENXIO; 1148 1149 minsz = offsetofend(struct vfio_iommu_spapr_tce_remove, 1150 start_addr); 1151 1152 if (copy_from_user(&remove, (void __user *)arg, minsz)) 1153 return -EFAULT; 1154 1155 if (remove.argsz < minsz) 1156 return -EINVAL; 1157 1158 if (remove.flags) 1159 return -EINVAL; 1160 1161 if (container->def_window_pending && !remove.start_addr) { 1162 container->def_window_pending = false; 1163 return 0; 1164 } 1165 1166 mutex_lock(&container->lock); 1167 1168 ret = tce_iommu_remove_window(container, remove.start_addr); 1169 1170 mutex_unlock(&container->lock); 1171 1172 return ret; 1173 } 1174 } 1175 1176 return -ENOTTY; 1177 } 1178 1179 static void tce_iommu_release_ownership(struct tce_container *container, 1180 struct iommu_table_group *table_group) 1181 { 1182 int i; 1183 1184 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1185 struct iommu_table *tbl = container->tables[i]; 1186 1187 if (!tbl) 1188 continue; 1189 1190 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 1191 if (tbl->it_map) 1192 iommu_release_ownership(tbl); 1193 1194 container->tables[i] = NULL; 1195 } 1196 } 1197 1198 static int tce_iommu_take_ownership(struct tce_container *container, 1199 struct iommu_table_group *table_group) 1200 { 1201 int i, j, rc = 0; 1202 1203 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1204 struct iommu_table *tbl = table_group->tables[i]; 1205 1206 if (!tbl || !tbl->it_map) 1207 continue; 1208 1209 rc = iommu_take_ownership(tbl); 1210 if (rc) { 1211 for (j = 0; j < i; ++j) 1212 iommu_release_ownership( 1213 table_group->tables[j]); 1214 1215 return rc; 1216 } 1217 } 1218 1219 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1220 container->tables[i] = table_group->tables[i]; 1221 1222 return 0; 1223 } 1224 1225 static void tce_iommu_release_ownership_ddw(struct tce_container *container, 1226 struct iommu_table_group *table_group) 1227 { 1228 long i; 1229 1230 if (!table_group->ops->unset_window) { 1231 WARN_ON_ONCE(1); 1232 return; 1233 } 1234 1235 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1236 if (container->tables[i]) 1237 table_group->ops->unset_window(table_group, i); 1238 1239 table_group->ops->release_ownership(table_group); 1240 } 1241 1242 static long tce_iommu_take_ownership_ddw(struct tce_container *container, 1243 struct iommu_table_group *table_group) 1244 { 1245 long i, ret = 0; 1246 1247 if (!table_group->ops->create_table || !table_group->ops->set_window || 1248 !table_group->ops->release_ownership) { 1249 WARN_ON_ONCE(1); 1250 return -EFAULT; 1251 } 1252 1253 table_group->ops->take_ownership(table_group); 1254 1255 /* Set all windows to the new group */ 1256 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1257 struct iommu_table *tbl = container->tables[i]; 1258 1259 if (!tbl) 1260 continue; 1261 1262 ret = table_group->ops->set_window(table_group, i, tbl); 1263 if (ret) 1264 goto release_exit; 1265 } 1266 1267 return 0; 1268 1269 release_exit: 1270 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1271 table_group->ops->unset_window(table_group, i); 1272 1273 table_group->ops->release_ownership(table_group); 1274 1275 return ret; 1276 } 1277 1278 static int tce_iommu_attach_group(void *iommu_data, 1279 struct iommu_group *iommu_group) 1280 { 1281 int ret; 1282 struct tce_container *container = iommu_data; 1283 struct iommu_table_group *table_group; 1284 struct tce_iommu_group *tcegrp = NULL; 1285 1286 mutex_lock(&container->lock); 1287 1288 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", 1289 iommu_group_id(iommu_group), iommu_group); */ 1290 table_group = iommu_group_get_iommudata(iommu_group); 1291 if (!table_group) { 1292 ret = -ENODEV; 1293 goto unlock_exit; 1294 } 1295 1296 if (tce_groups_attached(container) && (!table_group->ops || 1297 !table_group->ops->take_ownership || 1298 !table_group->ops->release_ownership)) { 1299 ret = -EBUSY; 1300 goto unlock_exit; 1301 } 1302 1303 /* Check if new group has the same iommu_ops (i.e. compatible) */ 1304 list_for_each_entry(tcegrp, &container->group_list, next) { 1305 struct iommu_table_group *table_group_tmp; 1306 1307 if (tcegrp->grp == iommu_group) { 1308 pr_warn("tce_vfio: Group %d is already attached\n", 1309 iommu_group_id(iommu_group)); 1310 ret = -EBUSY; 1311 goto unlock_exit; 1312 } 1313 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); 1314 if (table_group_tmp->ops->create_table != 1315 table_group->ops->create_table) { 1316 pr_warn("tce_vfio: Group %d is incompatible with group %d\n", 1317 iommu_group_id(iommu_group), 1318 iommu_group_id(tcegrp->grp)); 1319 ret = -EPERM; 1320 goto unlock_exit; 1321 } 1322 } 1323 1324 tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL); 1325 if (!tcegrp) { 1326 ret = -ENOMEM; 1327 goto unlock_exit; 1328 } 1329 1330 if (!table_group->ops || !table_group->ops->take_ownership || 1331 !table_group->ops->release_ownership) { 1332 if (container->v2) { 1333 ret = -EPERM; 1334 goto unlock_exit; 1335 } 1336 ret = tce_iommu_take_ownership(container, table_group); 1337 } else { 1338 if (!container->v2) { 1339 ret = -EPERM; 1340 goto unlock_exit; 1341 } 1342 ret = tce_iommu_take_ownership_ddw(container, table_group); 1343 if (!tce_groups_attached(container) && !container->tables[0]) 1344 container->def_window_pending = true; 1345 } 1346 1347 if (!ret) { 1348 tcegrp->grp = iommu_group; 1349 list_add(&tcegrp->next, &container->group_list); 1350 } 1351 1352 unlock_exit: 1353 if (ret && tcegrp) 1354 kfree(tcegrp); 1355 1356 mutex_unlock(&container->lock); 1357 1358 return ret; 1359 } 1360 1361 static void tce_iommu_detach_group(void *iommu_data, 1362 struct iommu_group *iommu_group) 1363 { 1364 struct tce_container *container = iommu_data; 1365 struct iommu_table_group *table_group; 1366 bool found = false; 1367 struct tce_iommu_group *tcegrp; 1368 1369 mutex_lock(&container->lock); 1370 1371 list_for_each_entry(tcegrp, &container->group_list, next) { 1372 if (tcegrp->grp == iommu_group) { 1373 found = true; 1374 break; 1375 } 1376 } 1377 1378 if (!found) { 1379 pr_warn("tce_vfio: detaching unattached group #%u\n", 1380 iommu_group_id(iommu_group)); 1381 goto unlock_exit; 1382 } 1383 1384 list_del(&tcegrp->next); 1385 kfree(tcegrp); 1386 1387 table_group = iommu_group_get_iommudata(iommu_group); 1388 BUG_ON(!table_group); 1389 1390 if (!table_group->ops || !table_group->ops->release_ownership) 1391 tce_iommu_release_ownership(container, table_group); 1392 else 1393 tce_iommu_release_ownership_ddw(container, table_group); 1394 1395 unlock_exit: 1396 mutex_unlock(&container->lock); 1397 } 1398 1399 static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { 1400 .name = "iommu-vfio-powerpc", 1401 .owner = THIS_MODULE, 1402 .open = tce_iommu_open, 1403 .release = tce_iommu_release, 1404 .ioctl = tce_iommu_ioctl, 1405 .attach_group = tce_iommu_attach_group, 1406 .detach_group = tce_iommu_detach_group, 1407 }; 1408 1409 static int __init tce_iommu_init(void) 1410 { 1411 return vfio_register_iommu_driver(&tce_iommu_driver_ops); 1412 } 1413 1414 static void __exit tce_iommu_cleanup(void) 1415 { 1416 vfio_unregister_iommu_driver(&tce_iommu_driver_ops); 1417 } 1418 1419 module_init(tce_iommu_init); 1420 module_exit(tce_iommu_cleanup); 1421 1422 MODULE_VERSION(DRIVER_VERSION); 1423 MODULE_LICENSE("GPL v2"); 1424 MODULE_AUTHOR(DRIVER_AUTHOR); 1425 MODULE_DESCRIPTION(DRIVER_DESC); 1426 1427