1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO: IOMMU DMA mapping support for TCE on POWER 4 * 5 * Copyright (C) 2013 IBM Corp. All rights reserved. 6 * Author: Alexey Kardashevskiy <aik@ozlabs.ru> 7 * 8 * Derived from original vfio_iommu_type1.c: 9 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 10 * Author: Alex Williamson <alex.williamson@redhat.com> 11 */ 12 13 #include <linux/module.h> 14 #include <linux/pci.h> 15 #include <linux/slab.h> 16 #include <linux/uaccess.h> 17 #include <linux/err.h> 18 #include <linux/vfio.h> 19 #include <linux/vmalloc.h> 20 #include <linux/sched/mm.h> 21 #include <linux/sched/signal.h> 22 #include <linux/mm.h> 23 24 #include <asm/iommu.h> 25 #include <asm/tce.h> 26 #include <asm/mmu_context.h> 27 28 #define DRIVER_VERSION "0.1" 29 #define DRIVER_AUTHOR "aik@ozlabs.ru" 30 #define DRIVER_DESC "VFIO IOMMU SPAPR TCE" 31 32 static void tce_iommu_detach_group(void *iommu_data, 33 struct iommu_group *iommu_group); 34 35 /* 36 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation 37 * 38 * This code handles mapping and unmapping of user data buffers 39 * into DMA'ble space using the IOMMU 40 */ 41 42 struct tce_iommu_group { 43 struct list_head next; 44 struct iommu_group *grp; 45 }; 46 47 /* 48 * A container needs to remember which preregistered region it has 49 * referenced to do proper cleanup at the userspace process exit. 50 */ 51 struct tce_iommu_prereg { 52 struct list_head next; 53 struct mm_iommu_table_group_mem_t *mem; 54 }; 55 56 /* 57 * The container descriptor supports only a single group per container. 58 * Required by the API as the container is not supplied with the IOMMU group 59 * at the moment of initialization. 60 */ 61 struct tce_container { 62 struct mutex lock; 63 bool enabled; 64 bool v2; 65 bool def_window_pending; 66 unsigned long locked_pages; 67 struct mm_struct *mm; 68 struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; 69 struct list_head group_list; 70 struct list_head prereg_list; 71 }; 72 73 static long tce_iommu_mm_set(struct tce_container *container) 74 { 75 if (container->mm) { 76 if (container->mm == current->mm) 77 return 0; 78 return -EPERM; 79 } 80 BUG_ON(!current->mm); 81 container->mm = current->mm; 82 atomic_inc(&container->mm->mm_count); 83 84 return 0; 85 } 86 87 static long tce_iommu_prereg_free(struct tce_container *container, 88 struct tce_iommu_prereg *tcemem) 89 { 90 long ret; 91 92 ret = mm_iommu_put(container->mm, tcemem->mem); 93 if (ret) 94 return ret; 95 96 list_del(&tcemem->next); 97 kfree(tcemem); 98 99 return 0; 100 } 101 102 static long tce_iommu_unregister_pages(struct tce_container *container, 103 __u64 vaddr, __u64 size) 104 { 105 struct mm_iommu_table_group_mem_t *mem; 106 struct tce_iommu_prereg *tcemem; 107 bool found = false; 108 long ret; 109 110 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) 111 return -EINVAL; 112 113 mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT); 114 if (!mem) 115 return -ENOENT; 116 117 list_for_each_entry(tcemem, &container->prereg_list, next) { 118 if (tcemem->mem == mem) { 119 found = true; 120 break; 121 } 122 } 123 124 if (!found) 125 ret = -ENOENT; 126 else 127 ret = tce_iommu_prereg_free(container, tcemem); 128 129 mm_iommu_put(container->mm, mem); 130 131 return ret; 132 } 133 134 static long tce_iommu_register_pages(struct tce_container *container, 135 __u64 vaddr, __u64 size) 136 { 137 long ret = 0; 138 struct mm_iommu_table_group_mem_t *mem = NULL; 139 struct tce_iommu_prereg *tcemem; 140 unsigned long entries = size >> PAGE_SHIFT; 141 142 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || 143 ((vaddr + size) < vaddr)) 144 return -EINVAL; 145 146 mem = mm_iommu_get(container->mm, vaddr, entries); 147 if (mem) { 148 list_for_each_entry(tcemem, &container->prereg_list, next) { 149 if (tcemem->mem == mem) { 150 ret = -EBUSY; 151 goto put_exit; 152 } 153 } 154 } else { 155 ret = mm_iommu_new(container->mm, vaddr, entries, &mem); 156 if (ret) 157 return ret; 158 } 159 160 tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); 161 if (!tcemem) { 162 ret = -ENOMEM; 163 goto put_exit; 164 } 165 166 tcemem->mem = mem; 167 list_add(&tcemem->next, &container->prereg_list); 168 169 container->enabled = true; 170 171 return 0; 172 173 put_exit: 174 mm_iommu_put(container->mm, mem); 175 return ret; 176 } 177 178 static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, 179 unsigned int it_page_shift) 180 { 181 struct page *page; 182 unsigned long size = 0; 183 184 if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size)) 185 return size == (1UL << it_page_shift); 186 187 page = pfn_to_page(hpa >> PAGE_SHIFT); 188 /* 189 * Check that the TCE table granularity is not bigger than the size of 190 * a page we just found. Otherwise the hardware can get access to 191 * a bigger memory chunk that it should. 192 */ 193 return page_shift(compound_head(page)) >= it_page_shift; 194 } 195 196 static inline bool tce_groups_attached(struct tce_container *container) 197 { 198 return !list_empty(&container->group_list); 199 } 200 201 static long tce_iommu_find_table(struct tce_container *container, 202 phys_addr_t ioba, struct iommu_table **ptbl) 203 { 204 long i; 205 206 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 207 struct iommu_table *tbl = container->tables[i]; 208 209 if (tbl) { 210 unsigned long entry = ioba >> tbl->it_page_shift; 211 unsigned long start = tbl->it_offset; 212 unsigned long end = start + tbl->it_size; 213 214 if ((start <= entry) && (entry < end)) { 215 *ptbl = tbl; 216 return i; 217 } 218 } 219 } 220 221 return -1; 222 } 223 224 static int tce_iommu_find_free_table(struct tce_container *container) 225 { 226 int i; 227 228 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 229 if (!container->tables[i]) 230 return i; 231 } 232 233 return -ENOSPC; 234 } 235 236 static int tce_iommu_enable(struct tce_container *container) 237 { 238 int ret = 0; 239 unsigned long locked; 240 struct iommu_table_group *table_group; 241 struct tce_iommu_group *tcegrp; 242 243 if (container->enabled) 244 return -EBUSY; 245 246 /* 247 * When userspace pages are mapped into the IOMMU, they are effectively 248 * locked memory, so, theoretically, we need to update the accounting 249 * of locked pages on each map and unmap. For powerpc, the map unmap 250 * paths can be very hot, though, and the accounting would kill 251 * performance, especially since it would be difficult to impossible 252 * to handle the accounting in real mode only. 253 * 254 * To address that, rather than precisely accounting every page, we 255 * instead account for a worst case on locked memory when the iommu is 256 * enabled and disabled. The worst case upper bound on locked memory 257 * is the size of the whole iommu window, which is usually relatively 258 * small (compared to total memory sizes) on POWER hardware. 259 * 260 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, 261 * that would effectively kill the guest at random points, much better 262 * enforcing the limit based on the max that the guest can map. 263 * 264 * Unfortunately at the moment it counts whole tables, no matter how 265 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups 266 * each with 2GB DMA window, 8GB will be counted here. The reason for 267 * this is that we cannot tell here the amount of RAM used by the guest 268 * as this information is only available from KVM and VFIO is 269 * KVM agnostic. 270 * 271 * So we do not allow enabling a container without a group attached 272 * as there is no way to know how much we should increment 273 * the locked_vm counter. 274 */ 275 if (!tce_groups_attached(container)) 276 return -ENODEV; 277 278 tcegrp = list_first_entry(&container->group_list, 279 struct tce_iommu_group, next); 280 table_group = iommu_group_get_iommudata(tcegrp->grp); 281 if (!table_group) 282 return -ENODEV; 283 284 if (!table_group->tce32_size) 285 return -EPERM; 286 287 ret = tce_iommu_mm_set(container); 288 if (ret) 289 return ret; 290 291 locked = table_group->tce32_size >> PAGE_SHIFT; 292 ret = account_locked_vm(container->mm, locked, true); 293 if (ret) 294 return ret; 295 296 container->locked_pages = locked; 297 298 container->enabled = true; 299 300 return ret; 301 } 302 303 static void tce_iommu_disable(struct tce_container *container) 304 { 305 if (!container->enabled) 306 return; 307 308 container->enabled = false; 309 310 BUG_ON(!container->mm); 311 account_locked_vm(container->mm, container->locked_pages, false); 312 } 313 314 static void *tce_iommu_open(unsigned long arg) 315 { 316 struct tce_container *container; 317 318 if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) { 319 pr_err("tce_vfio: Wrong IOMMU type\n"); 320 return ERR_PTR(-EINVAL); 321 } 322 323 container = kzalloc(sizeof(*container), GFP_KERNEL); 324 if (!container) 325 return ERR_PTR(-ENOMEM); 326 327 mutex_init(&container->lock); 328 INIT_LIST_HEAD_RCU(&container->group_list); 329 INIT_LIST_HEAD_RCU(&container->prereg_list); 330 331 container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; 332 333 return container; 334 } 335 336 static int tce_iommu_clear(struct tce_container *container, 337 struct iommu_table *tbl, 338 unsigned long entry, unsigned long pages); 339 static void tce_iommu_free_table(struct tce_container *container, 340 struct iommu_table *tbl); 341 342 static void tce_iommu_release(void *iommu_data) 343 { 344 struct tce_container *container = iommu_data; 345 struct tce_iommu_group *tcegrp; 346 struct tce_iommu_prereg *tcemem, *tmtmp; 347 long i; 348 349 while (tce_groups_attached(container)) { 350 tcegrp = list_first_entry(&container->group_list, 351 struct tce_iommu_group, next); 352 tce_iommu_detach_group(iommu_data, tcegrp->grp); 353 } 354 355 /* 356 * If VFIO created a table, it was not disposed 357 * by tce_iommu_detach_group() so do it now. 358 */ 359 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 360 struct iommu_table *tbl = container->tables[i]; 361 362 if (!tbl) 363 continue; 364 365 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 366 tce_iommu_free_table(container, tbl); 367 } 368 369 list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next) 370 WARN_ON(tce_iommu_prereg_free(container, tcemem)); 371 372 tce_iommu_disable(container); 373 if (container->mm) 374 mmdrop(container->mm); 375 mutex_destroy(&container->lock); 376 377 kfree(container); 378 } 379 380 static void tce_iommu_unuse_page(struct tce_container *container, 381 unsigned long hpa) 382 { 383 struct page *page; 384 385 page = pfn_to_page(hpa >> PAGE_SHIFT); 386 put_page(page); 387 } 388 389 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container, 390 unsigned long tce, unsigned long shift, 391 unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) 392 { 393 long ret = 0; 394 struct mm_iommu_table_group_mem_t *mem; 395 396 mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift); 397 if (!mem) 398 return -EINVAL; 399 400 ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa); 401 if (ret) 402 return -EINVAL; 403 404 *pmem = mem; 405 406 return 0; 407 } 408 409 static void tce_iommu_unuse_page_v2(struct tce_container *container, 410 struct iommu_table *tbl, unsigned long entry) 411 { 412 struct mm_iommu_table_group_mem_t *mem = NULL; 413 int ret; 414 unsigned long hpa = 0; 415 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); 416 417 if (!pua) 418 return; 419 420 ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua), 421 tbl->it_page_shift, &hpa, &mem); 422 if (ret) 423 pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n", 424 __func__, be64_to_cpu(*pua), entry, ret); 425 if (mem) 426 mm_iommu_mapped_dec(mem); 427 428 *pua = cpu_to_be64(0); 429 } 430 431 static int tce_iommu_clear(struct tce_container *container, 432 struct iommu_table *tbl, 433 unsigned long entry, unsigned long pages) 434 { 435 unsigned long oldhpa; 436 long ret; 437 enum dma_data_direction direction; 438 unsigned long lastentry = entry + pages, firstentry = entry; 439 440 for ( ; entry < lastentry; ++entry) { 441 if (tbl->it_indirect_levels && tbl->it_userspace) { 442 /* 443 * For multilevel tables, we can take a shortcut here 444 * and skip some TCEs as we know that the userspace 445 * addresses cache is a mirror of the real TCE table 446 * and if it is missing some indirect levels, then 447 * the hardware table does not have them allocated 448 * either and therefore does not require updating. 449 */ 450 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, 451 entry); 452 if (!pua) { 453 /* align to level_size which is power of two */ 454 entry |= tbl->it_level_size - 1; 455 continue; 456 } 457 } 458 459 cond_resched(); 460 461 direction = DMA_NONE; 462 oldhpa = 0; 463 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa, 464 &direction); 465 if (ret) 466 continue; 467 468 if (direction == DMA_NONE) 469 continue; 470 471 if (container->v2) { 472 tce_iommu_unuse_page_v2(container, tbl, entry); 473 continue; 474 } 475 476 tce_iommu_unuse_page(container, oldhpa); 477 } 478 479 iommu_tce_kill(tbl, firstentry, pages); 480 481 return 0; 482 } 483 484 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) 485 { 486 struct page *page = NULL; 487 enum dma_data_direction direction = iommu_tce_direction(tce); 488 489 if (get_user_pages_fast(tce & PAGE_MASK, 1, 490 direction != DMA_TO_DEVICE ? FOLL_WRITE : 0, 491 &page) != 1) 492 return -EFAULT; 493 494 *hpa = __pa((unsigned long) page_address(page)); 495 496 return 0; 497 } 498 499 static long tce_iommu_build(struct tce_container *container, 500 struct iommu_table *tbl, 501 unsigned long entry, unsigned long tce, unsigned long pages, 502 enum dma_data_direction direction) 503 { 504 long i, ret = 0; 505 unsigned long hpa; 506 enum dma_data_direction dirtmp; 507 508 for (i = 0; i < pages; ++i) { 509 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 510 511 ret = tce_iommu_use_page(tce, &hpa); 512 if (ret) 513 break; 514 515 if (!tce_page_is_contained(container->mm, hpa, 516 tbl->it_page_shift)) { 517 ret = -EPERM; 518 break; 519 } 520 521 hpa |= offset; 522 dirtmp = direction; 523 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i, 524 &hpa, &dirtmp); 525 if (ret) { 526 tce_iommu_unuse_page(container, hpa); 527 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 528 __func__, entry << tbl->it_page_shift, 529 tce, ret); 530 break; 531 } 532 533 if (dirtmp != DMA_NONE) 534 tce_iommu_unuse_page(container, hpa); 535 536 tce += IOMMU_PAGE_SIZE(tbl); 537 } 538 539 if (ret) 540 tce_iommu_clear(container, tbl, entry, i); 541 else 542 iommu_tce_kill(tbl, entry, pages); 543 544 return ret; 545 } 546 547 static long tce_iommu_build_v2(struct tce_container *container, 548 struct iommu_table *tbl, 549 unsigned long entry, unsigned long tce, unsigned long pages, 550 enum dma_data_direction direction) 551 { 552 long i, ret = 0; 553 unsigned long hpa; 554 enum dma_data_direction dirtmp; 555 556 for (i = 0; i < pages; ++i) { 557 struct mm_iommu_table_group_mem_t *mem = NULL; 558 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i); 559 560 ret = tce_iommu_prereg_ua_to_hpa(container, 561 tce, tbl->it_page_shift, &hpa, &mem); 562 if (ret) 563 break; 564 565 if (!tce_page_is_contained(container->mm, hpa, 566 tbl->it_page_shift)) { 567 ret = -EPERM; 568 break; 569 } 570 571 /* Preserve offset within IOMMU page */ 572 hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 573 dirtmp = direction; 574 575 /* The registered region is being unregistered */ 576 if (mm_iommu_mapped_inc(mem)) 577 break; 578 579 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i, 580 &hpa, &dirtmp); 581 if (ret) { 582 /* dirtmp cannot be DMA_NONE here */ 583 tce_iommu_unuse_page_v2(container, tbl, entry + i); 584 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 585 __func__, entry << tbl->it_page_shift, 586 tce, ret); 587 break; 588 } 589 590 if (dirtmp != DMA_NONE) 591 tce_iommu_unuse_page_v2(container, tbl, entry + i); 592 593 *pua = cpu_to_be64(tce); 594 595 tce += IOMMU_PAGE_SIZE(tbl); 596 } 597 598 if (ret) 599 tce_iommu_clear(container, tbl, entry, i); 600 else 601 iommu_tce_kill(tbl, entry, pages); 602 603 return ret; 604 } 605 606 static long tce_iommu_create_table(struct tce_container *container, 607 struct iommu_table_group *table_group, 608 int num, 609 __u32 page_shift, 610 __u64 window_size, 611 __u32 levels, 612 struct iommu_table **ptbl) 613 { 614 long ret, table_size; 615 616 table_size = table_group->ops->get_table_size(page_shift, window_size, 617 levels); 618 if (!table_size) 619 return -EINVAL; 620 621 ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true); 622 if (ret) 623 return ret; 624 625 ret = table_group->ops->create_table(table_group, num, 626 page_shift, window_size, levels, ptbl); 627 628 WARN_ON(!ret && !(*ptbl)->it_ops->free); 629 WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size)); 630 631 return ret; 632 } 633 634 static void tce_iommu_free_table(struct tce_container *container, 635 struct iommu_table *tbl) 636 { 637 unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; 638 639 iommu_tce_table_put(tbl); 640 account_locked_vm(container->mm, pages, false); 641 } 642 643 static long tce_iommu_create_window(struct tce_container *container, 644 __u32 page_shift, __u64 window_size, __u32 levels, 645 __u64 *start_addr) 646 { 647 struct tce_iommu_group *tcegrp; 648 struct iommu_table_group *table_group; 649 struct iommu_table *tbl = NULL; 650 long ret, num; 651 652 num = tce_iommu_find_free_table(container); 653 if (num < 0) 654 return num; 655 656 /* Get the first group for ops::create_table */ 657 tcegrp = list_first_entry(&container->group_list, 658 struct tce_iommu_group, next); 659 table_group = iommu_group_get_iommudata(tcegrp->grp); 660 if (!table_group) 661 return -EFAULT; 662 663 if (!(table_group->pgsizes & (1ULL << page_shift))) 664 return -EINVAL; 665 666 if (!table_group->ops->set_window || !table_group->ops->unset_window || 667 !table_group->ops->get_table_size || 668 !table_group->ops->create_table) 669 return -EPERM; 670 671 /* Create TCE table */ 672 ret = tce_iommu_create_table(container, table_group, num, 673 page_shift, window_size, levels, &tbl); 674 if (ret) 675 return ret; 676 677 BUG_ON(!tbl->it_ops->free); 678 679 /* 680 * Program the table to every group. 681 * Groups have been tested for compatibility at the attach time. 682 */ 683 list_for_each_entry(tcegrp, &container->group_list, next) { 684 table_group = iommu_group_get_iommudata(tcegrp->grp); 685 686 ret = table_group->ops->set_window(table_group, num, tbl); 687 if (ret) 688 goto unset_exit; 689 } 690 691 container->tables[num] = tbl; 692 693 /* Return start address assigned by platform in create_table() */ 694 *start_addr = tbl->it_offset << tbl->it_page_shift; 695 696 return 0; 697 698 unset_exit: 699 list_for_each_entry(tcegrp, &container->group_list, next) { 700 table_group = iommu_group_get_iommudata(tcegrp->grp); 701 table_group->ops->unset_window(table_group, num); 702 } 703 tce_iommu_free_table(container, tbl); 704 705 return ret; 706 } 707 708 static long tce_iommu_remove_window(struct tce_container *container, 709 __u64 start_addr) 710 { 711 struct iommu_table_group *table_group = NULL; 712 struct iommu_table *tbl; 713 struct tce_iommu_group *tcegrp; 714 int num; 715 716 num = tce_iommu_find_table(container, start_addr, &tbl); 717 if (num < 0) 718 return -EINVAL; 719 720 BUG_ON(!tbl->it_size); 721 722 /* Detach groups from IOMMUs */ 723 list_for_each_entry(tcegrp, &container->group_list, next) { 724 table_group = iommu_group_get_iommudata(tcegrp->grp); 725 726 /* 727 * SPAPR TCE IOMMU exposes the default DMA window to 728 * the guest via dma32_window_start/size of 729 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow 730 * the userspace to remove this window, some do not so 731 * here we check for the platform capability. 732 */ 733 if (!table_group->ops || !table_group->ops->unset_window) 734 return -EPERM; 735 736 table_group->ops->unset_window(table_group, num); 737 } 738 739 /* Free table */ 740 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 741 tce_iommu_free_table(container, tbl); 742 container->tables[num] = NULL; 743 744 return 0; 745 } 746 747 static long tce_iommu_create_default_window(struct tce_container *container) 748 { 749 long ret; 750 __u64 start_addr = 0; 751 struct tce_iommu_group *tcegrp; 752 struct iommu_table_group *table_group; 753 754 if (!container->def_window_pending) 755 return 0; 756 757 if (!tce_groups_attached(container)) 758 return -ENODEV; 759 760 tcegrp = list_first_entry(&container->group_list, 761 struct tce_iommu_group, next); 762 table_group = iommu_group_get_iommudata(tcegrp->grp); 763 if (!table_group) 764 return -ENODEV; 765 766 ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K, 767 table_group->tce32_size, 1, &start_addr); 768 WARN_ON_ONCE(!ret && start_addr); 769 770 if (!ret) 771 container->def_window_pending = false; 772 773 return ret; 774 } 775 776 static long tce_iommu_ioctl(void *iommu_data, 777 unsigned int cmd, unsigned long arg) 778 { 779 struct tce_container *container = iommu_data; 780 unsigned long minsz, ddwsz; 781 long ret; 782 783 switch (cmd) { 784 case VFIO_CHECK_EXTENSION: 785 switch (arg) { 786 case VFIO_SPAPR_TCE_IOMMU: 787 case VFIO_SPAPR_TCE_v2_IOMMU: 788 ret = 1; 789 break; 790 default: 791 ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); 792 break; 793 } 794 795 return (ret < 0) ? 0 : ret; 796 } 797 798 /* 799 * Sanity check to prevent one userspace from manipulating 800 * another userspace mm. 801 */ 802 BUG_ON(!container); 803 if (container->mm && container->mm != current->mm) 804 return -EPERM; 805 806 switch (cmd) { 807 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { 808 struct vfio_iommu_spapr_tce_info info; 809 struct tce_iommu_group *tcegrp; 810 struct iommu_table_group *table_group; 811 812 if (!tce_groups_attached(container)) 813 return -ENXIO; 814 815 tcegrp = list_first_entry(&container->group_list, 816 struct tce_iommu_group, next); 817 table_group = iommu_group_get_iommudata(tcegrp->grp); 818 819 if (!table_group) 820 return -ENXIO; 821 822 minsz = offsetofend(struct vfio_iommu_spapr_tce_info, 823 dma32_window_size); 824 825 if (copy_from_user(&info, (void __user *)arg, minsz)) 826 return -EFAULT; 827 828 if (info.argsz < minsz) 829 return -EINVAL; 830 831 info.dma32_window_start = table_group->tce32_start; 832 info.dma32_window_size = table_group->tce32_size; 833 info.flags = 0; 834 memset(&info.ddw, 0, sizeof(info.ddw)); 835 836 if (table_group->max_dynamic_windows_supported && 837 container->v2) { 838 info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW; 839 info.ddw.pgsizes = table_group->pgsizes; 840 info.ddw.max_dynamic_windows_supported = 841 table_group->max_dynamic_windows_supported; 842 info.ddw.levels = table_group->max_levels; 843 } 844 845 ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw); 846 847 if (info.argsz >= ddwsz) 848 minsz = ddwsz; 849 850 if (copy_to_user((void __user *)arg, &info, minsz)) 851 return -EFAULT; 852 853 return 0; 854 } 855 case VFIO_IOMMU_MAP_DMA: { 856 struct vfio_iommu_type1_dma_map param; 857 struct iommu_table *tbl = NULL; 858 long num; 859 enum dma_data_direction direction; 860 861 if (!container->enabled) 862 return -EPERM; 863 864 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 865 866 if (copy_from_user(¶m, (void __user *)arg, minsz)) 867 return -EFAULT; 868 869 if (param.argsz < minsz) 870 return -EINVAL; 871 872 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ | 873 VFIO_DMA_MAP_FLAG_WRITE)) 874 return -EINVAL; 875 876 ret = tce_iommu_create_default_window(container); 877 if (ret) 878 return ret; 879 880 num = tce_iommu_find_table(container, param.iova, &tbl); 881 if (num < 0) 882 return -ENXIO; 883 884 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || 885 (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) 886 return -EINVAL; 887 888 /* iova is checked by the IOMMU API */ 889 if (param.flags & VFIO_DMA_MAP_FLAG_READ) { 890 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 891 direction = DMA_BIDIRECTIONAL; 892 else 893 direction = DMA_TO_DEVICE; 894 } else { 895 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 896 direction = DMA_FROM_DEVICE; 897 else 898 return -EINVAL; 899 } 900 901 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); 902 if (ret) 903 return ret; 904 905 if (container->v2) 906 ret = tce_iommu_build_v2(container, tbl, 907 param.iova >> tbl->it_page_shift, 908 param.vaddr, 909 param.size >> tbl->it_page_shift, 910 direction); 911 else 912 ret = tce_iommu_build(container, tbl, 913 param.iova >> tbl->it_page_shift, 914 param.vaddr, 915 param.size >> tbl->it_page_shift, 916 direction); 917 918 iommu_flush_tce(tbl); 919 920 return ret; 921 } 922 case VFIO_IOMMU_UNMAP_DMA: { 923 struct vfio_iommu_type1_dma_unmap param; 924 struct iommu_table *tbl = NULL; 925 long num; 926 927 if (!container->enabled) 928 return -EPERM; 929 930 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, 931 size); 932 933 if (copy_from_user(¶m, (void __user *)arg, minsz)) 934 return -EFAULT; 935 936 if (param.argsz < minsz) 937 return -EINVAL; 938 939 /* No flag is supported now */ 940 if (param.flags) 941 return -EINVAL; 942 943 ret = tce_iommu_create_default_window(container); 944 if (ret) 945 return ret; 946 947 num = tce_iommu_find_table(container, param.iova, &tbl); 948 if (num < 0) 949 return -ENXIO; 950 951 if (param.size & ~IOMMU_PAGE_MASK(tbl)) 952 return -EINVAL; 953 954 ret = iommu_tce_clear_param_check(tbl, param.iova, 0, 955 param.size >> tbl->it_page_shift); 956 if (ret) 957 return ret; 958 959 ret = tce_iommu_clear(container, tbl, 960 param.iova >> tbl->it_page_shift, 961 param.size >> tbl->it_page_shift); 962 iommu_flush_tce(tbl); 963 964 return ret; 965 } 966 case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: { 967 struct vfio_iommu_spapr_register_memory param; 968 969 if (!container->v2) 970 break; 971 972 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 973 size); 974 975 ret = tce_iommu_mm_set(container); 976 if (ret) 977 return ret; 978 979 if (copy_from_user(¶m, (void __user *)arg, minsz)) 980 return -EFAULT; 981 982 if (param.argsz < minsz) 983 return -EINVAL; 984 985 /* No flag is supported now */ 986 if (param.flags) 987 return -EINVAL; 988 989 mutex_lock(&container->lock); 990 ret = tce_iommu_register_pages(container, param.vaddr, 991 param.size); 992 mutex_unlock(&container->lock); 993 994 return ret; 995 } 996 case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: { 997 struct vfio_iommu_spapr_register_memory param; 998 999 if (!container->v2) 1000 break; 1001 1002 if (!container->mm) 1003 return -EPERM; 1004 1005 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1006 size); 1007 1008 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1009 return -EFAULT; 1010 1011 if (param.argsz < minsz) 1012 return -EINVAL; 1013 1014 /* No flag is supported now */ 1015 if (param.flags) 1016 return -EINVAL; 1017 1018 mutex_lock(&container->lock); 1019 ret = tce_iommu_unregister_pages(container, param.vaddr, 1020 param.size); 1021 mutex_unlock(&container->lock); 1022 1023 return ret; 1024 } 1025 case VFIO_IOMMU_ENABLE: 1026 if (container->v2) 1027 break; 1028 1029 mutex_lock(&container->lock); 1030 ret = tce_iommu_enable(container); 1031 mutex_unlock(&container->lock); 1032 return ret; 1033 1034 1035 case VFIO_IOMMU_DISABLE: 1036 if (container->v2) 1037 break; 1038 1039 mutex_lock(&container->lock); 1040 tce_iommu_disable(container); 1041 mutex_unlock(&container->lock); 1042 return 0; 1043 1044 case VFIO_EEH_PE_OP: { 1045 struct tce_iommu_group *tcegrp; 1046 1047 ret = 0; 1048 list_for_each_entry(tcegrp, &container->group_list, next) { 1049 ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, 1050 cmd, arg); 1051 if (ret) 1052 return ret; 1053 } 1054 return ret; 1055 } 1056 1057 case VFIO_IOMMU_SPAPR_TCE_CREATE: { 1058 struct vfio_iommu_spapr_tce_create create; 1059 1060 if (!container->v2) 1061 break; 1062 1063 ret = tce_iommu_mm_set(container); 1064 if (ret) 1065 return ret; 1066 1067 if (!tce_groups_attached(container)) 1068 return -ENXIO; 1069 1070 minsz = offsetofend(struct vfio_iommu_spapr_tce_create, 1071 start_addr); 1072 1073 if (copy_from_user(&create, (void __user *)arg, minsz)) 1074 return -EFAULT; 1075 1076 if (create.argsz < minsz) 1077 return -EINVAL; 1078 1079 if (create.flags) 1080 return -EINVAL; 1081 1082 mutex_lock(&container->lock); 1083 1084 ret = tce_iommu_create_default_window(container); 1085 if (!ret) 1086 ret = tce_iommu_create_window(container, 1087 create.page_shift, 1088 create.window_size, create.levels, 1089 &create.start_addr); 1090 1091 mutex_unlock(&container->lock); 1092 1093 if (!ret && copy_to_user((void __user *)arg, &create, minsz)) 1094 ret = -EFAULT; 1095 1096 return ret; 1097 } 1098 case VFIO_IOMMU_SPAPR_TCE_REMOVE: { 1099 struct vfio_iommu_spapr_tce_remove remove; 1100 1101 if (!container->v2) 1102 break; 1103 1104 ret = tce_iommu_mm_set(container); 1105 if (ret) 1106 return ret; 1107 1108 if (!tce_groups_attached(container)) 1109 return -ENXIO; 1110 1111 minsz = offsetofend(struct vfio_iommu_spapr_tce_remove, 1112 start_addr); 1113 1114 if (copy_from_user(&remove, (void __user *)arg, minsz)) 1115 return -EFAULT; 1116 1117 if (remove.argsz < minsz) 1118 return -EINVAL; 1119 1120 if (remove.flags) 1121 return -EINVAL; 1122 1123 if (container->def_window_pending && !remove.start_addr) { 1124 container->def_window_pending = false; 1125 return 0; 1126 } 1127 1128 mutex_lock(&container->lock); 1129 1130 ret = tce_iommu_remove_window(container, remove.start_addr); 1131 1132 mutex_unlock(&container->lock); 1133 1134 return ret; 1135 } 1136 } 1137 1138 return -ENOTTY; 1139 } 1140 1141 static void tce_iommu_release_ownership(struct tce_container *container, 1142 struct iommu_table_group *table_group) 1143 { 1144 int i; 1145 1146 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1147 struct iommu_table *tbl = container->tables[i]; 1148 1149 if (!tbl) 1150 continue; 1151 1152 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 1153 if (tbl->it_map) 1154 iommu_release_ownership(tbl); 1155 1156 container->tables[i] = NULL; 1157 } 1158 } 1159 1160 static int tce_iommu_take_ownership(struct tce_container *container, 1161 struct iommu_table_group *table_group) 1162 { 1163 int i, j, rc = 0; 1164 1165 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1166 struct iommu_table *tbl = table_group->tables[i]; 1167 1168 if (!tbl || !tbl->it_map) 1169 continue; 1170 1171 rc = iommu_take_ownership(tbl); 1172 if (rc) { 1173 for (j = 0; j < i; ++j) 1174 iommu_release_ownership( 1175 table_group->tables[j]); 1176 1177 return rc; 1178 } 1179 } 1180 1181 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1182 container->tables[i] = table_group->tables[i]; 1183 1184 return 0; 1185 } 1186 1187 static void tce_iommu_release_ownership_ddw(struct tce_container *container, 1188 struct iommu_table_group *table_group) 1189 { 1190 long i; 1191 1192 if (!table_group->ops->unset_window) { 1193 WARN_ON_ONCE(1); 1194 return; 1195 } 1196 1197 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1198 if (container->tables[i]) 1199 table_group->ops->unset_window(table_group, i); 1200 1201 table_group->ops->release_ownership(table_group); 1202 } 1203 1204 static long tce_iommu_take_ownership_ddw(struct tce_container *container, 1205 struct iommu_table_group *table_group) 1206 { 1207 long i, ret = 0; 1208 1209 if (!table_group->ops->create_table || !table_group->ops->set_window || 1210 !table_group->ops->release_ownership) { 1211 WARN_ON_ONCE(1); 1212 return -EFAULT; 1213 } 1214 1215 table_group->ops->take_ownership(table_group); 1216 1217 /* Set all windows to the new group */ 1218 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1219 struct iommu_table *tbl = container->tables[i]; 1220 1221 if (!tbl) 1222 continue; 1223 1224 ret = table_group->ops->set_window(table_group, i, tbl); 1225 if (ret) 1226 goto release_exit; 1227 } 1228 1229 return 0; 1230 1231 release_exit: 1232 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1233 table_group->ops->unset_window(table_group, i); 1234 1235 table_group->ops->release_ownership(table_group); 1236 1237 return ret; 1238 } 1239 1240 static int tce_iommu_attach_group(void *iommu_data, 1241 struct iommu_group *iommu_group) 1242 { 1243 int ret = 0; 1244 struct tce_container *container = iommu_data; 1245 struct iommu_table_group *table_group; 1246 struct tce_iommu_group *tcegrp = NULL; 1247 1248 mutex_lock(&container->lock); 1249 1250 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", 1251 iommu_group_id(iommu_group), iommu_group); */ 1252 table_group = iommu_group_get_iommudata(iommu_group); 1253 if (!table_group) { 1254 ret = -ENODEV; 1255 goto unlock_exit; 1256 } 1257 1258 if (tce_groups_attached(container) && (!table_group->ops || 1259 !table_group->ops->take_ownership || 1260 !table_group->ops->release_ownership)) { 1261 ret = -EBUSY; 1262 goto unlock_exit; 1263 } 1264 1265 /* Check if new group has the same iommu_ops (i.e. compatible) */ 1266 list_for_each_entry(tcegrp, &container->group_list, next) { 1267 struct iommu_table_group *table_group_tmp; 1268 1269 if (tcegrp->grp == iommu_group) { 1270 pr_warn("tce_vfio: Group %d is already attached\n", 1271 iommu_group_id(iommu_group)); 1272 ret = -EBUSY; 1273 goto unlock_exit; 1274 } 1275 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); 1276 if (table_group_tmp->ops->create_table != 1277 table_group->ops->create_table) { 1278 pr_warn("tce_vfio: Group %d is incompatible with group %d\n", 1279 iommu_group_id(iommu_group), 1280 iommu_group_id(tcegrp->grp)); 1281 ret = -EPERM; 1282 goto unlock_exit; 1283 } 1284 } 1285 1286 tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL); 1287 if (!tcegrp) { 1288 ret = -ENOMEM; 1289 goto unlock_exit; 1290 } 1291 1292 if (!table_group->ops || !table_group->ops->take_ownership || 1293 !table_group->ops->release_ownership) { 1294 if (container->v2) { 1295 ret = -EPERM; 1296 goto free_exit; 1297 } 1298 ret = tce_iommu_take_ownership(container, table_group); 1299 } else { 1300 if (!container->v2) { 1301 ret = -EPERM; 1302 goto free_exit; 1303 } 1304 ret = tce_iommu_take_ownership_ddw(container, table_group); 1305 if (!tce_groups_attached(container) && !container->tables[0]) 1306 container->def_window_pending = true; 1307 } 1308 1309 if (!ret) { 1310 tcegrp->grp = iommu_group; 1311 list_add(&tcegrp->next, &container->group_list); 1312 } 1313 1314 free_exit: 1315 if (ret && tcegrp) 1316 kfree(tcegrp); 1317 1318 unlock_exit: 1319 mutex_unlock(&container->lock); 1320 1321 return ret; 1322 } 1323 1324 static void tce_iommu_detach_group(void *iommu_data, 1325 struct iommu_group *iommu_group) 1326 { 1327 struct tce_container *container = iommu_data; 1328 struct iommu_table_group *table_group; 1329 bool found = false; 1330 struct tce_iommu_group *tcegrp; 1331 1332 mutex_lock(&container->lock); 1333 1334 list_for_each_entry(tcegrp, &container->group_list, next) { 1335 if (tcegrp->grp == iommu_group) { 1336 found = true; 1337 break; 1338 } 1339 } 1340 1341 if (!found) { 1342 pr_warn("tce_vfio: detaching unattached group #%u\n", 1343 iommu_group_id(iommu_group)); 1344 goto unlock_exit; 1345 } 1346 1347 list_del(&tcegrp->next); 1348 kfree(tcegrp); 1349 1350 table_group = iommu_group_get_iommudata(iommu_group); 1351 BUG_ON(!table_group); 1352 1353 if (!table_group->ops || !table_group->ops->release_ownership) 1354 tce_iommu_release_ownership(container, table_group); 1355 else 1356 tce_iommu_release_ownership_ddw(container, table_group); 1357 1358 unlock_exit: 1359 mutex_unlock(&container->lock); 1360 } 1361 1362 static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { 1363 .name = "iommu-vfio-powerpc", 1364 .owner = THIS_MODULE, 1365 .open = tce_iommu_open, 1366 .release = tce_iommu_release, 1367 .ioctl = tce_iommu_ioctl, 1368 .attach_group = tce_iommu_attach_group, 1369 .detach_group = tce_iommu_detach_group, 1370 }; 1371 1372 static int __init tce_iommu_init(void) 1373 { 1374 return vfio_register_iommu_driver(&tce_iommu_driver_ops); 1375 } 1376 1377 static void __exit tce_iommu_cleanup(void) 1378 { 1379 vfio_unregister_iommu_driver(&tce_iommu_driver_ops); 1380 } 1381 1382 module_init(tce_iommu_init); 1383 module_exit(tce_iommu_cleanup); 1384 1385 MODULE_VERSION(DRIVER_VERSION); 1386 MODULE_LICENSE("GPL v2"); 1387 MODULE_AUTHOR(DRIVER_AUTHOR); 1388 MODULE_DESCRIPTION(DRIVER_DESC); 1389 1390