1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO: IOMMU DMA mapping support for TCE on POWER 4 * 5 * Copyright (C) 2013 IBM Corp. All rights reserved. 6 * Author: Alexey Kardashevskiy <aik@ozlabs.ru> 7 * 8 * Derived from original vfio_iommu_type1.c: 9 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 10 * Author: Alex Williamson <alex.williamson@redhat.com> 11 */ 12 13 #include <linux/module.h> 14 #include <linux/pci.h> 15 #include <linux/slab.h> 16 #include <linux/uaccess.h> 17 #include <linux/err.h> 18 #include <linux/vfio.h> 19 #include <linux/vmalloc.h> 20 #include <linux/sched/mm.h> 21 #include <linux/sched/signal.h> 22 #include <linux/mm.h> 23 #include "vfio.h" 24 25 #include <asm/iommu.h> 26 #include <asm/tce.h> 27 #include <asm/mmu_context.h> 28 29 #define DRIVER_VERSION "0.1" 30 #define DRIVER_AUTHOR "aik@ozlabs.ru" 31 #define DRIVER_DESC "VFIO IOMMU SPAPR TCE" 32 33 static void tce_iommu_detach_group(void *iommu_data, 34 struct iommu_group *iommu_group); 35 36 /* 37 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation 38 * 39 * This code handles mapping and unmapping of user data buffers 40 * into DMA'ble space using the IOMMU 41 */ 42 43 struct tce_iommu_group { 44 struct list_head next; 45 struct iommu_group *grp; 46 }; 47 48 /* 49 * A container needs to remember which preregistered region it has 50 * referenced to do proper cleanup at the userspace process exit. 51 */ 52 struct tce_iommu_prereg { 53 struct list_head next; 54 struct mm_iommu_table_group_mem_t *mem; 55 }; 56 57 /* 58 * The container descriptor supports only a single group per container. 59 * Required by the API as the container is not supplied with the IOMMU group 60 * at the moment of initialization. 61 */ 62 struct tce_container { 63 struct mutex lock; 64 bool enabled; 65 bool v2; 66 bool def_window_pending; 67 unsigned long locked_pages; 68 struct mm_struct *mm; 69 struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; 70 struct list_head group_list; 71 struct list_head prereg_list; 72 }; 73 74 static long tce_iommu_mm_set(struct tce_container *container) 75 { 76 if (container->mm) { 77 if (container->mm == current->mm) 78 return 0; 79 return -EPERM; 80 } 81 BUG_ON(!current->mm); 82 container->mm = current->mm; 83 mmgrab(container->mm); 84 85 return 0; 86 } 87 88 static long tce_iommu_prereg_free(struct tce_container *container, 89 struct tce_iommu_prereg *tcemem) 90 { 91 long ret; 92 93 ret = mm_iommu_put(container->mm, tcemem->mem); 94 if (ret) 95 return ret; 96 97 list_del(&tcemem->next); 98 kfree(tcemem); 99 100 return 0; 101 } 102 103 static long tce_iommu_unregister_pages(struct tce_container *container, 104 __u64 vaddr, __u64 size) 105 { 106 struct mm_iommu_table_group_mem_t *mem; 107 struct tce_iommu_prereg *tcemem; 108 bool found = false; 109 long ret; 110 111 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) 112 return -EINVAL; 113 114 mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT); 115 if (!mem) 116 return -ENOENT; 117 118 list_for_each_entry(tcemem, &container->prereg_list, next) { 119 if (tcemem->mem == mem) { 120 found = true; 121 break; 122 } 123 } 124 125 if (!found) 126 ret = -ENOENT; 127 else 128 ret = tce_iommu_prereg_free(container, tcemem); 129 130 mm_iommu_put(container->mm, mem); 131 132 return ret; 133 } 134 135 static long tce_iommu_register_pages(struct tce_container *container, 136 __u64 vaddr, __u64 size) 137 { 138 long ret = 0; 139 struct mm_iommu_table_group_mem_t *mem = NULL; 140 struct tce_iommu_prereg *tcemem; 141 unsigned long entries = size >> PAGE_SHIFT; 142 143 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || 144 ((vaddr + size) < vaddr)) 145 return -EINVAL; 146 147 mem = mm_iommu_get(container->mm, vaddr, entries); 148 if (mem) { 149 list_for_each_entry(tcemem, &container->prereg_list, next) { 150 if (tcemem->mem == mem) { 151 ret = -EBUSY; 152 goto put_exit; 153 } 154 } 155 } else { 156 ret = mm_iommu_new(container->mm, vaddr, entries, &mem); 157 if (ret) 158 return ret; 159 } 160 161 tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); 162 if (!tcemem) { 163 ret = -ENOMEM; 164 goto put_exit; 165 } 166 167 tcemem->mem = mem; 168 list_add(&tcemem->next, &container->prereg_list); 169 170 container->enabled = true; 171 172 return 0; 173 174 put_exit: 175 mm_iommu_put(container->mm, mem); 176 return ret; 177 } 178 179 static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, 180 unsigned int it_page_shift) 181 { 182 struct page *page; 183 unsigned long size = 0; 184 185 if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size)) 186 return size == (1UL << it_page_shift); 187 188 page = pfn_to_page(hpa >> PAGE_SHIFT); 189 /* 190 * Check that the TCE table granularity is not bigger than the size of 191 * a page we just found. Otherwise the hardware can get access to 192 * a bigger memory chunk that it should. 193 */ 194 return page_shift(compound_head(page)) >= it_page_shift; 195 } 196 197 static inline bool tce_groups_attached(struct tce_container *container) 198 { 199 return !list_empty(&container->group_list); 200 } 201 202 static long tce_iommu_find_table(struct tce_container *container, 203 phys_addr_t ioba, struct iommu_table **ptbl) 204 { 205 long i; 206 207 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 208 struct iommu_table *tbl = container->tables[i]; 209 210 if (tbl) { 211 unsigned long entry = ioba >> tbl->it_page_shift; 212 unsigned long start = tbl->it_offset; 213 unsigned long end = start + tbl->it_size; 214 215 if ((start <= entry) && (entry < end)) { 216 *ptbl = tbl; 217 return i; 218 } 219 } 220 } 221 222 return -1; 223 } 224 225 static int tce_iommu_find_free_table(struct tce_container *container) 226 { 227 int i; 228 229 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 230 if (!container->tables[i]) 231 return i; 232 } 233 234 return -ENOSPC; 235 } 236 237 static int tce_iommu_enable(struct tce_container *container) 238 { 239 int ret = 0; 240 unsigned long locked; 241 struct iommu_table_group *table_group; 242 struct tce_iommu_group *tcegrp; 243 244 if (container->enabled) 245 return -EBUSY; 246 247 /* 248 * When userspace pages are mapped into the IOMMU, they are effectively 249 * locked memory, so, theoretically, we need to update the accounting 250 * of locked pages on each map and unmap. For powerpc, the map unmap 251 * paths can be very hot, though, and the accounting would kill 252 * performance, especially since it would be difficult to impossible 253 * to handle the accounting in real mode only. 254 * 255 * To address that, rather than precisely accounting every page, we 256 * instead account for a worst case on locked memory when the iommu is 257 * enabled and disabled. The worst case upper bound on locked memory 258 * is the size of the whole iommu window, which is usually relatively 259 * small (compared to total memory sizes) on POWER hardware. 260 * 261 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, 262 * that would effectively kill the guest at random points, much better 263 * enforcing the limit based on the max that the guest can map. 264 * 265 * Unfortunately at the moment it counts whole tables, no matter how 266 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups 267 * each with 2GB DMA window, 8GB will be counted here. The reason for 268 * this is that we cannot tell here the amount of RAM used by the guest 269 * as this information is only available from KVM and VFIO is 270 * KVM agnostic. 271 * 272 * So we do not allow enabling a container without a group attached 273 * as there is no way to know how much we should increment 274 * the locked_vm counter. 275 */ 276 if (!tce_groups_attached(container)) 277 return -ENODEV; 278 279 tcegrp = list_first_entry(&container->group_list, 280 struct tce_iommu_group, next); 281 table_group = iommu_group_get_iommudata(tcegrp->grp); 282 if (!table_group) 283 return -ENODEV; 284 285 if (!table_group->tce32_size) 286 return -EPERM; 287 288 ret = tce_iommu_mm_set(container); 289 if (ret) 290 return ret; 291 292 locked = table_group->tce32_size >> PAGE_SHIFT; 293 ret = account_locked_vm(container->mm, locked, true); 294 if (ret) 295 return ret; 296 297 container->locked_pages = locked; 298 299 container->enabled = true; 300 301 return ret; 302 } 303 304 static void tce_iommu_disable(struct tce_container *container) 305 { 306 if (!container->enabled) 307 return; 308 309 container->enabled = false; 310 311 BUG_ON(!container->mm); 312 account_locked_vm(container->mm, container->locked_pages, false); 313 } 314 315 static void *tce_iommu_open(unsigned long arg) 316 { 317 struct tce_container *container; 318 319 if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) { 320 pr_err("tce_vfio: Wrong IOMMU type\n"); 321 return ERR_PTR(-EINVAL); 322 } 323 324 container = kzalloc(sizeof(*container), GFP_KERNEL); 325 if (!container) 326 return ERR_PTR(-ENOMEM); 327 328 mutex_init(&container->lock); 329 INIT_LIST_HEAD_RCU(&container->group_list); 330 INIT_LIST_HEAD_RCU(&container->prereg_list); 331 332 container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; 333 334 return container; 335 } 336 337 static int tce_iommu_clear(struct tce_container *container, 338 struct iommu_table *tbl, 339 unsigned long entry, unsigned long pages); 340 static void tce_iommu_free_table(struct tce_container *container, 341 struct iommu_table *tbl); 342 343 static void tce_iommu_release(void *iommu_data) 344 { 345 struct tce_container *container = iommu_data; 346 struct tce_iommu_group *tcegrp; 347 struct tce_iommu_prereg *tcemem, *tmtmp; 348 long i; 349 350 while (tce_groups_attached(container)) { 351 tcegrp = list_first_entry(&container->group_list, 352 struct tce_iommu_group, next); 353 tce_iommu_detach_group(iommu_data, tcegrp->grp); 354 } 355 356 /* 357 * If VFIO created a table, it was not disposed 358 * by tce_iommu_detach_group() so do it now. 359 */ 360 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 361 struct iommu_table *tbl = container->tables[i]; 362 363 if (!tbl) 364 continue; 365 366 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 367 tce_iommu_free_table(container, tbl); 368 } 369 370 list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next) 371 WARN_ON(tce_iommu_prereg_free(container, tcemem)); 372 373 tce_iommu_disable(container); 374 if (container->mm) 375 mmdrop(container->mm); 376 mutex_destroy(&container->lock); 377 378 kfree(container); 379 } 380 381 static void tce_iommu_unuse_page(unsigned long hpa) 382 { 383 struct page *page; 384 385 page = pfn_to_page(hpa >> PAGE_SHIFT); 386 unpin_user_page(page); 387 } 388 389 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container, 390 unsigned long tce, unsigned long shift, 391 unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) 392 { 393 long ret = 0; 394 struct mm_iommu_table_group_mem_t *mem; 395 396 mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift); 397 if (!mem) 398 return -EINVAL; 399 400 ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa); 401 if (ret) 402 return -EINVAL; 403 404 *pmem = mem; 405 406 return 0; 407 } 408 409 static void tce_iommu_unuse_page_v2(struct tce_container *container, 410 struct iommu_table *tbl, unsigned long entry) 411 { 412 struct mm_iommu_table_group_mem_t *mem = NULL; 413 int ret; 414 unsigned long hpa = 0; 415 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); 416 417 if (!pua) 418 return; 419 420 ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua), 421 tbl->it_page_shift, &hpa, &mem); 422 if (ret) 423 pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n", 424 __func__, be64_to_cpu(*pua), entry, ret); 425 if (mem) 426 mm_iommu_mapped_dec(mem); 427 428 *pua = cpu_to_be64(0); 429 } 430 431 static int tce_iommu_clear(struct tce_container *container, 432 struct iommu_table *tbl, 433 unsigned long entry, unsigned long pages) 434 { 435 unsigned long oldhpa; 436 long ret; 437 enum dma_data_direction direction; 438 unsigned long lastentry = entry + pages, firstentry = entry; 439 440 for ( ; entry < lastentry; ++entry) { 441 if (tbl->it_indirect_levels && tbl->it_userspace) { 442 /* 443 * For multilevel tables, we can take a shortcut here 444 * and skip some TCEs as we know that the userspace 445 * addresses cache is a mirror of the real TCE table 446 * and if it is missing some indirect levels, then 447 * the hardware table does not have them allocated 448 * either and therefore does not require updating. 449 */ 450 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, 451 entry); 452 if (!pua) { 453 /* align to level_size which is power of two */ 454 entry |= tbl->it_level_size - 1; 455 continue; 456 } 457 } 458 459 cond_resched(); 460 461 direction = DMA_NONE; 462 oldhpa = 0; 463 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa, 464 &direction); 465 if (ret) 466 continue; 467 468 if (direction == DMA_NONE) 469 continue; 470 471 if (container->v2) { 472 tce_iommu_unuse_page_v2(container, tbl, entry); 473 continue; 474 } 475 476 tce_iommu_unuse_page(oldhpa); 477 } 478 479 iommu_tce_kill(tbl, firstentry, pages); 480 481 return 0; 482 } 483 484 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) 485 { 486 struct page *page = NULL; 487 enum dma_data_direction direction = iommu_tce_direction(tce); 488 489 if (pin_user_pages_fast(tce & PAGE_MASK, 1, 490 direction != DMA_TO_DEVICE ? FOLL_WRITE : 0, 491 &page) != 1) 492 return -EFAULT; 493 494 *hpa = __pa((unsigned long) page_address(page)); 495 496 return 0; 497 } 498 499 static long tce_iommu_build(struct tce_container *container, 500 struct iommu_table *tbl, 501 unsigned long entry, unsigned long tce, unsigned long pages, 502 enum dma_data_direction direction) 503 { 504 long i, ret = 0; 505 unsigned long hpa; 506 enum dma_data_direction dirtmp; 507 508 for (i = 0; i < pages; ++i) { 509 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 510 511 ret = tce_iommu_use_page(tce, &hpa); 512 if (ret) 513 break; 514 515 if (!tce_page_is_contained(container->mm, hpa, 516 tbl->it_page_shift)) { 517 ret = -EPERM; 518 break; 519 } 520 521 hpa |= offset; 522 dirtmp = direction; 523 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i, 524 &hpa, &dirtmp); 525 if (ret) { 526 tce_iommu_unuse_page(hpa); 527 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 528 __func__, entry << tbl->it_page_shift, 529 tce, ret); 530 break; 531 } 532 533 if (dirtmp != DMA_NONE) 534 tce_iommu_unuse_page(hpa); 535 536 tce += IOMMU_PAGE_SIZE(tbl); 537 } 538 539 if (ret) 540 tce_iommu_clear(container, tbl, entry, i); 541 else 542 iommu_tce_kill(tbl, entry, pages); 543 544 return ret; 545 } 546 547 static long tce_iommu_build_v2(struct tce_container *container, 548 struct iommu_table *tbl, 549 unsigned long entry, unsigned long tce, unsigned long pages, 550 enum dma_data_direction direction) 551 { 552 long i, ret = 0; 553 unsigned long hpa; 554 enum dma_data_direction dirtmp; 555 556 for (i = 0; i < pages; ++i) { 557 struct mm_iommu_table_group_mem_t *mem = NULL; 558 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i); 559 560 ret = tce_iommu_prereg_ua_to_hpa(container, 561 tce, tbl->it_page_shift, &hpa, &mem); 562 if (ret) 563 break; 564 565 if (!tce_page_is_contained(container->mm, hpa, 566 tbl->it_page_shift)) { 567 ret = -EPERM; 568 break; 569 } 570 571 /* Preserve offset within IOMMU page */ 572 hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 573 dirtmp = direction; 574 575 /* The registered region is being unregistered */ 576 if (mm_iommu_mapped_inc(mem)) 577 break; 578 579 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i, 580 &hpa, &dirtmp); 581 if (ret) { 582 /* dirtmp cannot be DMA_NONE here */ 583 tce_iommu_unuse_page_v2(container, tbl, entry + i); 584 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 585 __func__, entry << tbl->it_page_shift, 586 tce, ret); 587 break; 588 } 589 590 if (dirtmp != DMA_NONE) 591 tce_iommu_unuse_page_v2(container, tbl, entry + i); 592 593 *pua = cpu_to_be64(tce); 594 595 tce += IOMMU_PAGE_SIZE(tbl); 596 } 597 598 if (ret) 599 tce_iommu_clear(container, tbl, entry, i); 600 else 601 iommu_tce_kill(tbl, entry, pages); 602 603 return ret; 604 } 605 606 static long tce_iommu_create_table(struct tce_container *container, 607 struct iommu_table_group *table_group, 608 int num, 609 __u32 page_shift, 610 __u64 window_size, 611 __u32 levels, 612 struct iommu_table **ptbl) 613 { 614 long ret, table_size; 615 616 table_size = table_group->ops->get_table_size(page_shift, window_size, 617 levels); 618 if (!table_size) 619 return -EINVAL; 620 621 ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true); 622 if (ret) 623 return ret; 624 625 ret = table_group->ops->create_table(table_group, num, 626 page_shift, window_size, levels, ptbl); 627 628 WARN_ON(!ret && !(*ptbl)->it_ops->free); 629 WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size)); 630 631 return ret; 632 } 633 634 static void tce_iommu_free_table(struct tce_container *container, 635 struct iommu_table *tbl) 636 { 637 unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; 638 639 iommu_tce_table_put(tbl); 640 account_locked_vm(container->mm, pages, false); 641 } 642 643 static long tce_iommu_create_window(struct tce_container *container, 644 __u32 page_shift, __u64 window_size, __u32 levels, 645 __u64 *start_addr) 646 { 647 struct tce_iommu_group *tcegrp; 648 struct iommu_table_group *table_group; 649 struct iommu_table *tbl = NULL; 650 long ret, num; 651 652 num = tce_iommu_find_free_table(container); 653 if (num < 0) 654 return num; 655 656 /* Get the first group for ops::create_table */ 657 tcegrp = list_first_entry(&container->group_list, 658 struct tce_iommu_group, next); 659 table_group = iommu_group_get_iommudata(tcegrp->grp); 660 if (!table_group) 661 return -EFAULT; 662 663 if (!(table_group->pgsizes & (1ULL << page_shift))) 664 return -EINVAL; 665 666 if (!table_group->ops->set_window || !table_group->ops->unset_window || 667 !table_group->ops->get_table_size || 668 !table_group->ops->create_table) 669 return -EPERM; 670 671 /* Create TCE table */ 672 ret = tce_iommu_create_table(container, table_group, num, 673 page_shift, window_size, levels, &tbl); 674 if (ret) 675 return ret; 676 677 BUG_ON(!tbl->it_ops->free); 678 679 /* 680 * Program the table to every group. 681 * Groups have been tested for compatibility at the attach time. 682 */ 683 list_for_each_entry(tcegrp, &container->group_list, next) { 684 table_group = iommu_group_get_iommudata(tcegrp->grp); 685 686 ret = table_group->ops->set_window(table_group, num, tbl); 687 if (ret) 688 goto unset_exit; 689 } 690 691 container->tables[num] = tbl; 692 693 /* Return start address assigned by platform in create_table() */ 694 *start_addr = tbl->it_offset << tbl->it_page_shift; 695 696 return 0; 697 698 unset_exit: 699 list_for_each_entry(tcegrp, &container->group_list, next) { 700 table_group = iommu_group_get_iommudata(tcegrp->grp); 701 table_group->ops->unset_window(table_group, num); 702 } 703 tce_iommu_free_table(container, tbl); 704 705 return ret; 706 } 707 708 static long tce_iommu_remove_window(struct tce_container *container, 709 __u64 start_addr) 710 { 711 struct iommu_table_group *table_group = NULL; 712 struct iommu_table *tbl; 713 struct tce_iommu_group *tcegrp; 714 int num; 715 716 num = tce_iommu_find_table(container, start_addr, &tbl); 717 if (num < 0) 718 return -EINVAL; 719 720 BUG_ON(!tbl->it_size); 721 722 /* Detach groups from IOMMUs */ 723 list_for_each_entry(tcegrp, &container->group_list, next) { 724 table_group = iommu_group_get_iommudata(tcegrp->grp); 725 726 /* 727 * SPAPR TCE IOMMU exposes the default DMA window to 728 * the guest via dma32_window_start/size of 729 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow 730 * the userspace to remove this window, some do not so 731 * here we check for the platform capability. 732 */ 733 if (!table_group->ops || !table_group->ops->unset_window) 734 return -EPERM; 735 736 table_group->ops->unset_window(table_group, num); 737 } 738 739 /* Free table */ 740 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 741 tce_iommu_free_table(container, tbl); 742 container->tables[num] = NULL; 743 744 return 0; 745 } 746 747 static long tce_iommu_create_default_window(struct tce_container *container) 748 { 749 long ret; 750 __u64 start_addr = 0; 751 struct tce_iommu_group *tcegrp; 752 struct iommu_table_group *table_group; 753 754 if (!container->def_window_pending) 755 return 0; 756 757 if (!tce_groups_attached(container)) 758 return -ENODEV; 759 760 tcegrp = list_first_entry(&container->group_list, 761 struct tce_iommu_group, next); 762 table_group = iommu_group_get_iommudata(tcegrp->grp); 763 if (!table_group) 764 return -ENODEV; 765 766 ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K, 767 table_group->tce32_size, 1, &start_addr); 768 WARN_ON_ONCE(!ret && start_addr); 769 770 if (!ret) 771 container->def_window_pending = false; 772 773 return ret; 774 } 775 776 static long tce_iommu_ioctl(void *iommu_data, 777 unsigned int cmd, unsigned long arg) 778 { 779 struct tce_container *container = iommu_data; 780 unsigned long minsz, ddwsz; 781 long ret; 782 783 switch (cmd) { 784 case VFIO_CHECK_EXTENSION: 785 switch (arg) { 786 case VFIO_SPAPR_TCE_IOMMU: 787 case VFIO_SPAPR_TCE_v2_IOMMU: 788 ret = 1; 789 break; 790 default: 791 ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); 792 break; 793 } 794 795 return (ret < 0) ? 0 : ret; 796 } 797 798 /* 799 * Sanity check to prevent one userspace from manipulating 800 * another userspace mm. 801 */ 802 BUG_ON(!container); 803 if (container->mm && container->mm != current->mm) 804 return -EPERM; 805 806 switch (cmd) { 807 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { 808 struct vfio_iommu_spapr_tce_info info; 809 struct tce_iommu_group *tcegrp; 810 struct iommu_table_group *table_group; 811 812 if (!tce_groups_attached(container)) 813 return -ENXIO; 814 815 tcegrp = list_first_entry(&container->group_list, 816 struct tce_iommu_group, next); 817 table_group = iommu_group_get_iommudata(tcegrp->grp); 818 819 if (!table_group) 820 return -ENXIO; 821 822 minsz = offsetofend(struct vfio_iommu_spapr_tce_info, 823 dma32_window_size); 824 825 if (copy_from_user(&info, (void __user *)arg, minsz)) 826 return -EFAULT; 827 828 if (info.argsz < minsz) 829 return -EINVAL; 830 831 info.dma32_window_start = table_group->tce32_start; 832 info.dma32_window_size = table_group->tce32_size; 833 info.flags = 0; 834 memset(&info.ddw, 0, sizeof(info.ddw)); 835 836 if (table_group->max_dynamic_windows_supported && 837 container->v2) { 838 info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW; 839 info.ddw.pgsizes = table_group->pgsizes; 840 info.ddw.max_dynamic_windows_supported = 841 table_group->max_dynamic_windows_supported; 842 info.ddw.levels = table_group->max_levels; 843 } 844 845 ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw); 846 847 if (info.argsz >= ddwsz) 848 minsz = ddwsz; 849 850 if (copy_to_user((void __user *)arg, &info, minsz)) 851 return -EFAULT; 852 853 return 0; 854 } 855 case VFIO_IOMMU_MAP_DMA: { 856 struct vfio_iommu_type1_dma_map param; 857 struct iommu_table *tbl = NULL; 858 long num; 859 enum dma_data_direction direction; 860 861 if (!container->enabled) 862 return -EPERM; 863 864 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 865 866 if (copy_from_user(¶m, (void __user *)arg, minsz)) 867 return -EFAULT; 868 869 if (param.argsz < minsz) 870 return -EINVAL; 871 872 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ | 873 VFIO_DMA_MAP_FLAG_WRITE)) 874 return -EINVAL; 875 876 ret = tce_iommu_create_default_window(container); 877 if (ret) 878 return ret; 879 880 num = tce_iommu_find_table(container, param.iova, &tbl); 881 if (num < 0) 882 return -ENXIO; 883 884 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || 885 (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) 886 return -EINVAL; 887 888 /* iova is checked by the IOMMU API */ 889 if (param.flags & VFIO_DMA_MAP_FLAG_READ) { 890 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 891 direction = DMA_BIDIRECTIONAL; 892 else 893 direction = DMA_TO_DEVICE; 894 } else { 895 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 896 direction = DMA_FROM_DEVICE; 897 else 898 return -EINVAL; 899 } 900 901 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); 902 if (ret) 903 return ret; 904 905 if (container->v2) 906 ret = tce_iommu_build_v2(container, tbl, 907 param.iova >> tbl->it_page_shift, 908 param.vaddr, 909 param.size >> tbl->it_page_shift, 910 direction); 911 else 912 ret = tce_iommu_build(container, tbl, 913 param.iova >> tbl->it_page_shift, 914 param.vaddr, 915 param.size >> tbl->it_page_shift, 916 direction); 917 918 iommu_flush_tce(tbl); 919 920 return ret; 921 } 922 case VFIO_IOMMU_UNMAP_DMA: { 923 struct vfio_iommu_type1_dma_unmap param; 924 struct iommu_table *tbl = NULL; 925 long num; 926 927 if (!container->enabled) 928 return -EPERM; 929 930 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, 931 size); 932 933 if (copy_from_user(¶m, (void __user *)arg, minsz)) 934 return -EFAULT; 935 936 if (param.argsz < minsz) 937 return -EINVAL; 938 939 /* No flag is supported now */ 940 if (param.flags) 941 return -EINVAL; 942 943 ret = tce_iommu_create_default_window(container); 944 if (ret) 945 return ret; 946 947 num = tce_iommu_find_table(container, param.iova, &tbl); 948 if (num < 0) 949 return -ENXIO; 950 951 if (param.size & ~IOMMU_PAGE_MASK(tbl)) 952 return -EINVAL; 953 954 ret = iommu_tce_clear_param_check(tbl, param.iova, 0, 955 param.size >> tbl->it_page_shift); 956 if (ret) 957 return ret; 958 959 ret = tce_iommu_clear(container, tbl, 960 param.iova >> tbl->it_page_shift, 961 param.size >> tbl->it_page_shift); 962 iommu_flush_tce(tbl); 963 964 return ret; 965 } 966 case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: { 967 struct vfio_iommu_spapr_register_memory param; 968 969 if (!container->v2) 970 break; 971 972 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 973 size); 974 975 ret = tce_iommu_mm_set(container); 976 if (ret) 977 return ret; 978 979 if (copy_from_user(¶m, (void __user *)arg, minsz)) 980 return -EFAULT; 981 982 if (param.argsz < minsz) 983 return -EINVAL; 984 985 /* No flag is supported now */ 986 if (param.flags) 987 return -EINVAL; 988 989 mutex_lock(&container->lock); 990 ret = tce_iommu_register_pages(container, param.vaddr, 991 param.size); 992 mutex_unlock(&container->lock); 993 994 return ret; 995 } 996 case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: { 997 struct vfio_iommu_spapr_register_memory param; 998 999 if (!container->v2) 1000 break; 1001 1002 if (!container->mm) 1003 return -EPERM; 1004 1005 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1006 size); 1007 1008 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1009 return -EFAULT; 1010 1011 if (param.argsz < minsz) 1012 return -EINVAL; 1013 1014 /* No flag is supported now */ 1015 if (param.flags) 1016 return -EINVAL; 1017 1018 mutex_lock(&container->lock); 1019 ret = tce_iommu_unregister_pages(container, param.vaddr, 1020 param.size); 1021 mutex_unlock(&container->lock); 1022 1023 return ret; 1024 } 1025 case VFIO_IOMMU_ENABLE: 1026 if (container->v2) 1027 break; 1028 1029 mutex_lock(&container->lock); 1030 ret = tce_iommu_enable(container); 1031 mutex_unlock(&container->lock); 1032 return ret; 1033 1034 1035 case VFIO_IOMMU_DISABLE: 1036 if (container->v2) 1037 break; 1038 1039 mutex_lock(&container->lock); 1040 tce_iommu_disable(container); 1041 mutex_unlock(&container->lock); 1042 return 0; 1043 1044 case VFIO_EEH_PE_OP: { 1045 struct tce_iommu_group *tcegrp; 1046 1047 ret = 0; 1048 list_for_each_entry(tcegrp, &container->group_list, next) { 1049 ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, 1050 cmd, arg); 1051 if (ret) 1052 return ret; 1053 } 1054 return ret; 1055 } 1056 1057 case VFIO_IOMMU_SPAPR_TCE_CREATE: { 1058 struct vfio_iommu_spapr_tce_create create; 1059 1060 if (!container->v2) 1061 break; 1062 1063 ret = tce_iommu_mm_set(container); 1064 if (ret) 1065 return ret; 1066 1067 if (!tce_groups_attached(container)) 1068 return -ENXIO; 1069 1070 minsz = offsetofend(struct vfio_iommu_spapr_tce_create, 1071 start_addr); 1072 1073 if (copy_from_user(&create, (void __user *)arg, minsz)) 1074 return -EFAULT; 1075 1076 if (create.argsz < minsz) 1077 return -EINVAL; 1078 1079 if (create.flags) 1080 return -EINVAL; 1081 1082 mutex_lock(&container->lock); 1083 1084 ret = tce_iommu_create_default_window(container); 1085 if (!ret) 1086 ret = tce_iommu_create_window(container, 1087 create.page_shift, 1088 create.window_size, create.levels, 1089 &create.start_addr); 1090 1091 mutex_unlock(&container->lock); 1092 1093 if (!ret && copy_to_user((void __user *)arg, &create, minsz)) 1094 ret = -EFAULT; 1095 1096 return ret; 1097 } 1098 case VFIO_IOMMU_SPAPR_TCE_REMOVE: { 1099 struct vfio_iommu_spapr_tce_remove remove; 1100 1101 if (!container->v2) 1102 break; 1103 1104 ret = tce_iommu_mm_set(container); 1105 if (ret) 1106 return ret; 1107 1108 if (!tce_groups_attached(container)) 1109 return -ENXIO; 1110 1111 minsz = offsetofend(struct vfio_iommu_spapr_tce_remove, 1112 start_addr); 1113 1114 if (copy_from_user(&remove, (void __user *)arg, minsz)) 1115 return -EFAULT; 1116 1117 if (remove.argsz < minsz) 1118 return -EINVAL; 1119 1120 if (remove.flags) 1121 return -EINVAL; 1122 1123 if (container->def_window_pending && !remove.start_addr) { 1124 container->def_window_pending = false; 1125 return 0; 1126 } 1127 1128 mutex_lock(&container->lock); 1129 1130 ret = tce_iommu_remove_window(container, remove.start_addr); 1131 1132 mutex_unlock(&container->lock); 1133 1134 return ret; 1135 } 1136 } 1137 1138 return -ENOTTY; 1139 } 1140 1141 static void tce_iommu_release_ownership(struct tce_container *container, 1142 struct iommu_table_group *table_group) 1143 { 1144 int i; 1145 1146 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1147 struct iommu_table *tbl = container->tables[i]; 1148 1149 if (!tbl) 1150 continue; 1151 1152 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 1153 if (tbl->it_map) 1154 iommu_release_ownership(tbl); 1155 1156 container->tables[i] = NULL; 1157 } 1158 } 1159 1160 static int tce_iommu_take_ownership(struct tce_container *container, 1161 struct iommu_table_group *table_group) 1162 { 1163 int i, j, rc = 0; 1164 1165 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1166 struct iommu_table *tbl = table_group->tables[i]; 1167 1168 if (!tbl || !tbl->it_map) 1169 continue; 1170 1171 rc = iommu_take_ownership(tbl); 1172 if (rc) { 1173 for (j = 0; j < i; ++j) 1174 iommu_release_ownership( 1175 table_group->tables[j]); 1176 1177 return rc; 1178 } 1179 } 1180 1181 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1182 container->tables[i] = table_group->tables[i]; 1183 1184 return 0; 1185 } 1186 1187 static void tce_iommu_release_ownership_ddw(struct tce_container *container, 1188 struct iommu_table_group *table_group) 1189 { 1190 long i; 1191 1192 if (!table_group->ops->unset_window) { 1193 WARN_ON_ONCE(1); 1194 return; 1195 } 1196 1197 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1198 if (container->tables[i]) 1199 table_group->ops->unset_window(table_group, i); 1200 1201 table_group->ops->release_ownership(table_group); 1202 } 1203 1204 static long tce_iommu_take_ownership_ddw(struct tce_container *container, 1205 struct iommu_table_group *table_group) 1206 { 1207 long i, ret = 0; 1208 1209 if (!table_group->ops->create_table || !table_group->ops->set_window || 1210 !table_group->ops->release_ownership) { 1211 WARN_ON_ONCE(1); 1212 return -EFAULT; 1213 } 1214 1215 table_group->ops->take_ownership(table_group); 1216 1217 /* Set all windows to the new group */ 1218 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1219 struct iommu_table *tbl = container->tables[i]; 1220 1221 if (!tbl) 1222 continue; 1223 1224 ret = table_group->ops->set_window(table_group, i, tbl); 1225 if (ret) 1226 goto release_exit; 1227 } 1228 1229 return 0; 1230 1231 release_exit: 1232 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1233 table_group->ops->unset_window(table_group, i); 1234 1235 table_group->ops->release_ownership(table_group); 1236 1237 return ret; 1238 } 1239 1240 static int tce_iommu_attach_group(void *iommu_data, 1241 struct iommu_group *iommu_group, enum vfio_group_type type) 1242 { 1243 int ret = 0; 1244 struct tce_container *container = iommu_data; 1245 struct iommu_table_group *table_group; 1246 struct tce_iommu_group *tcegrp = NULL; 1247 1248 if (type == VFIO_EMULATED_IOMMU) 1249 return -EINVAL; 1250 1251 mutex_lock(&container->lock); 1252 1253 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", 1254 iommu_group_id(iommu_group), iommu_group); */ 1255 table_group = iommu_group_get_iommudata(iommu_group); 1256 if (!table_group) { 1257 ret = -ENODEV; 1258 goto unlock_exit; 1259 } 1260 1261 if (tce_groups_attached(container) && (!table_group->ops || 1262 !table_group->ops->take_ownership || 1263 !table_group->ops->release_ownership)) { 1264 ret = -EBUSY; 1265 goto unlock_exit; 1266 } 1267 1268 /* 1269 * Check if new group has the same iommu_table_group_ops 1270 * (i.e. compatible) 1271 */ 1272 list_for_each_entry(tcegrp, &container->group_list, next) { 1273 struct iommu_table_group *table_group_tmp; 1274 1275 if (tcegrp->grp == iommu_group) { 1276 pr_warn("tce_vfio: Group %d is already attached\n", 1277 iommu_group_id(iommu_group)); 1278 ret = -EBUSY; 1279 goto unlock_exit; 1280 } 1281 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); 1282 if (table_group_tmp->ops->create_table != 1283 table_group->ops->create_table) { 1284 pr_warn("tce_vfio: Group %d is incompatible with group %d\n", 1285 iommu_group_id(iommu_group), 1286 iommu_group_id(tcegrp->grp)); 1287 ret = -EPERM; 1288 goto unlock_exit; 1289 } 1290 } 1291 1292 tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL); 1293 if (!tcegrp) { 1294 ret = -ENOMEM; 1295 goto unlock_exit; 1296 } 1297 1298 if (!table_group->ops || !table_group->ops->take_ownership || 1299 !table_group->ops->release_ownership) { 1300 if (container->v2) { 1301 ret = -EPERM; 1302 goto free_exit; 1303 } 1304 ret = tce_iommu_take_ownership(container, table_group); 1305 } else { 1306 if (!container->v2) { 1307 ret = -EPERM; 1308 goto free_exit; 1309 } 1310 ret = tce_iommu_take_ownership_ddw(container, table_group); 1311 if (!tce_groups_attached(container) && !container->tables[0]) 1312 container->def_window_pending = true; 1313 } 1314 1315 if (!ret) { 1316 tcegrp->grp = iommu_group; 1317 list_add(&tcegrp->next, &container->group_list); 1318 } 1319 1320 free_exit: 1321 if (ret && tcegrp) 1322 kfree(tcegrp); 1323 1324 unlock_exit: 1325 mutex_unlock(&container->lock); 1326 1327 return ret; 1328 } 1329 1330 static void tce_iommu_detach_group(void *iommu_data, 1331 struct iommu_group *iommu_group) 1332 { 1333 struct tce_container *container = iommu_data; 1334 struct iommu_table_group *table_group; 1335 bool found = false; 1336 struct tce_iommu_group *tcegrp; 1337 1338 mutex_lock(&container->lock); 1339 1340 list_for_each_entry(tcegrp, &container->group_list, next) { 1341 if (tcegrp->grp == iommu_group) { 1342 found = true; 1343 break; 1344 } 1345 } 1346 1347 if (!found) { 1348 pr_warn("tce_vfio: detaching unattached group #%u\n", 1349 iommu_group_id(iommu_group)); 1350 goto unlock_exit; 1351 } 1352 1353 list_del(&tcegrp->next); 1354 kfree(tcegrp); 1355 1356 table_group = iommu_group_get_iommudata(iommu_group); 1357 BUG_ON(!table_group); 1358 1359 if (!table_group->ops || !table_group->ops->release_ownership) 1360 tce_iommu_release_ownership(container, table_group); 1361 else 1362 tce_iommu_release_ownership_ddw(container, table_group); 1363 1364 unlock_exit: 1365 mutex_unlock(&container->lock); 1366 } 1367 1368 static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { 1369 .name = "iommu-vfio-powerpc", 1370 .owner = THIS_MODULE, 1371 .open = tce_iommu_open, 1372 .release = tce_iommu_release, 1373 .ioctl = tce_iommu_ioctl, 1374 .attach_group = tce_iommu_attach_group, 1375 .detach_group = tce_iommu_detach_group, 1376 }; 1377 1378 static int __init tce_iommu_init(void) 1379 { 1380 return vfio_register_iommu_driver(&tce_iommu_driver_ops); 1381 } 1382 1383 static void __exit tce_iommu_cleanup(void) 1384 { 1385 vfio_unregister_iommu_driver(&tce_iommu_driver_ops); 1386 } 1387 1388 module_init(tce_iommu_init); 1389 module_exit(tce_iommu_cleanup); 1390 1391 MODULE_VERSION(DRIVER_VERSION); 1392 MODULE_LICENSE("GPL v2"); 1393 MODULE_AUTHOR(DRIVER_AUTHOR); 1394 MODULE_DESCRIPTION(DRIVER_DESC); 1395 1396