1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO: IOMMU DMA mapping support for TCE on POWER 4 * 5 * Copyright (C) 2013 IBM Corp. All rights reserved. 6 * Author: Alexey Kardashevskiy <aik@ozlabs.ru> 7 * 8 * Derived from original vfio_iommu_type1.c: 9 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 10 * Author: Alex Williamson <alex.williamson@redhat.com> 11 */ 12 13 #include <linux/module.h> 14 #include <linux/pci.h> 15 #include <linux/slab.h> 16 #include <linux/uaccess.h> 17 #include <linux/err.h> 18 #include <linux/vfio.h> 19 #include <linux/vmalloc.h> 20 #include <linux/sched/mm.h> 21 #include <linux/sched/signal.h> 22 #include <linux/mm.h> 23 #include "vfio.h" 24 25 #include <asm/iommu.h> 26 #include <asm/tce.h> 27 #include <asm/mmu_context.h> 28 29 #define DRIVER_VERSION "0.1" 30 #define DRIVER_AUTHOR "aik@ozlabs.ru" 31 #define DRIVER_DESC "VFIO IOMMU SPAPR TCE" 32 33 static void tce_iommu_detach_group(void *iommu_data, 34 struct iommu_group *iommu_group); 35 36 /* 37 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation 38 * 39 * This code handles mapping and unmapping of user data buffers 40 * into DMA'ble space using the IOMMU 41 */ 42 43 struct tce_iommu_group { 44 struct list_head next; 45 struct iommu_group *grp; 46 }; 47 48 /* 49 * A container needs to remember which preregistered region it has 50 * referenced to do proper cleanup at the userspace process exit. 51 */ 52 struct tce_iommu_prereg { 53 struct list_head next; 54 struct mm_iommu_table_group_mem_t *mem; 55 }; 56 57 /* 58 * The container descriptor supports only a single group per container. 59 * Required by the API as the container is not supplied with the IOMMU group 60 * at the moment of initialization. 61 */ 62 struct tce_container { 63 struct mutex lock; 64 bool enabled; 65 bool v2; 66 bool def_window_pending; 67 unsigned long locked_pages; 68 struct mm_struct *mm; 69 struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; 70 struct list_head group_list; 71 struct list_head prereg_list; 72 }; 73 74 static long tce_iommu_mm_set(struct tce_container *container) 75 { 76 if (container->mm) { 77 if (container->mm == current->mm) 78 return 0; 79 return -EPERM; 80 } 81 BUG_ON(!current->mm); 82 container->mm = current->mm; 83 mmgrab(container->mm); 84 85 return 0; 86 } 87 88 static long tce_iommu_prereg_free(struct tce_container *container, 89 struct tce_iommu_prereg *tcemem) 90 { 91 long ret; 92 93 ret = mm_iommu_put(container->mm, tcemem->mem); 94 if (ret) 95 return ret; 96 97 list_del(&tcemem->next); 98 kfree(tcemem); 99 100 return 0; 101 } 102 103 static long tce_iommu_unregister_pages(struct tce_container *container, 104 __u64 vaddr, __u64 size) 105 { 106 struct mm_iommu_table_group_mem_t *mem; 107 struct tce_iommu_prereg *tcemem; 108 bool found = false; 109 long ret; 110 111 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) 112 return -EINVAL; 113 114 mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT); 115 if (!mem) 116 return -ENOENT; 117 118 list_for_each_entry(tcemem, &container->prereg_list, next) { 119 if (tcemem->mem == mem) { 120 found = true; 121 break; 122 } 123 } 124 125 if (!found) 126 ret = -ENOENT; 127 else 128 ret = tce_iommu_prereg_free(container, tcemem); 129 130 mm_iommu_put(container->mm, mem); 131 132 return ret; 133 } 134 135 static long tce_iommu_register_pages(struct tce_container *container, 136 __u64 vaddr, __u64 size) 137 { 138 long ret = 0; 139 struct mm_iommu_table_group_mem_t *mem = NULL; 140 struct tce_iommu_prereg *tcemem; 141 unsigned long entries = size >> PAGE_SHIFT; 142 143 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || 144 ((vaddr + size) < vaddr)) 145 return -EINVAL; 146 147 mem = mm_iommu_get(container->mm, vaddr, entries); 148 if (mem) { 149 list_for_each_entry(tcemem, &container->prereg_list, next) { 150 if (tcemem->mem == mem) { 151 ret = -EBUSY; 152 goto put_exit; 153 } 154 } 155 } else { 156 ret = mm_iommu_new(container->mm, vaddr, entries, &mem); 157 if (ret) 158 return ret; 159 } 160 161 tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); 162 if (!tcemem) { 163 ret = -ENOMEM; 164 goto put_exit; 165 } 166 167 tcemem->mem = mem; 168 list_add(&tcemem->next, &container->prereg_list); 169 170 container->enabled = true; 171 172 return 0; 173 174 put_exit: 175 mm_iommu_put(container->mm, mem); 176 return ret; 177 } 178 179 static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, 180 unsigned int it_page_shift) 181 { 182 struct page *page; 183 unsigned long size = 0; 184 185 if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size)) 186 return size == (1UL << it_page_shift); 187 188 page = pfn_to_page(hpa >> PAGE_SHIFT); 189 /* 190 * Check that the TCE table granularity is not bigger than the size of 191 * a page we just found. Otherwise the hardware can get access to 192 * a bigger memory chunk that it should. 193 */ 194 return page_shift(compound_head(page)) >= it_page_shift; 195 } 196 197 static inline bool tce_groups_attached(struct tce_container *container) 198 { 199 return !list_empty(&container->group_list); 200 } 201 202 static long tce_iommu_find_table(struct tce_container *container, 203 phys_addr_t ioba, struct iommu_table **ptbl) 204 { 205 long i; 206 207 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 208 struct iommu_table *tbl = container->tables[i]; 209 210 if (tbl) { 211 unsigned long entry = ioba >> tbl->it_page_shift; 212 unsigned long start = tbl->it_offset; 213 unsigned long end = start + tbl->it_size; 214 215 if ((start <= entry) && (entry < end)) { 216 *ptbl = tbl; 217 return i; 218 } 219 } 220 } 221 222 return -1; 223 } 224 225 static int tce_iommu_find_free_table(struct tce_container *container) 226 { 227 int i; 228 229 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 230 if (!container->tables[i]) 231 return i; 232 } 233 234 return -ENOSPC; 235 } 236 237 static int tce_iommu_enable(struct tce_container *container) 238 { 239 int ret = 0; 240 unsigned long locked; 241 struct iommu_table_group *table_group; 242 struct tce_iommu_group *tcegrp; 243 244 if (container->enabled) 245 return -EBUSY; 246 247 /* 248 * When userspace pages are mapped into the IOMMU, they are effectively 249 * locked memory, so, theoretically, we need to update the accounting 250 * of locked pages on each map and unmap. For powerpc, the map unmap 251 * paths can be very hot, though, and the accounting would kill 252 * performance, especially since it would be difficult to impossible 253 * to handle the accounting in real mode only. 254 * 255 * To address that, rather than precisely accounting every page, we 256 * instead account for a worst case on locked memory when the iommu is 257 * enabled and disabled. The worst case upper bound on locked memory 258 * is the size of the whole iommu window, which is usually relatively 259 * small (compared to total memory sizes) on POWER hardware. 260 * 261 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, 262 * that would effectively kill the guest at random points, much better 263 * enforcing the limit based on the max that the guest can map. 264 * 265 * Unfortunately at the moment it counts whole tables, no matter how 266 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups 267 * each with 2GB DMA window, 8GB will be counted here. The reason for 268 * this is that we cannot tell here the amount of RAM used by the guest 269 * as this information is only available from KVM and VFIO is 270 * KVM agnostic. 271 * 272 * So we do not allow enabling a container without a group attached 273 * as there is no way to know how much we should increment 274 * the locked_vm counter. 275 */ 276 if (!tce_groups_attached(container)) 277 return -ENODEV; 278 279 tcegrp = list_first_entry(&container->group_list, 280 struct tce_iommu_group, next); 281 table_group = iommu_group_get_iommudata(tcegrp->grp); 282 if (!table_group) 283 return -ENODEV; 284 285 if (!table_group->tce32_size) 286 return -EPERM; 287 288 ret = tce_iommu_mm_set(container); 289 if (ret) 290 return ret; 291 292 locked = table_group->tce32_size >> PAGE_SHIFT; 293 ret = account_locked_vm(container->mm, locked, true); 294 if (ret) 295 return ret; 296 297 container->locked_pages = locked; 298 299 container->enabled = true; 300 301 return ret; 302 } 303 304 static void tce_iommu_disable(struct tce_container *container) 305 { 306 if (!container->enabled) 307 return; 308 309 container->enabled = false; 310 311 BUG_ON(!container->mm); 312 account_locked_vm(container->mm, container->locked_pages, false); 313 } 314 315 static void *tce_iommu_open(unsigned long arg) 316 { 317 struct tce_container *container; 318 319 if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) { 320 pr_err("tce_vfio: Wrong IOMMU type\n"); 321 return ERR_PTR(-EINVAL); 322 } 323 324 container = kzalloc(sizeof(*container), GFP_KERNEL); 325 if (!container) 326 return ERR_PTR(-ENOMEM); 327 328 mutex_init(&container->lock); 329 INIT_LIST_HEAD_RCU(&container->group_list); 330 INIT_LIST_HEAD_RCU(&container->prereg_list); 331 332 container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; 333 334 return container; 335 } 336 337 static int tce_iommu_clear(struct tce_container *container, 338 struct iommu_table *tbl, 339 unsigned long entry, unsigned long pages); 340 static void tce_iommu_free_table(struct tce_container *container, 341 struct iommu_table *tbl); 342 343 static void tce_iommu_release(void *iommu_data) 344 { 345 struct tce_container *container = iommu_data; 346 struct tce_iommu_group *tcegrp; 347 struct tce_iommu_prereg *tcemem, *tmtmp; 348 long i; 349 350 while (tce_groups_attached(container)) { 351 tcegrp = list_first_entry(&container->group_list, 352 struct tce_iommu_group, next); 353 tce_iommu_detach_group(iommu_data, tcegrp->grp); 354 } 355 356 /* 357 * If VFIO created a table, it was not disposed 358 * by tce_iommu_detach_group() so do it now. 359 */ 360 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 361 struct iommu_table *tbl = container->tables[i]; 362 363 if (!tbl) 364 continue; 365 366 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 367 tce_iommu_free_table(container, tbl); 368 } 369 370 list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next) 371 WARN_ON(tce_iommu_prereg_free(container, tcemem)); 372 373 tce_iommu_disable(container); 374 if (container->mm) 375 mmdrop(container->mm); 376 mutex_destroy(&container->lock); 377 378 kfree(container); 379 } 380 381 static void tce_iommu_unuse_page(struct tce_container *container, 382 unsigned long hpa) 383 { 384 struct page *page; 385 386 page = pfn_to_page(hpa >> PAGE_SHIFT); 387 unpin_user_page(page); 388 } 389 390 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container, 391 unsigned long tce, unsigned long shift, 392 unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) 393 { 394 long ret = 0; 395 struct mm_iommu_table_group_mem_t *mem; 396 397 mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift); 398 if (!mem) 399 return -EINVAL; 400 401 ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa); 402 if (ret) 403 return -EINVAL; 404 405 *pmem = mem; 406 407 return 0; 408 } 409 410 static void tce_iommu_unuse_page_v2(struct tce_container *container, 411 struct iommu_table *tbl, unsigned long entry) 412 { 413 struct mm_iommu_table_group_mem_t *mem = NULL; 414 int ret; 415 unsigned long hpa = 0; 416 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); 417 418 if (!pua) 419 return; 420 421 ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua), 422 tbl->it_page_shift, &hpa, &mem); 423 if (ret) 424 pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n", 425 __func__, be64_to_cpu(*pua), entry, ret); 426 if (mem) 427 mm_iommu_mapped_dec(mem); 428 429 *pua = cpu_to_be64(0); 430 } 431 432 static int tce_iommu_clear(struct tce_container *container, 433 struct iommu_table *tbl, 434 unsigned long entry, unsigned long pages) 435 { 436 unsigned long oldhpa; 437 long ret; 438 enum dma_data_direction direction; 439 unsigned long lastentry = entry + pages, firstentry = entry; 440 441 for ( ; entry < lastentry; ++entry) { 442 if (tbl->it_indirect_levels && tbl->it_userspace) { 443 /* 444 * For multilevel tables, we can take a shortcut here 445 * and skip some TCEs as we know that the userspace 446 * addresses cache is a mirror of the real TCE table 447 * and if it is missing some indirect levels, then 448 * the hardware table does not have them allocated 449 * either and therefore does not require updating. 450 */ 451 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, 452 entry); 453 if (!pua) { 454 /* align to level_size which is power of two */ 455 entry |= tbl->it_level_size - 1; 456 continue; 457 } 458 } 459 460 cond_resched(); 461 462 direction = DMA_NONE; 463 oldhpa = 0; 464 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa, 465 &direction); 466 if (ret) 467 continue; 468 469 if (direction == DMA_NONE) 470 continue; 471 472 if (container->v2) { 473 tce_iommu_unuse_page_v2(container, tbl, entry); 474 continue; 475 } 476 477 tce_iommu_unuse_page(container, oldhpa); 478 } 479 480 iommu_tce_kill(tbl, firstentry, pages); 481 482 return 0; 483 } 484 485 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) 486 { 487 struct page *page = NULL; 488 enum dma_data_direction direction = iommu_tce_direction(tce); 489 490 if (pin_user_pages_fast(tce & PAGE_MASK, 1, 491 direction != DMA_TO_DEVICE ? FOLL_WRITE : 0, 492 &page) != 1) 493 return -EFAULT; 494 495 *hpa = __pa((unsigned long) page_address(page)); 496 497 return 0; 498 } 499 500 static long tce_iommu_build(struct tce_container *container, 501 struct iommu_table *tbl, 502 unsigned long entry, unsigned long tce, unsigned long pages, 503 enum dma_data_direction direction) 504 { 505 long i, ret = 0; 506 unsigned long hpa; 507 enum dma_data_direction dirtmp; 508 509 for (i = 0; i < pages; ++i) { 510 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 511 512 ret = tce_iommu_use_page(tce, &hpa); 513 if (ret) 514 break; 515 516 if (!tce_page_is_contained(container->mm, hpa, 517 tbl->it_page_shift)) { 518 ret = -EPERM; 519 break; 520 } 521 522 hpa |= offset; 523 dirtmp = direction; 524 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i, 525 &hpa, &dirtmp); 526 if (ret) { 527 tce_iommu_unuse_page(container, hpa); 528 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 529 __func__, entry << tbl->it_page_shift, 530 tce, ret); 531 break; 532 } 533 534 if (dirtmp != DMA_NONE) 535 tce_iommu_unuse_page(container, hpa); 536 537 tce += IOMMU_PAGE_SIZE(tbl); 538 } 539 540 if (ret) 541 tce_iommu_clear(container, tbl, entry, i); 542 else 543 iommu_tce_kill(tbl, entry, pages); 544 545 return ret; 546 } 547 548 static long tce_iommu_build_v2(struct tce_container *container, 549 struct iommu_table *tbl, 550 unsigned long entry, unsigned long tce, unsigned long pages, 551 enum dma_data_direction direction) 552 { 553 long i, ret = 0; 554 unsigned long hpa; 555 enum dma_data_direction dirtmp; 556 557 for (i = 0; i < pages; ++i) { 558 struct mm_iommu_table_group_mem_t *mem = NULL; 559 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i); 560 561 ret = tce_iommu_prereg_ua_to_hpa(container, 562 tce, tbl->it_page_shift, &hpa, &mem); 563 if (ret) 564 break; 565 566 if (!tce_page_is_contained(container->mm, hpa, 567 tbl->it_page_shift)) { 568 ret = -EPERM; 569 break; 570 } 571 572 /* Preserve offset within IOMMU page */ 573 hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 574 dirtmp = direction; 575 576 /* The registered region is being unregistered */ 577 if (mm_iommu_mapped_inc(mem)) 578 break; 579 580 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i, 581 &hpa, &dirtmp); 582 if (ret) { 583 /* dirtmp cannot be DMA_NONE here */ 584 tce_iommu_unuse_page_v2(container, tbl, entry + i); 585 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 586 __func__, entry << tbl->it_page_shift, 587 tce, ret); 588 break; 589 } 590 591 if (dirtmp != DMA_NONE) 592 tce_iommu_unuse_page_v2(container, tbl, entry + i); 593 594 *pua = cpu_to_be64(tce); 595 596 tce += IOMMU_PAGE_SIZE(tbl); 597 } 598 599 if (ret) 600 tce_iommu_clear(container, tbl, entry, i); 601 else 602 iommu_tce_kill(tbl, entry, pages); 603 604 return ret; 605 } 606 607 static long tce_iommu_create_table(struct tce_container *container, 608 struct iommu_table_group *table_group, 609 int num, 610 __u32 page_shift, 611 __u64 window_size, 612 __u32 levels, 613 struct iommu_table **ptbl) 614 { 615 long ret, table_size; 616 617 table_size = table_group->ops->get_table_size(page_shift, window_size, 618 levels); 619 if (!table_size) 620 return -EINVAL; 621 622 ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true); 623 if (ret) 624 return ret; 625 626 ret = table_group->ops->create_table(table_group, num, 627 page_shift, window_size, levels, ptbl); 628 629 WARN_ON(!ret && !(*ptbl)->it_ops->free); 630 WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size)); 631 632 return ret; 633 } 634 635 static void tce_iommu_free_table(struct tce_container *container, 636 struct iommu_table *tbl) 637 { 638 unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; 639 640 iommu_tce_table_put(tbl); 641 account_locked_vm(container->mm, pages, false); 642 } 643 644 static long tce_iommu_create_window(struct tce_container *container, 645 __u32 page_shift, __u64 window_size, __u32 levels, 646 __u64 *start_addr) 647 { 648 struct tce_iommu_group *tcegrp; 649 struct iommu_table_group *table_group; 650 struct iommu_table *tbl = NULL; 651 long ret, num; 652 653 num = tce_iommu_find_free_table(container); 654 if (num < 0) 655 return num; 656 657 /* Get the first group for ops::create_table */ 658 tcegrp = list_first_entry(&container->group_list, 659 struct tce_iommu_group, next); 660 table_group = iommu_group_get_iommudata(tcegrp->grp); 661 if (!table_group) 662 return -EFAULT; 663 664 if (!(table_group->pgsizes & (1ULL << page_shift))) 665 return -EINVAL; 666 667 if (!table_group->ops->set_window || !table_group->ops->unset_window || 668 !table_group->ops->get_table_size || 669 !table_group->ops->create_table) 670 return -EPERM; 671 672 /* Create TCE table */ 673 ret = tce_iommu_create_table(container, table_group, num, 674 page_shift, window_size, levels, &tbl); 675 if (ret) 676 return ret; 677 678 BUG_ON(!tbl->it_ops->free); 679 680 /* 681 * Program the table to every group. 682 * Groups have been tested for compatibility at the attach time. 683 */ 684 list_for_each_entry(tcegrp, &container->group_list, next) { 685 table_group = iommu_group_get_iommudata(tcegrp->grp); 686 687 ret = table_group->ops->set_window(table_group, num, tbl); 688 if (ret) 689 goto unset_exit; 690 } 691 692 container->tables[num] = tbl; 693 694 /* Return start address assigned by platform in create_table() */ 695 *start_addr = tbl->it_offset << tbl->it_page_shift; 696 697 return 0; 698 699 unset_exit: 700 list_for_each_entry(tcegrp, &container->group_list, next) { 701 table_group = iommu_group_get_iommudata(tcegrp->grp); 702 table_group->ops->unset_window(table_group, num); 703 } 704 tce_iommu_free_table(container, tbl); 705 706 return ret; 707 } 708 709 static long tce_iommu_remove_window(struct tce_container *container, 710 __u64 start_addr) 711 { 712 struct iommu_table_group *table_group = NULL; 713 struct iommu_table *tbl; 714 struct tce_iommu_group *tcegrp; 715 int num; 716 717 num = tce_iommu_find_table(container, start_addr, &tbl); 718 if (num < 0) 719 return -EINVAL; 720 721 BUG_ON(!tbl->it_size); 722 723 /* Detach groups from IOMMUs */ 724 list_for_each_entry(tcegrp, &container->group_list, next) { 725 table_group = iommu_group_get_iommudata(tcegrp->grp); 726 727 /* 728 * SPAPR TCE IOMMU exposes the default DMA window to 729 * the guest via dma32_window_start/size of 730 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow 731 * the userspace to remove this window, some do not so 732 * here we check for the platform capability. 733 */ 734 if (!table_group->ops || !table_group->ops->unset_window) 735 return -EPERM; 736 737 table_group->ops->unset_window(table_group, num); 738 } 739 740 /* Free table */ 741 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 742 tce_iommu_free_table(container, tbl); 743 container->tables[num] = NULL; 744 745 return 0; 746 } 747 748 static long tce_iommu_create_default_window(struct tce_container *container) 749 { 750 long ret; 751 __u64 start_addr = 0; 752 struct tce_iommu_group *tcegrp; 753 struct iommu_table_group *table_group; 754 755 if (!container->def_window_pending) 756 return 0; 757 758 if (!tce_groups_attached(container)) 759 return -ENODEV; 760 761 tcegrp = list_first_entry(&container->group_list, 762 struct tce_iommu_group, next); 763 table_group = iommu_group_get_iommudata(tcegrp->grp); 764 if (!table_group) 765 return -ENODEV; 766 767 ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K, 768 table_group->tce32_size, 1, &start_addr); 769 WARN_ON_ONCE(!ret && start_addr); 770 771 if (!ret) 772 container->def_window_pending = false; 773 774 return ret; 775 } 776 777 static long tce_iommu_ioctl(void *iommu_data, 778 unsigned int cmd, unsigned long arg) 779 { 780 struct tce_container *container = iommu_data; 781 unsigned long minsz, ddwsz; 782 long ret; 783 784 switch (cmd) { 785 case VFIO_CHECK_EXTENSION: 786 switch (arg) { 787 case VFIO_SPAPR_TCE_IOMMU: 788 case VFIO_SPAPR_TCE_v2_IOMMU: 789 ret = 1; 790 break; 791 default: 792 ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); 793 break; 794 } 795 796 return (ret < 0) ? 0 : ret; 797 } 798 799 /* 800 * Sanity check to prevent one userspace from manipulating 801 * another userspace mm. 802 */ 803 BUG_ON(!container); 804 if (container->mm && container->mm != current->mm) 805 return -EPERM; 806 807 switch (cmd) { 808 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { 809 struct vfio_iommu_spapr_tce_info info; 810 struct tce_iommu_group *tcegrp; 811 struct iommu_table_group *table_group; 812 813 if (!tce_groups_attached(container)) 814 return -ENXIO; 815 816 tcegrp = list_first_entry(&container->group_list, 817 struct tce_iommu_group, next); 818 table_group = iommu_group_get_iommudata(tcegrp->grp); 819 820 if (!table_group) 821 return -ENXIO; 822 823 minsz = offsetofend(struct vfio_iommu_spapr_tce_info, 824 dma32_window_size); 825 826 if (copy_from_user(&info, (void __user *)arg, minsz)) 827 return -EFAULT; 828 829 if (info.argsz < minsz) 830 return -EINVAL; 831 832 info.dma32_window_start = table_group->tce32_start; 833 info.dma32_window_size = table_group->tce32_size; 834 info.flags = 0; 835 memset(&info.ddw, 0, sizeof(info.ddw)); 836 837 if (table_group->max_dynamic_windows_supported && 838 container->v2) { 839 info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW; 840 info.ddw.pgsizes = table_group->pgsizes; 841 info.ddw.max_dynamic_windows_supported = 842 table_group->max_dynamic_windows_supported; 843 info.ddw.levels = table_group->max_levels; 844 } 845 846 ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw); 847 848 if (info.argsz >= ddwsz) 849 minsz = ddwsz; 850 851 if (copy_to_user((void __user *)arg, &info, minsz)) 852 return -EFAULT; 853 854 return 0; 855 } 856 case VFIO_IOMMU_MAP_DMA: { 857 struct vfio_iommu_type1_dma_map param; 858 struct iommu_table *tbl = NULL; 859 long num; 860 enum dma_data_direction direction; 861 862 if (!container->enabled) 863 return -EPERM; 864 865 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 866 867 if (copy_from_user(¶m, (void __user *)arg, minsz)) 868 return -EFAULT; 869 870 if (param.argsz < minsz) 871 return -EINVAL; 872 873 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ | 874 VFIO_DMA_MAP_FLAG_WRITE)) 875 return -EINVAL; 876 877 ret = tce_iommu_create_default_window(container); 878 if (ret) 879 return ret; 880 881 num = tce_iommu_find_table(container, param.iova, &tbl); 882 if (num < 0) 883 return -ENXIO; 884 885 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || 886 (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) 887 return -EINVAL; 888 889 /* iova is checked by the IOMMU API */ 890 if (param.flags & VFIO_DMA_MAP_FLAG_READ) { 891 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 892 direction = DMA_BIDIRECTIONAL; 893 else 894 direction = DMA_TO_DEVICE; 895 } else { 896 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 897 direction = DMA_FROM_DEVICE; 898 else 899 return -EINVAL; 900 } 901 902 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); 903 if (ret) 904 return ret; 905 906 if (container->v2) 907 ret = tce_iommu_build_v2(container, tbl, 908 param.iova >> tbl->it_page_shift, 909 param.vaddr, 910 param.size >> tbl->it_page_shift, 911 direction); 912 else 913 ret = tce_iommu_build(container, tbl, 914 param.iova >> tbl->it_page_shift, 915 param.vaddr, 916 param.size >> tbl->it_page_shift, 917 direction); 918 919 iommu_flush_tce(tbl); 920 921 return ret; 922 } 923 case VFIO_IOMMU_UNMAP_DMA: { 924 struct vfio_iommu_type1_dma_unmap param; 925 struct iommu_table *tbl = NULL; 926 long num; 927 928 if (!container->enabled) 929 return -EPERM; 930 931 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, 932 size); 933 934 if (copy_from_user(¶m, (void __user *)arg, minsz)) 935 return -EFAULT; 936 937 if (param.argsz < minsz) 938 return -EINVAL; 939 940 /* No flag is supported now */ 941 if (param.flags) 942 return -EINVAL; 943 944 ret = tce_iommu_create_default_window(container); 945 if (ret) 946 return ret; 947 948 num = tce_iommu_find_table(container, param.iova, &tbl); 949 if (num < 0) 950 return -ENXIO; 951 952 if (param.size & ~IOMMU_PAGE_MASK(tbl)) 953 return -EINVAL; 954 955 ret = iommu_tce_clear_param_check(tbl, param.iova, 0, 956 param.size >> tbl->it_page_shift); 957 if (ret) 958 return ret; 959 960 ret = tce_iommu_clear(container, tbl, 961 param.iova >> tbl->it_page_shift, 962 param.size >> tbl->it_page_shift); 963 iommu_flush_tce(tbl); 964 965 return ret; 966 } 967 case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: { 968 struct vfio_iommu_spapr_register_memory param; 969 970 if (!container->v2) 971 break; 972 973 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 974 size); 975 976 ret = tce_iommu_mm_set(container); 977 if (ret) 978 return ret; 979 980 if (copy_from_user(¶m, (void __user *)arg, minsz)) 981 return -EFAULT; 982 983 if (param.argsz < minsz) 984 return -EINVAL; 985 986 /* No flag is supported now */ 987 if (param.flags) 988 return -EINVAL; 989 990 mutex_lock(&container->lock); 991 ret = tce_iommu_register_pages(container, param.vaddr, 992 param.size); 993 mutex_unlock(&container->lock); 994 995 return ret; 996 } 997 case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: { 998 struct vfio_iommu_spapr_register_memory param; 999 1000 if (!container->v2) 1001 break; 1002 1003 if (!container->mm) 1004 return -EPERM; 1005 1006 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1007 size); 1008 1009 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1010 return -EFAULT; 1011 1012 if (param.argsz < minsz) 1013 return -EINVAL; 1014 1015 /* No flag is supported now */ 1016 if (param.flags) 1017 return -EINVAL; 1018 1019 mutex_lock(&container->lock); 1020 ret = tce_iommu_unregister_pages(container, param.vaddr, 1021 param.size); 1022 mutex_unlock(&container->lock); 1023 1024 return ret; 1025 } 1026 case VFIO_IOMMU_ENABLE: 1027 if (container->v2) 1028 break; 1029 1030 mutex_lock(&container->lock); 1031 ret = tce_iommu_enable(container); 1032 mutex_unlock(&container->lock); 1033 return ret; 1034 1035 1036 case VFIO_IOMMU_DISABLE: 1037 if (container->v2) 1038 break; 1039 1040 mutex_lock(&container->lock); 1041 tce_iommu_disable(container); 1042 mutex_unlock(&container->lock); 1043 return 0; 1044 1045 case VFIO_EEH_PE_OP: { 1046 struct tce_iommu_group *tcegrp; 1047 1048 ret = 0; 1049 list_for_each_entry(tcegrp, &container->group_list, next) { 1050 ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, 1051 cmd, arg); 1052 if (ret) 1053 return ret; 1054 } 1055 return ret; 1056 } 1057 1058 case VFIO_IOMMU_SPAPR_TCE_CREATE: { 1059 struct vfio_iommu_spapr_tce_create create; 1060 1061 if (!container->v2) 1062 break; 1063 1064 ret = tce_iommu_mm_set(container); 1065 if (ret) 1066 return ret; 1067 1068 if (!tce_groups_attached(container)) 1069 return -ENXIO; 1070 1071 minsz = offsetofend(struct vfio_iommu_spapr_tce_create, 1072 start_addr); 1073 1074 if (copy_from_user(&create, (void __user *)arg, minsz)) 1075 return -EFAULT; 1076 1077 if (create.argsz < minsz) 1078 return -EINVAL; 1079 1080 if (create.flags) 1081 return -EINVAL; 1082 1083 mutex_lock(&container->lock); 1084 1085 ret = tce_iommu_create_default_window(container); 1086 if (!ret) 1087 ret = tce_iommu_create_window(container, 1088 create.page_shift, 1089 create.window_size, create.levels, 1090 &create.start_addr); 1091 1092 mutex_unlock(&container->lock); 1093 1094 if (!ret && copy_to_user((void __user *)arg, &create, minsz)) 1095 ret = -EFAULT; 1096 1097 return ret; 1098 } 1099 case VFIO_IOMMU_SPAPR_TCE_REMOVE: { 1100 struct vfio_iommu_spapr_tce_remove remove; 1101 1102 if (!container->v2) 1103 break; 1104 1105 ret = tce_iommu_mm_set(container); 1106 if (ret) 1107 return ret; 1108 1109 if (!tce_groups_attached(container)) 1110 return -ENXIO; 1111 1112 minsz = offsetofend(struct vfio_iommu_spapr_tce_remove, 1113 start_addr); 1114 1115 if (copy_from_user(&remove, (void __user *)arg, minsz)) 1116 return -EFAULT; 1117 1118 if (remove.argsz < minsz) 1119 return -EINVAL; 1120 1121 if (remove.flags) 1122 return -EINVAL; 1123 1124 if (container->def_window_pending && !remove.start_addr) { 1125 container->def_window_pending = false; 1126 return 0; 1127 } 1128 1129 mutex_lock(&container->lock); 1130 1131 ret = tce_iommu_remove_window(container, remove.start_addr); 1132 1133 mutex_unlock(&container->lock); 1134 1135 return ret; 1136 } 1137 } 1138 1139 return -ENOTTY; 1140 } 1141 1142 static void tce_iommu_release_ownership(struct tce_container *container, 1143 struct iommu_table_group *table_group) 1144 { 1145 int i; 1146 1147 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1148 struct iommu_table *tbl = container->tables[i]; 1149 1150 if (!tbl) 1151 continue; 1152 1153 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 1154 if (tbl->it_map) 1155 iommu_release_ownership(tbl); 1156 1157 container->tables[i] = NULL; 1158 } 1159 } 1160 1161 static int tce_iommu_take_ownership(struct tce_container *container, 1162 struct iommu_table_group *table_group) 1163 { 1164 int i, j, rc = 0; 1165 1166 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1167 struct iommu_table *tbl = table_group->tables[i]; 1168 1169 if (!tbl || !tbl->it_map) 1170 continue; 1171 1172 rc = iommu_take_ownership(tbl); 1173 if (rc) { 1174 for (j = 0; j < i; ++j) 1175 iommu_release_ownership( 1176 table_group->tables[j]); 1177 1178 return rc; 1179 } 1180 } 1181 1182 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1183 container->tables[i] = table_group->tables[i]; 1184 1185 return 0; 1186 } 1187 1188 static void tce_iommu_release_ownership_ddw(struct tce_container *container, 1189 struct iommu_table_group *table_group) 1190 { 1191 long i; 1192 1193 if (!table_group->ops->unset_window) { 1194 WARN_ON_ONCE(1); 1195 return; 1196 } 1197 1198 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1199 if (container->tables[i]) 1200 table_group->ops->unset_window(table_group, i); 1201 1202 table_group->ops->release_ownership(table_group); 1203 } 1204 1205 static long tce_iommu_take_ownership_ddw(struct tce_container *container, 1206 struct iommu_table_group *table_group) 1207 { 1208 long i, ret = 0; 1209 1210 if (!table_group->ops->create_table || !table_group->ops->set_window || 1211 !table_group->ops->release_ownership) { 1212 WARN_ON_ONCE(1); 1213 return -EFAULT; 1214 } 1215 1216 table_group->ops->take_ownership(table_group); 1217 1218 /* Set all windows to the new group */ 1219 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1220 struct iommu_table *tbl = container->tables[i]; 1221 1222 if (!tbl) 1223 continue; 1224 1225 ret = table_group->ops->set_window(table_group, i, tbl); 1226 if (ret) 1227 goto release_exit; 1228 } 1229 1230 return 0; 1231 1232 release_exit: 1233 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1234 table_group->ops->unset_window(table_group, i); 1235 1236 table_group->ops->release_ownership(table_group); 1237 1238 return ret; 1239 } 1240 1241 static int tce_iommu_attach_group(void *iommu_data, 1242 struct iommu_group *iommu_group, enum vfio_group_type type) 1243 { 1244 int ret = 0; 1245 struct tce_container *container = iommu_data; 1246 struct iommu_table_group *table_group; 1247 struct tce_iommu_group *tcegrp = NULL; 1248 1249 if (type == VFIO_EMULATED_IOMMU) 1250 return -EINVAL; 1251 1252 mutex_lock(&container->lock); 1253 1254 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", 1255 iommu_group_id(iommu_group), iommu_group); */ 1256 table_group = iommu_group_get_iommudata(iommu_group); 1257 if (!table_group) { 1258 ret = -ENODEV; 1259 goto unlock_exit; 1260 } 1261 1262 if (tce_groups_attached(container) && (!table_group->ops || 1263 !table_group->ops->take_ownership || 1264 !table_group->ops->release_ownership)) { 1265 ret = -EBUSY; 1266 goto unlock_exit; 1267 } 1268 1269 /* Check if new group has the same iommu_ops (i.e. compatible) */ 1270 list_for_each_entry(tcegrp, &container->group_list, next) { 1271 struct iommu_table_group *table_group_tmp; 1272 1273 if (tcegrp->grp == iommu_group) { 1274 pr_warn("tce_vfio: Group %d is already attached\n", 1275 iommu_group_id(iommu_group)); 1276 ret = -EBUSY; 1277 goto unlock_exit; 1278 } 1279 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); 1280 if (table_group_tmp->ops->create_table != 1281 table_group->ops->create_table) { 1282 pr_warn("tce_vfio: Group %d is incompatible with group %d\n", 1283 iommu_group_id(iommu_group), 1284 iommu_group_id(tcegrp->grp)); 1285 ret = -EPERM; 1286 goto unlock_exit; 1287 } 1288 } 1289 1290 tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL); 1291 if (!tcegrp) { 1292 ret = -ENOMEM; 1293 goto unlock_exit; 1294 } 1295 1296 if (!table_group->ops || !table_group->ops->take_ownership || 1297 !table_group->ops->release_ownership) { 1298 if (container->v2) { 1299 ret = -EPERM; 1300 goto free_exit; 1301 } 1302 ret = tce_iommu_take_ownership(container, table_group); 1303 } else { 1304 if (!container->v2) { 1305 ret = -EPERM; 1306 goto free_exit; 1307 } 1308 ret = tce_iommu_take_ownership_ddw(container, table_group); 1309 if (!tce_groups_attached(container) && !container->tables[0]) 1310 container->def_window_pending = true; 1311 } 1312 1313 if (!ret) { 1314 tcegrp->grp = iommu_group; 1315 list_add(&tcegrp->next, &container->group_list); 1316 } 1317 1318 free_exit: 1319 if (ret && tcegrp) 1320 kfree(tcegrp); 1321 1322 unlock_exit: 1323 mutex_unlock(&container->lock); 1324 1325 return ret; 1326 } 1327 1328 static void tce_iommu_detach_group(void *iommu_data, 1329 struct iommu_group *iommu_group) 1330 { 1331 struct tce_container *container = iommu_data; 1332 struct iommu_table_group *table_group; 1333 bool found = false; 1334 struct tce_iommu_group *tcegrp; 1335 1336 mutex_lock(&container->lock); 1337 1338 list_for_each_entry(tcegrp, &container->group_list, next) { 1339 if (tcegrp->grp == iommu_group) { 1340 found = true; 1341 break; 1342 } 1343 } 1344 1345 if (!found) { 1346 pr_warn("tce_vfio: detaching unattached group #%u\n", 1347 iommu_group_id(iommu_group)); 1348 goto unlock_exit; 1349 } 1350 1351 list_del(&tcegrp->next); 1352 kfree(tcegrp); 1353 1354 table_group = iommu_group_get_iommudata(iommu_group); 1355 BUG_ON(!table_group); 1356 1357 if (!table_group->ops || !table_group->ops->release_ownership) 1358 tce_iommu_release_ownership(container, table_group); 1359 else 1360 tce_iommu_release_ownership_ddw(container, table_group); 1361 1362 unlock_exit: 1363 mutex_unlock(&container->lock); 1364 } 1365 1366 static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { 1367 .name = "iommu-vfio-powerpc", 1368 .owner = THIS_MODULE, 1369 .open = tce_iommu_open, 1370 .release = tce_iommu_release, 1371 .ioctl = tce_iommu_ioctl, 1372 .attach_group = tce_iommu_attach_group, 1373 .detach_group = tce_iommu_detach_group, 1374 }; 1375 1376 static int __init tce_iommu_init(void) 1377 { 1378 return vfio_register_iommu_driver(&tce_iommu_driver_ops); 1379 } 1380 1381 static void __exit tce_iommu_cleanup(void) 1382 { 1383 vfio_unregister_iommu_driver(&tce_iommu_driver_ops); 1384 } 1385 1386 module_init(tce_iommu_init); 1387 module_exit(tce_iommu_cleanup); 1388 1389 MODULE_VERSION(DRIVER_VERSION); 1390 MODULE_LICENSE("GPL v2"); 1391 MODULE_AUTHOR(DRIVER_AUTHOR); 1392 MODULE_DESCRIPTION(DRIVER_DESC); 1393 1394