1 /* 2 * VFIO: IOMMU DMA mapping support for TCE on POWER 3 * 4 * Copyright (C) 2013 IBM Corp. All rights reserved. 5 * Author: Alexey Kardashevskiy <aik@ozlabs.ru> 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 * 11 * Derived from original vfio_iommu_type1.c: 12 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 13 * Author: Alex Williamson <alex.williamson@redhat.com> 14 */ 15 16 #include <linux/module.h> 17 #include <linux/pci.h> 18 #include <linux/slab.h> 19 #include <linux/uaccess.h> 20 #include <linux/err.h> 21 #include <linux/vfio.h> 22 #include <linux/vmalloc.h> 23 #include <linux/sched/mm.h> 24 #include <linux/sched/signal.h> 25 26 #include <asm/iommu.h> 27 #include <asm/tce.h> 28 #include <asm/mmu_context.h> 29 30 #define DRIVER_VERSION "0.1" 31 #define DRIVER_AUTHOR "aik@ozlabs.ru" 32 #define DRIVER_DESC "VFIO IOMMU SPAPR TCE" 33 34 static void tce_iommu_detach_group(void *iommu_data, 35 struct iommu_group *iommu_group); 36 37 static long try_increment_locked_vm(struct mm_struct *mm, long npages) 38 { 39 long ret = 0, locked, lock_limit; 40 41 if (WARN_ON_ONCE(!mm)) 42 return -EPERM; 43 44 if (!npages) 45 return 0; 46 47 down_write(&mm->mmap_sem); 48 locked = mm->locked_vm + npages; 49 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 50 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 51 ret = -ENOMEM; 52 else 53 mm->locked_vm += npages; 54 55 pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid, 56 npages << PAGE_SHIFT, 57 mm->locked_vm << PAGE_SHIFT, 58 rlimit(RLIMIT_MEMLOCK), 59 ret ? " - exceeded" : ""); 60 61 up_write(&mm->mmap_sem); 62 63 return ret; 64 } 65 66 static void decrement_locked_vm(struct mm_struct *mm, long npages) 67 { 68 if (!mm || !npages) 69 return; 70 71 down_write(&mm->mmap_sem); 72 if (WARN_ON_ONCE(npages > mm->locked_vm)) 73 npages = mm->locked_vm; 74 mm->locked_vm -= npages; 75 pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid, 76 npages << PAGE_SHIFT, 77 mm->locked_vm << PAGE_SHIFT, 78 rlimit(RLIMIT_MEMLOCK)); 79 up_write(&mm->mmap_sem); 80 } 81 82 /* 83 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation 84 * 85 * This code handles mapping and unmapping of user data buffers 86 * into DMA'ble space using the IOMMU 87 */ 88 89 struct tce_iommu_group { 90 struct list_head next; 91 struct iommu_group *grp; 92 }; 93 94 /* 95 * A container needs to remember which preregistered region it has 96 * referenced to do proper cleanup at the userspace process exit. 97 */ 98 struct tce_iommu_prereg { 99 struct list_head next; 100 struct mm_iommu_table_group_mem_t *mem; 101 }; 102 103 /* 104 * The container descriptor supports only a single group per container. 105 * Required by the API as the container is not supplied with the IOMMU group 106 * at the moment of initialization. 107 */ 108 struct tce_container { 109 struct mutex lock; 110 bool enabled; 111 bool v2; 112 bool def_window_pending; 113 unsigned long locked_pages; 114 struct mm_struct *mm; 115 struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; 116 struct list_head group_list; 117 struct list_head prereg_list; 118 }; 119 120 static long tce_iommu_mm_set(struct tce_container *container) 121 { 122 if (container->mm) { 123 if (container->mm == current->mm) 124 return 0; 125 return -EPERM; 126 } 127 BUG_ON(!current->mm); 128 container->mm = current->mm; 129 atomic_inc(&container->mm->mm_count); 130 131 return 0; 132 } 133 134 static long tce_iommu_prereg_free(struct tce_container *container, 135 struct tce_iommu_prereg *tcemem) 136 { 137 long ret; 138 139 ret = mm_iommu_put(container->mm, tcemem->mem); 140 if (ret) 141 return ret; 142 143 list_del(&tcemem->next); 144 kfree(tcemem); 145 146 return 0; 147 } 148 149 static long tce_iommu_unregister_pages(struct tce_container *container, 150 __u64 vaddr, __u64 size) 151 { 152 struct mm_iommu_table_group_mem_t *mem; 153 struct tce_iommu_prereg *tcemem; 154 bool found = false; 155 long ret; 156 157 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) 158 return -EINVAL; 159 160 mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT); 161 if (!mem) 162 return -ENOENT; 163 164 list_for_each_entry(tcemem, &container->prereg_list, next) { 165 if (tcemem->mem == mem) { 166 found = true; 167 break; 168 } 169 } 170 171 if (!found) 172 ret = -ENOENT; 173 else 174 ret = tce_iommu_prereg_free(container, tcemem); 175 176 mm_iommu_put(container->mm, mem); 177 178 return ret; 179 } 180 181 static long tce_iommu_register_pages(struct tce_container *container, 182 __u64 vaddr, __u64 size) 183 { 184 long ret = 0; 185 struct mm_iommu_table_group_mem_t *mem = NULL; 186 struct tce_iommu_prereg *tcemem; 187 unsigned long entries = size >> PAGE_SHIFT; 188 189 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || 190 ((vaddr + size) < vaddr)) 191 return -EINVAL; 192 193 mem = mm_iommu_get(container->mm, vaddr, entries); 194 if (mem) { 195 list_for_each_entry(tcemem, &container->prereg_list, next) { 196 if (tcemem->mem == mem) { 197 ret = -EBUSY; 198 goto put_exit; 199 } 200 } 201 } else { 202 ret = mm_iommu_new(container->mm, vaddr, entries, &mem); 203 if (ret) 204 return ret; 205 } 206 207 tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); 208 if (!tcemem) { 209 ret = -ENOMEM; 210 goto put_exit; 211 } 212 213 tcemem->mem = mem; 214 list_add(&tcemem->next, &container->prereg_list); 215 216 container->enabled = true; 217 218 return 0; 219 220 put_exit: 221 mm_iommu_put(container->mm, mem); 222 return ret; 223 } 224 225 static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, 226 unsigned int page_shift) 227 { 228 struct page *page; 229 unsigned long size = 0; 230 231 if (mm_iommu_is_devmem(mm, hpa, page_shift, &size)) 232 return size == (1UL << page_shift); 233 234 page = pfn_to_page(hpa >> PAGE_SHIFT); 235 /* 236 * Check that the TCE table granularity is not bigger than the size of 237 * a page we just found. Otherwise the hardware can get access to 238 * a bigger memory chunk that it should. 239 */ 240 return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; 241 } 242 243 static inline bool tce_groups_attached(struct tce_container *container) 244 { 245 return !list_empty(&container->group_list); 246 } 247 248 static long tce_iommu_find_table(struct tce_container *container, 249 phys_addr_t ioba, struct iommu_table **ptbl) 250 { 251 long i; 252 253 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 254 struct iommu_table *tbl = container->tables[i]; 255 256 if (tbl) { 257 unsigned long entry = ioba >> tbl->it_page_shift; 258 unsigned long start = tbl->it_offset; 259 unsigned long end = start + tbl->it_size; 260 261 if ((start <= entry) && (entry < end)) { 262 *ptbl = tbl; 263 return i; 264 } 265 } 266 } 267 268 return -1; 269 } 270 271 static int tce_iommu_find_free_table(struct tce_container *container) 272 { 273 int i; 274 275 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 276 if (!container->tables[i]) 277 return i; 278 } 279 280 return -ENOSPC; 281 } 282 283 static int tce_iommu_enable(struct tce_container *container) 284 { 285 int ret = 0; 286 unsigned long locked; 287 struct iommu_table_group *table_group; 288 struct tce_iommu_group *tcegrp; 289 290 if (container->enabled) 291 return -EBUSY; 292 293 /* 294 * When userspace pages are mapped into the IOMMU, they are effectively 295 * locked memory, so, theoretically, we need to update the accounting 296 * of locked pages on each map and unmap. For powerpc, the map unmap 297 * paths can be very hot, though, and the accounting would kill 298 * performance, especially since it would be difficult to impossible 299 * to handle the accounting in real mode only. 300 * 301 * To address that, rather than precisely accounting every page, we 302 * instead account for a worst case on locked memory when the iommu is 303 * enabled and disabled. The worst case upper bound on locked memory 304 * is the size of the whole iommu window, which is usually relatively 305 * small (compared to total memory sizes) on POWER hardware. 306 * 307 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, 308 * that would effectively kill the guest at random points, much better 309 * enforcing the limit based on the max that the guest can map. 310 * 311 * Unfortunately at the moment it counts whole tables, no matter how 312 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups 313 * each with 2GB DMA window, 8GB will be counted here. The reason for 314 * this is that we cannot tell here the amount of RAM used by the guest 315 * as this information is only available from KVM and VFIO is 316 * KVM agnostic. 317 * 318 * So we do not allow enabling a container without a group attached 319 * as there is no way to know how much we should increment 320 * the locked_vm counter. 321 */ 322 if (!tce_groups_attached(container)) 323 return -ENODEV; 324 325 tcegrp = list_first_entry(&container->group_list, 326 struct tce_iommu_group, next); 327 table_group = iommu_group_get_iommudata(tcegrp->grp); 328 if (!table_group) 329 return -ENODEV; 330 331 if (!table_group->tce32_size) 332 return -EPERM; 333 334 ret = tce_iommu_mm_set(container); 335 if (ret) 336 return ret; 337 338 locked = table_group->tce32_size >> PAGE_SHIFT; 339 ret = try_increment_locked_vm(container->mm, locked); 340 if (ret) 341 return ret; 342 343 container->locked_pages = locked; 344 345 container->enabled = true; 346 347 return ret; 348 } 349 350 static void tce_iommu_disable(struct tce_container *container) 351 { 352 if (!container->enabled) 353 return; 354 355 container->enabled = false; 356 357 BUG_ON(!container->mm); 358 decrement_locked_vm(container->mm, container->locked_pages); 359 } 360 361 static void *tce_iommu_open(unsigned long arg) 362 { 363 struct tce_container *container; 364 365 if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) { 366 pr_err("tce_vfio: Wrong IOMMU type\n"); 367 return ERR_PTR(-EINVAL); 368 } 369 370 container = kzalloc(sizeof(*container), GFP_KERNEL); 371 if (!container) 372 return ERR_PTR(-ENOMEM); 373 374 mutex_init(&container->lock); 375 INIT_LIST_HEAD_RCU(&container->group_list); 376 INIT_LIST_HEAD_RCU(&container->prereg_list); 377 378 container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; 379 380 return container; 381 } 382 383 static int tce_iommu_clear(struct tce_container *container, 384 struct iommu_table *tbl, 385 unsigned long entry, unsigned long pages); 386 static void tce_iommu_free_table(struct tce_container *container, 387 struct iommu_table *tbl); 388 389 static void tce_iommu_release(void *iommu_data) 390 { 391 struct tce_container *container = iommu_data; 392 struct tce_iommu_group *tcegrp; 393 struct tce_iommu_prereg *tcemem, *tmtmp; 394 long i; 395 396 while (tce_groups_attached(container)) { 397 tcegrp = list_first_entry(&container->group_list, 398 struct tce_iommu_group, next); 399 tce_iommu_detach_group(iommu_data, tcegrp->grp); 400 } 401 402 /* 403 * If VFIO created a table, it was not disposed 404 * by tce_iommu_detach_group() so do it now. 405 */ 406 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 407 struct iommu_table *tbl = container->tables[i]; 408 409 if (!tbl) 410 continue; 411 412 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 413 tce_iommu_free_table(container, tbl); 414 } 415 416 list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next) 417 WARN_ON(tce_iommu_prereg_free(container, tcemem)); 418 419 tce_iommu_disable(container); 420 if (container->mm) 421 mmdrop(container->mm); 422 mutex_destroy(&container->lock); 423 424 kfree(container); 425 } 426 427 static void tce_iommu_unuse_page(struct tce_container *container, 428 unsigned long hpa) 429 { 430 struct page *page; 431 432 page = pfn_to_page(hpa >> PAGE_SHIFT); 433 put_page(page); 434 } 435 436 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container, 437 unsigned long tce, unsigned long shift, 438 unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) 439 { 440 long ret = 0; 441 struct mm_iommu_table_group_mem_t *mem; 442 443 mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift); 444 if (!mem) 445 return -EINVAL; 446 447 ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa); 448 if (ret) 449 return -EINVAL; 450 451 *pmem = mem; 452 453 return 0; 454 } 455 456 static void tce_iommu_unuse_page_v2(struct tce_container *container, 457 struct iommu_table *tbl, unsigned long entry) 458 { 459 struct mm_iommu_table_group_mem_t *mem = NULL; 460 int ret; 461 unsigned long hpa = 0; 462 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); 463 464 if (!pua) 465 return; 466 467 ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua), 468 tbl->it_page_shift, &hpa, &mem); 469 if (ret) 470 pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n", 471 __func__, be64_to_cpu(*pua), entry, ret); 472 if (mem) 473 mm_iommu_mapped_dec(mem); 474 475 *pua = cpu_to_be64(0); 476 } 477 478 static int tce_iommu_clear(struct tce_container *container, 479 struct iommu_table *tbl, 480 unsigned long entry, unsigned long pages) 481 { 482 unsigned long oldhpa; 483 long ret; 484 enum dma_data_direction direction; 485 unsigned long lastentry = entry + pages; 486 487 for ( ; entry < lastentry; ++entry) { 488 if (tbl->it_indirect_levels && tbl->it_userspace) { 489 /* 490 * For multilevel tables, we can take a shortcut here 491 * and skip some TCEs as we know that the userspace 492 * addresses cache is a mirror of the real TCE table 493 * and if it is missing some indirect levels, then 494 * the hardware table does not have them allocated 495 * either and therefore does not require updating. 496 */ 497 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, 498 entry); 499 if (!pua) { 500 /* align to level_size which is power of two */ 501 entry |= tbl->it_level_size - 1; 502 continue; 503 } 504 } 505 506 cond_resched(); 507 508 direction = DMA_NONE; 509 oldhpa = 0; 510 ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa, 511 &direction); 512 if (ret) 513 continue; 514 515 if (direction == DMA_NONE) 516 continue; 517 518 if (container->v2) { 519 tce_iommu_unuse_page_v2(container, tbl, entry); 520 continue; 521 } 522 523 tce_iommu_unuse_page(container, oldhpa); 524 } 525 526 return 0; 527 } 528 529 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) 530 { 531 struct page *page = NULL; 532 enum dma_data_direction direction = iommu_tce_direction(tce); 533 534 if (get_user_pages_fast(tce & PAGE_MASK, 1, 535 direction != DMA_TO_DEVICE ? FOLL_WRITE : 0, 536 &page) != 1) 537 return -EFAULT; 538 539 *hpa = __pa((unsigned long) page_address(page)); 540 541 return 0; 542 } 543 544 static long tce_iommu_build(struct tce_container *container, 545 struct iommu_table *tbl, 546 unsigned long entry, unsigned long tce, unsigned long pages, 547 enum dma_data_direction direction) 548 { 549 long i, ret = 0; 550 unsigned long hpa; 551 enum dma_data_direction dirtmp; 552 553 for (i = 0; i < pages; ++i) { 554 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 555 556 ret = tce_iommu_use_page(tce, &hpa); 557 if (ret) 558 break; 559 560 if (!tce_page_is_contained(container->mm, hpa, 561 tbl->it_page_shift)) { 562 ret = -EPERM; 563 break; 564 } 565 566 hpa |= offset; 567 dirtmp = direction; 568 ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, 569 &dirtmp); 570 if (ret) { 571 tce_iommu_unuse_page(container, hpa); 572 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 573 __func__, entry << tbl->it_page_shift, 574 tce, ret); 575 break; 576 } 577 578 if (dirtmp != DMA_NONE) 579 tce_iommu_unuse_page(container, hpa); 580 581 tce += IOMMU_PAGE_SIZE(tbl); 582 } 583 584 if (ret) 585 tce_iommu_clear(container, tbl, entry, i); 586 587 return ret; 588 } 589 590 static long tce_iommu_build_v2(struct tce_container *container, 591 struct iommu_table *tbl, 592 unsigned long entry, unsigned long tce, unsigned long pages, 593 enum dma_data_direction direction) 594 { 595 long i, ret = 0; 596 unsigned long hpa; 597 enum dma_data_direction dirtmp; 598 599 for (i = 0; i < pages; ++i) { 600 struct mm_iommu_table_group_mem_t *mem = NULL; 601 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i); 602 603 ret = tce_iommu_prereg_ua_to_hpa(container, 604 tce, tbl->it_page_shift, &hpa, &mem); 605 if (ret) 606 break; 607 608 if (!tce_page_is_contained(container->mm, hpa, 609 tbl->it_page_shift)) { 610 ret = -EPERM; 611 break; 612 } 613 614 /* Preserve offset within IOMMU page */ 615 hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 616 dirtmp = direction; 617 618 /* The registered region is being unregistered */ 619 if (mm_iommu_mapped_inc(mem)) 620 break; 621 622 ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, 623 &dirtmp); 624 if (ret) { 625 /* dirtmp cannot be DMA_NONE here */ 626 tce_iommu_unuse_page_v2(container, tbl, entry + i); 627 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 628 __func__, entry << tbl->it_page_shift, 629 tce, ret); 630 break; 631 } 632 633 if (dirtmp != DMA_NONE) 634 tce_iommu_unuse_page_v2(container, tbl, entry + i); 635 636 *pua = cpu_to_be64(tce); 637 638 tce += IOMMU_PAGE_SIZE(tbl); 639 } 640 641 if (ret) 642 tce_iommu_clear(container, tbl, entry, i); 643 644 return ret; 645 } 646 647 static long tce_iommu_create_table(struct tce_container *container, 648 struct iommu_table_group *table_group, 649 int num, 650 __u32 page_shift, 651 __u64 window_size, 652 __u32 levels, 653 struct iommu_table **ptbl) 654 { 655 long ret, table_size; 656 657 table_size = table_group->ops->get_table_size(page_shift, window_size, 658 levels); 659 if (!table_size) 660 return -EINVAL; 661 662 ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT); 663 if (ret) 664 return ret; 665 666 ret = table_group->ops->create_table(table_group, num, 667 page_shift, window_size, levels, ptbl); 668 669 WARN_ON(!ret && !(*ptbl)->it_ops->free); 670 WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size)); 671 672 return ret; 673 } 674 675 static void tce_iommu_free_table(struct tce_container *container, 676 struct iommu_table *tbl) 677 { 678 unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; 679 680 iommu_tce_table_put(tbl); 681 decrement_locked_vm(container->mm, pages); 682 } 683 684 static long tce_iommu_create_window(struct tce_container *container, 685 __u32 page_shift, __u64 window_size, __u32 levels, 686 __u64 *start_addr) 687 { 688 struct tce_iommu_group *tcegrp; 689 struct iommu_table_group *table_group; 690 struct iommu_table *tbl = NULL; 691 long ret, num; 692 693 num = tce_iommu_find_free_table(container); 694 if (num < 0) 695 return num; 696 697 /* Get the first group for ops::create_table */ 698 tcegrp = list_first_entry(&container->group_list, 699 struct tce_iommu_group, next); 700 table_group = iommu_group_get_iommudata(tcegrp->grp); 701 if (!table_group) 702 return -EFAULT; 703 704 if (!(table_group->pgsizes & (1ULL << page_shift))) 705 return -EINVAL; 706 707 if (!table_group->ops->set_window || !table_group->ops->unset_window || 708 !table_group->ops->get_table_size || 709 !table_group->ops->create_table) 710 return -EPERM; 711 712 /* Create TCE table */ 713 ret = tce_iommu_create_table(container, table_group, num, 714 page_shift, window_size, levels, &tbl); 715 if (ret) 716 return ret; 717 718 BUG_ON(!tbl->it_ops->free); 719 720 /* 721 * Program the table to every group. 722 * Groups have been tested for compatibility at the attach time. 723 */ 724 list_for_each_entry(tcegrp, &container->group_list, next) { 725 table_group = iommu_group_get_iommudata(tcegrp->grp); 726 727 ret = table_group->ops->set_window(table_group, num, tbl); 728 if (ret) 729 goto unset_exit; 730 } 731 732 container->tables[num] = tbl; 733 734 /* Return start address assigned by platform in create_table() */ 735 *start_addr = tbl->it_offset << tbl->it_page_shift; 736 737 return 0; 738 739 unset_exit: 740 list_for_each_entry(tcegrp, &container->group_list, next) { 741 table_group = iommu_group_get_iommudata(tcegrp->grp); 742 table_group->ops->unset_window(table_group, num); 743 } 744 tce_iommu_free_table(container, tbl); 745 746 return ret; 747 } 748 749 static long tce_iommu_remove_window(struct tce_container *container, 750 __u64 start_addr) 751 { 752 struct iommu_table_group *table_group = NULL; 753 struct iommu_table *tbl; 754 struct tce_iommu_group *tcegrp; 755 int num; 756 757 num = tce_iommu_find_table(container, start_addr, &tbl); 758 if (num < 0) 759 return -EINVAL; 760 761 BUG_ON(!tbl->it_size); 762 763 /* Detach groups from IOMMUs */ 764 list_for_each_entry(tcegrp, &container->group_list, next) { 765 table_group = iommu_group_get_iommudata(tcegrp->grp); 766 767 /* 768 * SPAPR TCE IOMMU exposes the default DMA window to 769 * the guest via dma32_window_start/size of 770 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow 771 * the userspace to remove this window, some do not so 772 * here we check for the platform capability. 773 */ 774 if (!table_group->ops || !table_group->ops->unset_window) 775 return -EPERM; 776 777 table_group->ops->unset_window(table_group, num); 778 } 779 780 /* Free table */ 781 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 782 tce_iommu_free_table(container, tbl); 783 container->tables[num] = NULL; 784 785 return 0; 786 } 787 788 static long tce_iommu_create_default_window(struct tce_container *container) 789 { 790 long ret; 791 __u64 start_addr = 0; 792 struct tce_iommu_group *tcegrp; 793 struct iommu_table_group *table_group; 794 795 if (!container->def_window_pending) 796 return 0; 797 798 if (!tce_groups_attached(container)) 799 return -ENODEV; 800 801 tcegrp = list_first_entry(&container->group_list, 802 struct tce_iommu_group, next); 803 table_group = iommu_group_get_iommudata(tcegrp->grp); 804 if (!table_group) 805 return -ENODEV; 806 807 ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K, 808 table_group->tce32_size, 1, &start_addr); 809 WARN_ON_ONCE(!ret && start_addr); 810 811 if (!ret) 812 container->def_window_pending = false; 813 814 return ret; 815 } 816 817 static long tce_iommu_ioctl(void *iommu_data, 818 unsigned int cmd, unsigned long arg) 819 { 820 struct tce_container *container = iommu_data; 821 unsigned long minsz, ddwsz; 822 long ret; 823 824 switch (cmd) { 825 case VFIO_CHECK_EXTENSION: 826 switch (arg) { 827 case VFIO_SPAPR_TCE_IOMMU: 828 case VFIO_SPAPR_TCE_v2_IOMMU: 829 ret = 1; 830 break; 831 default: 832 ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); 833 break; 834 } 835 836 return (ret < 0) ? 0 : ret; 837 } 838 839 /* 840 * Sanity check to prevent one userspace from manipulating 841 * another userspace mm. 842 */ 843 BUG_ON(!container); 844 if (container->mm && container->mm != current->mm) 845 return -EPERM; 846 847 switch (cmd) { 848 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { 849 struct vfio_iommu_spapr_tce_info info; 850 struct tce_iommu_group *tcegrp; 851 struct iommu_table_group *table_group; 852 853 if (!tce_groups_attached(container)) 854 return -ENXIO; 855 856 tcegrp = list_first_entry(&container->group_list, 857 struct tce_iommu_group, next); 858 table_group = iommu_group_get_iommudata(tcegrp->grp); 859 860 if (!table_group) 861 return -ENXIO; 862 863 minsz = offsetofend(struct vfio_iommu_spapr_tce_info, 864 dma32_window_size); 865 866 if (copy_from_user(&info, (void __user *)arg, minsz)) 867 return -EFAULT; 868 869 if (info.argsz < minsz) 870 return -EINVAL; 871 872 info.dma32_window_start = table_group->tce32_start; 873 info.dma32_window_size = table_group->tce32_size; 874 info.flags = 0; 875 memset(&info.ddw, 0, sizeof(info.ddw)); 876 877 if (table_group->max_dynamic_windows_supported && 878 container->v2) { 879 info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW; 880 info.ddw.pgsizes = table_group->pgsizes; 881 info.ddw.max_dynamic_windows_supported = 882 table_group->max_dynamic_windows_supported; 883 info.ddw.levels = table_group->max_levels; 884 } 885 886 ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw); 887 888 if (info.argsz >= ddwsz) 889 minsz = ddwsz; 890 891 if (copy_to_user((void __user *)arg, &info, minsz)) 892 return -EFAULT; 893 894 return 0; 895 } 896 case VFIO_IOMMU_MAP_DMA: { 897 struct vfio_iommu_type1_dma_map param; 898 struct iommu_table *tbl = NULL; 899 long num; 900 enum dma_data_direction direction; 901 902 if (!container->enabled) 903 return -EPERM; 904 905 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 906 907 if (copy_from_user(¶m, (void __user *)arg, minsz)) 908 return -EFAULT; 909 910 if (param.argsz < minsz) 911 return -EINVAL; 912 913 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ | 914 VFIO_DMA_MAP_FLAG_WRITE)) 915 return -EINVAL; 916 917 ret = tce_iommu_create_default_window(container); 918 if (ret) 919 return ret; 920 921 num = tce_iommu_find_table(container, param.iova, &tbl); 922 if (num < 0) 923 return -ENXIO; 924 925 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || 926 (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) 927 return -EINVAL; 928 929 /* iova is checked by the IOMMU API */ 930 if (param.flags & VFIO_DMA_MAP_FLAG_READ) { 931 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 932 direction = DMA_BIDIRECTIONAL; 933 else 934 direction = DMA_TO_DEVICE; 935 } else { 936 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 937 direction = DMA_FROM_DEVICE; 938 else 939 return -EINVAL; 940 } 941 942 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); 943 if (ret) 944 return ret; 945 946 if (container->v2) 947 ret = tce_iommu_build_v2(container, tbl, 948 param.iova >> tbl->it_page_shift, 949 param.vaddr, 950 param.size >> tbl->it_page_shift, 951 direction); 952 else 953 ret = tce_iommu_build(container, tbl, 954 param.iova >> tbl->it_page_shift, 955 param.vaddr, 956 param.size >> tbl->it_page_shift, 957 direction); 958 959 iommu_flush_tce(tbl); 960 961 return ret; 962 } 963 case VFIO_IOMMU_UNMAP_DMA: { 964 struct vfio_iommu_type1_dma_unmap param; 965 struct iommu_table *tbl = NULL; 966 long num; 967 968 if (!container->enabled) 969 return -EPERM; 970 971 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, 972 size); 973 974 if (copy_from_user(¶m, (void __user *)arg, minsz)) 975 return -EFAULT; 976 977 if (param.argsz < minsz) 978 return -EINVAL; 979 980 /* No flag is supported now */ 981 if (param.flags) 982 return -EINVAL; 983 984 ret = tce_iommu_create_default_window(container); 985 if (ret) 986 return ret; 987 988 num = tce_iommu_find_table(container, param.iova, &tbl); 989 if (num < 0) 990 return -ENXIO; 991 992 if (param.size & ~IOMMU_PAGE_MASK(tbl)) 993 return -EINVAL; 994 995 ret = iommu_tce_clear_param_check(tbl, param.iova, 0, 996 param.size >> tbl->it_page_shift); 997 if (ret) 998 return ret; 999 1000 ret = tce_iommu_clear(container, tbl, 1001 param.iova >> tbl->it_page_shift, 1002 param.size >> tbl->it_page_shift); 1003 iommu_flush_tce(tbl); 1004 1005 return ret; 1006 } 1007 case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: { 1008 struct vfio_iommu_spapr_register_memory param; 1009 1010 if (!container->v2) 1011 break; 1012 1013 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1014 size); 1015 1016 ret = tce_iommu_mm_set(container); 1017 if (ret) 1018 return ret; 1019 1020 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1021 return -EFAULT; 1022 1023 if (param.argsz < minsz) 1024 return -EINVAL; 1025 1026 /* No flag is supported now */ 1027 if (param.flags) 1028 return -EINVAL; 1029 1030 mutex_lock(&container->lock); 1031 ret = tce_iommu_register_pages(container, param.vaddr, 1032 param.size); 1033 mutex_unlock(&container->lock); 1034 1035 return ret; 1036 } 1037 case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: { 1038 struct vfio_iommu_spapr_register_memory param; 1039 1040 if (!container->v2) 1041 break; 1042 1043 if (!container->mm) 1044 return -EPERM; 1045 1046 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1047 size); 1048 1049 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1050 return -EFAULT; 1051 1052 if (param.argsz < minsz) 1053 return -EINVAL; 1054 1055 /* No flag is supported now */ 1056 if (param.flags) 1057 return -EINVAL; 1058 1059 mutex_lock(&container->lock); 1060 ret = tce_iommu_unregister_pages(container, param.vaddr, 1061 param.size); 1062 mutex_unlock(&container->lock); 1063 1064 return ret; 1065 } 1066 case VFIO_IOMMU_ENABLE: 1067 if (container->v2) 1068 break; 1069 1070 mutex_lock(&container->lock); 1071 ret = tce_iommu_enable(container); 1072 mutex_unlock(&container->lock); 1073 return ret; 1074 1075 1076 case VFIO_IOMMU_DISABLE: 1077 if (container->v2) 1078 break; 1079 1080 mutex_lock(&container->lock); 1081 tce_iommu_disable(container); 1082 mutex_unlock(&container->lock); 1083 return 0; 1084 1085 case VFIO_EEH_PE_OP: { 1086 struct tce_iommu_group *tcegrp; 1087 1088 ret = 0; 1089 list_for_each_entry(tcegrp, &container->group_list, next) { 1090 ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, 1091 cmd, arg); 1092 if (ret) 1093 return ret; 1094 } 1095 return ret; 1096 } 1097 1098 case VFIO_IOMMU_SPAPR_TCE_CREATE: { 1099 struct vfio_iommu_spapr_tce_create create; 1100 1101 if (!container->v2) 1102 break; 1103 1104 ret = tce_iommu_mm_set(container); 1105 if (ret) 1106 return ret; 1107 1108 if (!tce_groups_attached(container)) 1109 return -ENXIO; 1110 1111 minsz = offsetofend(struct vfio_iommu_spapr_tce_create, 1112 start_addr); 1113 1114 if (copy_from_user(&create, (void __user *)arg, minsz)) 1115 return -EFAULT; 1116 1117 if (create.argsz < minsz) 1118 return -EINVAL; 1119 1120 if (create.flags) 1121 return -EINVAL; 1122 1123 mutex_lock(&container->lock); 1124 1125 ret = tce_iommu_create_default_window(container); 1126 if (!ret) 1127 ret = tce_iommu_create_window(container, 1128 create.page_shift, 1129 create.window_size, create.levels, 1130 &create.start_addr); 1131 1132 mutex_unlock(&container->lock); 1133 1134 if (!ret && copy_to_user((void __user *)arg, &create, minsz)) 1135 ret = -EFAULT; 1136 1137 return ret; 1138 } 1139 case VFIO_IOMMU_SPAPR_TCE_REMOVE: { 1140 struct vfio_iommu_spapr_tce_remove remove; 1141 1142 if (!container->v2) 1143 break; 1144 1145 ret = tce_iommu_mm_set(container); 1146 if (ret) 1147 return ret; 1148 1149 if (!tce_groups_attached(container)) 1150 return -ENXIO; 1151 1152 minsz = offsetofend(struct vfio_iommu_spapr_tce_remove, 1153 start_addr); 1154 1155 if (copy_from_user(&remove, (void __user *)arg, minsz)) 1156 return -EFAULT; 1157 1158 if (remove.argsz < minsz) 1159 return -EINVAL; 1160 1161 if (remove.flags) 1162 return -EINVAL; 1163 1164 if (container->def_window_pending && !remove.start_addr) { 1165 container->def_window_pending = false; 1166 return 0; 1167 } 1168 1169 mutex_lock(&container->lock); 1170 1171 ret = tce_iommu_remove_window(container, remove.start_addr); 1172 1173 mutex_unlock(&container->lock); 1174 1175 return ret; 1176 } 1177 } 1178 1179 return -ENOTTY; 1180 } 1181 1182 static void tce_iommu_release_ownership(struct tce_container *container, 1183 struct iommu_table_group *table_group) 1184 { 1185 int i; 1186 1187 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1188 struct iommu_table *tbl = container->tables[i]; 1189 1190 if (!tbl) 1191 continue; 1192 1193 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 1194 if (tbl->it_map) 1195 iommu_release_ownership(tbl); 1196 1197 container->tables[i] = NULL; 1198 } 1199 } 1200 1201 static int tce_iommu_take_ownership(struct tce_container *container, 1202 struct iommu_table_group *table_group) 1203 { 1204 int i, j, rc = 0; 1205 1206 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1207 struct iommu_table *tbl = table_group->tables[i]; 1208 1209 if (!tbl || !tbl->it_map) 1210 continue; 1211 1212 rc = iommu_take_ownership(tbl); 1213 if (rc) { 1214 for (j = 0; j < i; ++j) 1215 iommu_release_ownership( 1216 table_group->tables[j]); 1217 1218 return rc; 1219 } 1220 } 1221 1222 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1223 container->tables[i] = table_group->tables[i]; 1224 1225 return 0; 1226 } 1227 1228 static void tce_iommu_release_ownership_ddw(struct tce_container *container, 1229 struct iommu_table_group *table_group) 1230 { 1231 long i; 1232 1233 if (!table_group->ops->unset_window) { 1234 WARN_ON_ONCE(1); 1235 return; 1236 } 1237 1238 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1239 if (container->tables[i]) 1240 table_group->ops->unset_window(table_group, i); 1241 1242 table_group->ops->release_ownership(table_group); 1243 } 1244 1245 static long tce_iommu_take_ownership_ddw(struct tce_container *container, 1246 struct iommu_table_group *table_group) 1247 { 1248 long i, ret = 0; 1249 1250 if (!table_group->ops->create_table || !table_group->ops->set_window || 1251 !table_group->ops->release_ownership) { 1252 WARN_ON_ONCE(1); 1253 return -EFAULT; 1254 } 1255 1256 table_group->ops->take_ownership(table_group); 1257 1258 /* Set all windows to the new group */ 1259 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1260 struct iommu_table *tbl = container->tables[i]; 1261 1262 if (!tbl) 1263 continue; 1264 1265 ret = table_group->ops->set_window(table_group, i, tbl); 1266 if (ret) 1267 goto release_exit; 1268 } 1269 1270 return 0; 1271 1272 release_exit: 1273 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1274 table_group->ops->unset_window(table_group, i); 1275 1276 table_group->ops->release_ownership(table_group); 1277 1278 return ret; 1279 } 1280 1281 static int tce_iommu_attach_group(void *iommu_data, 1282 struct iommu_group *iommu_group) 1283 { 1284 int ret; 1285 struct tce_container *container = iommu_data; 1286 struct iommu_table_group *table_group; 1287 struct tce_iommu_group *tcegrp = NULL; 1288 1289 mutex_lock(&container->lock); 1290 1291 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", 1292 iommu_group_id(iommu_group), iommu_group); */ 1293 table_group = iommu_group_get_iommudata(iommu_group); 1294 if (!table_group) { 1295 ret = -ENODEV; 1296 goto unlock_exit; 1297 } 1298 1299 if (tce_groups_attached(container) && (!table_group->ops || 1300 !table_group->ops->take_ownership || 1301 !table_group->ops->release_ownership)) { 1302 ret = -EBUSY; 1303 goto unlock_exit; 1304 } 1305 1306 /* Check if new group has the same iommu_ops (i.e. compatible) */ 1307 list_for_each_entry(tcegrp, &container->group_list, next) { 1308 struct iommu_table_group *table_group_tmp; 1309 1310 if (tcegrp->grp == iommu_group) { 1311 pr_warn("tce_vfio: Group %d is already attached\n", 1312 iommu_group_id(iommu_group)); 1313 ret = -EBUSY; 1314 goto unlock_exit; 1315 } 1316 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); 1317 if (table_group_tmp->ops->create_table != 1318 table_group->ops->create_table) { 1319 pr_warn("tce_vfio: Group %d is incompatible with group %d\n", 1320 iommu_group_id(iommu_group), 1321 iommu_group_id(tcegrp->grp)); 1322 ret = -EPERM; 1323 goto unlock_exit; 1324 } 1325 } 1326 1327 tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL); 1328 if (!tcegrp) { 1329 ret = -ENOMEM; 1330 goto unlock_exit; 1331 } 1332 1333 if (!table_group->ops || !table_group->ops->take_ownership || 1334 !table_group->ops->release_ownership) { 1335 if (container->v2) { 1336 ret = -EPERM; 1337 goto unlock_exit; 1338 } 1339 ret = tce_iommu_take_ownership(container, table_group); 1340 } else { 1341 if (!container->v2) { 1342 ret = -EPERM; 1343 goto unlock_exit; 1344 } 1345 ret = tce_iommu_take_ownership_ddw(container, table_group); 1346 if (!tce_groups_attached(container) && !container->tables[0]) 1347 container->def_window_pending = true; 1348 } 1349 1350 if (!ret) { 1351 tcegrp->grp = iommu_group; 1352 list_add(&tcegrp->next, &container->group_list); 1353 } 1354 1355 unlock_exit: 1356 if (ret && tcegrp) 1357 kfree(tcegrp); 1358 1359 mutex_unlock(&container->lock); 1360 1361 return ret; 1362 } 1363 1364 static void tce_iommu_detach_group(void *iommu_data, 1365 struct iommu_group *iommu_group) 1366 { 1367 struct tce_container *container = iommu_data; 1368 struct iommu_table_group *table_group; 1369 bool found = false; 1370 struct tce_iommu_group *tcegrp; 1371 1372 mutex_lock(&container->lock); 1373 1374 list_for_each_entry(tcegrp, &container->group_list, next) { 1375 if (tcegrp->grp == iommu_group) { 1376 found = true; 1377 break; 1378 } 1379 } 1380 1381 if (!found) { 1382 pr_warn("tce_vfio: detaching unattached group #%u\n", 1383 iommu_group_id(iommu_group)); 1384 goto unlock_exit; 1385 } 1386 1387 list_del(&tcegrp->next); 1388 kfree(tcegrp); 1389 1390 table_group = iommu_group_get_iommudata(iommu_group); 1391 BUG_ON(!table_group); 1392 1393 if (!table_group->ops || !table_group->ops->release_ownership) 1394 tce_iommu_release_ownership(container, table_group); 1395 else 1396 tce_iommu_release_ownership_ddw(container, table_group); 1397 1398 unlock_exit: 1399 mutex_unlock(&container->lock); 1400 } 1401 1402 static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { 1403 .name = "iommu-vfio-powerpc", 1404 .owner = THIS_MODULE, 1405 .open = tce_iommu_open, 1406 .release = tce_iommu_release, 1407 .ioctl = tce_iommu_ioctl, 1408 .attach_group = tce_iommu_attach_group, 1409 .detach_group = tce_iommu_detach_group, 1410 }; 1411 1412 static int __init tce_iommu_init(void) 1413 { 1414 return vfio_register_iommu_driver(&tce_iommu_driver_ops); 1415 } 1416 1417 static void __exit tce_iommu_cleanup(void) 1418 { 1419 vfio_unregister_iommu_driver(&tce_iommu_driver_ops); 1420 } 1421 1422 module_init(tce_iommu_init); 1423 module_exit(tce_iommu_cleanup); 1424 1425 MODULE_VERSION(DRIVER_VERSION); 1426 MODULE_LICENSE("GPL v2"); 1427 MODULE_AUTHOR(DRIVER_AUTHOR); 1428 MODULE_DESCRIPTION(DRIVER_DESC); 1429 1430