1 /* 2 * VFIO: IOMMU DMA mapping support for TCE on POWER 3 * 4 * Copyright (C) 2013 IBM Corp. All rights reserved. 5 * Author: Alexey Kardashevskiy <aik@ozlabs.ru> 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 * 11 * Derived from original vfio_iommu_type1.c: 12 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 13 * Author: Alex Williamson <alex.williamson@redhat.com> 14 */ 15 16 #include <linux/module.h> 17 #include <linux/pci.h> 18 #include <linux/slab.h> 19 #include <linux/uaccess.h> 20 #include <linux/err.h> 21 #include <linux/vfio.h> 22 #include <linux/vmalloc.h> 23 #include <linux/sched/mm.h> 24 #include <linux/sched/signal.h> 25 26 #include <asm/iommu.h> 27 #include <asm/tce.h> 28 #include <asm/mmu_context.h> 29 30 #define DRIVER_VERSION "0.1" 31 #define DRIVER_AUTHOR "aik@ozlabs.ru" 32 #define DRIVER_DESC "VFIO IOMMU SPAPR TCE" 33 34 static void tce_iommu_detach_group(void *iommu_data, 35 struct iommu_group *iommu_group); 36 37 static long try_increment_locked_vm(struct mm_struct *mm, long npages) 38 { 39 long ret = 0, locked, lock_limit; 40 41 if (WARN_ON_ONCE(!mm)) 42 return -EPERM; 43 44 if (!npages) 45 return 0; 46 47 down_write(&mm->mmap_sem); 48 locked = mm->locked_vm + npages; 49 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 50 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 51 ret = -ENOMEM; 52 else 53 mm->locked_vm += npages; 54 55 pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid, 56 npages << PAGE_SHIFT, 57 mm->locked_vm << PAGE_SHIFT, 58 rlimit(RLIMIT_MEMLOCK), 59 ret ? " - exceeded" : ""); 60 61 up_write(&mm->mmap_sem); 62 63 return ret; 64 } 65 66 static void decrement_locked_vm(struct mm_struct *mm, long npages) 67 { 68 if (!mm || !npages) 69 return; 70 71 down_write(&mm->mmap_sem); 72 if (WARN_ON_ONCE(npages > mm->locked_vm)) 73 npages = mm->locked_vm; 74 mm->locked_vm -= npages; 75 pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid, 76 npages << PAGE_SHIFT, 77 mm->locked_vm << PAGE_SHIFT, 78 rlimit(RLIMIT_MEMLOCK)); 79 up_write(&mm->mmap_sem); 80 } 81 82 /* 83 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation 84 * 85 * This code handles mapping and unmapping of user data buffers 86 * into DMA'ble space using the IOMMU 87 */ 88 89 struct tce_iommu_group { 90 struct list_head next; 91 struct iommu_group *grp; 92 }; 93 94 /* 95 * A container needs to remember which preregistered region it has 96 * referenced to do proper cleanup at the userspace process exit. 97 */ 98 struct tce_iommu_prereg { 99 struct list_head next; 100 struct mm_iommu_table_group_mem_t *mem; 101 }; 102 103 /* 104 * The container descriptor supports only a single group per container. 105 * Required by the API as the container is not supplied with the IOMMU group 106 * at the moment of initialization. 107 */ 108 struct tce_container { 109 struct mutex lock; 110 bool enabled; 111 bool v2; 112 bool def_window_pending; 113 unsigned long locked_pages; 114 struct mm_struct *mm; 115 struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; 116 struct list_head group_list; 117 struct list_head prereg_list; 118 }; 119 120 static long tce_iommu_mm_set(struct tce_container *container) 121 { 122 if (container->mm) { 123 if (container->mm == current->mm) 124 return 0; 125 return -EPERM; 126 } 127 BUG_ON(!current->mm); 128 container->mm = current->mm; 129 atomic_inc(&container->mm->mm_count); 130 131 return 0; 132 } 133 134 static long tce_iommu_prereg_free(struct tce_container *container, 135 struct tce_iommu_prereg *tcemem) 136 { 137 long ret; 138 139 ret = mm_iommu_put(container->mm, tcemem->mem); 140 if (ret) 141 return ret; 142 143 list_del(&tcemem->next); 144 kfree(tcemem); 145 146 return 0; 147 } 148 149 static long tce_iommu_unregister_pages(struct tce_container *container, 150 __u64 vaddr, __u64 size) 151 { 152 struct mm_iommu_table_group_mem_t *mem; 153 struct tce_iommu_prereg *tcemem; 154 bool found = false; 155 long ret; 156 157 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) 158 return -EINVAL; 159 160 mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT); 161 if (!mem) 162 return -ENOENT; 163 164 list_for_each_entry(tcemem, &container->prereg_list, next) { 165 if (tcemem->mem == mem) { 166 found = true; 167 break; 168 } 169 } 170 171 if (!found) 172 ret = -ENOENT; 173 else 174 ret = tce_iommu_prereg_free(container, tcemem); 175 176 mm_iommu_put(container->mm, mem); 177 178 return ret; 179 } 180 181 static long tce_iommu_register_pages(struct tce_container *container, 182 __u64 vaddr, __u64 size) 183 { 184 long ret = 0; 185 struct mm_iommu_table_group_mem_t *mem = NULL; 186 struct tce_iommu_prereg *tcemem; 187 unsigned long entries = size >> PAGE_SHIFT; 188 189 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || 190 ((vaddr + size) < vaddr)) 191 return -EINVAL; 192 193 mem = mm_iommu_get(container->mm, vaddr, entries); 194 if (mem) { 195 list_for_each_entry(tcemem, &container->prereg_list, next) { 196 if (tcemem->mem == mem) { 197 ret = -EBUSY; 198 goto put_exit; 199 } 200 } 201 } else { 202 ret = mm_iommu_new(container->mm, vaddr, entries, &mem); 203 if (ret) 204 return ret; 205 } 206 207 tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); 208 if (!tcemem) { 209 ret = -ENOMEM; 210 goto put_exit; 211 } 212 213 tcemem->mem = mem; 214 list_add(&tcemem->next, &container->prereg_list); 215 216 container->enabled = true; 217 218 return 0; 219 220 put_exit: 221 mm_iommu_put(container->mm, mem); 222 return ret; 223 } 224 225 static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, 226 unsigned int page_shift) 227 { 228 struct page *page; 229 unsigned long size = 0; 230 231 if (mm_iommu_is_devmem(mm, hpa, page_shift, &size)) 232 return size == (1UL << page_shift); 233 234 page = pfn_to_page(hpa >> PAGE_SHIFT); 235 /* 236 * Check that the TCE table granularity is not bigger than the size of 237 * a page we just found. Otherwise the hardware can get access to 238 * a bigger memory chunk that it should. 239 */ 240 return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; 241 } 242 243 static inline bool tce_groups_attached(struct tce_container *container) 244 { 245 return !list_empty(&container->group_list); 246 } 247 248 static long tce_iommu_find_table(struct tce_container *container, 249 phys_addr_t ioba, struct iommu_table **ptbl) 250 { 251 long i; 252 253 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 254 struct iommu_table *tbl = container->tables[i]; 255 256 if (tbl) { 257 unsigned long entry = ioba >> tbl->it_page_shift; 258 unsigned long start = tbl->it_offset; 259 unsigned long end = start + tbl->it_size; 260 261 if ((start <= entry) && (entry < end)) { 262 *ptbl = tbl; 263 return i; 264 } 265 } 266 } 267 268 return -1; 269 } 270 271 static int tce_iommu_find_free_table(struct tce_container *container) 272 { 273 int i; 274 275 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 276 if (!container->tables[i]) 277 return i; 278 } 279 280 return -ENOSPC; 281 } 282 283 static int tce_iommu_enable(struct tce_container *container) 284 { 285 int ret = 0; 286 unsigned long locked; 287 struct iommu_table_group *table_group; 288 struct tce_iommu_group *tcegrp; 289 290 if (container->enabled) 291 return -EBUSY; 292 293 /* 294 * When userspace pages are mapped into the IOMMU, they are effectively 295 * locked memory, so, theoretically, we need to update the accounting 296 * of locked pages on each map and unmap. For powerpc, the map unmap 297 * paths can be very hot, though, and the accounting would kill 298 * performance, especially since it would be difficult to impossible 299 * to handle the accounting in real mode only. 300 * 301 * To address that, rather than precisely accounting every page, we 302 * instead account for a worst case on locked memory when the iommu is 303 * enabled and disabled. The worst case upper bound on locked memory 304 * is the size of the whole iommu window, which is usually relatively 305 * small (compared to total memory sizes) on POWER hardware. 306 * 307 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, 308 * that would effectively kill the guest at random points, much better 309 * enforcing the limit based on the max that the guest can map. 310 * 311 * Unfortunately at the moment it counts whole tables, no matter how 312 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups 313 * each with 2GB DMA window, 8GB will be counted here. The reason for 314 * this is that we cannot tell here the amount of RAM used by the guest 315 * as this information is only available from KVM and VFIO is 316 * KVM agnostic. 317 * 318 * So we do not allow enabling a container without a group attached 319 * as there is no way to know how much we should increment 320 * the locked_vm counter. 321 */ 322 if (!tce_groups_attached(container)) 323 return -ENODEV; 324 325 tcegrp = list_first_entry(&container->group_list, 326 struct tce_iommu_group, next); 327 table_group = iommu_group_get_iommudata(tcegrp->grp); 328 if (!table_group) 329 return -ENODEV; 330 331 if (!table_group->tce32_size) 332 return -EPERM; 333 334 ret = tce_iommu_mm_set(container); 335 if (ret) 336 return ret; 337 338 locked = table_group->tce32_size >> PAGE_SHIFT; 339 ret = try_increment_locked_vm(container->mm, locked); 340 if (ret) 341 return ret; 342 343 container->locked_pages = locked; 344 345 container->enabled = true; 346 347 return ret; 348 } 349 350 static void tce_iommu_disable(struct tce_container *container) 351 { 352 if (!container->enabled) 353 return; 354 355 container->enabled = false; 356 357 BUG_ON(!container->mm); 358 decrement_locked_vm(container->mm, container->locked_pages); 359 } 360 361 static void *tce_iommu_open(unsigned long arg) 362 { 363 struct tce_container *container; 364 365 if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) { 366 pr_err("tce_vfio: Wrong IOMMU type\n"); 367 return ERR_PTR(-EINVAL); 368 } 369 370 container = kzalloc(sizeof(*container), GFP_KERNEL); 371 if (!container) 372 return ERR_PTR(-ENOMEM); 373 374 mutex_init(&container->lock); 375 INIT_LIST_HEAD_RCU(&container->group_list); 376 INIT_LIST_HEAD_RCU(&container->prereg_list); 377 378 container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; 379 380 return container; 381 } 382 383 static int tce_iommu_clear(struct tce_container *container, 384 struct iommu_table *tbl, 385 unsigned long entry, unsigned long pages); 386 static void tce_iommu_free_table(struct tce_container *container, 387 struct iommu_table *tbl); 388 389 static void tce_iommu_release(void *iommu_data) 390 { 391 struct tce_container *container = iommu_data; 392 struct tce_iommu_group *tcegrp; 393 struct tce_iommu_prereg *tcemem, *tmtmp; 394 long i; 395 396 while (tce_groups_attached(container)) { 397 tcegrp = list_first_entry(&container->group_list, 398 struct tce_iommu_group, next); 399 tce_iommu_detach_group(iommu_data, tcegrp->grp); 400 } 401 402 /* 403 * If VFIO created a table, it was not disposed 404 * by tce_iommu_detach_group() so do it now. 405 */ 406 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 407 struct iommu_table *tbl = container->tables[i]; 408 409 if (!tbl) 410 continue; 411 412 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 413 tce_iommu_free_table(container, tbl); 414 } 415 416 list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next) 417 WARN_ON(tce_iommu_prereg_free(container, tcemem)); 418 419 tce_iommu_disable(container); 420 if (container->mm) 421 mmdrop(container->mm); 422 mutex_destroy(&container->lock); 423 424 kfree(container); 425 } 426 427 static void tce_iommu_unuse_page(struct tce_container *container, 428 unsigned long hpa) 429 { 430 struct page *page; 431 432 page = pfn_to_page(hpa >> PAGE_SHIFT); 433 put_page(page); 434 } 435 436 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container, 437 unsigned long tce, unsigned long shift, 438 unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) 439 { 440 long ret = 0; 441 struct mm_iommu_table_group_mem_t *mem; 442 443 mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift); 444 if (!mem) 445 return -EINVAL; 446 447 ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa); 448 if (ret) 449 return -EINVAL; 450 451 *pmem = mem; 452 453 return 0; 454 } 455 456 static void tce_iommu_unuse_page_v2(struct tce_container *container, 457 struct iommu_table *tbl, unsigned long entry) 458 { 459 struct mm_iommu_table_group_mem_t *mem = NULL; 460 int ret; 461 unsigned long hpa = 0; 462 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); 463 464 if (!pua) 465 return; 466 467 ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua), 468 tbl->it_page_shift, &hpa, &mem); 469 if (ret) 470 pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n", 471 __func__, be64_to_cpu(*pua), entry, ret); 472 if (mem) 473 mm_iommu_mapped_dec(mem); 474 475 *pua = cpu_to_be64(0); 476 } 477 478 static int tce_iommu_clear(struct tce_container *container, 479 struct iommu_table *tbl, 480 unsigned long entry, unsigned long pages) 481 { 482 unsigned long oldhpa; 483 long ret; 484 enum dma_data_direction direction; 485 unsigned long lastentry = entry + pages; 486 487 for ( ; entry < lastentry; ++entry) { 488 if (tbl->it_indirect_levels && tbl->it_userspace) { 489 /* 490 * For multilevel tables, we can take a shortcut here 491 * and skip some TCEs as we know that the userspace 492 * addresses cache is a mirror of the real TCE table 493 * and if it is missing some indirect levels, then 494 * the hardware table does not have them allocated 495 * either and therefore does not require updating. 496 */ 497 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, 498 entry); 499 if (!pua) { 500 /* align to level_size which is power of two */ 501 entry |= tbl->it_level_size - 1; 502 continue; 503 } 504 } 505 506 cond_resched(); 507 508 direction = DMA_NONE; 509 oldhpa = 0; 510 ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa, 511 &direction); 512 if (ret) 513 continue; 514 515 if (direction == DMA_NONE) 516 continue; 517 518 if (container->v2) { 519 tce_iommu_unuse_page_v2(container, tbl, entry); 520 continue; 521 } 522 523 tce_iommu_unuse_page(container, oldhpa); 524 } 525 526 return 0; 527 } 528 529 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) 530 { 531 struct page *page = NULL; 532 enum dma_data_direction direction = iommu_tce_direction(tce); 533 534 if (get_user_pages_fast(tce & PAGE_MASK, 1, 535 direction != DMA_TO_DEVICE, &page) != 1) 536 return -EFAULT; 537 538 *hpa = __pa((unsigned long) page_address(page)); 539 540 return 0; 541 } 542 543 static long tce_iommu_build(struct tce_container *container, 544 struct iommu_table *tbl, 545 unsigned long entry, unsigned long tce, unsigned long pages, 546 enum dma_data_direction direction) 547 { 548 long i, ret = 0; 549 unsigned long hpa; 550 enum dma_data_direction dirtmp; 551 552 for (i = 0; i < pages; ++i) { 553 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 554 555 ret = tce_iommu_use_page(tce, &hpa); 556 if (ret) 557 break; 558 559 if (!tce_page_is_contained(container->mm, hpa, 560 tbl->it_page_shift)) { 561 ret = -EPERM; 562 break; 563 } 564 565 hpa |= offset; 566 dirtmp = direction; 567 ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, 568 &dirtmp); 569 if (ret) { 570 tce_iommu_unuse_page(container, hpa); 571 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 572 __func__, entry << tbl->it_page_shift, 573 tce, ret); 574 break; 575 } 576 577 if (dirtmp != DMA_NONE) 578 tce_iommu_unuse_page(container, hpa); 579 580 tce += IOMMU_PAGE_SIZE(tbl); 581 } 582 583 if (ret) 584 tce_iommu_clear(container, tbl, entry, i); 585 586 return ret; 587 } 588 589 static long tce_iommu_build_v2(struct tce_container *container, 590 struct iommu_table *tbl, 591 unsigned long entry, unsigned long tce, unsigned long pages, 592 enum dma_data_direction direction) 593 { 594 long i, ret = 0; 595 unsigned long hpa; 596 enum dma_data_direction dirtmp; 597 598 for (i = 0; i < pages; ++i) { 599 struct mm_iommu_table_group_mem_t *mem = NULL; 600 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i); 601 602 ret = tce_iommu_prereg_ua_to_hpa(container, 603 tce, tbl->it_page_shift, &hpa, &mem); 604 if (ret) 605 break; 606 607 if (!tce_page_is_contained(container->mm, hpa, 608 tbl->it_page_shift)) { 609 ret = -EPERM; 610 break; 611 } 612 613 /* Preserve offset within IOMMU page */ 614 hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 615 dirtmp = direction; 616 617 /* The registered region is being unregistered */ 618 if (mm_iommu_mapped_inc(mem)) 619 break; 620 621 ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, 622 &dirtmp); 623 if (ret) { 624 /* dirtmp cannot be DMA_NONE here */ 625 tce_iommu_unuse_page_v2(container, tbl, entry + i); 626 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 627 __func__, entry << tbl->it_page_shift, 628 tce, ret); 629 break; 630 } 631 632 if (dirtmp != DMA_NONE) 633 tce_iommu_unuse_page_v2(container, tbl, entry + i); 634 635 *pua = cpu_to_be64(tce); 636 637 tce += IOMMU_PAGE_SIZE(tbl); 638 } 639 640 if (ret) 641 tce_iommu_clear(container, tbl, entry, i); 642 643 return ret; 644 } 645 646 static long tce_iommu_create_table(struct tce_container *container, 647 struct iommu_table_group *table_group, 648 int num, 649 __u32 page_shift, 650 __u64 window_size, 651 __u32 levels, 652 struct iommu_table **ptbl) 653 { 654 long ret, table_size; 655 656 table_size = table_group->ops->get_table_size(page_shift, window_size, 657 levels); 658 if (!table_size) 659 return -EINVAL; 660 661 ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT); 662 if (ret) 663 return ret; 664 665 ret = table_group->ops->create_table(table_group, num, 666 page_shift, window_size, levels, ptbl); 667 668 WARN_ON(!ret && !(*ptbl)->it_ops->free); 669 WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size)); 670 671 return ret; 672 } 673 674 static void tce_iommu_free_table(struct tce_container *container, 675 struct iommu_table *tbl) 676 { 677 unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; 678 679 iommu_tce_table_put(tbl); 680 decrement_locked_vm(container->mm, pages); 681 } 682 683 static long tce_iommu_create_window(struct tce_container *container, 684 __u32 page_shift, __u64 window_size, __u32 levels, 685 __u64 *start_addr) 686 { 687 struct tce_iommu_group *tcegrp; 688 struct iommu_table_group *table_group; 689 struct iommu_table *tbl = NULL; 690 long ret, num; 691 692 num = tce_iommu_find_free_table(container); 693 if (num < 0) 694 return num; 695 696 /* Get the first group for ops::create_table */ 697 tcegrp = list_first_entry(&container->group_list, 698 struct tce_iommu_group, next); 699 table_group = iommu_group_get_iommudata(tcegrp->grp); 700 if (!table_group) 701 return -EFAULT; 702 703 if (!(table_group->pgsizes & (1ULL << page_shift))) 704 return -EINVAL; 705 706 if (!table_group->ops->set_window || !table_group->ops->unset_window || 707 !table_group->ops->get_table_size || 708 !table_group->ops->create_table) 709 return -EPERM; 710 711 /* Create TCE table */ 712 ret = tce_iommu_create_table(container, table_group, num, 713 page_shift, window_size, levels, &tbl); 714 if (ret) 715 return ret; 716 717 BUG_ON(!tbl->it_ops->free); 718 719 /* 720 * Program the table to every group. 721 * Groups have been tested for compatibility at the attach time. 722 */ 723 list_for_each_entry(tcegrp, &container->group_list, next) { 724 table_group = iommu_group_get_iommudata(tcegrp->grp); 725 726 ret = table_group->ops->set_window(table_group, num, tbl); 727 if (ret) 728 goto unset_exit; 729 } 730 731 container->tables[num] = tbl; 732 733 /* Return start address assigned by platform in create_table() */ 734 *start_addr = tbl->it_offset << tbl->it_page_shift; 735 736 return 0; 737 738 unset_exit: 739 list_for_each_entry(tcegrp, &container->group_list, next) { 740 table_group = iommu_group_get_iommudata(tcegrp->grp); 741 table_group->ops->unset_window(table_group, num); 742 } 743 tce_iommu_free_table(container, tbl); 744 745 return ret; 746 } 747 748 static long tce_iommu_remove_window(struct tce_container *container, 749 __u64 start_addr) 750 { 751 struct iommu_table_group *table_group = NULL; 752 struct iommu_table *tbl; 753 struct tce_iommu_group *tcegrp; 754 int num; 755 756 num = tce_iommu_find_table(container, start_addr, &tbl); 757 if (num < 0) 758 return -EINVAL; 759 760 BUG_ON(!tbl->it_size); 761 762 /* Detach groups from IOMMUs */ 763 list_for_each_entry(tcegrp, &container->group_list, next) { 764 table_group = iommu_group_get_iommudata(tcegrp->grp); 765 766 /* 767 * SPAPR TCE IOMMU exposes the default DMA window to 768 * the guest via dma32_window_start/size of 769 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow 770 * the userspace to remove this window, some do not so 771 * here we check for the platform capability. 772 */ 773 if (!table_group->ops || !table_group->ops->unset_window) 774 return -EPERM; 775 776 table_group->ops->unset_window(table_group, num); 777 } 778 779 /* Free table */ 780 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 781 tce_iommu_free_table(container, tbl); 782 container->tables[num] = NULL; 783 784 return 0; 785 } 786 787 static long tce_iommu_create_default_window(struct tce_container *container) 788 { 789 long ret; 790 __u64 start_addr = 0; 791 struct tce_iommu_group *tcegrp; 792 struct iommu_table_group *table_group; 793 794 if (!container->def_window_pending) 795 return 0; 796 797 if (!tce_groups_attached(container)) 798 return -ENODEV; 799 800 tcegrp = list_first_entry(&container->group_list, 801 struct tce_iommu_group, next); 802 table_group = iommu_group_get_iommudata(tcegrp->grp); 803 if (!table_group) 804 return -ENODEV; 805 806 ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K, 807 table_group->tce32_size, 1, &start_addr); 808 WARN_ON_ONCE(!ret && start_addr); 809 810 if (!ret) 811 container->def_window_pending = false; 812 813 return ret; 814 } 815 816 static long tce_iommu_ioctl(void *iommu_data, 817 unsigned int cmd, unsigned long arg) 818 { 819 struct tce_container *container = iommu_data; 820 unsigned long minsz, ddwsz; 821 long ret; 822 823 switch (cmd) { 824 case VFIO_CHECK_EXTENSION: 825 switch (arg) { 826 case VFIO_SPAPR_TCE_IOMMU: 827 case VFIO_SPAPR_TCE_v2_IOMMU: 828 ret = 1; 829 break; 830 default: 831 ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); 832 break; 833 } 834 835 return (ret < 0) ? 0 : ret; 836 } 837 838 /* 839 * Sanity check to prevent one userspace from manipulating 840 * another userspace mm. 841 */ 842 BUG_ON(!container); 843 if (container->mm && container->mm != current->mm) 844 return -EPERM; 845 846 switch (cmd) { 847 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { 848 struct vfio_iommu_spapr_tce_info info; 849 struct tce_iommu_group *tcegrp; 850 struct iommu_table_group *table_group; 851 852 if (!tce_groups_attached(container)) 853 return -ENXIO; 854 855 tcegrp = list_first_entry(&container->group_list, 856 struct tce_iommu_group, next); 857 table_group = iommu_group_get_iommudata(tcegrp->grp); 858 859 if (!table_group) 860 return -ENXIO; 861 862 minsz = offsetofend(struct vfio_iommu_spapr_tce_info, 863 dma32_window_size); 864 865 if (copy_from_user(&info, (void __user *)arg, minsz)) 866 return -EFAULT; 867 868 if (info.argsz < minsz) 869 return -EINVAL; 870 871 info.dma32_window_start = table_group->tce32_start; 872 info.dma32_window_size = table_group->tce32_size; 873 info.flags = 0; 874 memset(&info.ddw, 0, sizeof(info.ddw)); 875 876 if (table_group->max_dynamic_windows_supported && 877 container->v2) { 878 info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW; 879 info.ddw.pgsizes = table_group->pgsizes; 880 info.ddw.max_dynamic_windows_supported = 881 table_group->max_dynamic_windows_supported; 882 info.ddw.levels = table_group->max_levels; 883 } 884 885 ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw); 886 887 if (info.argsz >= ddwsz) 888 minsz = ddwsz; 889 890 if (copy_to_user((void __user *)arg, &info, minsz)) 891 return -EFAULT; 892 893 return 0; 894 } 895 case VFIO_IOMMU_MAP_DMA: { 896 struct vfio_iommu_type1_dma_map param; 897 struct iommu_table *tbl = NULL; 898 long num; 899 enum dma_data_direction direction; 900 901 if (!container->enabled) 902 return -EPERM; 903 904 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 905 906 if (copy_from_user(¶m, (void __user *)arg, minsz)) 907 return -EFAULT; 908 909 if (param.argsz < minsz) 910 return -EINVAL; 911 912 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ | 913 VFIO_DMA_MAP_FLAG_WRITE)) 914 return -EINVAL; 915 916 ret = tce_iommu_create_default_window(container); 917 if (ret) 918 return ret; 919 920 num = tce_iommu_find_table(container, param.iova, &tbl); 921 if (num < 0) 922 return -ENXIO; 923 924 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || 925 (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) 926 return -EINVAL; 927 928 /* iova is checked by the IOMMU API */ 929 if (param.flags & VFIO_DMA_MAP_FLAG_READ) { 930 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 931 direction = DMA_BIDIRECTIONAL; 932 else 933 direction = DMA_TO_DEVICE; 934 } else { 935 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 936 direction = DMA_FROM_DEVICE; 937 else 938 return -EINVAL; 939 } 940 941 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); 942 if (ret) 943 return ret; 944 945 if (container->v2) 946 ret = tce_iommu_build_v2(container, tbl, 947 param.iova >> tbl->it_page_shift, 948 param.vaddr, 949 param.size >> tbl->it_page_shift, 950 direction); 951 else 952 ret = tce_iommu_build(container, tbl, 953 param.iova >> tbl->it_page_shift, 954 param.vaddr, 955 param.size >> tbl->it_page_shift, 956 direction); 957 958 iommu_flush_tce(tbl); 959 960 return ret; 961 } 962 case VFIO_IOMMU_UNMAP_DMA: { 963 struct vfio_iommu_type1_dma_unmap param; 964 struct iommu_table *tbl = NULL; 965 long num; 966 967 if (!container->enabled) 968 return -EPERM; 969 970 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, 971 size); 972 973 if (copy_from_user(¶m, (void __user *)arg, minsz)) 974 return -EFAULT; 975 976 if (param.argsz < minsz) 977 return -EINVAL; 978 979 /* No flag is supported now */ 980 if (param.flags) 981 return -EINVAL; 982 983 ret = tce_iommu_create_default_window(container); 984 if (ret) 985 return ret; 986 987 num = tce_iommu_find_table(container, param.iova, &tbl); 988 if (num < 0) 989 return -ENXIO; 990 991 if (param.size & ~IOMMU_PAGE_MASK(tbl)) 992 return -EINVAL; 993 994 ret = iommu_tce_clear_param_check(tbl, param.iova, 0, 995 param.size >> tbl->it_page_shift); 996 if (ret) 997 return ret; 998 999 ret = tce_iommu_clear(container, tbl, 1000 param.iova >> tbl->it_page_shift, 1001 param.size >> tbl->it_page_shift); 1002 iommu_flush_tce(tbl); 1003 1004 return ret; 1005 } 1006 case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: { 1007 struct vfio_iommu_spapr_register_memory param; 1008 1009 if (!container->v2) 1010 break; 1011 1012 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1013 size); 1014 1015 ret = tce_iommu_mm_set(container); 1016 if (ret) 1017 return ret; 1018 1019 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1020 return -EFAULT; 1021 1022 if (param.argsz < minsz) 1023 return -EINVAL; 1024 1025 /* No flag is supported now */ 1026 if (param.flags) 1027 return -EINVAL; 1028 1029 mutex_lock(&container->lock); 1030 ret = tce_iommu_register_pages(container, param.vaddr, 1031 param.size); 1032 mutex_unlock(&container->lock); 1033 1034 return ret; 1035 } 1036 case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: { 1037 struct vfio_iommu_spapr_register_memory param; 1038 1039 if (!container->v2) 1040 break; 1041 1042 if (!container->mm) 1043 return -EPERM; 1044 1045 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1046 size); 1047 1048 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1049 return -EFAULT; 1050 1051 if (param.argsz < minsz) 1052 return -EINVAL; 1053 1054 /* No flag is supported now */ 1055 if (param.flags) 1056 return -EINVAL; 1057 1058 mutex_lock(&container->lock); 1059 ret = tce_iommu_unregister_pages(container, param.vaddr, 1060 param.size); 1061 mutex_unlock(&container->lock); 1062 1063 return ret; 1064 } 1065 case VFIO_IOMMU_ENABLE: 1066 if (container->v2) 1067 break; 1068 1069 mutex_lock(&container->lock); 1070 ret = tce_iommu_enable(container); 1071 mutex_unlock(&container->lock); 1072 return ret; 1073 1074 1075 case VFIO_IOMMU_DISABLE: 1076 if (container->v2) 1077 break; 1078 1079 mutex_lock(&container->lock); 1080 tce_iommu_disable(container); 1081 mutex_unlock(&container->lock); 1082 return 0; 1083 1084 case VFIO_EEH_PE_OP: { 1085 struct tce_iommu_group *tcegrp; 1086 1087 ret = 0; 1088 list_for_each_entry(tcegrp, &container->group_list, next) { 1089 ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, 1090 cmd, arg); 1091 if (ret) 1092 return ret; 1093 } 1094 return ret; 1095 } 1096 1097 case VFIO_IOMMU_SPAPR_TCE_CREATE: { 1098 struct vfio_iommu_spapr_tce_create create; 1099 1100 if (!container->v2) 1101 break; 1102 1103 ret = tce_iommu_mm_set(container); 1104 if (ret) 1105 return ret; 1106 1107 if (!tce_groups_attached(container)) 1108 return -ENXIO; 1109 1110 minsz = offsetofend(struct vfio_iommu_spapr_tce_create, 1111 start_addr); 1112 1113 if (copy_from_user(&create, (void __user *)arg, minsz)) 1114 return -EFAULT; 1115 1116 if (create.argsz < minsz) 1117 return -EINVAL; 1118 1119 if (create.flags) 1120 return -EINVAL; 1121 1122 mutex_lock(&container->lock); 1123 1124 ret = tce_iommu_create_default_window(container); 1125 if (!ret) 1126 ret = tce_iommu_create_window(container, 1127 create.page_shift, 1128 create.window_size, create.levels, 1129 &create.start_addr); 1130 1131 mutex_unlock(&container->lock); 1132 1133 if (!ret && copy_to_user((void __user *)arg, &create, minsz)) 1134 ret = -EFAULT; 1135 1136 return ret; 1137 } 1138 case VFIO_IOMMU_SPAPR_TCE_REMOVE: { 1139 struct vfio_iommu_spapr_tce_remove remove; 1140 1141 if (!container->v2) 1142 break; 1143 1144 ret = tce_iommu_mm_set(container); 1145 if (ret) 1146 return ret; 1147 1148 if (!tce_groups_attached(container)) 1149 return -ENXIO; 1150 1151 minsz = offsetofend(struct vfio_iommu_spapr_tce_remove, 1152 start_addr); 1153 1154 if (copy_from_user(&remove, (void __user *)arg, minsz)) 1155 return -EFAULT; 1156 1157 if (remove.argsz < minsz) 1158 return -EINVAL; 1159 1160 if (remove.flags) 1161 return -EINVAL; 1162 1163 if (container->def_window_pending && !remove.start_addr) { 1164 container->def_window_pending = false; 1165 return 0; 1166 } 1167 1168 mutex_lock(&container->lock); 1169 1170 ret = tce_iommu_remove_window(container, remove.start_addr); 1171 1172 mutex_unlock(&container->lock); 1173 1174 return ret; 1175 } 1176 } 1177 1178 return -ENOTTY; 1179 } 1180 1181 static void tce_iommu_release_ownership(struct tce_container *container, 1182 struct iommu_table_group *table_group) 1183 { 1184 int i; 1185 1186 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1187 struct iommu_table *tbl = container->tables[i]; 1188 1189 if (!tbl) 1190 continue; 1191 1192 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 1193 if (tbl->it_map) 1194 iommu_release_ownership(tbl); 1195 1196 container->tables[i] = NULL; 1197 } 1198 } 1199 1200 static int tce_iommu_take_ownership(struct tce_container *container, 1201 struct iommu_table_group *table_group) 1202 { 1203 int i, j, rc = 0; 1204 1205 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1206 struct iommu_table *tbl = table_group->tables[i]; 1207 1208 if (!tbl || !tbl->it_map) 1209 continue; 1210 1211 rc = iommu_take_ownership(tbl); 1212 if (rc) { 1213 for (j = 0; j < i; ++j) 1214 iommu_release_ownership( 1215 table_group->tables[j]); 1216 1217 return rc; 1218 } 1219 } 1220 1221 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1222 container->tables[i] = table_group->tables[i]; 1223 1224 return 0; 1225 } 1226 1227 static void tce_iommu_release_ownership_ddw(struct tce_container *container, 1228 struct iommu_table_group *table_group) 1229 { 1230 long i; 1231 1232 if (!table_group->ops->unset_window) { 1233 WARN_ON_ONCE(1); 1234 return; 1235 } 1236 1237 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1238 table_group->ops->unset_window(table_group, i); 1239 1240 table_group->ops->release_ownership(table_group); 1241 } 1242 1243 static long tce_iommu_take_ownership_ddw(struct tce_container *container, 1244 struct iommu_table_group *table_group) 1245 { 1246 long i, ret = 0; 1247 1248 if (!table_group->ops->create_table || !table_group->ops->set_window || 1249 !table_group->ops->release_ownership) { 1250 WARN_ON_ONCE(1); 1251 return -EFAULT; 1252 } 1253 1254 table_group->ops->take_ownership(table_group); 1255 1256 /* Set all windows to the new group */ 1257 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1258 struct iommu_table *tbl = container->tables[i]; 1259 1260 if (!tbl) 1261 continue; 1262 1263 ret = table_group->ops->set_window(table_group, i, tbl); 1264 if (ret) 1265 goto release_exit; 1266 } 1267 1268 return 0; 1269 1270 release_exit: 1271 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1272 table_group->ops->unset_window(table_group, i); 1273 1274 table_group->ops->release_ownership(table_group); 1275 1276 return ret; 1277 } 1278 1279 static int tce_iommu_attach_group(void *iommu_data, 1280 struct iommu_group *iommu_group) 1281 { 1282 int ret; 1283 struct tce_container *container = iommu_data; 1284 struct iommu_table_group *table_group; 1285 struct tce_iommu_group *tcegrp = NULL; 1286 1287 mutex_lock(&container->lock); 1288 1289 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", 1290 iommu_group_id(iommu_group), iommu_group); */ 1291 table_group = iommu_group_get_iommudata(iommu_group); 1292 if (!table_group) { 1293 ret = -ENODEV; 1294 goto unlock_exit; 1295 } 1296 1297 if (tce_groups_attached(container) && (!table_group->ops || 1298 !table_group->ops->take_ownership || 1299 !table_group->ops->release_ownership)) { 1300 ret = -EBUSY; 1301 goto unlock_exit; 1302 } 1303 1304 /* Check if new group has the same iommu_ops (i.e. compatible) */ 1305 list_for_each_entry(tcegrp, &container->group_list, next) { 1306 struct iommu_table_group *table_group_tmp; 1307 1308 if (tcegrp->grp == iommu_group) { 1309 pr_warn("tce_vfio: Group %d is already attached\n", 1310 iommu_group_id(iommu_group)); 1311 ret = -EBUSY; 1312 goto unlock_exit; 1313 } 1314 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); 1315 if (table_group_tmp->ops->create_table != 1316 table_group->ops->create_table) { 1317 pr_warn("tce_vfio: Group %d is incompatible with group %d\n", 1318 iommu_group_id(iommu_group), 1319 iommu_group_id(tcegrp->grp)); 1320 ret = -EPERM; 1321 goto unlock_exit; 1322 } 1323 } 1324 1325 tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL); 1326 if (!tcegrp) { 1327 ret = -ENOMEM; 1328 goto unlock_exit; 1329 } 1330 1331 if (!table_group->ops || !table_group->ops->take_ownership || 1332 !table_group->ops->release_ownership) { 1333 if (container->v2) { 1334 ret = -EPERM; 1335 goto unlock_exit; 1336 } 1337 ret = tce_iommu_take_ownership(container, table_group); 1338 } else { 1339 if (!container->v2) { 1340 ret = -EPERM; 1341 goto unlock_exit; 1342 } 1343 ret = tce_iommu_take_ownership_ddw(container, table_group); 1344 if (!tce_groups_attached(container) && !container->tables[0]) 1345 container->def_window_pending = true; 1346 } 1347 1348 if (!ret) { 1349 tcegrp->grp = iommu_group; 1350 list_add(&tcegrp->next, &container->group_list); 1351 } 1352 1353 unlock_exit: 1354 if (ret && tcegrp) 1355 kfree(tcegrp); 1356 1357 mutex_unlock(&container->lock); 1358 1359 return ret; 1360 } 1361 1362 static void tce_iommu_detach_group(void *iommu_data, 1363 struct iommu_group *iommu_group) 1364 { 1365 struct tce_container *container = iommu_data; 1366 struct iommu_table_group *table_group; 1367 bool found = false; 1368 struct tce_iommu_group *tcegrp; 1369 1370 mutex_lock(&container->lock); 1371 1372 list_for_each_entry(tcegrp, &container->group_list, next) { 1373 if (tcegrp->grp == iommu_group) { 1374 found = true; 1375 break; 1376 } 1377 } 1378 1379 if (!found) { 1380 pr_warn("tce_vfio: detaching unattached group #%u\n", 1381 iommu_group_id(iommu_group)); 1382 goto unlock_exit; 1383 } 1384 1385 list_del(&tcegrp->next); 1386 kfree(tcegrp); 1387 1388 table_group = iommu_group_get_iommudata(iommu_group); 1389 BUG_ON(!table_group); 1390 1391 if (!table_group->ops || !table_group->ops->release_ownership) 1392 tce_iommu_release_ownership(container, table_group); 1393 else 1394 tce_iommu_release_ownership_ddw(container, table_group); 1395 1396 unlock_exit: 1397 mutex_unlock(&container->lock); 1398 } 1399 1400 const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { 1401 .name = "iommu-vfio-powerpc", 1402 .owner = THIS_MODULE, 1403 .open = tce_iommu_open, 1404 .release = tce_iommu_release, 1405 .ioctl = tce_iommu_ioctl, 1406 .attach_group = tce_iommu_attach_group, 1407 .detach_group = tce_iommu_detach_group, 1408 }; 1409 1410 static int __init tce_iommu_init(void) 1411 { 1412 return vfio_register_iommu_driver(&tce_iommu_driver_ops); 1413 } 1414 1415 static void __exit tce_iommu_cleanup(void) 1416 { 1417 vfio_unregister_iommu_driver(&tce_iommu_driver_ops); 1418 } 1419 1420 module_init(tce_iommu_init); 1421 module_exit(tce_iommu_cleanup); 1422 1423 MODULE_VERSION(DRIVER_VERSION); 1424 MODULE_LICENSE("GPL v2"); 1425 MODULE_AUTHOR(DRIVER_AUTHOR); 1426 MODULE_DESCRIPTION(DRIVER_DESC); 1427 1428