1 /* 2 * VFIO: IOMMU DMA mapping support for TCE on POWER 3 * 4 * Copyright (C) 2013 IBM Corp. All rights reserved. 5 * Author: Alexey Kardashevskiy <aik@ozlabs.ru> 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 * 11 * Derived from original vfio_iommu_type1.c: 12 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 13 * Author: Alex Williamson <alex.williamson@redhat.com> 14 */ 15 16 #include <linux/module.h> 17 #include <linux/pci.h> 18 #include <linux/slab.h> 19 #include <linux/uaccess.h> 20 #include <linux/err.h> 21 #include <linux/vfio.h> 22 #include <linux/vmalloc.h> 23 #include <linux/sched/mm.h> 24 #include <linux/sched/signal.h> 25 26 #include <asm/iommu.h> 27 #include <asm/tce.h> 28 #include <asm/mmu_context.h> 29 30 #define DRIVER_VERSION "0.1" 31 #define DRIVER_AUTHOR "aik@ozlabs.ru" 32 #define DRIVER_DESC "VFIO IOMMU SPAPR TCE" 33 34 static void tce_iommu_detach_group(void *iommu_data, 35 struct iommu_group *iommu_group); 36 37 static long try_increment_locked_vm(struct mm_struct *mm, long npages) 38 { 39 long ret = 0, locked, lock_limit; 40 41 if (WARN_ON_ONCE(!mm)) 42 return -EPERM; 43 44 if (!npages) 45 return 0; 46 47 down_write(&mm->mmap_sem); 48 locked = mm->locked_vm + npages; 49 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 50 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 51 ret = -ENOMEM; 52 else 53 mm->locked_vm += npages; 54 55 pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid, 56 npages << PAGE_SHIFT, 57 mm->locked_vm << PAGE_SHIFT, 58 rlimit(RLIMIT_MEMLOCK), 59 ret ? " - exceeded" : ""); 60 61 up_write(&mm->mmap_sem); 62 63 return ret; 64 } 65 66 static void decrement_locked_vm(struct mm_struct *mm, long npages) 67 { 68 if (!mm || !npages) 69 return; 70 71 down_write(&mm->mmap_sem); 72 if (WARN_ON_ONCE(npages > mm->locked_vm)) 73 npages = mm->locked_vm; 74 mm->locked_vm -= npages; 75 pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid, 76 npages << PAGE_SHIFT, 77 mm->locked_vm << PAGE_SHIFT, 78 rlimit(RLIMIT_MEMLOCK)); 79 up_write(&mm->mmap_sem); 80 } 81 82 /* 83 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation 84 * 85 * This code handles mapping and unmapping of user data buffers 86 * into DMA'ble space using the IOMMU 87 */ 88 89 struct tce_iommu_group { 90 struct list_head next; 91 struct iommu_group *grp; 92 }; 93 94 /* 95 * A container needs to remember which preregistered region it has 96 * referenced to do proper cleanup at the userspace process exit. 97 */ 98 struct tce_iommu_prereg { 99 struct list_head next; 100 struct mm_iommu_table_group_mem_t *mem; 101 }; 102 103 /* 104 * The container descriptor supports only a single group per container. 105 * Required by the API as the container is not supplied with the IOMMU group 106 * at the moment of initialization. 107 */ 108 struct tce_container { 109 struct mutex lock; 110 bool enabled; 111 bool v2; 112 bool def_window_pending; 113 unsigned long locked_pages; 114 struct mm_struct *mm; 115 struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; 116 struct list_head group_list; 117 struct list_head prereg_list; 118 }; 119 120 static long tce_iommu_mm_set(struct tce_container *container) 121 { 122 if (container->mm) { 123 if (container->mm == current->mm) 124 return 0; 125 return -EPERM; 126 } 127 BUG_ON(!current->mm); 128 container->mm = current->mm; 129 atomic_inc(&container->mm->mm_count); 130 131 return 0; 132 } 133 134 static long tce_iommu_prereg_free(struct tce_container *container, 135 struct tce_iommu_prereg *tcemem) 136 { 137 long ret; 138 139 ret = mm_iommu_put(container->mm, tcemem->mem); 140 if (ret) 141 return ret; 142 143 list_del(&tcemem->next); 144 kfree(tcemem); 145 146 return 0; 147 } 148 149 static long tce_iommu_unregister_pages(struct tce_container *container, 150 __u64 vaddr, __u64 size) 151 { 152 struct mm_iommu_table_group_mem_t *mem; 153 struct tce_iommu_prereg *tcemem; 154 bool found = false; 155 156 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) 157 return -EINVAL; 158 159 mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT); 160 if (!mem) 161 return -ENOENT; 162 163 list_for_each_entry(tcemem, &container->prereg_list, next) { 164 if (tcemem->mem == mem) { 165 found = true; 166 break; 167 } 168 } 169 170 if (!found) 171 return -ENOENT; 172 173 return tce_iommu_prereg_free(container, tcemem); 174 } 175 176 static long tce_iommu_register_pages(struct tce_container *container, 177 __u64 vaddr, __u64 size) 178 { 179 long ret = 0; 180 struct mm_iommu_table_group_mem_t *mem = NULL; 181 struct tce_iommu_prereg *tcemem; 182 unsigned long entries = size >> PAGE_SHIFT; 183 184 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || 185 ((vaddr + size) < vaddr)) 186 return -EINVAL; 187 188 mem = mm_iommu_find(container->mm, vaddr, entries); 189 if (mem) { 190 list_for_each_entry(tcemem, &container->prereg_list, next) { 191 if (tcemem->mem == mem) 192 return -EBUSY; 193 } 194 } 195 196 ret = mm_iommu_get(container->mm, vaddr, entries, &mem); 197 if (ret) 198 return ret; 199 200 tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); 201 if (!tcemem) { 202 mm_iommu_put(container->mm, mem); 203 return -ENOMEM; 204 } 205 206 tcemem->mem = mem; 207 list_add(&tcemem->next, &container->prereg_list); 208 209 container->enabled = true; 210 211 return 0; 212 } 213 214 static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl, 215 struct mm_struct *mm) 216 { 217 unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * 218 tbl->it_size, PAGE_SIZE); 219 unsigned long *uas; 220 long ret; 221 222 BUG_ON(tbl->it_userspace); 223 224 ret = try_increment_locked_vm(mm, cb >> PAGE_SHIFT); 225 if (ret) 226 return ret; 227 228 uas = vzalloc(cb); 229 if (!uas) { 230 decrement_locked_vm(mm, cb >> PAGE_SHIFT); 231 return -ENOMEM; 232 } 233 tbl->it_userspace = uas; 234 235 return 0; 236 } 237 238 static void tce_iommu_userspace_view_free(struct iommu_table *tbl, 239 struct mm_struct *mm) 240 { 241 unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * 242 tbl->it_size, PAGE_SIZE); 243 244 if (!tbl->it_userspace) 245 return; 246 247 vfree(tbl->it_userspace); 248 tbl->it_userspace = NULL; 249 decrement_locked_vm(mm, cb >> PAGE_SHIFT); 250 } 251 252 static bool tce_page_is_contained(struct page *page, unsigned page_shift) 253 { 254 /* 255 * Check that the TCE table granularity is not bigger than the size of 256 * a page we just found. Otherwise the hardware can get access to 257 * a bigger memory chunk that it should. 258 */ 259 return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; 260 } 261 262 static inline bool tce_groups_attached(struct tce_container *container) 263 { 264 return !list_empty(&container->group_list); 265 } 266 267 static long tce_iommu_find_table(struct tce_container *container, 268 phys_addr_t ioba, struct iommu_table **ptbl) 269 { 270 long i; 271 272 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 273 struct iommu_table *tbl = container->tables[i]; 274 275 if (tbl) { 276 unsigned long entry = ioba >> tbl->it_page_shift; 277 unsigned long start = tbl->it_offset; 278 unsigned long end = start + tbl->it_size; 279 280 if ((start <= entry) && (entry < end)) { 281 *ptbl = tbl; 282 return i; 283 } 284 } 285 } 286 287 return -1; 288 } 289 290 static int tce_iommu_find_free_table(struct tce_container *container) 291 { 292 int i; 293 294 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 295 if (!container->tables[i]) 296 return i; 297 } 298 299 return -ENOSPC; 300 } 301 302 static int tce_iommu_enable(struct tce_container *container) 303 { 304 int ret = 0; 305 unsigned long locked; 306 struct iommu_table_group *table_group; 307 struct tce_iommu_group *tcegrp; 308 309 if (container->enabled) 310 return -EBUSY; 311 312 /* 313 * When userspace pages are mapped into the IOMMU, they are effectively 314 * locked memory, so, theoretically, we need to update the accounting 315 * of locked pages on each map and unmap. For powerpc, the map unmap 316 * paths can be very hot, though, and the accounting would kill 317 * performance, especially since it would be difficult to impossible 318 * to handle the accounting in real mode only. 319 * 320 * To address that, rather than precisely accounting every page, we 321 * instead account for a worst case on locked memory when the iommu is 322 * enabled and disabled. The worst case upper bound on locked memory 323 * is the size of the whole iommu window, which is usually relatively 324 * small (compared to total memory sizes) on POWER hardware. 325 * 326 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, 327 * that would effectively kill the guest at random points, much better 328 * enforcing the limit based on the max that the guest can map. 329 * 330 * Unfortunately at the moment it counts whole tables, no matter how 331 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups 332 * each with 2GB DMA window, 8GB will be counted here. The reason for 333 * this is that we cannot tell here the amount of RAM used by the guest 334 * as this information is only available from KVM and VFIO is 335 * KVM agnostic. 336 * 337 * So we do not allow enabling a container without a group attached 338 * as there is no way to know how much we should increment 339 * the locked_vm counter. 340 */ 341 if (!tce_groups_attached(container)) 342 return -ENODEV; 343 344 tcegrp = list_first_entry(&container->group_list, 345 struct tce_iommu_group, next); 346 table_group = iommu_group_get_iommudata(tcegrp->grp); 347 if (!table_group) 348 return -ENODEV; 349 350 if (!table_group->tce32_size) 351 return -EPERM; 352 353 ret = tce_iommu_mm_set(container); 354 if (ret) 355 return ret; 356 357 locked = table_group->tce32_size >> PAGE_SHIFT; 358 ret = try_increment_locked_vm(container->mm, locked); 359 if (ret) 360 return ret; 361 362 container->locked_pages = locked; 363 364 container->enabled = true; 365 366 return ret; 367 } 368 369 static void tce_iommu_disable(struct tce_container *container) 370 { 371 if (!container->enabled) 372 return; 373 374 container->enabled = false; 375 376 BUG_ON(!container->mm); 377 decrement_locked_vm(container->mm, container->locked_pages); 378 } 379 380 static void *tce_iommu_open(unsigned long arg) 381 { 382 struct tce_container *container; 383 384 if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) { 385 pr_err("tce_vfio: Wrong IOMMU type\n"); 386 return ERR_PTR(-EINVAL); 387 } 388 389 container = kzalloc(sizeof(*container), GFP_KERNEL); 390 if (!container) 391 return ERR_PTR(-ENOMEM); 392 393 mutex_init(&container->lock); 394 INIT_LIST_HEAD_RCU(&container->group_list); 395 INIT_LIST_HEAD_RCU(&container->prereg_list); 396 397 container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; 398 399 return container; 400 } 401 402 static int tce_iommu_clear(struct tce_container *container, 403 struct iommu_table *tbl, 404 unsigned long entry, unsigned long pages); 405 static void tce_iommu_free_table(struct tce_container *container, 406 struct iommu_table *tbl); 407 408 static void tce_iommu_release(void *iommu_data) 409 { 410 struct tce_container *container = iommu_data; 411 struct tce_iommu_group *tcegrp; 412 long i; 413 414 while (tce_groups_attached(container)) { 415 tcegrp = list_first_entry(&container->group_list, 416 struct tce_iommu_group, next); 417 tce_iommu_detach_group(iommu_data, tcegrp->grp); 418 } 419 420 /* 421 * If VFIO created a table, it was not disposed 422 * by tce_iommu_detach_group() so do it now. 423 */ 424 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 425 struct iommu_table *tbl = container->tables[i]; 426 427 if (!tbl) 428 continue; 429 430 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 431 tce_iommu_free_table(container, tbl); 432 } 433 434 while (!list_empty(&container->prereg_list)) { 435 struct tce_iommu_prereg *tcemem; 436 437 tcemem = list_first_entry(&container->prereg_list, 438 struct tce_iommu_prereg, next); 439 WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem)); 440 } 441 442 tce_iommu_disable(container); 443 if (container->mm) 444 mmdrop(container->mm); 445 mutex_destroy(&container->lock); 446 447 kfree(container); 448 } 449 450 static void tce_iommu_unuse_page(struct tce_container *container, 451 unsigned long hpa) 452 { 453 struct page *page; 454 455 page = pfn_to_page(hpa >> PAGE_SHIFT); 456 put_page(page); 457 } 458 459 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container, 460 unsigned long tce, unsigned long size, 461 unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) 462 { 463 long ret = 0; 464 struct mm_iommu_table_group_mem_t *mem; 465 466 mem = mm_iommu_lookup(container->mm, tce, size); 467 if (!mem) 468 return -EINVAL; 469 470 ret = mm_iommu_ua_to_hpa(mem, tce, phpa); 471 if (ret) 472 return -EINVAL; 473 474 *pmem = mem; 475 476 return 0; 477 } 478 479 static void tce_iommu_unuse_page_v2(struct tce_container *container, 480 struct iommu_table *tbl, unsigned long entry) 481 { 482 struct mm_iommu_table_group_mem_t *mem = NULL; 483 int ret; 484 unsigned long hpa = 0; 485 unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); 486 487 if (!pua) 488 return; 489 490 ret = tce_iommu_prereg_ua_to_hpa(container, *pua, IOMMU_PAGE_SIZE(tbl), 491 &hpa, &mem); 492 if (ret) 493 pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n", 494 __func__, *pua, entry, ret); 495 if (mem) 496 mm_iommu_mapped_dec(mem); 497 498 *pua = 0; 499 } 500 501 static int tce_iommu_clear(struct tce_container *container, 502 struct iommu_table *tbl, 503 unsigned long entry, unsigned long pages) 504 { 505 unsigned long oldhpa; 506 long ret; 507 enum dma_data_direction direction; 508 509 for ( ; pages; --pages, ++entry) { 510 direction = DMA_NONE; 511 oldhpa = 0; 512 ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction); 513 if (ret) 514 continue; 515 516 if (direction == DMA_NONE) 517 continue; 518 519 if (container->v2) { 520 tce_iommu_unuse_page_v2(container, tbl, entry); 521 continue; 522 } 523 524 tce_iommu_unuse_page(container, oldhpa); 525 } 526 527 return 0; 528 } 529 530 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) 531 { 532 struct page *page = NULL; 533 enum dma_data_direction direction = iommu_tce_direction(tce); 534 535 if (get_user_pages_fast(tce & PAGE_MASK, 1, 536 direction != DMA_TO_DEVICE, &page) != 1) 537 return -EFAULT; 538 539 *hpa = __pa((unsigned long) page_address(page)); 540 541 return 0; 542 } 543 544 static long tce_iommu_build(struct tce_container *container, 545 struct iommu_table *tbl, 546 unsigned long entry, unsigned long tce, unsigned long pages, 547 enum dma_data_direction direction) 548 { 549 long i, ret = 0; 550 struct page *page; 551 unsigned long hpa; 552 enum dma_data_direction dirtmp; 553 554 for (i = 0; i < pages; ++i) { 555 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 556 557 ret = tce_iommu_use_page(tce, &hpa); 558 if (ret) 559 break; 560 561 page = pfn_to_page(hpa >> PAGE_SHIFT); 562 if (!tce_page_is_contained(page, tbl->it_page_shift)) { 563 ret = -EPERM; 564 break; 565 } 566 567 hpa |= offset; 568 dirtmp = direction; 569 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); 570 if (ret) { 571 tce_iommu_unuse_page(container, hpa); 572 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 573 __func__, entry << tbl->it_page_shift, 574 tce, ret); 575 break; 576 } 577 578 if (dirtmp != DMA_NONE) 579 tce_iommu_unuse_page(container, hpa); 580 581 tce += IOMMU_PAGE_SIZE(tbl); 582 } 583 584 if (ret) 585 tce_iommu_clear(container, tbl, entry, i); 586 587 return ret; 588 } 589 590 static long tce_iommu_build_v2(struct tce_container *container, 591 struct iommu_table *tbl, 592 unsigned long entry, unsigned long tce, unsigned long pages, 593 enum dma_data_direction direction) 594 { 595 long i, ret = 0; 596 struct page *page; 597 unsigned long hpa; 598 enum dma_data_direction dirtmp; 599 600 if (!tbl->it_userspace) { 601 ret = tce_iommu_userspace_view_alloc(tbl, container->mm); 602 if (ret) 603 return ret; 604 } 605 606 for (i = 0; i < pages; ++i) { 607 struct mm_iommu_table_group_mem_t *mem = NULL; 608 unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, 609 entry + i); 610 611 ret = tce_iommu_prereg_ua_to_hpa(container, 612 tce, IOMMU_PAGE_SIZE(tbl), &hpa, &mem); 613 if (ret) 614 break; 615 616 page = pfn_to_page(hpa >> PAGE_SHIFT); 617 if (!tce_page_is_contained(page, tbl->it_page_shift)) { 618 ret = -EPERM; 619 break; 620 } 621 622 /* Preserve offset within IOMMU page */ 623 hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 624 dirtmp = direction; 625 626 /* The registered region is being unregistered */ 627 if (mm_iommu_mapped_inc(mem)) 628 break; 629 630 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); 631 if (ret) { 632 /* dirtmp cannot be DMA_NONE here */ 633 tce_iommu_unuse_page_v2(container, tbl, entry + i); 634 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 635 __func__, entry << tbl->it_page_shift, 636 tce, ret); 637 break; 638 } 639 640 if (dirtmp != DMA_NONE) 641 tce_iommu_unuse_page_v2(container, tbl, entry + i); 642 643 *pua = tce; 644 645 tce += IOMMU_PAGE_SIZE(tbl); 646 } 647 648 if (ret) 649 tce_iommu_clear(container, tbl, entry, i); 650 651 return ret; 652 } 653 654 static long tce_iommu_create_table(struct tce_container *container, 655 struct iommu_table_group *table_group, 656 int num, 657 __u32 page_shift, 658 __u64 window_size, 659 __u32 levels, 660 struct iommu_table **ptbl) 661 { 662 long ret, table_size; 663 664 table_size = table_group->ops->get_table_size(page_shift, window_size, 665 levels); 666 if (!table_size) 667 return -EINVAL; 668 669 ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT); 670 if (ret) 671 return ret; 672 673 ret = table_group->ops->create_table(table_group, num, 674 page_shift, window_size, levels, ptbl); 675 676 WARN_ON(!ret && !(*ptbl)->it_ops->free); 677 WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size)); 678 679 return ret; 680 } 681 682 static void tce_iommu_free_table(struct tce_container *container, 683 struct iommu_table *tbl) 684 { 685 unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; 686 687 tce_iommu_userspace_view_free(tbl, container->mm); 688 iommu_tce_table_put(tbl); 689 decrement_locked_vm(container->mm, pages); 690 } 691 692 static long tce_iommu_create_window(struct tce_container *container, 693 __u32 page_shift, __u64 window_size, __u32 levels, 694 __u64 *start_addr) 695 { 696 struct tce_iommu_group *tcegrp; 697 struct iommu_table_group *table_group; 698 struct iommu_table *tbl = NULL; 699 long ret, num; 700 701 num = tce_iommu_find_free_table(container); 702 if (num < 0) 703 return num; 704 705 /* Get the first group for ops::create_table */ 706 tcegrp = list_first_entry(&container->group_list, 707 struct tce_iommu_group, next); 708 table_group = iommu_group_get_iommudata(tcegrp->grp); 709 if (!table_group) 710 return -EFAULT; 711 712 if (!(table_group->pgsizes & (1ULL << page_shift))) 713 return -EINVAL; 714 715 if (!table_group->ops->set_window || !table_group->ops->unset_window || 716 !table_group->ops->get_table_size || 717 !table_group->ops->create_table) 718 return -EPERM; 719 720 /* Create TCE table */ 721 ret = tce_iommu_create_table(container, table_group, num, 722 page_shift, window_size, levels, &tbl); 723 if (ret) 724 return ret; 725 726 BUG_ON(!tbl->it_ops->free); 727 728 /* 729 * Program the table to every group. 730 * Groups have been tested for compatibility at the attach time. 731 */ 732 list_for_each_entry(tcegrp, &container->group_list, next) { 733 table_group = iommu_group_get_iommudata(tcegrp->grp); 734 735 ret = table_group->ops->set_window(table_group, num, tbl); 736 if (ret) 737 goto unset_exit; 738 } 739 740 container->tables[num] = tbl; 741 742 /* Return start address assigned by platform in create_table() */ 743 *start_addr = tbl->it_offset << tbl->it_page_shift; 744 745 return 0; 746 747 unset_exit: 748 list_for_each_entry(tcegrp, &container->group_list, next) { 749 table_group = iommu_group_get_iommudata(tcegrp->grp); 750 table_group->ops->unset_window(table_group, num); 751 } 752 tce_iommu_free_table(container, tbl); 753 754 return ret; 755 } 756 757 static long tce_iommu_remove_window(struct tce_container *container, 758 __u64 start_addr) 759 { 760 struct iommu_table_group *table_group = NULL; 761 struct iommu_table *tbl; 762 struct tce_iommu_group *tcegrp; 763 int num; 764 765 num = tce_iommu_find_table(container, start_addr, &tbl); 766 if (num < 0) 767 return -EINVAL; 768 769 BUG_ON(!tbl->it_size); 770 771 /* Detach groups from IOMMUs */ 772 list_for_each_entry(tcegrp, &container->group_list, next) { 773 table_group = iommu_group_get_iommudata(tcegrp->grp); 774 775 /* 776 * SPAPR TCE IOMMU exposes the default DMA window to 777 * the guest via dma32_window_start/size of 778 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow 779 * the userspace to remove this window, some do not so 780 * here we check for the platform capability. 781 */ 782 if (!table_group->ops || !table_group->ops->unset_window) 783 return -EPERM; 784 785 table_group->ops->unset_window(table_group, num); 786 } 787 788 /* Free table */ 789 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 790 tce_iommu_free_table(container, tbl); 791 container->tables[num] = NULL; 792 793 return 0; 794 } 795 796 static long tce_iommu_create_default_window(struct tce_container *container) 797 { 798 long ret; 799 __u64 start_addr = 0; 800 struct tce_iommu_group *tcegrp; 801 struct iommu_table_group *table_group; 802 803 if (!container->def_window_pending) 804 return 0; 805 806 if (!tce_groups_attached(container)) 807 return -ENODEV; 808 809 tcegrp = list_first_entry(&container->group_list, 810 struct tce_iommu_group, next); 811 table_group = iommu_group_get_iommudata(tcegrp->grp); 812 if (!table_group) 813 return -ENODEV; 814 815 ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K, 816 table_group->tce32_size, 1, &start_addr); 817 WARN_ON_ONCE(!ret && start_addr); 818 819 if (!ret) 820 container->def_window_pending = false; 821 822 return ret; 823 } 824 825 static long tce_iommu_ioctl(void *iommu_data, 826 unsigned int cmd, unsigned long arg) 827 { 828 struct tce_container *container = iommu_data; 829 unsigned long minsz, ddwsz; 830 long ret; 831 832 switch (cmd) { 833 case VFIO_CHECK_EXTENSION: 834 switch (arg) { 835 case VFIO_SPAPR_TCE_IOMMU: 836 case VFIO_SPAPR_TCE_v2_IOMMU: 837 ret = 1; 838 break; 839 default: 840 ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); 841 break; 842 } 843 844 return (ret < 0) ? 0 : ret; 845 } 846 847 /* 848 * Sanity check to prevent one userspace from manipulating 849 * another userspace mm. 850 */ 851 BUG_ON(!container); 852 if (container->mm && container->mm != current->mm) 853 return -EPERM; 854 855 switch (cmd) { 856 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { 857 struct vfio_iommu_spapr_tce_info info; 858 struct tce_iommu_group *tcegrp; 859 struct iommu_table_group *table_group; 860 861 if (!tce_groups_attached(container)) 862 return -ENXIO; 863 864 tcegrp = list_first_entry(&container->group_list, 865 struct tce_iommu_group, next); 866 table_group = iommu_group_get_iommudata(tcegrp->grp); 867 868 if (!table_group) 869 return -ENXIO; 870 871 minsz = offsetofend(struct vfio_iommu_spapr_tce_info, 872 dma32_window_size); 873 874 if (copy_from_user(&info, (void __user *)arg, minsz)) 875 return -EFAULT; 876 877 if (info.argsz < minsz) 878 return -EINVAL; 879 880 info.dma32_window_start = table_group->tce32_start; 881 info.dma32_window_size = table_group->tce32_size; 882 info.flags = 0; 883 memset(&info.ddw, 0, sizeof(info.ddw)); 884 885 if (table_group->max_dynamic_windows_supported && 886 container->v2) { 887 info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW; 888 info.ddw.pgsizes = table_group->pgsizes; 889 info.ddw.max_dynamic_windows_supported = 890 table_group->max_dynamic_windows_supported; 891 info.ddw.levels = table_group->max_levels; 892 } 893 894 ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw); 895 896 if (info.argsz >= ddwsz) 897 minsz = ddwsz; 898 899 if (copy_to_user((void __user *)arg, &info, minsz)) 900 return -EFAULT; 901 902 return 0; 903 } 904 case VFIO_IOMMU_MAP_DMA: { 905 struct vfio_iommu_type1_dma_map param; 906 struct iommu_table *tbl = NULL; 907 long num; 908 enum dma_data_direction direction; 909 910 if (!container->enabled) 911 return -EPERM; 912 913 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 914 915 if (copy_from_user(¶m, (void __user *)arg, minsz)) 916 return -EFAULT; 917 918 if (param.argsz < minsz) 919 return -EINVAL; 920 921 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ | 922 VFIO_DMA_MAP_FLAG_WRITE)) 923 return -EINVAL; 924 925 ret = tce_iommu_create_default_window(container); 926 if (ret) 927 return ret; 928 929 num = tce_iommu_find_table(container, param.iova, &tbl); 930 if (num < 0) 931 return -ENXIO; 932 933 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || 934 (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) 935 return -EINVAL; 936 937 /* iova is checked by the IOMMU API */ 938 if (param.flags & VFIO_DMA_MAP_FLAG_READ) { 939 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 940 direction = DMA_BIDIRECTIONAL; 941 else 942 direction = DMA_TO_DEVICE; 943 } else { 944 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 945 direction = DMA_FROM_DEVICE; 946 else 947 return -EINVAL; 948 } 949 950 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); 951 if (ret) 952 return ret; 953 954 if (container->v2) 955 ret = tce_iommu_build_v2(container, tbl, 956 param.iova >> tbl->it_page_shift, 957 param.vaddr, 958 param.size >> tbl->it_page_shift, 959 direction); 960 else 961 ret = tce_iommu_build(container, tbl, 962 param.iova >> tbl->it_page_shift, 963 param.vaddr, 964 param.size >> tbl->it_page_shift, 965 direction); 966 967 iommu_flush_tce(tbl); 968 969 return ret; 970 } 971 case VFIO_IOMMU_UNMAP_DMA: { 972 struct vfio_iommu_type1_dma_unmap param; 973 struct iommu_table *tbl = NULL; 974 long num; 975 976 if (!container->enabled) 977 return -EPERM; 978 979 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, 980 size); 981 982 if (copy_from_user(¶m, (void __user *)arg, minsz)) 983 return -EFAULT; 984 985 if (param.argsz < minsz) 986 return -EINVAL; 987 988 /* No flag is supported now */ 989 if (param.flags) 990 return -EINVAL; 991 992 ret = tce_iommu_create_default_window(container); 993 if (ret) 994 return ret; 995 996 num = tce_iommu_find_table(container, param.iova, &tbl); 997 if (num < 0) 998 return -ENXIO; 999 1000 if (param.size & ~IOMMU_PAGE_MASK(tbl)) 1001 return -EINVAL; 1002 1003 ret = iommu_tce_clear_param_check(tbl, param.iova, 0, 1004 param.size >> tbl->it_page_shift); 1005 if (ret) 1006 return ret; 1007 1008 ret = tce_iommu_clear(container, tbl, 1009 param.iova >> tbl->it_page_shift, 1010 param.size >> tbl->it_page_shift); 1011 iommu_flush_tce(tbl); 1012 1013 return ret; 1014 } 1015 case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: { 1016 struct vfio_iommu_spapr_register_memory param; 1017 1018 if (!container->v2) 1019 break; 1020 1021 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1022 size); 1023 1024 ret = tce_iommu_mm_set(container); 1025 if (ret) 1026 return ret; 1027 1028 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1029 return -EFAULT; 1030 1031 if (param.argsz < minsz) 1032 return -EINVAL; 1033 1034 /* No flag is supported now */ 1035 if (param.flags) 1036 return -EINVAL; 1037 1038 mutex_lock(&container->lock); 1039 ret = tce_iommu_register_pages(container, param.vaddr, 1040 param.size); 1041 mutex_unlock(&container->lock); 1042 1043 return ret; 1044 } 1045 case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: { 1046 struct vfio_iommu_spapr_register_memory param; 1047 1048 if (!container->v2) 1049 break; 1050 1051 if (!container->mm) 1052 return -EPERM; 1053 1054 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1055 size); 1056 1057 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1058 return -EFAULT; 1059 1060 if (param.argsz < minsz) 1061 return -EINVAL; 1062 1063 /* No flag is supported now */ 1064 if (param.flags) 1065 return -EINVAL; 1066 1067 mutex_lock(&container->lock); 1068 ret = tce_iommu_unregister_pages(container, param.vaddr, 1069 param.size); 1070 mutex_unlock(&container->lock); 1071 1072 return ret; 1073 } 1074 case VFIO_IOMMU_ENABLE: 1075 if (container->v2) 1076 break; 1077 1078 mutex_lock(&container->lock); 1079 ret = tce_iommu_enable(container); 1080 mutex_unlock(&container->lock); 1081 return ret; 1082 1083 1084 case VFIO_IOMMU_DISABLE: 1085 if (container->v2) 1086 break; 1087 1088 mutex_lock(&container->lock); 1089 tce_iommu_disable(container); 1090 mutex_unlock(&container->lock); 1091 return 0; 1092 1093 case VFIO_EEH_PE_OP: { 1094 struct tce_iommu_group *tcegrp; 1095 1096 ret = 0; 1097 list_for_each_entry(tcegrp, &container->group_list, next) { 1098 ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, 1099 cmd, arg); 1100 if (ret) 1101 return ret; 1102 } 1103 return ret; 1104 } 1105 1106 case VFIO_IOMMU_SPAPR_TCE_CREATE: { 1107 struct vfio_iommu_spapr_tce_create create; 1108 1109 if (!container->v2) 1110 break; 1111 1112 ret = tce_iommu_mm_set(container); 1113 if (ret) 1114 return ret; 1115 1116 if (!tce_groups_attached(container)) 1117 return -ENXIO; 1118 1119 minsz = offsetofend(struct vfio_iommu_spapr_tce_create, 1120 start_addr); 1121 1122 if (copy_from_user(&create, (void __user *)arg, minsz)) 1123 return -EFAULT; 1124 1125 if (create.argsz < minsz) 1126 return -EINVAL; 1127 1128 if (create.flags) 1129 return -EINVAL; 1130 1131 mutex_lock(&container->lock); 1132 1133 ret = tce_iommu_create_default_window(container); 1134 if (!ret) 1135 ret = tce_iommu_create_window(container, 1136 create.page_shift, 1137 create.window_size, create.levels, 1138 &create.start_addr); 1139 1140 mutex_unlock(&container->lock); 1141 1142 if (!ret && copy_to_user((void __user *)arg, &create, minsz)) 1143 ret = -EFAULT; 1144 1145 return ret; 1146 } 1147 case VFIO_IOMMU_SPAPR_TCE_REMOVE: { 1148 struct vfio_iommu_spapr_tce_remove remove; 1149 1150 if (!container->v2) 1151 break; 1152 1153 ret = tce_iommu_mm_set(container); 1154 if (ret) 1155 return ret; 1156 1157 if (!tce_groups_attached(container)) 1158 return -ENXIO; 1159 1160 minsz = offsetofend(struct vfio_iommu_spapr_tce_remove, 1161 start_addr); 1162 1163 if (copy_from_user(&remove, (void __user *)arg, minsz)) 1164 return -EFAULT; 1165 1166 if (remove.argsz < minsz) 1167 return -EINVAL; 1168 1169 if (remove.flags) 1170 return -EINVAL; 1171 1172 if (container->def_window_pending && !remove.start_addr) { 1173 container->def_window_pending = false; 1174 return 0; 1175 } 1176 1177 mutex_lock(&container->lock); 1178 1179 ret = tce_iommu_remove_window(container, remove.start_addr); 1180 1181 mutex_unlock(&container->lock); 1182 1183 return ret; 1184 } 1185 } 1186 1187 return -ENOTTY; 1188 } 1189 1190 static void tce_iommu_release_ownership(struct tce_container *container, 1191 struct iommu_table_group *table_group) 1192 { 1193 int i; 1194 1195 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1196 struct iommu_table *tbl = container->tables[i]; 1197 1198 if (!tbl) 1199 continue; 1200 1201 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 1202 tce_iommu_userspace_view_free(tbl, container->mm); 1203 if (tbl->it_map) 1204 iommu_release_ownership(tbl); 1205 1206 container->tables[i] = NULL; 1207 } 1208 } 1209 1210 static int tce_iommu_take_ownership(struct tce_container *container, 1211 struct iommu_table_group *table_group) 1212 { 1213 int i, j, rc = 0; 1214 1215 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1216 struct iommu_table *tbl = table_group->tables[i]; 1217 1218 if (!tbl || !tbl->it_map) 1219 continue; 1220 1221 rc = iommu_take_ownership(tbl); 1222 if (rc) { 1223 for (j = 0; j < i; ++j) 1224 iommu_release_ownership( 1225 table_group->tables[j]); 1226 1227 return rc; 1228 } 1229 } 1230 1231 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1232 container->tables[i] = table_group->tables[i]; 1233 1234 return 0; 1235 } 1236 1237 static void tce_iommu_release_ownership_ddw(struct tce_container *container, 1238 struct iommu_table_group *table_group) 1239 { 1240 long i; 1241 1242 if (!table_group->ops->unset_window) { 1243 WARN_ON_ONCE(1); 1244 return; 1245 } 1246 1247 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1248 table_group->ops->unset_window(table_group, i); 1249 1250 table_group->ops->release_ownership(table_group); 1251 } 1252 1253 static long tce_iommu_take_ownership_ddw(struct tce_container *container, 1254 struct iommu_table_group *table_group) 1255 { 1256 long i, ret = 0; 1257 1258 if (!table_group->ops->create_table || !table_group->ops->set_window || 1259 !table_group->ops->release_ownership) { 1260 WARN_ON_ONCE(1); 1261 return -EFAULT; 1262 } 1263 1264 table_group->ops->take_ownership(table_group); 1265 1266 /* Set all windows to the new group */ 1267 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1268 struct iommu_table *tbl = container->tables[i]; 1269 1270 if (!tbl) 1271 continue; 1272 1273 ret = table_group->ops->set_window(table_group, i, tbl); 1274 if (ret) 1275 goto release_exit; 1276 } 1277 1278 return 0; 1279 1280 release_exit: 1281 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1282 table_group->ops->unset_window(table_group, i); 1283 1284 table_group->ops->release_ownership(table_group); 1285 1286 return ret; 1287 } 1288 1289 static int tce_iommu_attach_group(void *iommu_data, 1290 struct iommu_group *iommu_group) 1291 { 1292 int ret; 1293 struct tce_container *container = iommu_data; 1294 struct iommu_table_group *table_group; 1295 struct tce_iommu_group *tcegrp = NULL; 1296 1297 mutex_lock(&container->lock); 1298 1299 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", 1300 iommu_group_id(iommu_group), iommu_group); */ 1301 table_group = iommu_group_get_iommudata(iommu_group); 1302 if (!table_group) { 1303 ret = -ENODEV; 1304 goto unlock_exit; 1305 } 1306 1307 if (tce_groups_attached(container) && (!table_group->ops || 1308 !table_group->ops->take_ownership || 1309 !table_group->ops->release_ownership)) { 1310 ret = -EBUSY; 1311 goto unlock_exit; 1312 } 1313 1314 /* Check if new group has the same iommu_ops (i.e. compatible) */ 1315 list_for_each_entry(tcegrp, &container->group_list, next) { 1316 struct iommu_table_group *table_group_tmp; 1317 1318 if (tcegrp->grp == iommu_group) { 1319 pr_warn("tce_vfio: Group %d is already attached\n", 1320 iommu_group_id(iommu_group)); 1321 ret = -EBUSY; 1322 goto unlock_exit; 1323 } 1324 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); 1325 if (table_group_tmp->ops->create_table != 1326 table_group->ops->create_table) { 1327 pr_warn("tce_vfio: Group %d is incompatible with group %d\n", 1328 iommu_group_id(iommu_group), 1329 iommu_group_id(tcegrp->grp)); 1330 ret = -EPERM; 1331 goto unlock_exit; 1332 } 1333 } 1334 1335 tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL); 1336 if (!tcegrp) { 1337 ret = -ENOMEM; 1338 goto unlock_exit; 1339 } 1340 1341 if (!table_group->ops || !table_group->ops->take_ownership || 1342 !table_group->ops->release_ownership) { 1343 if (container->v2) { 1344 ret = -EPERM; 1345 goto unlock_exit; 1346 } 1347 ret = tce_iommu_take_ownership(container, table_group); 1348 } else { 1349 if (!container->v2) { 1350 ret = -EPERM; 1351 goto unlock_exit; 1352 } 1353 ret = tce_iommu_take_ownership_ddw(container, table_group); 1354 if (!tce_groups_attached(container) && !container->tables[0]) 1355 container->def_window_pending = true; 1356 } 1357 1358 if (!ret) { 1359 tcegrp->grp = iommu_group; 1360 list_add(&tcegrp->next, &container->group_list); 1361 } 1362 1363 unlock_exit: 1364 if (ret && tcegrp) 1365 kfree(tcegrp); 1366 1367 mutex_unlock(&container->lock); 1368 1369 return ret; 1370 } 1371 1372 static void tce_iommu_detach_group(void *iommu_data, 1373 struct iommu_group *iommu_group) 1374 { 1375 struct tce_container *container = iommu_data; 1376 struct iommu_table_group *table_group; 1377 bool found = false; 1378 struct tce_iommu_group *tcegrp; 1379 1380 mutex_lock(&container->lock); 1381 1382 list_for_each_entry(tcegrp, &container->group_list, next) { 1383 if (tcegrp->grp == iommu_group) { 1384 found = true; 1385 break; 1386 } 1387 } 1388 1389 if (!found) { 1390 pr_warn("tce_vfio: detaching unattached group #%u\n", 1391 iommu_group_id(iommu_group)); 1392 goto unlock_exit; 1393 } 1394 1395 list_del(&tcegrp->next); 1396 kfree(tcegrp); 1397 1398 table_group = iommu_group_get_iommudata(iommu_group); 1399 BUG_ON(!table_group); 1400 1401 if (!table_group->ops || !table_group->ops->release_ownership) 1402 tce_iommu_release_ownership(container, table_group); 1403 else 1404 tce_iommu_release_ownership_ddw(container, table_group); 1405 1406 unlock_exit: 1407 mutex_unlock(&container->lock); 1408 } 1409 1410 const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { 1411 .name = "iommu-vfio-powerpc", 1412 .owner = THIS_MODULE, 1413 .open = tce_iommu_open, 1414 .release = tce_iommu_release, 1415 .ioctl = tce_iommu_ioctl, 1416 .attach_group = tce_iommu_attach_group, 1417 .detach_group = tce_iommu_detach_group, 1418 }; 1419 1420 static int __init tce_iommu_init(void) 1421 { 1422 return vfio_register_iommu_driver(&tce_iommu_driver_ops); 1423 } 1424 1425 static void __exit tce_iommu_cleanup(void) 1426 { 1427 vfio_unregister_iommu_driver(&tce_iommu_driver_ops); 1428 } 1429 1430 module_init(tce_iommu_init); 1431 module_exit(tce_iommu_cleanup); 1432 1433 MODULE_VERSION(DRIVER_VERSION); 1434 MODULE_LICENSE("GPL v2"); 1435 MODULE_AUTHOR(DRIVER_AUTHOR); 1436 MODULE_DESCRIPTION(DRIVER_DESC); 1437 1438