1 /* 2 * VFIO: IOMMU DMA mapping support for TCE on POWER 3 * 4 * Copyright (C) 2013 IBM Corp. All rights reserved. 5 * Author: Alexey Kardashevskiy <aik@ozlabs.ru> 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 * 11 * Derived from original vfio_iommu_type1.c: 12 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 13 * Author: Alex Williamson <alex.williamson@redhat.com> 14 */ 15 16 #include <linux/module.h> 17 #include <linux/pci.h> 18 #include <linux/slab.h> 19 #include <linux/uaccess.h> 20 #include <linux/err.h> 21 #include <linux/vfio.h> 22 #include <linux/vmalloc.h> 23 #include <linux/sched/mm.h> 24 #include <linux/sched/signal.h> 25 26 #include <asm/iommu.h> 27 #include <asm/tce.h> 28 #include <asm/mmu_context.h> 29 30 #define DRIVER_VERSION "0.1" 31 #define DRIVER_AUTHOR "aik@ozlabs.ru" 32 #define DRIVER_DESC "VFIO IOMMU SPAPR TCE" 33 34 static void tce_iommu_detach_group(void *iommu_data, 35 struct iommu_group *iommu_group); 36 37 static long try_increment_locked_vm(struct mm_struct *mm, long npages) 38 { 39 long ret = 0, locked, lock_limit; 40 41 if (WARN_ON_ONCE(!mm)) 42 return -EPERM; 43 44 if (!npages) 45 return 0; 46 47 down_write(&mm->mmap_sem); 48 locked = mm->locked_vm + npages; 49 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 50 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 51 ret = -ENOMEM; 52 else 53 mm->locked_vm += npages; 54 55 pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid, 56 npages << PAGE_SHIFT, 57 mm->locked_vm << PAGE_SHIFT, 58 rlimit(RLIMIT_MEMLOCK), 59 ret ? " - exceeded" : ""); 60 61 up_write(&mm->mmap_sem); 62 63 return ret; 64 } 65 66 static void decrement_locked_vm(struct mm_struct *mm, long npages) 67 { 68 if (!mm || !npages) 69 return; 70 71 down_write(&mm->mmap_sem); 72 if (WARN_ON_ONCE(npages > mm->locked_vm)) 73 npages = mm->locked_vm; 74 mm->locked_vm -= npages; 75 pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid, 76 npages << PAGE_SHIFT, 77 mm->locked_vm << PAGE_SHIFT, 78 rlimit(RLIMIT_MEMLOCK)); 79 up_write(&mm->mmap_sem); 80 } 81 82 /* 83 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation 84 * 85 * This code handles mapping and unmapping of user data buffers 86 * into DMA'ble space using the IOMMU 87 */ 88 89 struct tce_iommu_group { 90 struct list_head next; 91 struct iommu_group *grp; 92 }; 93 94 /* 95 * A container needs to remember which preregistered region it has 96 * referenced to do proper cleanup at the userspace process exit. 97 */ 98 struct tce_iommu_prereg { 99 struct list_head next; 100 struct mm_iommu_table_group_mem_t *mem; 101 }; 102 103 /* 104 * The container descriptor supports only a single group per container. 105 * Required by the API as the container is not supplied with the IOMMU group 106 * at the moment of initialization. 107 */ 108 struct tce_container { 109 struct mutex lock; 110 bool enabled; 111 bool v2; 112 bool def_window_pending; 113 unsigned long locked_pages; 114 struct mm_struct *mm; 115 struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; 116 struct list_head group_list; 117 struct list_head prereg_list; 118 }; 119 120 static long tce_iommu_mm_set(struct tce_container *container) 121 { 122 if (container->mm) { 123 if (container->mm == current->mm) 124 return 0; 125 return -EPERM; 126 } 127 BUG_ON(!current->mm); 128 container->mm = current->mm; 129 atomic_inc(&container->mm->mm_count); 130 131 return 0; 132 } 133 134 static long tce_iommu_prereg_free(struct tce_container *container, 135 struct tce_iommu_prereg *tcemem) 136 { 137 long ret; 138 139 ret = mm_iommu_put(container->mm, tcemem->mem); 140 if (ret) 141 return ret; 142 143 list_del(&tcemem->next); 144 kfree(tcemem); 145 146 return 0; 147 } 148 149 static long tce_iommu_unregister_pages(struct tce_container *container, 150 __u64 vaddr, __u64 size) 151 { 152 struct mm_iommu_table_group_mem_t *mem; 153 struct tce_iommu_prereg *tcemem; 154 bool found = false; 155 156 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) 157 return -EINVAL; 158 159 mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT); 160 if (!mem) 161 return -ENOENT; 162 163 list_for_each_entry(tcemem, &container->prereg_list, next) { 164 if (tcemem->mem == mem) { 165 found = true; 166 break; 167 } 168 } 169 170 if (!found) 171 return -ENOENT; 172 173 return tce_iommu_prereg_free(container, tcemem); 174 } 175 176 static long tce_iommu_register_pages(struct tce_container *container, 177 __u64 vaddr, __u64 size) 178 { 179 long ret = 0; 180 struct mm_iommu_table_group_mem_t *mem = NULL; 181 struct tce_iommu_prereg *tcemem; 182 unsigned long entries = size >> PAGE_SHIFT; 183 184 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || 185 ((vaddr + size) < vaddr)) 186 return -EINVAL; 187 188 mem = mm_iommu_find(container->mm, vaddr, entries); 189 if (mem) { 190 list_for_each_entry(tcemem, &container->prereg_list, next) { 191 if (tcemem->mem == mem) 192 return -EBUSY; 193 } 194 } 195 196 ret = mm_iommu_get(container->mm, vaddr, entries, &mem); 197 if (ret) 198 return ret; 199 200 tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); 201 tcemem->mem = mem; 202 list_add(&tcemem->next, &container->prereg_list); 203 204 container->enabled = true; 205 206 return 0; 207 } 208 209 static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl, 210 struct mm_struct *mm) 211 { 212 unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * 213 tbl->it_size, PAGE_SIZE); 214 unsigned long *uas; 215 long ret; 216 217 BUG_ON(tbl->it_userspace); 218 219 ret = try_increment_locked_vm(mm, cb >> PAGE_SHIFT); 220 if (ret) 221 return ret; 222 223 uas = vzalloc(cb); 224 if (!uas) { 225 decrement_locked_vm(mm, cb >> PAGE_SHIFT); 226 return -ENOMEM; 227 } 228 tbl->it_userspace = uas; 229 230 return 0; 231 } 232 233 static void tce_iommu_userspace_view_free(struct iommu_table *tbl, 234 struct mm_struct *mm) 235 { 236 unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * 237 tbl->it_size, PAGE_SIZE); 238 239 if (!tbl->it_userspace) 240 return; 241 242 vfree(tbl->it_userspace); 243 tbl->it_userspace = NULL; 244 decrement_locked_vm(mm, cb >> PAGE_SHIFT); 245 } 246 247 static bool tce_page_is_contained(struct page *page, unsigned page_shift) 248 { 249 /* 250 * Check that the TCE table granularity is not bigger than the size of 251 * a page we just found. Otherwise the hardware can get access to 252 * a bigger memory chunk that it should. 253 */ 254 return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; 255 } 256 257 static inline bool tce_groups_attached(struct tce_container *container) 258 { 259 return !list_empty(&container->group_list); 260 } 261 262 static long tce_iommu_find_table(struct tce_container *container, 263 phys_addr_t ioba, struct iommu_table **ptbl) 264 { 265 long i; 266 267 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 268 struct iommu_table *tbl = container->tables[i]; 269 270 if (tbl) { 271 unsigned long entry = ioba >> tbl->it_page_shift; 272 unsigned long start = tbl->it_offset; 273 unsigned long end = start + tbl->it_size; 274 275 if ((start <= entry) && (entry < end)) { 276 *ptbl = tbl; 277 return i; 278 } 279 } 280 } 281 282 return -1; 283 } 284 285 static int tce_iommu_find_free_table(struct tce_container *container) 286 { 287 int i; 288 289 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 290 if (!container->tables[i]) 291 return i; 292 } 293 294 return -ENOSPC; 295 } 296 297 static int tce_iommu_enable(struct tce_container *container) 298 { 299 int ret = 0; 300 unsigned long locked; 301 struct iommu_table_group *table_group; 302 struct tce_iommu_group *tcegrp; 303 304 if (container->enabled) 305 return -EBUSY; 306 307 /* 308 * When userspace pages are mapped into the IOMMU, they are effectively 309 * locked memory, so, theoretically, we need to update the accounting 310 * of locked pages on each map and unmap. For powerpc, the map unmap 311 * paths can be very hot, though, and the accounting would kill 312 * performance, especially since it would be difficult to impossible 313 * to handle the accounting in real mode only. 314 * 315 * To address that, rather than precisely accounting every page, we 316 * instead account for a worst case on locked memory when the iommu is 317 * enabled and disabled. The worst case upper bound on locked memory 318 * is the size of the whole iommu window, which is usually relatively 319 * small (compared to total memory sizes) on POWER hardware. 320 * 321 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, 322 * that would effectively kill the guest at random points, much better 323 * enforcing the limit based on the max that the guest can map. 324 * 325 * Unfortunately at the moment it counts whole tables, no matter how 326 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups 327 * each with 2GB DMA window, 8GB will be counted here. The reason for 328 * this is that we cannot tell here the amount of RAM used by the guest 329 * as this information is only available from KVM and VFIO is 330 * KVM agnostic. 331 * 332 * So we do not allow enabling a container without a group attached 333 * as there is no way to know how much we should increment 334 * the locked_vm counter. 335 */ 336 if (!tce_groups_attached(container)) 337 return -ENODEV; 338 339 tcegrp = list_first_entry(&container->group_list, 340 struct tce_iommu_group, next); 341 table_group = iommu_group_get_iommudata(tcegrp->grp); 342 if (!table_group) 343 return -ENODEV; 344 345 if (!table_group->tce32_size) 346 return -EPERM; 347 348 ret = tce_iommu_mm_set(container); 349 if (ret) 350 return ret; 351 352 locked = table_group->tce32_size >> PAGE_SHIFT; 353 ret = try_increment_locked_vm(container->mm, locked); 354 if (ret) 355 return ret; 356 357 container->locked_pages = locked; 358 359 container->enabled = true; 360 361 return ret; 362 } 363 364 static void tce_iommu_disable(struct tce_container *container) 365 { 366 if (!container->enabled) 367 return; 368 369 container->enabled = false; 370 371 BUG_ON(!container->mm); 372 decrement_locked_vm(container->mm, container->locked_pages); 373 } 374 375 static void *tce_iommu_open(unsigned long arg) 376 { 377 struct tce_container *container; 378 379 if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) { 380 pr_err("tce_vfio: Wrong IOMMU type\n"); 381 return ERR_PTR(-EINVAL); 382 } 383 384 container = kzalloc(sizeof(*container), GFP_KERNEL); 385 if (!container) 386 return ERR_PTR(-ENOMEM); 387 388 mutex_init(&container->lock); 389 INIT_LIST_HEAD_RCU(&container->group_list); 390 INIT_LIST_HEAD_RCU(&container->prereg_list); 391 392 container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; 393 394 return container; 395 } 396 397 static int tce_iommu_clear(struct tce_container *container, 398 struct iommu_table *tbl, 399 unsigned long entry, unsigned long pages); 400 static void tce_iommu_free_table(struct tce_container *container, 401 struct iommu_table *tbl); 402 403 static void tce_iommu_release(void *iommu_data) 404 { 405 struct tce_container *container = iommu_data; 406 struct tce_iommu_group *tcegrp; 407 long i; 408 409 while (tce_groups_attached(container)) { 410 tcegrp = list_first_entry(&container->group_list, 411 struct tce_iommu_group, next); 412 tce_iommu_detach_group(iommu_data, tcegrp->grp); 413 } 414 415 /* 416 * If VFIO created a table, it was not disposed 417 * by tce_iommu_detach_group() so do it now. 418 */ 419 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 420 struct iommu_table *tbl = container->tables[i]; 421 422 if (!tbl) 423 continue; 424 425 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 426 tce_iommu_free_table(container, tbl); 427 } 428 429 while (!list_empty(&container->prereg_list)) { 430 struct tce_iommu_prereg *tcemem; 431 432 tcemem = list_first_entry(&container->prereg_list, 433 struct tce_iommu_prereg, next); 434 WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem)); 435 } 436 437 tce_iommu_disable(container); 438 if (container->mm) 439 mmdrop(container->mm); 440 mutex_destroy(&container->lock); 441 442 kfree(container); 443 } 444 445 static void tce_iommu_unuse_page(struct tce_container *container, 446 unsigned long hpa) 447 { 448 struct page *page; 449 450 page = pfn_to_page(hpa >> PAGE_SHIFT); 451 put_page(page); 452 } 453 454 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container, 455 unsigned long tce, unsigned long size, 456 unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) 457 { 458 long ret = 0; 459 struct mm_iommu_table_group_mem_t *mem; 460 461 mem = mm_iommu_lookup(container->mm, tce, size); 462 if (!mem) 463 return -EINVAL; 464 465 ret = mm_iommu_ua_to_hpa(mem, tce, phpa); 466 if (ret) 467 return -EINVAL; 468 469 *pmem = mem; 470 471 return 0; 472 } 473 474 static void tce_iommu_unuse_page_v2(struct tce_container *container, 475 struct iommu_table *tbl, unsigned long entry) 476 { 477 struct mm_iommu_table_group_mem_t *mem = NULL; 478 int ret; 479 unsigned long hpa = 0; 480 unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); 481 482 if (!pua) 483 return; 484 485 ret = tce_iommu_prereg_ua_to_hpa(container, *pua, IOMMU_PAGE_SIZE(tbl), 486 &hpa, &mem); 487 if (ret) 488 pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n", 489 __func__, *pua, entry, ret); 490 if (mem) 491 mm_iommu_mapped_dec(mem); 492 493 *pua = 0; 494 } 495 496 static int tce_iommu_clear(struct tce_container *container, 497 struct iommu_table *tbl, 498 unsigned long entry, unsigned long pages) 499 { 500 unsigned long oldhpa; 501 long ret; 502 enum dma_data_direction direction; 503 504 for ( ; pages; --pages, ++entry) { 505 direction = DMA_NONE; 506 oldhpa = 0; 507 ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction); 508 if (ret) 509 continue; 510 511 if (direction == DMA_NONE) 512 continue; 513 514 if (container->v2) { 515 tce_iommu_unuse_page_v2(container, tbl, entry); 516 continue; 517 } 518 519 tce_iommu_unuse_page(container, oldhpa); 520 } 521 522 return 0; 523 } 524 525 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) 526 { 527 struct page *page = NULL; 528 enum dma_data_direction direction = iommu_tce_direction(tce); 529 530 if (get_user_pages_fast(tce & PAGE_MASK, 1, 531 direction != DMA_TO_DEVICE, &page) != 1) 532 return -EFAULT; 533 534 *hpa = __pa((unsigned long) page_address(page)); 535 536 return 0; 537 } 538 539 static long tce_iommu_build(struct tce_container *container, 540 struct iommu_table *tbl, 541 unsigned long entry, unsigned long tce, unsigned long pages, 542 enum dma_data_direction direction) 543 { 544 long i, ret = 0; 545 struct page *page; 546 unsigned long hpa; 547 enum dma_data_direction dirtmp; 548 549 for (i = 0; i < pages; ++i) { 550 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 551 552 ret = tce_iommu_use_page(tce, &hpa); 553 if (ret) 554 break; 555 556 page = pfn_to_page(hpa >> PAGE_SHIFT); 557 if (!tce_page_is_contained(page, tbl->it_page_shift)) { 558 ret = -EPERM; 559 break; 560 } 561 562 hpa |= offset; 563 dirtmp = direction; 564 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); 565 if (ret) { 566 tce_iommu_unuse_page(container, hpa); 567 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 568 __func__, entry << tbl->it_page_shift, 569 tce, ret); 570 break; 571 } 572 573 if (dirtmp != DMA_NONE) 574 tce_iommu_unuse_page(container, hpa); 575 576 tce += IOMMU_PAGE_SIZE(tbl); 577 } 578 579 if (ret) 580 tce_iommu_clear(container, tbl, entry, i); 581 582 return ret; 583 } 584 585 static long tce_iommu_build_v2(struct tce_container *container, 586 struct iommu_table *tbl, 587 unsigned long entry, unsigned long tce, unsigned long pages, 588 enum dma_data_direction direction) 589 { 590 long i, ret = 0; 591 struct page *page; 592 unsigned long hpa; 593 enum dma_data_direction dirtmp; 594 595 if (!tbl->it_userspace) { 596 ret = tce_iommu_userspace_view_alloc(tbl, container->mm); 597 if (ret) 598 return ret; 599 } 600 601 for (i = 0; i < pages; ++i) { 602 struct mm_iommu_table_group_mem_t *mem = NULL; 603 unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, 604 entry + i); 605 606 ret = tce_iommu_prereg_ua_to_hpa(container, 607 tce, IOMMU_PAGE_SIZE(tbl), &hpa, &mem); 608 if (ret) 609 break; 610 611 page = pfn_to_page(hpa >> PAGE_SHIFT); 612 if (!tce_page_is_contained(page, tbl->it_page_shift)) { 613 ret = -EPERM; 614 break; 615 } 616 617 /* Preserve offset within IOMMU page */ 618 hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 619 dirtmp = direction; 620 621 /* The registered region is being unregistered */ 622 if (mm_iommu_mapped_inc(mem)) 623 break; 624 625 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); 626 if (ret) { 627 /* dirtmp cannot be DMA_NONE here */ 628 tce_iommu_unuse_page_v2(container, tbl, entry + i); 629 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 630 __func__, entry << tbl->it_page_shift, 631 tce, ret); 632 break; 633 } 634 635 if (dirtmp != DMA_NONE) 636 tce_iommu_unuse_page_v2(container, tbl, entry + i); 637 638 *pua = tce; 639 640 tce += IOMMU_PAGE_SIZE(tbl); 641 } 642 643 if (ret) 644 tce_iommu_clear(container, tbl, entry, i); 645 646 return ret; 647 } 648 649 static long tce_iommu_create_table(struct tce_container *container, 650 struct iommu_table_group *table_group, 651 int num, 652 __u32 page_shift, 653 __u64 window_size, 654 __u32 levels, 655 struct iommu_table **ptbl) 656 { 657 long ret, table_size; 658 659 table_size = table_group->ops->get_table_size(page_shift, window_size, 660 levels); 661 if (!table_size) 662 return -EINVAL; 663 664 ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT); 665 if (ret) 666 return ret; 667 668 ret = table_group->ops->create_table(table_group, num, 669 page_shift, window_size, levels, ptbl); 670 671 WARN_ON(!ret && !(*ptbl)->it_ops->free); 672 WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size)); 673 674 return ret; 675 } 676 677 static void tce_iommu_free_table(struct tce_container *container, 678 struct iommu_table *tbl) 679 { 680 unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; 681 682 tce_iommu_userspace_view_free(tbl, container->mm); 683 tbl->it_ops->free(tbl); 684 decrement_locked_vm(container->mm, pages); 685 } 686 687 static long tce_iommu_create_window(struct tce_container *container, 688 __u32 page_shift, __u64 window_size, __u32 levels, 689 __u64 *start_addr) 690 { 691 struct tce_iommu_group *tcegrp; 692 struct iommu_table_group *table_group; 693 struct iommu_table *tbl = NULL; 694 long ret, num; 695 696 num = tce_iommu_find_free_table(container); 697 if (num < 0) 698 return num; 699 700 /* Get the first group for ops::create_table */ 701 tcegrp = list_first_entry(&container->group_list, 702 struct tce_iommu_group, next); 703 table_group = iommu_group_get_iommudata(tcegrp->grp); 704 if (!table_group) 705 return -EFAULT; 706 707 if (!(table_group->pgsizes & (1ULL << page_shift))) 708 return -EINVAL; 709 710 if (!table_group->ops->set_window || !table_group->ops->unset_window || 711 !table_group->ops->get_table_size || 712 !table_group->ops->create_table) 713 return -EPERM; 714 715 /* Create TCE table */ 716 ret = tce_iommu_create_table(container, table_group, num, 717 page_shift, window_size, levels, &tbl); 718 if (ret) 719 return ret; 720 721 BUG_ON(!tbl->it_ops->free); 722 723 /* 724 * Program the table to every group. 725 * Groups have been tested for compatibility at the attach time. 726 */ 727 list_for_each_entry(tcegrp, &container->group_list, next) { 728 table_group = iommu_group_get_iommudata(tcegrp->grp); 729 730 ret = table_group->ops->set_window(table_group, num, tbl); 731 if (ret) 732 goto unset_exit; 733 } 734 735 container->tables[num] = tbl; 736 737 /* Return start address assigned by platform in create_table() */ 738 *start_addr = tbl->it_offset << tbl->it_page_shift; 739 740 return 0; 741 742 unset_exit: 743 list_for_each_entry(tcegrp, &container->group_list, next) { 744 table_group = iommu_group_get_iommudata(tcegrp->grp); 745 table_group->ops->unset_window(table_group, num); 746 } 747 tce_iommu_free_table(container, tbl); 748 749 return ret; 750 } 751 752 static long tce_iommu_remove_window(struct tce_container *container, 753 __u64 start_addr) 754 { 755 struct iommu_table_group *table_group = NULL; 756 struct iommu_table *tbl; 757 struct tce_iommu_group *tcegrp; 758 int num; 759 760 num = tce_iommu_find_table(container, start_addr, &tbl); 761 if (num < 0) 762 return -EINVAL; 763 764 BUG_ON(!tbl->it_size); 765 766 /* Detach groups from IOMMUs */ 767 list_for_each_entry(tcegrp, &container->group_list, next) { 768 table_group = iommu_group_get_iommudata(tcegrp->grp); 769 770 /* 771 * SPAPR TCE IOMMU exposes the default DMA window to 772 * the guest via dma32_window_start/size of 773 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow 774 * the userspace to remove this window, some do not so 775 * here we check for the platform capability. 776 */ 777 if (!table_group->ops || !table_group->ops->unset_window) 778 return -EPERM; 779 780 table_group->ops->unset_window(table_group, num); 781 } 782 783 /* Free table */ 784 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 785 tce_iommu_free_table(container, tbl); 786 container->tables[num] = NULL; 787 788 return 0; 789 } 790 791 static long tce_iommu_create_default_window(struct tce_container *container) 792 { 793 long ret; 794 __u64 start_addr = 0; 795 struct tce_iommu_group *tcegrp; 796 struct iommu_table_group *table_group; 797 798 if (!container->def_window_pending) 799 return 0; 800 801 if (!tce_groups_attached(container)) 802 return -ENODEV; 803 804 tcegrp = list_first_entry(&container->group_list, 805 struct tce_iommu_group, next); 806 table_group = iommu_group_get_iommudata(tcegrp->grp); 807 if (!table_group) 808 return -ENODEV; 809 810 ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K, 811 table_group->tce32_size, 1, &start_addr); 812 WARN_ON_ONCE(!ret && start_addr); 813 814 if (!ret) 815 container->def_window_pending = false; 816 817 return ret; 818 } 819 820 static long tce_iommu_ioctl(void *iommu_data, 821 unsigned int cmd, unsigned long arg) 822 { 823 struct tce_container *container = iommu_data; 824 unsigned long minsz, ddwsz; 825 long ret; 826 827 switch (cmd) { 828 case VFIO_CHECK_EXTENSION: 829 switch (arg) { 830 case VFIO_SPAPR_TCE_IOMMU: 831 case VFIO_SPAPR_TCE_v2_IOMMU: 832 ret = 1; 833 break; 834 default: 835 ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); 836 break; 837 } 838 839 return (ret < 0) ? 0 : ret; 840 } 841 842 /* 843 * Sanity check to prevent one userspace from manipulating 844 * another userspace mm. 845 */ 846 BUG_ON(!container); 847 if (container->mm && container->mm != current->mm) 848 return -EPERM; 849 850 switch (cmd) { 851 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { 852 struct vfio_iommu_spapr_tce_info info; 853 struct tce_iommu_group *tcegrp; 854 struct iommu_table_group *table_group; 855 856 if (!tce_groups_attached(container)) 857 return -ENXIO; 858 859 tcegrp = list_first_entry(&container->group_list, 860 struct tce_iommu_group, next); 861 table_group = iommu_group_get_iommudata(tcegrp->grp); 862 863 if (!table_group) 864 return -ENXIO; 865 866 minsz = offsetofend(struct vfio_iommu_spapr_tce_info, 867 dma32_window_size); 868 869 if (copy_from_user(&info, (void __user *)arg, minsz)) 870 return -EFAULT; 871 872 if (info.argsz < minsz) 873 return -EINVAL; 874 875 info.dma32_window_start = table_group->tce32_start; 876 info.dma32_window_size = table_group->tce32_size; 877 info.flags = 0; 878 memset(&info.ddw, 0, sizeof(info.ddw)); 879 880 if (table_group->max_dynamic_windows_supported && 881 container->v2) { 882 info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW; 883 info.ddw.pgsizes = table_group->pgsizes; 884 info.ddw.max_dynamic_windows_supported = 885 table_group->max_dynamic_windows_supported; 886 info.ddw.levels = table_group->max_levels; 887 } 888 889 ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw); 890 891 if (info.argsz >= ddwsz) 892 minsz = ddwsz; 893 894 if (copy_to_user((void __user *)arg, &info, minsz)) 895 return -EFAULT; 896 897 return 0; 898 } 899 case VFIO_IOMMU_MAP_DMA: { 900 struct vfio_iommu_type1_dma_map param; 901 struct iommu_table *tbl = NULL; 902 long num; 903 enum dma_data_direction direction; 904 905 if (!container->enabled) 906 return -EPERM; 907 908 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 909 910 if (copy_from_user(¶m, (void __user *)arg, minsz)) 911 return -EFAULT; 912 913 if (param.argsz < minsz) 914 return -EINVAL; 915 916 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ | 917 VFIO_DMA_MAP_FLAG_WRITE)) 918 return -EINVAL; 919 920 ret = tce_iommu_create_default_window(container); 921 if (ret) 922 return ret; 923 924 num = tce_iommu_find_table(container, param.iova, &tbl); 925 if (num < 0) 926 return -ENXIO; 927 928 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || 929 (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) 930 return -EINVAL; 931 932 /* iova is checked by the IOMMU API */ 933 if (param.flags & VFIO_DMA_MAP_FLAG_READ) { 934 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 935 direction = DMA_BIDIRECTIONAL; 936 else 937 direction = DMA_TO_DEVICE; 938 } else { 939 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 940 direction = DMA_FROM_DEVICE; 941 else 942 return -EINVAL; 943 } 944 945 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); 946 if (ret) 947 return ret; 948 949 if (container->v2) 950 ret = tce_iommu_build_v2(container, tbl, 951 param.iova >> tbl->it_page_shift, 952 param.vaddr, 953 param.size >> tbl->it_page_shift, 954 direction); 955 else 956 ret = tce_iommu_build(container, tbl, 957 param.iova >> tbl->it_page_shift, 958 param.vaddr, 959 param.size >> tbl->it_page_shift, 960 direction); 961 962 iommu_flush_tce(tbl); 963 964 return ret; 965 } 966 case VFIO_IOMMU_UNMAP_DMA: { 967 struct vfio_iommu_type1_dma_unmap param; 968 struct iommu_table *tbl = NULL; 969 long num; 970 971 if (!container->enabled) 972 return -EPERM; 973 974 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, 975 size); 976 977 if (copy_from_user(¶m, (void __user *)arg, minsz)) 978 return -EFAULT; 979 980 if (param.argsz < minsz) 981 return -EINVAL; 982 983 /* No flag is supported now */ 984 if (param.flags) 985 return -EINVAL; 986 987 ret = tce_iommu_create_default_window(container); 988 if (ret) 989 return ret; 990 991 num = tce_iommu_find_table(container, param.iova, &tbl); 992 if (num < 0) 993 return -ENXIO; 994 995 if (param.size & ~IOMMU_PAGE_MASK(tbl)) 996 return -EINVAL; 997 998 ret = iommu_tce_clear_param_check(tbl, param.iova, 0, 999 param.size >> tbl->it_page_shift); 1000 if (ret) 1001 return ret; 1002 1003 ret = tce_iommu_clear(container, tbl, 1004 param.iova >> tbl->it_page_shift, 1005 param.size >> tbl->it_page_shift); 1006 iommu_flush_tce(tbl); 1007 1008 return ret; 1009 } 1010 case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: { 1011 struct vfio_iommu_spapr_register_memory param; 1012 1013 if (!container->v2) 1014 break; 1015 1016 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1017 size); 1018 1019 ret = tce_iommu_mm_set(container); 1020 if (ret) 1021 return ret; 1022 1023 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1024 return -EFAULT; 1025 1026 if (param.argsz < minsz) 1027 return -EINVAL; 1028 1029 /* No flag is supported now */ 1030 if (param.flags) 1031 return -EINVAL; 1032 1033 mutex_lock(&container->lock); 1034 ret = tce_iommu_register_pages(container, param.vaddr, 1035 param.size); 1036 mutex_unlock(&container->lock); 1037 1038 return ret; 1039 } 1040 case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: { 1041 struct vfio_iommu_spapr_register_memory param; 1042 1043 if (!container->v2) 1044 break; 1045 1046 if (!container->mm) 1047 return -EPERM; 1048 1049 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1050 size); 1051 1052 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1053 return -EFAULT; 1054 1055 if (param.argsz < minsz) 1056 return -EINVAL; 1057 1058 /* No flag is supported now */ 1059 if (param.flags) 1060 return -EINVAL; 1061 1062 mutex_lock(&container->lock); 1063 ret = tce_iommu_unregister_pages(container, param.vaddr, 1064 param.size); 1065 mutex_unlock(&container->lock); 1066 1067 return ret; 1068 } 1069 case VFIO_IOMMU_ENABLE: 1070 if (container->v2) 1071 break; 1072 1073 mutex_lock(&container->lock); 1074 ret = tce_iommu_enable(container); 1075 mutex_unlock(&container->lock); 1076 return ret; 1077 1078 1079 case VFIO_IOMMU_DISABLE: 1080 if (container->v2) 1081 break; 1082 1083 mutex_lock(&container->lock); 1084 tce_iommu_disable(container); 1085 mutex_unlock(&container->lock); 1086 return 0; 1087 1088 case VFIO_EEH_PE_OP: { 1089 struct tce_iommu_group *tcegrp; 1090 1091 ret = 0; 1092 list_for_each_entry(tcegrp, &container->group_list, next) { 1093 ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, 1094 cmd, arg); 1095 if (ret) 1096 return ret; 1097 } 1098 return ret; 1099 } 1100 1101 case VFIO_IOMMU_SPAPR_TCE_CREATE: { 1102 struct vfio_iommu_spapr_tce_create create; 1103 1104 if (!container->v2) 1105 break; 1106 1107 ret = tce_iommu_mm_set(container); 1108 if (ret) 1109 return ret; 1110 1111 if (!tce_groups_attached(container)) 1112 return -ENXIO; 1113 1114 minsz = offsetofend(struct vfio_iommu_spapr_tce_create, 1115 start_addr); 1116 1117 if (copy_from_user(&create, (void __user *)arg, minsz)) 1118 return -EFAULT; 1119 1120 if (create.argsz < minsz) 1121 return -EINVAL; 1122 1123 if (create.flags) 1124 return -EINVAL; 1125 1126 mutex_lock(&container->lock); 1127 1128 ret = tce_iommu_create_default_window(container); 1129 if (!ret) 1130 ret = tce_iommu_create_window(container, 1131 create.page_shift, 1132 create.window_size, create.levels, 1133 &create.start_addr); 1134 1135 mutex_unlock(&container->lock); 1136 1137 if (!ret && copy_to_user((void __user *)arg, &create, minsz)) 1138 ret = -EFAULT; 1139 1140 return ret; 1141 } 1142 case VFIO_IOMMU_SPAPR_TCE_REMOVE: { 1143 struct vfio_iommu_spapr_tce_remove remove; 1144 1145 if (!container->v2) 1146 break; 1147 1148 ret = tce_iommu_mm_set(container); 1149 if (ret) 1150 return ret; 1151 1152 if (!tce_groups_attached(container)) 1153 return -ENXIO; 1154 1155 minsz = offsetofend(struct vfio_iommu_spapr_tce_remove, 1156 start_addr); 1157 1158 if (copy_from_user(&remove, (void __user *)arg, minsz)) 1159 return -EFAULT; 1160 1161 if (remove.argsz < minsz) 1162 return -EINVAL; 1163 1164 if (remove.flags) 1165 return -EINVAL; 1166 1167 if (container->def_window_pending && !remove.start_addr) { 1168 container->def_window_pending = false; 1169 return 0; 1170 } 1171 1172 mutex_lock(&container->lock); 1173 1174 ret = tce_iommu_remove_window(container, remove.start_addr); 1175 1176 mutex_unlock(&container->lock); 1177 1178 return ret; 1179 } 1180 } 1181 1182 return -ENOTTY; 1183 } 1184 1185 static void tce_iommu_release_ownership(struct tce_container *container, 1186 struct iommu_table_group *table_group) 1187 { 1188 int i; 1189 1190 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1191 struct iommu_table *tbl = container->tables[i]; 1192 1193 if (!tbl) 1194 continue; 1195 1196 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 1197 tce_iommu_userspace_view_free(tbl, container->mm); 1198 if (tbl->it_map) 1199 iommu_release_ownership(tbl); 1200 1201 container->tables[i] = NULL; 1202 } 1203 } 1204 1205 static int tce_iommu_take_ownership(struct tce_container *container, 1206 struct iommu_table_group *table_group) 1207 { 1208 int i, j, rc = 0; 1209 1210 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1211 struct iommu_table *tbl = table_group->tables[i]; 1212 1213 if (!tbl || !tbl->it_map) 1214 continue; 1215 1216 rc = iommu_take_ownership(tbl); 1217 if (rc) { 1218 for (j = 0; j < i; ++j) 1219 iommu_release_ownership( 1220 table_group->tables[j]); 1221 1222 return rc; 1223 } 1224 } 1225 1226 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1227 container->tables[i] = table_group->tables[i]; 1228 1229 return 0; 1230 } 1231 1232 static void tce_iommu_release_ownership_ddw(struct tce_container *container, 1233 struct iommu_table_group *table_group) 1234 { 1235 long i; 1236 1237 if (!table_group->ops->unset_window) { 1238 WARN_ON_ONCE(1); 1239 return; 1240 } 1241 1242 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1243 table_group->ops->unset_window(table_group, i); 1244 1245 table_group->ops->release_ownership(table_group); 1246 } 1247 1248 static long tce_iommu_take_ownership_ddw(struct tce_container *container, 1249 struct iommu_table_group *table_group) 1250 { 1251 long i, ret = 0; 1252 1253 if (!table_group->ops->create_table || !table_group->ops->set_window || 1254 !table_group->ops->release_ownership) { 1255 WARN_ON_ONCE(1); 1256 return -EFAULT; 1257 } 1258 1259 table_group->ops->take_ownership(table_group); 1260 1261 /* Set all windows to the new group */ 1262 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1263 struct iommu_table *tbl = container->tables[i]; 1264 1265 if (!tbl) 1266 continue; 1267 1268 ret = table_group->ops->set_window(table_group, i, tbl); 1269 if (ret) 1270 goto release_exit; 1271 } 1272 1273 return 0; 1274 1275 release_exit: 1276 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1277 table_group->ops->unset_window(table_group, i); 1278 1279 table_group->ops->release_ownership(table_group); 1280 1281 return ret; 1282 } 1283 1284 static int tce_iommu_attach_group(void *iommu_data, 1285 struct iommu_group *iommu_group) 1286 { 1287 int ret; 1288 struct tce_container *container = iommu_data; 1289 struct iommu_table_group *table_group; 1290 struct tce_iommu_group *tcegrp = NULL; 1291 1292 mutex_lock(&container->lock); 1293 1294 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", 1295 iommu_group_id(iommu_group), iommu_group); */ 1296 table_group = iommu_group_get_iommudata(iommu_group); 1297 if (!table_group) { 1298 ret = -ENODEV; 1299 goto unlock_exit; 1300 } 1301 1302 if (tce_groups_attached(container) && (!table_group->ops || 1303 !table_group->ops->take_ownership || 1304 !table_group->ops->release_ownership)) { 1305 ret = -EBUSY; 1306 goto unlock_exit; 1307 } 1308 1309 /* Check if new group has the same iommu_ops (i.e. compatible) */ 1310 list_for_each_entry(tcegrp, &container->group_list, next) { 1311 struct iommu_table_group *table_group_tmp; 1312 1313 if (tcegrp->grp == iommu_group) { 1314 pr_warn("tce_vfio: Group %d is already attached\n", 1315 iommu_group_id(iommu_group)); 1316 ret = -EBUSY; 1317 goto unlock_exit; 1318 } 1319 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); 1320 if (table_group_tmp->ops->create_table != 1321 table_group->ops->create_table) { 1322 pr_warn("tce_vfio: Group %d is incompatible with group %d\n", 1323 iommu_group_id(iommu_group), 1324 iommu_group_id(tcegrp->grp)); 1325 ret = -EPERM; 1326 goto unlock_exit; 1327 } 1328 } 1329 1330 tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL); 1331 if (!tcegrp) { 1332 ret = -ENOMEM; 1333 goto unlock_exit; 1334 } 1335 1336 if (!table_group->ops || !table_group->ops->take_ownership || 1337 !table_group->ops->release_ownership) { 1338 ret = tce_iommu_take_ownership(container, table_group); 1339 } else { 1340 ret = tce_iommu_take_ownership_ddw(container, table_group); 1341 if (!tce_groups_attached(container) && !container->tables[0]) 1342 container->def_window_pending = true; 1343 } 1344 1345 if (!ret) { 1346 tcegrp->grp = iommu_group; 1347 list_add(&tcegrp->next, &container->group_list); 1348 } 1349 1350 unlock_exit: 1351 if (ret && tcegrp) 1352 kfree(tcegrp); 1353 1354 mutex_unlock(&container->lock); 1355 1356 return ret; 1357 } 1358 1359 static void tce_iommu_detach_group(void *iommu_data, 1360 struct iommu_group *iommu_group) 1361 { 1362 struct tce_container *container = iommu_data; 1363 struct iommu_table_group *table_group; 1364 bool found = false; 1365 struct tce_iommu_group *tcegrp; 1366 1367 mutex_lock(&container->lock); 1368 1369 list_for_each_entry(tcegrp, &container->group_list, next) { 1370 if (tcegrp->grp == iommu_group) { 1371 found = true; 1372 break; 1373 } 1374 } 1375 1376 if (!found) { 1377 pr_warn("tce_vfio: detaching unattached group #%u\n", 1378 iommu_group_id(iommu_group)); 1379 goto unlock_exit; 1380 } 1381 1382 list_del(&tcegrp->next); 1383 kfree(tcegrp); 1384 1385 table_group = iommu_group_get_iommudata(iommu_group); 1386 BUG_ON(!table_group); 1387 1388 if (!table_group->ops || !table_group->ops->release_ownership) 1389 tce_iommu_release_ownership(container, table_group); 1390 else 1391 tce_iommu_release_ownership_ddw(container, table_group); 1392 1393 unlock_exit: 1394 mutex_unlock(&container->lock); 1395 } 1396 1397 const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { 1398 .name = "iommu-vfio-powerpc", 1399 .owner = THIS_MODULE, 1400 .open = tce_iommu_open, 1401 .release = tce_iommu_release, 1402 .ioctl = tce_iommu_ioctl, 1403 .attach_group = tce_iommu_attach_group, 1404 .detach_group = tce_iommu_detach_group, 1405 }; 1406 1407 static int __init tce_iommu_init(void) 1408 { 1409 return vfio_register_iommu_driver(&tce_iommu_driver_ops); 1410 } 1411 1412 static void __exit tce_iommu_cleanup(void) 1413 { 1414 vfio_unregister_iommu_driver(&tce_iommu_driver_ops); 1415 } 1416 1417 module_init(tce_iommu_init); 1418 module_exit(tce_iommu_cleanup); 1419 1420 MODULE_VERSION(DRIVER_VERSION); 1421 MODULE_LICENSE("GPL v2"); 1422 MODULE_AUTHOR(DRIVER_AUTHOR); 1423 MODULE_DESCRIPTION(DRIVER_DESC); 1424 1425