1 /* 2 * VFIO: IOMMU DMA mapping support for TCE on POWER 3 * 4 * Copyright (C) 2013 IBM Corp. All rights reserved. 5 * Author: Alexey Kardashevskiy <aik@ozlabs.ru> 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 * 11 * Derived from original vfio_iommu_type1.c: 12 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 13 * Author: Alex Williamson <alex.williamson@redhat.com> 14 */ 15 16 #include <linux/module.h> 17 #include <linux/pci.h> 18 #include <linux/slab.h> 19 #include <linux/uaccess.h> 20 #include <linux/err.h> 21 #include <linux/vfio.h> 22 #include <linux/vmalloc.h> 23 #include <linux/sched/mm.h> 24 #include <linux/sched/signal.h> 25 26 #include <asm/iommu.h> 27 #include <asm/tce.h> 28 #include <asm/mmu_context.h> 29 30 #define DRIVER_VERSION "0.1" 31 #define DRIVER_AUTHOR "aik@ozlabs.ru" 32 #define DRIVER_DESC "VFIO IOMMU SPAPR TCE" 33 34 static void tce_iommu_detach_group(void *iommu_data, 35 struct iommu_group *iommu_group); 36 37 static long try_increment_locked_vm(struct mm_struct *mm, long npages) 38 { 39 long ret = 0, locked, lock_limit; 40 41 if (WARN_ON_ONCE(!mm)) 42 return -EPERM; 43 44 if (!npages) 45 return 0; 46 47 down_write(&mm->mmap_sem); 48 locked = mm->locked_vm + npages; 49 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 50 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 51 ret = -ENOMEM; 52 else 53 mm->locked_vm += npages; 54 55 pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid, 56 npages << PAGE_SHIFT, 57 mm->locked_vm << PAGE_SHIFT, 58 rlimit(RLIMIT_MEMLOCK), 59 ret ? " - exceeded" : ""); 60 61 up_write(&mm->mmap_sem); 62 63 return ret; 64 } 65 66 static void decrement_locked_vm(struct mm_struct *mm, long npages) 67 { 68 if (!mm || !npages) 69 return; 70 71 down_write(&mm->mmap_sem); 72 if (WARN_ON_ONCE(npages > mm->locked_vm)) 73 npages = mm->locked_vm; 74 mm->locked_vm -= npages; 75 pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid, 76 npages << PAGE_SHIFT, 77 mm->locked_vm << PAGE_SHIFT, 78 rlimit(RLIMIT_MEMLOCK)); 79 up_write(&mm->mmap_sem); 80 } 81 82 /* 83 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation 84 * 85 * This code handles mapping and unmapping of user data buffers 86 * into DMA'ble space using the IOMMU 87 */ 88 89 struct tce_iommu_group { 90 struct list_head next; 91 struct iommu_group *grp; 92 }; 93 94 /* 95 * A container needs to remember which preregistered region it has 96 * referenced to do proper cleanup at the userspace process exit. 97 */ 98 struct tce_iommu_prereg { 99 struct list_head next; 100 struct mm_iommu_table_group_mem_t *mem; 101 }; 102 103 /* 104 * The container descriptor supports only a single group per container. 105 * Required by the API as the container is not supplied with the IOMMU group 106 * at the moment of initialization. 107 */ 108 struct tce_container { 109 struct mutex lock; 110 bool enabled; 111 bool v2; 112 bool def_window_pending; 113 unsigned long locked_pages; 114 struct mm_struct *mm; 115 struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; 116 struct list_head group_list; 117 struct list_head prereg_list; 118 }; 119 120 static long tce_iommu_mm_set(struct tce_container *container) 121 { 122 if (container->mm) { 123 if (container->mm == current->mm) 124 return 0; 125 return -EPERM; 126 } 127 BUG_ON(!current->mm); 128 container->mm = current->mm; 129 atomic_inc(&container->mm->mm_count); 130 131 return 0; 132 } 133 134 static long tce_iommu_prereg_free(struct tce_container *container, 135 struct tce_iommu_prereg *tcemem) 136 { 137 long ret; 138 139 ret = mm_iommu_put(container->mm, tcemem->mem); 140 if (ret) 141 return ret; 142 143 list_del(&tcemem->next); 144 kfree(tcemem); 145 146 return 0; 147 } 148 149 static long tce_iommu_unregister_pages(struct tce_container *container, 150 __u64 vaddr, __u64 size) 151 { 152 struct mm_iommu_table_group_mem_t *mem; 153 struct tce_iommu_prereg *tcemem; 154 bool found = false; 155 156 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) 157 return -EINVAL; 158 159 mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT); 160 if (!mem) 161 return -ENOENT; 162 163 list_for_each_entry(tcemem, &container->prereg_list, next) { 164 if (tcemem->mem == mem) { 165 found = true; 166 break; 167 } 168 } 169 170 if (!found) 171 return -ENOENT; 172 173 return tce_iommu_prereg_free(container, tcemem); 174 } 175 176 static long tce_iommu_register_pages(struct tce_container *container, 177 __u64 vaddr, __u64 size) 178 { 179 long ret = 0; 180 struct mm_iommu_table_group_mem_t *mem = NULL; 181 struct tce_iommu_prereg *tcemem; 182 unsigned long entries = size >> PAGE_SHIFT; 183 184 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || 185 ((vaddr + size) < vaddr)) 186 return -EINVAL; 187 188 mem = mm_iommu_find(container->mm, vaddr, entries); 189 if (mem) { 190 list_for_each_entry(tcemem, &container->prereg_list, next) { 191 if (tcemem->mem == mem) 192 return -EBUSY; 193 } 194 } 195 196 ret = mm_iommu_get(container->mm, vaddr, entries, &mem); 197 if (ret) 198 return ret; 199 200 tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); 201 if (!tcemem) { 202 mm_iommu_put(container->mm, mem); 203 return -ENOMEM; 204 } 205 206 tcemem->mem = mem; 207 list_add(&tcemem->next, &container->prereg_list); 208 209 container->enabled = true; 210 211 return 0; 212 } 213 214 static bool tce_page_is_contained(struct page *page, unsigned page_shift) 215 { 216 /* 217 * Check that the TCE table granularity is not bigger than the size of 218 * a page we just found. Otherwise the hardware can get access to 219 * a bigger memory chunk that it should. 220 */ 221 return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; 222 } 223 224 static inline bool tce_groups_attached(struct tce_container *container) 225 { 226 return !list_empty(&container->group_list); 227 } 228 229 static long tce_iommu_find_table(struct tce_container *container, 230 phys_addr_t ioba, struct iommu_table **ptbl) 231 { 232 long i; 233 234 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 235 struct iommu_table *tbl = container->tables[i]; 236 237 if (tbl) { 238 unsigned long entry = ioba >> tbl->it_page_shift; 239 unsigned long start = tbl->it_offset; 240 unsigned long end = start + tbl->it_size; 241 242 if ((start <= entry) && (entry < end)) { 243 *ptbl = tbl; 244 return i; 245 } 246 } 247 } 248 249 return -1; 250 } 251 252 static int tce_iommu_find_free_table(struct tce_container *container) 253 { 254 int i; 255 256 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 257 if (!container->tables[i]) 258 return i; 259 } 260 261 return -ENOSPC; 262 } 263 264 static int tce_iommu_enable(struct tce_container *container) 265 { 266 int ret = 0; 267 unsigned long locked; 268 struct iommu_table_group *table_group; 269 struct tce_iommu_group *tcegrp; 270 271 if (container->enabled) 272 return -EBUSY; 273 274 /* 275 * When userspace pages are mapped into the IOMMU, they are effectively 276 * locked memory, so, theoretically, we need to update the accounting 277 * of locked pages on each map and unmap. For powerpc, the map unmap 278 * paths can be very hot, though, and the accounting would kill 279 * performance, especially since it would be difficult to impossible 280 * to handle the accounting in real mode only. 281 * 282 * To address that, rather than precisely accounting every page, we 283 * instead account for a worst case on locked memory when the iommu is 284 * enabled and disabled. The worst case upper bound on locked memory 285 * is the size of the whole iommu window, which is usually relatively 286 * small (compared to total memory sizes) on POWER hardware. 287 * 288 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, 289 * that would effectively kill the guest at random points, much better 290 * enforcing the limit based on the max that the guest can map. 291 * 292 * Unfortunately at the moment it counts whole tables, no matter how 293 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups 294 * each with 2GB DMA window, 8GB will be counted here. The reason for 295 * this is that we cannot tell here the amount of RAM used by the guest 296 * as this information is only available from KVM and VFIO is 297 * KVM agnostic. 298 * 299 * So we do not allow enabling a container without a group attached 300 * as there is no way to know how much we should increment 301 * the locked_vm counter. 302 */ 303 if (!tce_groups_attached(container)) 304 return -ENODEV; 305 306 tcegrp = list_first_entry(&container->group_list, 307 struct tce_iommu_group, next); 308 table_group = iommu_group_get_iommudata(tcegrp->grp); 309 if (!table_group) 310 return -ENODEV; 311 312 if (!table_group->tce32_size) 313 return -EPERM; 314 315 ret = tce_iommu_mm_set(container); 316 if (ret) 317 return ret; 318 319 locked = table_group->tce32_size >> PAGE_SHIFT; 320 ret = try_increment_locked_vm(container->mm, locked); 321 if (ret) 322 return ret; 323 324 container->locked_pages = locked; 325 326 container->enabled = true; 327 328 return ret; 329 } 330 331 static void tce_iommu_disable(struct tce_container *container) 332 { 333 if (!container->enabled) 334 return; 335 336 container->enabled = false; 337 338 BUG_ON(!container->mm); 339 decrement_locked_vm(container->mm, container->locked_pages); 340 } 341 342 static void *tce_iommu_open(unsigned long arg) 343 { 344 struct tce_container *container; 345 346 if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) { 347 pr_err("tce_vfio: Wrong IOMMU type\n"); 348 return ERR_PTR(-EINVAL); 349 } 350 351 container = kzalloc(sizeof(*container), GFP_KERNEL); 352 if (!container) 353 return ERR_PTR(-ENOMEM); 354 355 mutex_init(&container->lock); 356 INIT_LIST_HEAD_RCU(&container->group_list); 357 INIT_LIST_HEAD_RCU(&container->prereg_list); 358 359 container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; 360 361 return container; 362 } 363 364 static int tce_iommu_clear(struct tce_container *container, 365 struct iommu_table *tbl, 366 unsigned long entry, unsigned long pages); 367 static void tce_iommu_free_table(struct tce_container *container, 368 struct iommu_table *tbl); 369 370 static void tce_iommu_release(void *iommu_data) 371 { 372 struct tce_container *container = iommu_data; 373 struct tce_iommu_group *tcegrp; 374 long i; 375 376 while (tce_groups_attached(container)) { 377 tcegrp = list_first_entry(&container->group_list, 378 struct tce_iommu_group, next); 379 tce_iommu_detach_group(iommu_data, tcegrp->grp); 380 } 381 382 /* 383 * If VFIO created a table, it was not disposed 384 * by tce_iommu_detach_group() so do it now. 385 */ 386 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 387 struct iommu_table *tbl = container->tables[i]; 388 389 if (!tbl) 390 continue; 391 392 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 393 tce_iommu_free_table(container, tbl); 394 } 395 396 while (!list_empty(&container->prereg_list)) { 397 struct tce_iommu_prereg *tcemem; 398 399 tcemem = list_first_entry(&container->prereg_list, 400 struct tce_iommu_prereg, next); 401 WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem)); 402 } 403 404 tce_iommu_disable(container); 405 if (container->mm) 406 mmdrop(container->mm); 407 mutex_destroy(&container->lock); 408 409 kfree(container); 410 } 411 412 static void tce_iommu_unuse_page(struct tce_container *container, 413 unsigned long hpa) 414 { 415 struct page *page; 416 417 page = pfn_to_page(hpa >> PAGE_SHIFT); 418 put_page(page); 419 } 420 421 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container, 422 unsigned long tce, unsigned long shift, 423 unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) 424 { 425 long ret = 0; 426 struct mm_iommu_table_group_mem_t *mem; 427 428 mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift); 429 if (!mem) 430 return -EINVAL; 431 432 ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa); 433 if (ret) 434 return -EINVAL; 435 436 *pmem = mem; 437 438 return 0; 439 } 440 441 static void tce_iommu_unuse_page_v2(struct tce_container *container, 442 struct iommu_table *tbl, unsigned long entry) 443 { 444 struct mm_iommu_table_group_mem_t *mem = NULL; 445 int ret; 446 unsigned long hpa = 0; 447 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); 448 449 if (!pua) 450 return; 451 452 ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua), 453 tbl->it_page_shift, &hpa, &mem); 454 if (ret) 455 pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n", 456 __func__, be64_to_cpu(*pua), entry, ret); 457 if (mem) 458 mm_iommu_mapped_dec(mem); 459 460 *pua = cpu_to_be64(0); 461 } 462 463 static int tce_iommu_clear(struct tce_container *container, 464 struct iommu_table *tbl, 465 unsigned long entry, unsigned long pages) 466 { 467 unsigned long oldhpa; 468 long ret; 469 enum dma_data_direction direction; 470 unsigned long lastentry = entry + pages; 471 472 for ( ; entry < lastentry; ++entry) { 473 if (tbl->it_indirect_levels && tbl->it_userspace) { 474 /* 475 * For multilevel tables, we can take a shortcut here 476 * and skip some TCEs as we know that the userspace 477 * addresses cache is a mirror of the real TCE table 478 * and if it is missing some indirect levels, then 479 * the hardware table does not have them allocated 480 * either and therefore does not require updating. 481 */ 482 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, 483 entry); 484 if (!pua) { 485 /* align to level_size which is power of two */ 486 entry |= tbl->it_level_size - 1; 487 continue; 488 } 489 } 490 491 cond_resched(); 492 493 direction = DMA_NONE; 494 oldhpa = 0; 495 ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction); 496 if (ret) 497 continue; 498 499 if (direction == DMA_NONE) 500 continue; 501 502 if (container->v2) { 503 tce_iommu_unuse_page_v2(container, tbl, entry); 504 continue; 505 } 506 507 tce_iommu_unuse_page(container, oldhpa); 508 } 509 510 return 0; 511 } 512 513 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) 514 { 515 struct page *page = NULL; 516 enum dma_data_direction direction = iommu_tce_direction(tce); 517 518 if (get_user_pages_fast(tce & PAGE_MASK, 1, 519 direction != DMA_TO_DEVICE, &page) != 1) 520 return -EFAULT; 521 522 *hpa = __pa((unsigned long) page_address(page)); 523 524 return 0; 525 } 526 527 static long tce_iommu_build(struct tce_container *container, 528 struct iommu_table *tbl, 529 unsigned long entry, unsigned long tce, unsigned long pages, 530 enum dma_data_direction direction) 531 { 532 long i, ret = 0; 533 struct page *page; 534 unsigned long hpa; 535 enum dma_data_direction dirtmp; 536 537 for (i = 0; i < pages; ++i) { 538 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 539 540 ret = tce_iommu_use_page(tce, &hpa); 541 if (ret) 542 break; 543 544 page = pfn_to_page(hpa >> PAGE_SHIFT); 545 if (!tce_page_is_contained(page, tbl->it_page_shift)) { 546 ret = -EPERM; 547 break; 548 } 549 550 hpa |= offset; 551 dirtmp = direction; 552 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); 553 if (ret) { 554 tce_iommu_unuse_page(container, hpa); 555 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 556 __func__, entry << tbl->it_page_shift, 557 tce, ret); 558 break; 559 } 560 561 if (dirtmp != DMA_NONE) 562 tce_iommu_unuse_page(container, hpa); 563 564 tce += IOMMU_PAGE_SIZE(tbl); 565 } 566 567 if (ret) 568 tce_iommu_clear(container, tbl, entry, i); 569 570 return ret; 571 } 572 573 static long tce_iommu_build_v2(struct tce_container *container, 574 struct iommu_table *tbl, 575 unsigned long entry, unsigned long tce, unsigned long pages, 576 enum dma_data_direction direction) 577 { 578 long i, ret = 0; 579 struct page *page; 580 unsigned long hpa; 581 enum dma_data_direction dirtmp; 582 583 for (i = 0; i < pages; ++i) { 584 struct mm_iommu_table_group_mem_t *mem = NULL; 585 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i); 586 587 ret = tce_iommu_prereg_ua_to_hpa(container, 588 tce, tbl->it_page_shift, &hpa, &mem); 589 if (ret) 590 break; 591 592 page = pfn_to_page(hpa >> PAGE_SHIFT); 593 if (!tce_page_is_contained(page, tbl->it_page_shift)) { 594 ret = -EPERM; 595 break; 596 } 597 598 /* Preserve offset within IOMMU page */ 599 hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; 600 dirtmp = direction; 601 602 /* The registered region is being unregistered */ 603 if (mm_iommu_mapped_inc(mem)) 604 break; 605 606 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); 607 if (ret) { 608 /* dirtmp cannot be DMA_NONE here */ 609 tce_iommu_unuse_page_v2(container, tbl, entry + i); 610 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 611 __func__, entry << tbl->it_page_shift, 612 tce, ret); 613 break; 614 } 615 616 if (dirtmp != DMA_NONE) 617 tce_iommu_unuse_page_v2(container, tbl, entry + i); 618 619 *pua = cpu_to_be64(tce); 620 621 tce += IOMMU_PAGE_SIZE(tbl); 622 } 623 624 if (ret) 625 tce_iommu_clear(container, tbl, entry, i); 626 627 return ret; 628 } 629 630 static long tce_iommu_create_table(struct tce_container *container, 631 struct iommu_table_group *table_group, 632 int num, 633 __u32 page_shift, 634 __u64 window_size, 635 __u32 levels, 636 struct iommu_table **ptbl) 637 { 638 long ret, table_size; 639 640 table_size = table_group->ops->get_table_size(page_shift, window_size, 641 levels); 642 if (!table_size) 643 return -EINVAL; 644 645 ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT); 646 if (ret) 647 return ret; 648 649 ret = table_group->ops->create_table(table_group, num, 650 page_shift, window_size, levels, ptbl); 651 652 WARN_ON(!ret && !(*ptbl)->it_ops->free); 653 WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size)); 654 655 return ret; 656 } 657 658 static void tce_iommu_free_table(struct tce_container *container, 659 struct iommu_table *tbl) 660 { 661 unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; 662 663 iommu_tce_table_put(tbl); 664 decrement_locked_vm(container->mm, pages); 665 } 666 667 static long tce_iommu_create_window(struct tce_container *container, 668 __u32 page_shift, __u64 window_size, __u32 levels, 669 __u64 *start_addr) 670 { 671 struct tce_iommu_group *tcegrp; 672 struct iommu_table_group *table_group; 673 struct iommu_table *tbl = NULL; 674 long ret, num; 675 676 num = tce_iommu_find_free_table(container); 677 if (num < 0) 678 return num; 679 680 /* Get the first group for ops::create_table */ 681 tcegrp = list_first_entry(&container->group_list, 682 struct tce_iommu_group, next); 683 table_group = iommu_group_get_iommudata(tcegrp->grp); 684 if (!table_group) 685 return -EFAULT; 686 687 if (!(table_group->pgsizes & (1ULL << page_shift))) 688 return -EINVAL; 689 690 if (!table_group->ops->set_window || !table_group->ops->unset_window || 691 !table_group->ops->get_table_size || 692 !table_group->ops->create_table) 693 return -EPERM; 694 695 /* Create TCE table */ 696 ret = tce_iommu_create_table(container, table_group, num, 697 page_shift, window_size, levels, &tbl); 698 if (ret) 699 return ret; 700 701 BUG_ON(!tbl->it_ops->free); 702 703 /* 704 * Program the table to every group. 705 * Groups have been tested for compatibility at the attach time. 706 */ 707 list_for_each_entry(tcegrp, &container->group_list, next) { 708 table_group = iommu_group_get_iommudata(tcegrp->grp); 709 710 ret = table_group->ops->set_window(table_group, num, tbl); 711 if (ret) 712 goto unset_exit; 713 } 714 715 container->tables[num] = tbl; 716 717 /* Return start address assigned by platform in create_table() */ 718 *start_addr = tbl->it_offset << tbl->it_page_shift; 719 720 return 0; 721 722 unset_exit: 723 list_for_each_entry(tcegrp, &container->group_list, next) { 724 table_group = iommu_group_get_iommudata(tcegrp->grp); 725 table_group->ops->unset_window(table_group, num); 726 } 727 tce_iommu_free_table(container, tbl); 728 729 return ret; 730 } 731 732 static long tce_iommu_remove_window(struct tce_container *container, 733 __u64 start_addr) 734 { 735 struct iommu_table_group *table_group = NULL; 736 struct iommu_table *tbl; 737 struct tce_iommu_group *tcegrp; 738 int num; 739 740 num = tce_iommu_find_table(container, start_addr, &tbl); 741 if (num < 0) 742 return -EINVAL; 743 744 BUG_ON(!tbl->it_size); 745 746 /* Detach groups from IOMMUs */ 747 list_for_each_entry(tcegrp, &container->group_list, next) { 748 table_group = iommu_group_get_iommudata(tcegrp->grp); 749 750 /* 751 * SPAPR TCE IOMMU exposes the default DMA window to 752 * the guest via dma32_window_start/size of 753 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow 754 * the userspace to remove this window, some do not so 755 * here we check for the platform capability. 756 */ 757 if (!table_group->ops || !table_group->ops->unset_window) 758 return -EPERM; 759 760 table_group->ops->unset_window(table_group, num); 761 } 762 763 /* Free table */ 764 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 765 tce_iommu_free_table(container, tbl); 766 container->tables[num] = NULL; 767 768 return 0; 769 } 770 771 static long tce_iommu_create_default_window(struct tce_container *container) 772 { 773 long ret; 774 __u64 start_addr = 0; 775 struct tce_iommu_group *tcegrp; 776 struct iommu_table_group *table_group; 777 778 if (!container->def_window_pending) 779 return 0; 780 781 if (!tce_groups_attached(container)) 782 return -ENODEV; 783 784 tcegrp = list_first_entry(&container->group_list, 785 struct tce_iommu_group, next); 786 table_group = iommu_group_get_iommudata(tcegrp->grp); 787 if (!table_group) 788 return -ENODEV; 789 790 ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K, 791 table_group->tce32_size, 1, &start_addr); 792 WARN_ON_ONCE(!ret && start_addr); 793 794 if (!ret) 795 container->def_window_pending = false; 796 797 return ret; 798 } 799 800 static long tce_iommu_ioctl(void *iommu_data, 801 unsigned int cmd, unsigned long arg) 802 { 803 struct tce_container *container = iommu_data; 804 unsigned long minsz, ddwsz; 805 long ret; 806 807 switch (cmd) { 808 case VFIO_CHECK_EXTENSION: 809 switch (arg) { 810 case VFIO_SPAPR_TCE_IOMMU: 811 case VFIO_SPAPR_TCE_v2_IOMMU: 812 ret = 1; 813 break; 814 default: 815 ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); 816 break; 817 } 818 819 return (ret < 0) ? 0 : ret; 820 } 821 822 /* 823 * Sanity check to prevent one userspace from manipulating 824 * another userspace mm. 825 */ 826 BUG_ON(!container); 827 if (container->mm && container->mm != current->mm) 828 return -EPERM; 829 830 switch (cmd) { 831 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { 832 struct vfio_iommu_spapr_tce_info info; 833 struct tce_iommu_group *tcegrp; 834 struct iommu_table_group *table_group; 835 836 if (!tce_groups_attached(container)) 837 return -ENXIO; 838 839 tcegrp = list_first_entry(&container->group_list, 840 struct tce_iommu_group, next); 841 table_group = iommu_group_get_iommudata(tcegrp->grp); 842 843 if (!table_group) 844 return -ENXIO; 845 846 minsz = offsetofend(struct vfio_iommu_spapr_tce_info, 847 dma32_window_size); 848 849 if (copy_from_user(&info, (void __user *)arg, minsz)) 850 return -EFAULT; 851 852 if (info.argsz < minsz) 853 return -EINVAL; 854 855 info.dma32_window_start = table_group->tce32_start; 856 info.dma32_window_size = table_group->tce32_size; 857 info.flags = 0; 858 memset(&info.ddw, 0, sizeof(info.ddw)); 859 860 if (table_group->max_dynamic_windows_supported && 861 container->v2) { 862 info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW; 863 info.ddw.pgsizes = table_group->pgsizes; 864 info.ddw.max_dynamic_windows_supported = 865 table_group->max_dynamic_windows_supported; 866 info.ddw.levels = table_group->max_levels; 867 } 868 869 ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw); 870 871 if (info.argsz >= ddwsz) 872 minsz = ddwsz; 873 874 if (copy_to_user((void __user *)arg, &info, minsz)) 875 return -EFAULT; 876 877 return 0; 878 } 879 case VFIO_IOMMU_MAP_DMA: { 880 struct vfio_iommu_type1_dma_map param; 881 struct iommu_table *tbl = NULL; 882 long num; 883 enum dma_data_direction direction; 884 885 if (!container->enabled) 886 return -EPERM; 887 888 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 889 890 if (copy_from_user(¶m, (void __user *)arg, minsz)) 891 return -EFAULT; 892 893 if (param.argsz < minsz) 894 return -EINVAL; 895 896 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ | 897 VFIO_DMA_MAP_FLAG_WRITE)) 898 return -EINVAL; 899 900 ret = tce_iommu_create_default_window(container); 901 if (ret) 902 return ret; 903 904 num = tce_iommu_find_table(container, param.iova, &tbl); 905 if (num < 0) 906 return -ENXIO; 907 908 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || 909 (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) 910 return -EINVAL; 911 912 /* iova is checked by the IOMMU API */ 913 if (param.flags & VFIO_DMA_MAP_FLAG_READ) { 914 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 915 direction = DMA_BIDIRECTIONAL; 916 else 917 direction = DMA_TO_DEVICE; 918 } else { 919 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 920 direction = DMA_FROM_DEVICE; 921 else 922 return -EINVAL; 923 } 924 925 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); 926 if (ret) 927 return ret; 928 929 if (container->v2) 930 ret = tce_iommu_build_v2(container, tbl, 931 param.iova >> tbl->it_page_shift, 932 param.vaddr, 933 param.size >> tbl->it_page_shift, 934 direction); 935 else 936 ret = tce_iommu_build(container, tbl, 937 param.iova >> tbl->it_page_shift, 938 param.vaddr, 939 param.size >> tbl->it_page_shift, 940 direction); 941 942 iommu_flush_tce(tbl); 943 944 return ret; 945 } 946 case VFIO_IOMMU_UNMAP_DMA: { 947 struct vfio_iommu_type1_dma_unmap param; 948 struct iommu_table *tbl = NULL; 949 long num; 950 951 if (!container->enabled) 952 return -EPERM; 953 954 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, 955 size); 956 957 if (copy_from_user(¶m, (void __user *)arg, minsz)) 958 return -EFAULT; 959 960 if (param.argsz < minsz) 961 return -EINVAL; 962 963 /* No flag is supported now */ 964 if (param.flags) 965 return -EINVAL; 966 967 ret = tce_iommu_create_default_window(container); 968 if (ret) 969 return ret; 970 971 num = tce_iommu_find_table(container, param.iova, &tbl); 972 if (num < 0) 973 return -ENXIO; 974 975 if (param.size & ~IOMMU_PAGE_MASK(tbl)) 976 return -EINVAL; 977 978 ret = iommu_tce_clear_param_check(tbl, param.iova, 0, 979 param.size >> tbl->it_page_shift); 980 if (ret) 981 return ret; 982 983 ret = tce_iommu_clear(container, tbl, 984 param.iova >> tbl->it_page_shift, 985 param.size >> tbl->it_page_shift); 986 iommu_flush_tce(tbl); 987 988 return ret; 989 } 990 case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: { 991 struct vfio_iommu_spapr_register_memory param; 992 993 if (!container->v2) 994 break; 995 996 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 997 size); 998 999 ret = tce_iommu_mm_set(container); 1000 if (ret) 1001 return ret; 1002 1003 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1004 return -EFAULT; 1005 1006 if (param.argsz < minsz) 1007 return -EINVAL; 1008 1009 /* No flag is supported now */ 1010 if (param.flags) 1011 return -EINVAL; 1012 1013 mutex_lock(&container->lock); 1014 ret = tce_iommu_register_pages(container, param.vaddr, 1015 param.size); 1016 mutex_unlock(&container->lock); 1017 1018 return ret; 1019 } 1020 case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: { 1021 struct vfio_iommu_spapr_register_memory param; 1022 1023 if (!container->v2) 1024 break; 1025 1026 if (!container->mm) 1027 return -EPERM; 1028 1029 minsz = offsetofend(struct vfio_iommu_spapr_register_memory, 1030 size); 1031 1032 if (copy_from_user(¶m, (void __user *)arg, minsz)) 1033 return -EFAULT; 1034 1035 if (param.argsz < minsz) 1036 return -EINVAL; 1037 1038 /* No flag is supported now */ 1039 if (param.flags) 1040 return -EINVAL; 1041 1042 mutex_lock(&container->lock); 1043 ret = tce_iommu_unregister_pages(container, param.vaddr, 1044 param.size); 1045 mutex_unlock(&container->lock); 1046 1047 return ret; 1048 } 1049 case VFIO_IOMMU_ENABLE: 1050 if (container->v2) 1051 break; 1052 1053 mutex_lock(&container->lock); 1054 ret = tce_iommu_enable(container); 1055 mutex_unlock(&container->lock); 1056 return ret; 1057 1058 1059 case VFIO_IOMMU_DISABLE: 1060 if (container->v2) 1061 break; 1062 1063 mutex_lock(&container->lock); 1064 tce_iommu_disable(container); 1065 mutex_unlock(&container->lock); 1066 return 0; 1067 1068 case VFIO_EEH_PE_OP: { 1069 struct tce_iommu_group *tcegrp; 1070 1071 ret = 0; 1072 list_for_each_entry(tcegrp, &container->group_list, next) { 1073 ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, 1074 cmd, arg); 1075 if (ret) 1076 return ret; 1077 } 1078 return ret; 1079 } 1080 1081 case VFIO_IOMMU_SPAPR_TCE_CREATE: { 1082 struct vfio_iommu_spapr_tce_create create; 1083 1084 if (!container->v2) 1085 break; 1086 1087 ret = tce_iommu_mm_set(container); 1088 if (ret) 1089 return ret; 1090 1091 if (!tce_groups_attached(container)) 1092 return -ENXIO; 1093 1094 minsz = offsetofend(struct vfio_iommu_spapr_tce_create, 1095 start_addr); 1096 1097 if (copy_from_user(&create, (void __user *)arg, minsz)) 1098 return -EFAULT; 1099 1100 if (create.argsz < minsz) 1101 return -EINVAL; 1102 1103 if (create.flags) 1104 return -EINVAL; 1105 1106 mutex_lock(&container->lock); 1107 1108 ret = tce_iommu_create_default_window(container); 1109 if (!ret) 1110 ret = tce_iommu_create_window(container, 1111 create.page_shift, 1112 create.window_size, create.levels, 1113 &create.start_addr); 1114 1115 mutex_unlock(&container->lock); 1116 1117 if (!ret && copy_to_user((void __user *)arg, &create, minsz)) 1118 ret = -EFAULT; 1119 1120 return ret; 1121 } 1122 case VFIO_IOMMU_SPAPR_TCE_REMOVE: { 1123 struct vfio_iommu_spapr_tce_remove remove; 1124 1125 if (!container->v2) 1126 break; 1127 1128 ret = tce_iommu_mm_set(container); 1129 if (ret) 1130 return ret; 1131 1132 if (!tce_groups_attached(container)) 1133 return -ENXIO; 1134 1135 minsz = offsetofend(struct vfio_iommu_spapr_tce_remove, 1136 start_addr); 1137 1138 if (copy_from_user(&remove, (void __user *)arg, minsz)) 1139 return -EFAULT; 1140 1141 if (remove.argsz < minsz) 1142 return -EINVAL; 1143 1144 if (remove.flags) 1145 return -EINVAL; 1146 1147 if (container->def_window_pending && !remove.start_addr) { 1148 container->def_window_pending = false; 1149 return 0; 1150 } 1151 1152 mutex_lock(&container->lock); 1153 1154 ret = tce_iommu_remove_window(container, remove.start_addr); 1155 1156 mutex_unlock(&container->lock); 1157 1158 return ret; 1159 } 1160 } 1161 1162 return -ENOTTY; 1163 } 1164 1165 static void tce_iommu_release_ownership(struct tce_container *container, 1166 struct iommu_table_group *table_group) 1167 { 1168 int i; 1169 1170 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1171 struct iommu_table *tbl = container->tables[i]; 1172 1173 if (!tbl) 1174 continue; 1175 1176 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); 1177 if (tbl->it_map) 1178 iommu_release_ownership(tbl); 1179 1180 container->tables[i] = NULL; 1181 } 1182 } 1183 1184 static int tce_iommu_take_ownership(struct tce_container *container, 1185 struct iommu_table_group *table_group) 1186 { 1187 int i, j, rc = 0; 1188 1189 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1190 struct iommu_table *tbl = table_group->tables[i]; 1191 1192 if (!tbl || !tbl->it_map) 1193 continue; 1194 1195 rc = iommu_take_ownership(tbl); 1196 if (rc) { 1197 for (j = 0; j < i; ++j) 1198 iommu_release_ownership( 1199 table_group->tables[j]); 1200 1201 return rc; 1202 } 1203 } 1204 1205 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1206 container->tables[i] = table_group->tables[i]; 1207 1208 return 0; 1209 } 1210 1211 static void tce_iommu_release_ownership_ddw(struct tce_container *container, 1212 struct iommu_table_group *table_group) 1213 { 1214 long i; 1215 1216 if (!table_group->ops->unset_window) { 1217 WARN_ON_ONCE(1); 1218 return; 1219 } 1220 1221 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1222 table_group->ops->unset_window(table_group, i); 1223 1224 table_group->ops->release_ownership(table_group); 1225 } 1226 1227 static long tce_iommu_take_ownership_ddw(struct tce_container *container, 1228 struct iommu_table_group *table_group) 1229 { 1230 long i, ret = 0; 1231 1232 if (!table_group->ops->create_table || !table_group->ops->set_window || 1233 !table_group->ops->release_ownership) { 1234 WARN_ON_ONCE(1); 1235 return -EFAULT; 1236 } 1237 1238 table_group->ops->take_ownership(table_group); 1239 1240 /* Set all windows to the new group */ 1241 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { 1242 struct iommu_table *tbl = container->tables[i]; 1243 1244 if (!tbl) 1245 continue; 1246 1247 ret = table_group->ops->set_window(table_group, i, tbl); 1248 if (ret) 1249 goto release_exit; 1250 } 1251 1252 return 0; 1253 1254 release_exit: 1255 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1256 table_group->ops->unset_window(table_group, i); 1257 1258 table_group->ops->release_ownership(table_group); 1259 1260 return ret; 1261 } 1262 1263 static int tce_iommu_attach_group(void *iommu_data, 1264 struct iommu_group *iommu_group) 1265 { 1266 int ret; 1267 struct tce_container *container = iommu_data; 1268 struct iommu_table_group *table_group; 1269 struct tce_iommu_group *tcegrp = NULL; 1270 1271 mutex_lock(&container->lock); 1272 1273 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", 1274 iommu_group_id(iommu_group), iommu_group); */ 1275 table_group = iommu_group_get_iommudata(iommu_group); 1276 if (!table_group) { 1277 ret = -ENODEV; 1278 goto unlock_exit; 1279 } 1280 1281 if (tce_groups_attached(container) && (!table_group->ops || 1282 !table_group->ops->take_ownership || 1283 !table_group->ops->release_ownership)) { 1284 ret = -EBUSY; 1285 goto unlock_exit; 1286 } 1287 1288 /* Check if new group has the same iommu_ops (i.e. compatible) */ 1289 list_for_each_entry(tcegrp, &container->group_list, next) { 1290 struct iommu_table_group *table_group_tmp; 1291 1292 if (tcegrp->grp == iommu_group) { 1293 pr_warn("tce_vfio: Group %d is already attached\n", 1294 iommu_group_id(iommu_group)); 1295 ret = -EBUSY; 1296 goto unlock_exit; 1297 } 1298 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); 1299 if (table_group_tmp->ops->create_table != 1300 table_group->ops->create_table) { 1301 pr_warn("tce_vfio: Group %d is incompatible with group %d\n", 1302 iommu_group_id(iommu_group), 1303 iommu_group_id(tcegrp->grp)); 1304 ret = -EPERM; 1305 goto unlock_exit; 1306 } 1307 } 1308 1309 tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL); 1310 if (!tcegrp) { 1311 ret = -ENOMEM; 1312 goto unlock_exit; 1313 } 1314 1315 if (!table_group->ops || !table_group->ops->take_ownership || 1316 !table_group->ops->release_ownership) { 1317 if (container->v2) { 1318 ret = -EPERM; 1319 goto unlock_exit; 1320 } 1321 ret = tce_iommu_take_ownership(container, table_group); 1322 } else { 1323 if (!container->v2) { 1324 ret = -EPERM; 1325 goto unlock_exit; 1326 } 1327 ret = tce_iommu_take_ownership_ddw(container, table_group); 1328 if (!tce_groups_attached(container) && !container->tables[0]) 1329 container->def_window_pending = true; 1330 } 1331 1332 if (!ret) { 1333 tcegrp->grp = iommu_group; 1334 list_add(&tcegrp->next, &container->group_list); 1335 } 1336 1337 unlock_exit: 1338 if (ret && tcegrp) 1339 kfree(tcegrp); 1340 1341 mutex_unlock(&container->lock); 1342 1343 return ret; 1344 } 1345 1346 static void tce_iommu_detach_group(void *iommu_data, 1347 struct iommu_group *iommu_group) 1348 { 1349 struct tce_container *container = iommu_data; 1350 struct iommu_table_group *table_group; 1351 bool found = false; 1352 struct tce_iommu_group *tcegrp; 1353 1354 mutex_lock(&container->lock); 1355 1356 list_for_each_entry(tcegrp, &container->group_list, next) { 1357 if (tcegrp->grp == iommu_group) { 1358 found = true; 1359 break; 1360 } 1361 } 1362 1363 if (!found) { 1364 pr_warn("tce_vfio: detaching unattached group #%u\n", 1365 iommu_group_id(iommu_group)); 1366 goto unlock_exit; 1367 } 1368 1369 list_del(&tcegrp->next); 1370 kfree(tcegrp); 1371 1372 table_group = iommu_group_get_iommudata(iommu_group); 1373 BUG_ON(!table_group); 1374 1375 if (!table_group->ops || !table_group->ops->release_ownership) 1376 tce_iommu_release_ownership(container, table_group); 1377 else 1378 tce_iommu_release_ownership_ddw(container, table_group); 1379 1380 unlock_exit: 1381 mutex_unlock(&container->lock); 1382 } 1383 1384 const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { 1385 .name = "iommu-vfio-powerpc", 1386 .owner = THIS_MODULE, 1387 .open = tce_iommu_open, 1388 .release = tce_iommu_release, 1389 .ioctl = tce_iommu_ioctl, 1390 .attach_group = tce_iommu_attach_group, 1391 .detach_group = tce_iommu_detach_group, 1392 }; 1393 1394 static int __init tce_iommu_init(void) 1395 { 1396 return vfio_register_iommu_driver(&tce_iommu_driver_ops); 1397 } 1398 1399 static void __exit tce_iommu_cleanup(void) 1400 { 1401 vfio_unregister_iommu_driver(&tce_iommu_driver_ops); 1402 } 1403 1404 module_init(tce_iommu_init); 1405 module_exit(tce_iommu_cleanup); 1406 1407 MODULE_VERSION(DRIVER_VERSION); 1408 MODULE_LICENSE("GPL v2"); 1409 MODULE_AUTHOR(DRIVER_AUTHOR); 1410 MODULE_DESCRIPTION(DRIVER_DESC); 1411 1412