1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 4 * 5 * VFIO container (/dev/vfio/vfio) 6 */ 7 #include <linux/file.h> 8 #include <linux/slab.h> 9 #include <linux/fs.h> 10 #include <linux/capability.h> 11 #include <linux/iommu.h> 12 #include <linux/miscdevice.h> 13 #include <linux/vfio.h> 14 #include <uapi/linux/vfio.h> 15 16 #include "vfio.h" 17 18 struct vfio_container { 19 struct kref kref; 20 struct list_head group_list; 21 struct rw_semaphore group_lock; 22 struct vfio_iommu_driver *iommu_driver; 23 void *iommu_data; 24 bool noiommu; 25 }; 26 27 static struct vfio { 28 struct list_head iommu_drivers_list; 29 struct mutex iommu_drivers_lock; 30 } vfio; 31 32 #ifdef CONFIG_VFIO_NOIOMMU 33 bool vfio_noiommu __read_mostly; 34 module_param_named(enable_unsafe_noiommu_mode, 35 vfio_noiommu, bool, S_IRUGO | S_IWUSR); 36 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); 37 #endif 38 39 static void *vfio_noiommu_open(unsigned long arg) 40 { 41 if (arg != VFIO_NOIOMMU_IOMMU) 42 return ERR_PTR(-EINVAL); 43 if (!capable(CAP_SYS_RAWIO)) 44 return ERR_PTR(-EPERM); 45 46 return NULL; 47 } 48 49 static void vfio_noiommu_release(void *iommu_data) 50 { 51 } 52 53 static long vfio_noiommu_ioctl(void *iommu_data, 54 unsigned int cmd, unsigned long arg) 55 { 56 if (cmd == VFIO_CHECK_EXTENSION) 57 return vfio_noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0; 58 59 return -ENOTTY; 60 } 61 62 static int vfio_noiommu_attach_group(void *iommu_data, 63 struct iommu_group *iommu_group, enum vfio_group_type type) 64 { 65 return 0; 66 } 67 68 static void vfio_noiommu_detach_group(void *iommu_data, 69 struct iommu_group *iommu_group) 70 { 71 } 72 73 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = { 74 .name = "vfio-noiommu", 75 .owner = THIS_MODULE, 76 .open = vfio_noiommu_open, 77 .release = vfio_noiommu_release, 78 .ioctl = vfio_noiommu_ioctl, 79 .attach_group = vfio_noiommu_attach_group, 80 .detach_group = vfio_noiommu_detach_group, 81 }; 82 83 /* 84 * Only noiommu containers can use vfio-noiommu and noiommu containers can only 85 * use vfio-noiommu. 86 */ 87 static bool vfio_iommu_driver_allowed(struct vfio_container *container, 88 const struct vfio_iommu_driver *driver) 89 { 90 if (!IS_ENABLED(CONFIG_VFIO_NOIOMMU)) 91 return true; 92 return container->noiommu == (driver->ops == &vfio_noiommu_ops); 93 } 94 95 /* 96 * IOMMU driver registration 97 */ 98 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops) 99 { 100 struct vfio_iommu_driver *driver, *tmp; 101 102 if (WARN_ON(!ops->register_device != !ops->unregister_device)) 103 return -EINVAL; 104 105 driver = kzalloc(sizeof(*driver), GFP_KERNEL); 106 if (!driver) 107 return -ENOMEM; 108 109 driver->ops = ops; 110 111 mutex_lock(&vfio.iommu_drivers_lock); 112 113 /* Check for duplicates */ 114 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) { 115 if (tmp->ops == ops) { 116 mutex_unlock(&vfio.iommu_drivers_lock); 117 kfree(driver); 118 return -EINVAL; 119 } 120 } 121 122 list_add(&driver->vfio_next, &vfio.iommu_drivers_list); 123 124 mutex_unlock(&vfio.iommu_drivers_lock); 125 126 return 0; 127 } 128 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver); 129 130 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) 131 { 132 struct vfio_iommu_driver *driver; 133 134 mutex_lock(&vfio.iommu_drivers_lock); 135 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { 136 if (driver->ops == ops) { 137 list_del(&driver->vfio_next); 138 mutex_unlock(&vfio.iommu_drivers_lock); 139 kfree(driver); 140 return; 141 } 142 } 143 mutex_unlock(&vfio.iommu_drivers_lock); 144 } 145 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); 146 147 /* 148 * Container objects - containers are created when /dev/vfio/vfio is 149 * opened, but their lifecycle extends until the last user is done, so 150 * it's freed via kref. Must support container/group/device being 151 * closed in any order. 152 */ 153 static void vfio_container_release(struct kref *kref) 154 { 155 struct vfio_container *container; 156 container = container_of(kref, struct vfio_container, kref); 157 158 kfree(container); 159 } 160 161 static void vfio_container_get(struct vfio_container *container) 162 { 163 kref_get(&container->kref); 164 } 165 166 static void vfio_container_put(struct vfio_container *container) 167 { 168 kref_put(&container->kref, vfio_container_release); 169 } 170 171 void vfio_device_container_register(struct vfio_device *device) 172 { 173 struct vfio_iommu_driver *iommu_driver = 174 device->group->container->iommu_driver; 175 176 if (iommu_driver && iommu_driver->ops->register_device) 177 iommu_driver->ops->register_device( 178 device->group->container->iommu_data, device); 179 } 180 181 void vfio_device_container_unregister(struct vfio_device *device) 182 { 183 struct vfio_iommu_driver *iommu_driver = 184 device->group->container->iommu_driver; 185 186 if (iommu_driver && iommu_driver->ops->unregister_device) 187 iommu_driver->ops->unregister_device( 188 device->group->container->iommu_data, device); 189 } 190 191 long vfio_container_ioctl_check_extension(struct vfio_container *container, 192 unsigned long arg) 193 { 194 struct vfio_iommu_driver *driver; 195 long ret = 0; 196 197 down_read(&container->group_lock); 198 199 driver = container->iommu_driver; 200 201 switch (arg) { 202 /* No base extensions yet */ 203 default: 204 /* 205 * If no driver is set, poll all registered drivers for 206 * extensions and return the first positive result. If 207 * a driver is already set, further queries will be passed 208 * only to that driver. 209 */ 210 if (!driver) { 211 mutex_lock(&vfio.iommu_drivers_lock); 212 list_for_each_entry(driver, &vfio.iommu_drivers_list, 213 vfio_next) { 214 215 if (!list_empty(&container->group_list) && 216 !vfio_iommu_driver_allowed(container, 217 driver)) 218 continue; 219 if (!try_module_get(driver->ops->owner)) 220 continue; 221 222 ret = driver->ops->ioctl(NULL, 223 VFIO_CHECK_EXTENSION, 224 arg); 225 module_put(driver->ops->owner); 226 if (ret > 0) 227 break; 228 } 229 mutex_unlock(&vfio.iommu_drivers_lock); 230 } else 231 ret = driver->ops->ioctl(container->iommu_data, 232 VFIO_CHECK_EXTENSION, arg); 233 } 234 235 up_read(&container->group_lock); 236 237 return ret; 238 } 239 240 /* hold write lock on container->group_lock */ 241 static int __vfio_container_attach_groups(struct vfio_container *container, 242 struct vfio_iommu_driver *driver, 243 void *data) 244 { 245 struct vfio_group *group; 246 int ret = -ENODEV; 247 248 list_for_each_entry(group, &container->group_list, container_next) { 249 ret = driver->ops->attach_group(data, group->iommu_group, 250 group->type); 251 if (ret) 252 goto unwind; 253 } 254 255 return ret; 256 257 unwind: 258 list_for_each_entry_continue_reverse(group, &container->group_list, 259 container_next) { 260 driver->ops->detach_group(data, group->iommu_group); 261 } 262 263 return ret; 264 } 265 266 static long vfio_ioctl_set_iommu(struct vfio_container *container, 267 unsigned long arg) 268 { 269 struct vfio_iommu_driver *driver; 270 long ret = -ENODEV; 271 272 down_write(&container->group_lock); 273 274 /* 275 * The container is designed to be an unprivileged interface while 276 * the group can be assigned to specific users. Therefore, only by 277 * adding a group to a container does the user get the privilege of 278 * enabling the iommu, which may allocate finite resources. There 279 * is no unset_iommu, but by removing all the groups from a container, 280 * the container is deprivileged and returns to an unset state. 281 */ 282 if (list_empty(&container->group_list) || container->iommu_driver) { 283 up_write(&container->group_lock); 284 return -EINVAL; 285 } 286 287 mutex_lock(&vfio.iommu_drivers_lock); 288 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { 289 void *data; 290 291 if (!vfio_iommu_driver_allowed(container, driver)) 292 continue; 293 if (!try_module_get(driver->ops->owner)) 294 continue; 295 296 /* 297 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION, 298 * so test which iommu driver reported support for this 299 * extension and call open on them. We also pass them the 300 * magic, allowing a single driver to support multiple 301 * interfaces if they'd like. 302 */ 303 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) { 304 module_put(driver->ops->owner); 305 continue; 306 } 307 308 data = driver->ops->open(arg); 309 if (IS_ERR(data)) { 310 ret = PTR_ERR(data); 311 module_put(driver->ops->owner); 312 continue; 313 } 314 315 ret = __vfio_container_attach_groups(container, driver, data); 316 if (ret) { 317 driver->ops->release(data); 318 module_put(driver->ops->owner); 319 continue; 320 } 321 322 container->iommu_driver = driver; 323 container->iommu_data = data; 324 break; 325 } 326 327 mutex_unlock(&vfio.iommu_drivers_lock); 328 up_write(&container->group_lock); 329 330 return ret; 331 } 332 333 static long vfio_fops_unl_ioctl(struct file *filep, 334 unsigned int cmd, unsigned long arg) 335 { 336 struct vfio_container *container = filep->private_data; 337 struct vfio_iommu_driver *driver; 338 void *data; 339 long ret = -EINVAL; 340 341 if (!container) 342 return ret; 343 344 switch (cmd) { 345 case VFIO_GET_API_VERSION: 346 ret = VFIO_API_VERSION; 347 break; 348 case VFIO_CHECK_EXTENSION: 349 ret = vfio_container_ioctl_check_extension(container, arg); 350 break; 351 case VFIO_SET_IOMMU: 352 ret = vfio_ioctl_set_iommu(container, arg); 353 break; 354 default: 355 driver = container->iommu_driver; 356 data = container->iommu_data; 357 358 if (driver) /* passthrough all unrecognized ioctls */ 359 ret = driver->ops->ioctl(data, cmd, arg); 360 } 361 362 return ret; 363 } 364 365 static int vfio_fops_open(struct inode *inode, struct file *filep) 366 { 367 struct vfio_container *container; 368 369 container = kzalloc(sizeof(*container), GFP_KERNEL); 370 if (!container) 371 return -ENOMEM; 372 373 INIT_LIST_HEAD(&container->group_list); 374 init_rwsem(&container->group_lock); 375 kref_init(&container->kref); 376 377 filep->private_data = container; 378 379 return 0; 380 } 381 382 static int vfio_fops_release(struct inode *inode, struct file *filep) 383 { 384 struct vfio_container *container = filep->private_data; 385 struct vfio_iommu_driver *driver = container->iommu_driver; 386 387 if (driver && driver->ops->notify) 388 driver->ops->notify(container->iommu_data, 389 VFIO_IOMMU_CONTAINER_CLOSE); 390 391 filep->private_data = NULL; 392 393 vfio_container_put(container); 394 395 return 0; 396 } 397 398 static const struct file_operations vfio_fops = { 399 .owner = THIS_MODULE, 400 .open = vfio_fops_open, 401 .release = vfio_fops_release, 402 .unlocked_ioctl = vfio_fops_unl_ioctl, 403 .compat_ioctl = compat_ptr_ioctl, 404 }; 405 406 struct vfio_container *vfio_container_from_file(struct file *file) 407 { 408 struct vfio_container *container; 409 410 /* Sanity check, is this really our fd? */ 411 if (file->f_op != &vfio_fops) 412 return NULL; 413 414 container = file->private_data; 415 WARN_ON(!container); /* fget ensures we don't race vfio_release */ 416 return container; 417 } 418 419 static struct miscdevice vfio_dev = { 420 .minor = VFIO_MINOR, 421 .name = "vfio", 422 .fops = &vfio_fops, 423 .nodename = "vfio/vfio", 424 .mode = S_IRUGO | S_IWUGO, 425 }; 426 427 int vfio_container_attach_group(struct vfio_container *container, 428 struct vfio_group *group) 429 { 430 struct vfio_iommu_driver *driver; 431 int ret = 0; 432 433 lockdep_assert_held(&group->group_lock); 434 435 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) 436 return -EPERM; 437 438 down_write(&container->group_lock); 439 440 /* Real groups and fake groups cannot mix */ 441 if (!list_empty(&container->group_list) && 442 container->noiommu != (group->type == VFIO_NO_IOMMU)) { 443 ret = -EPERM; 444 goto out_unlock_container; 445 } 446 447 if (group->type == VFIO_IOMMU) { 448 ret = iommu_group_claim_dma_owner(group->iommu_group, group); 449 if (ret) 450 goto out_unlock_container; 451 } 452 453 driver = container->iommu_driver; 454 if (driver) { 455 ret = driver->ops->attach_group(container->iommu_data, 456 group->iommu_group, 457 group->type); 458 if (ret) { 459 if (group->type == VFIO_IOMMU) 460 iommu_group_release_dma_owner( 461 group->iommu_group); 462 goto out_unlock_container; 463 } 464 } 465 466 group->container = container; 467 group->container_users = 1; 468 container->noiommu = (group->type == VFIO_NO_IOMMU); 469 list_add(&group->container_next, &container->group_list); 470 471 /* Get a reference on the container and mark a user within the group */ 472 vfio_container_get(container); 473 474 out_unlock_container: 475 up_write(&container->group_lock); 476 return ret; 477 } 478 479 void vfio_group_detach_container(struct vfio_group *group) 480 { 481 struct vfio_container *container = group->container; 482 struct vfio_iommu_driver *driver; 483 484 lockdep_assert_held(&group->group_lock); 485 WARN_ON(group->container_users != 1); 486 487 down_write(&container->group_lock); 488 489 driver = container->iommu_driver; 490 if (driver) 491 driver->ops->detach_group(container->iommu_data, 492 group->iommu_group); 493 494 if (group->type == VFIO_IOMMU) 495 iommu_group_release_dma_owner(group->iommu_group); 496 497 group->container = NULL; 498 group->container_users = 0; 499 list_del(&group->container_next); 500 501 /* Detaching the last group deprivileges a container, remove iommu */ 502 if (driver && list_empty(&container->group_list)) { 503 driver->ops->release(container->iommu_data); 504 module_put(driver->ops->owner); 505 container->iommu_driver = NULL; 506 container->iommu_data = NULL; 507 } 508 509 up_write(&container->group_lock); 510 511 vfio_container_put(container); 512 } 513 514 int vfio_device_assign_container(struct vfio_device *device) 515 { 516 struct vfio_group *group = device->group; 517 518 lockdep_assert_held(&group->group_lock); 519 520 if (!group->container || !group->container->iommu_driver || 521 WARN_ON(!group->container_users)) 522 return -EINVAL; 523 524 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) 525 return -EPERM; 526 527 get_file(group->opened_file); 528 group->container_users++; 529 return 0; 530 } 531 532 void vfio_device_unassign_container(struct vfio_device *device) 533 { 534 mutex_lock(&device->group->group_lock); 535 WARN_ON(device->group->container_users <= 1); 536 device->group->container_users--; 537 fput(device->group->opened_file); 538 mutex_unlock(&device->group->group_lock); 539 } 540 541 /* 542 * Pin contiguous user pages and return their associated host pages for local 543 * domain only. 544 * @device [in] : device 545 * @iova [in] : starting IOVA of user pages to be pinned. 546 * @npage [in] : count of pages to be pinned. This count should not 547 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 548 * @prot [in] : protection flags 549 * @pages[out] : array of host pages 550 * Return error or number of pages pinned. 551 * 552 * A driver may only call this function if the vfio_device was created 553 * by vfio_register_emulated_iommu_dev(). 554 */ 555 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, 556 int npage, int prot, struct page **pages) 557 { 558 struct vfio_container *container; 559 struct vfio_group *group = device->group; 560 struct vfio_iommu_driver *driver; 561 int ret; 562 563 if (!pages || !npage || !vfio_assert_device_open(device)) 564 return -EINVAL; 565 566 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) 567 return -E2BIG; 568 569 /* group->container cannot change while a vfio device is open */ 570 container = group->container; 571 driver = container->iommu_driver; 572 if (likely(driver && driver->ops->pin_pages)) 573 ret = driver->ops->pin_pages(container->iommu_data, 574 group->iommu_group, iova, 575 npage, prot, pages); 576 else 577 ret = -ENOTTY; 578 579 return ret; 580 } 581 EXPORT_SYMBOL(vfio_pin_pages); 582 583 /* 584 * Unpin contiguous host pages for local domain only. 585 * @device [in] : device 586 * @iova [in] : starting address of user pages to be unpinned. 587 * @npage [in] : count of pages to be unpinned. This count should not 588 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 589 */ 590 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) 591 { 592 struct vfio_container *container; 593 struct vfio_iommu_driver *driver; 594 595 if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES)) 596 return; 597 598 if (WARN_ON(!vfio_assert_device_open(device))) 599 return; 600 601 /* group->container cannot change while a vfio device is open */ 602 container = device->group->container; 603 driver = container->iommu_driver; 604 605 driver->ops->unpin_pages(container->iommu_data, iova, npage); 606 } 607 EXPORT_SYMBOL(vfio_unpin_pages); 608 609 /* 610 * This interface allows the CPUs to perform some sort of virtual DMA on 611 * behalf of the device. 612 * 613 * CPUs read/write from/into a range of IOVAs pointing to user space memory 614 * into/from a kernel buffer. 615 * 616 * As the read/write of user space memory is conducted via the CPUs and is 617 * not a real device DMA, it is not necessary to pin the user space memory. 618 * 619 * @device [in] : VFIO device 620 * @iova [in] : base IOVA of a user space buffer 621 * @data [in] : pointer to kernel buffer 622 * @len [in] : kernel buffer length 623 * @write : indicate read or write 624 * Return error code on failure or 0 on success. 625 */ 626 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, 627 size_t len, bool write) 628 { 629 struct vfio_container *container; 630 struct vfio_iommu_driver *driver; 631 int ret = 0; 632 633 if (!data || len <= 0 || !vfio_assert_device_open(device)) 634 return -EINVAL; 635 636 /* group->container cannot change while a vfio device is open */ 637 container = device->group->container; 638 driver = container->iommu_driver; 639 640 if (likely(driver && driver->ops->dma_rw)) 641 ret = driver->ops->dma_rw(container->iommu_data, 642 iova, data, len, write); 643 else 644 ret = -ENOTTY; 645 return ret; 646 } 647 EXPORT_SYMBOL(vfio_dma_rw); 648 649 int __init vfio_container_init(void) 650 { 651 int ret; 652 653 mutex_init(&vfio.iommu_drivers_lock); 654 INIT_LIST_HEAD(&vfio.iommu_drivers_list); 655 656 ret = misc_register(&vfio_dev); 657 if (ret) { 658 pr_err("vfio: misc device register failed\n"); 659 return ret; 660 } 661 662 if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) { 663 ret = vfio_register_iommu_driver(&vfio_noiommu_ops); 664 if (ret) 665 goto err_misc; 666 } 667 return 0; 668 669 err_misc: 670 misc_deregister(&vfio_dev); 671 return ret; 672 } 673 674 void vfio_container_cleanup(void) 675 { 676 if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) 677 vfio_unregister_iommu_driver(&vfio_noiommu_ops); 678 misc_deregister(&vfio_dev); 679 mutex_destroy(&vfio.iommu_drivers_lock); 680 } 681