1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO core 4 * 5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 6 * Author: Alex Williamson <alex.williamson@redhat.com> 7 * 8 * Derived from original vfio: 9 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 10 * Author: Tom Lyon, pugs@cisco.com 11 */ 12 13 #include <linux/cdev.h> 14 #include <linux/compat.h> 15 #include <linux/device.h> 16 #include <linux/file.h> 17 #include <linux/anon_inodes.h> 18 #include <linux/fs.h> 19 #include <linux/idr.h> 20 #include <linux/iommu.h> 21 #include <linux/list.h> 22 #include <linux/miscdevice.h> 23 #include <linux/module.h> 24 #include <linux/mutex.h> 25 #include <linux/pci.h> 26 #include <linux/rwsem.h> 27 #include <linux/sched.h> 28 #include <linux/slab.h> 29 #include <linux/stat.h> 30 #include <linux/string.h> 31 #include <linux/uaccess.h> 32 #include <linux/vfio.h> 33 #include <linux/wait.h> 34 #include <linux/sched/signal.h> 35 #include <linux/pm_runtime.h> 36 #include <linux/interval_tree.h> 37 #include <linux/iova_bitmap.h> 38 #include "vfio.h" 39 40 #define DRIVER_VERSION "0.3" 41 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 42 #define DRIVER_DESC "VFIO - User Level meta-driver" 43 44 static struct vfio { 45 struct class *class; 46 struct list_head iommu_drivers_list; 47 struct mutex iommu_drivers_lock; 48 struct list_head group_list; 49 struct mutex group_lock; /* locks group_list */ 50 struct ida group_ida; 51 dev_t group_devt; 52 } vfio; 53 54 struct vfio_iommu_driver { 55 const struct vfio_iommu_driver_ops *ops; 56 struct list_head vfio_next; 57 }; 58 59 struct vfio_container { 60 struct kref kref; 61 struct list_head group_list; 62 struct rw_semaphore group_lock; 63 struct vfio_iommu_driver *iommu_driver; 64 void *iommu_data; 65 bool noiommu; 66 }; 67 68 struct vfio_group { 69 struct device dev; 70 struct cdev cdev; 71 refcount_t users; 72 unsigned int container_users; 73 struct iommu_group *iommu_group; 74 struct vfio_container *container; 75 struct list_head device_list; 76 struct mutex device_lock; 77 struct list_head vfio_next; 78 struct list_head container_next; 79 enum vfio_group_type type; 80 struct rw_semaphore group_rwsem; 81 struct kvm *kvm; 82 struct file *opened_file; 83 struct blocking_notifier_head notifier; 84 }; 85 86 #ifdef CONFIG_VFIO_NOIOMMU 87 static bool noiommu __read_mostly; 88 module_param_named(enable_unsafe_noiommu_mode, 89 noiommu, bool, S_IRUGO | S_IWUSR); 90 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); 91 #endif 92 93 static DEFINE_XARRAY(vfio_device_set_xa); 94 static const struct file_operations vfio_group_fops; 95 96 int vfio_assign_device_set(struct vfio_device *device, void *set_id) 97 { 98 unsigned long idx = (unsigned long)set_id; 99 struct vfio_device_set *new_dev_set; 100 struct vfio_device_set *dev_set; 101 102 if (WARN_ON(!set_id)) 103 return -EINVAL; 104 105 /* 106 * Atomically acquire a singleton object in the xarray for this set_id 107 */ 108 xa_lock(&vfio_device_set_xa); 109 dev_set = xa_load(&vfio_device_set_xa, idx); 110 if (dev_set) 111 goto found_get_ref; 112 xa_unlock(&vfio_device_set_xa); 113 114 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL); 115 if (!new_dev_set) 116 return -ENOMEM; 117 mutex_init(&new_dev_set->lock); 118 INIT_LIST_HEAD(&new_dev_set->device_list); 119 new_dev_set->set_id = set_id; 120 121 xa_lock(&vfio_device_set_xa); 122 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set, 123 GFP_KERNEL); 124 if (!dev_set) { 125 dev_set = new_dev_set; 126 goto found_get_ref; 127 } 128 129 kfree(new_dev_set); 130 if (xa_is_err(dev_set)) { 131 xa_unlock(&vfio_device_set_xa); 132 return xa_err(dev_set); 133 } 134 135 found_get_ref: 136 dev_set->device_count++; 137 xa_unlock(&vfio_device_set_xa); 138 mutex_lock(&dev_set->lock); 139 device->dev_set = dev_set; 140 list_add_tail(&device->dev_set_list, &dev_set->device_list); 141 mutex_unlock(&dev_set->lock); 142 return 0; 143 } 144 EXPORT_SYMBOL_GPL(vfio_assign_device_set); 145 146 static void vfio_release_device_set(struct vfio_device *device) 147 { 148 struct vfio_device_set *dev_set = device->dev_set; 149 150 if (!dev_set) 151 return; 152 153 mutex_lock(&dev_set->lock); 154 list_del(&device->dev_set_list); 155 mutex_unlock(&dev_set->lock); 156 157 xa_lock(&vfio_device_set_xa); 158 if (!--dev_set->device_count) { 159 __xa_erase(&vfio_device_set_xa, 160 (unsigned long)dev_set->set_id); 161 mutex_destroy(&dev_set->lock); 162 kfree(dev_set); 163 } 164 xa_unlock(&vfio_device_set_xa); 165 } 166 167 #ifdef CONFIG_VFIO_NOIOMMU 168 static void *vfio_noiommu_open(unsigned long arg) 169 { 170 if (arg != VFIO_NOIOMMU_IOMMU) 171 return ERR_PTR(-EINVAL); 172 if (!capable(CAP_SYS_RAWIO)) 173 return ERR_PTR(-EPERM); 174 175 return NULL; 176 } 177 178 static void vfio_noiommu_release(void *iommu_data) 179 { 180 } 181 182 static long vfio_noiommu_ioctl(void *iommu_data, 183 unsigned int cmd, unsigned long arg) 184 { 185 if (cmd == VFIO_CHECK_EXTENSION) 186 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0; 187 188 return -ENOTTY; 189 } 190 191 static int vfio_noiommu_attach_group(void *iommu_data, 192 struct iommu_group *iommu_group, enum vfio_group_type type) 193 { 194 return 0; 195 } 196 197 static void vfio_noiommu_detach_group(void *iommu_data, 198 struct iommu_group *iommu_group) 199 { 200 } 201 202 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = { 203 .name = "vfio-noiommu", 204 .owner = THIS_MODULE, 205 .open = vfio_noiommu_open, 206 .release = vfio_noiommu_release, 207 .ioctl = vfio_noiommu_ioctl, 208 .attach_group = vfio_noiommu_attach_group, 209 .detach_group = vfio_noiommu_detach_group, 210 }; 211 212 /* 213 * Only noiommu containers can use vfio-noiommu and noiommu containers can only 214 * use vfio-noiommu. 215 */ 216 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container, 217 const struct vfio_iommu_driver *driver) 218 { 219 return container->noiommu == (driver->ops == &vfio_noiommu_ops); 220 } 221 #else 222 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container, 223 const struct vfio_iommu_driver *driver) 224 { 225 return true; 226 } 227 #endif /* CONFIG_VFIO_NOIOMMU */ 228 229 /* 230 * IOMMU driver registration 231 */ 232 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops) 233 { 234 struct vfio_iommu_driver *driver, *tmp; 235 236 if (WARN_ON(!ops->register_device != !ops->unregister_device)) 237 return -EINVAL; 238 239 driver = kzalloc(sizeof(*driver), GFP_KERNEL); 240 if (!driver) 241 return -ENOMEM; 242 243 driver->ops = ops; 244 245 mutex_lock(&vfio.iommu_drivers_lock); 246 247 /* Check for duplicates */ 248 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) { 249 if (tmp->ops == ops) { 250 mutex_unlock(&vfio.iommu_drivers_lock); 251 kfree(driver); 252 return -EINVAL; 253 } 254 } 255 256 list_add(&driver->vfio_next, &vfio.iommu_drivers_list); 257 258 mutex_unlock(&vfio.iommu_drivers_lock); 259 260 return 0; 261 } 262 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver); 263 264 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) 265 { 266 struct vfio_iommu_driver *driver; 267 268 mutex_lock(&vfio.iommu_drivers_lock); 269 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { 270 if (driver->ops == ops) { 271 list_del(&driver->vfio_next); 272 mutex_unlock(&vfio.iommu_drivers_lock); 273 kfree(driver); 274 return; 275 } 276 } 277 mutex_unlock(&vfio.iommu_drivers_lock); 278 } 279 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); 280 281 static void vfio_group_get(struct vfio_group *group); 282 283 /* 284 * Container objects - containers are created when /dev/vfio/vfio is 285 * opened, but their lifecycle extends until the last user is done, so 286 * it's freed via kref. Must support container/group/device being 287 * closed in any order. 288 */ 289 static void vfio_container_get(struct vfio_container *container) 290 { 291 kref_get(&container->kref); 292 } 293 294 static void vfio_container_release(struct kref *kref) 295 { 296 struct vfio_container *container; 297 container = container_of(kref, struct vfio_container, kref); 298 299 kfree(container); 300 } 301 302 static void vfio_container_put(struct vfio_container *container) 303 { 304 kref_put(&container->kref, vfio_container_release); 305 } 306 307 /* 308 * Group objects - create, release, get, put, search 309 */ 310 static struct vfio_group * 311 __vfio_group_get_from_iommu(struct iommu_group *iommu_group) 312 { 313 struct vfio_group *group; 314 315 list_for_each_entry(group, &vfio.group_list, vfio_next) { 316 if (group->iommu_group == iommu_group) { 317 vfio_group_get(group); 318 return group; 319 } 320 } 321 return NULL; 322 } 323 324 static struct vfio_group * 325 vfio_group_get_from_iommu(struct iommu_group *iommu_group) 326 { 327 struct vfio_group *group; 328 329 mutex_lock(&vfio.group_lock); 330 group = __vfio_group_get_from_iommu(iommu_group); 331 mutex_unlock(&vfio.group_lock); 332 return group; 333 } 334 335 static void vfio_group_release(struct device *dev) 336 { 337 struct vfio_group *group = container_of(dev, struct vfio_group, dev); 338 339 mutex_destroy(&group->device_lock); 340 iommu_group_put(group->iommu_group); 341 ida_free(&vfio.group_ida, MINOR(group->dev.devt)); 342 kfree(group); 343 } 344 345 static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, 346 enum vfio_group_type type) 347 { 348 struct vfio_group *group; 349 int minor; 350 351 group = kzalloc(sizeof(*group), GFP_KERNEL); 352 if (!group) 353 return ERR_PTR(-ENOMEM); 354 355 minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL); 356 if (minor < 0) { 357 kfree(group); 358 return ERR_PTR(minor); 359 } 360 361 device_initialize(&group->dev); 362 group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor); 363 group->dev.class = vfio.class; 364 group->dev.release = vfio_group_release; 365 cdev_init(&group->cdev, &vfio_group_fops); 366 group->cdev.owner = THIS_MODULE; 367 368 refcount_set(&group->users, 1); 369 init_rwsem(&group->group_rwsem); 370 INIT_LIST_HEAD(&group->device_list); 371 mutex_init(&group->device_lock); 372 group->iommu_group = iommu_group; 373 /* put in vfio_group_release() */ 374 iommu_group_ref_get(iommu_group); 375 group->type = type; 376 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); 377 378 return group; 379 } 380 381 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, 382 enum vfio_group_type type) 383 { 384 struct vfio_group *group; 385 struct vfio_group *ret; 386 int err; 387 388 group = vfio_group_alloc(iommu_group, type); 389 if (IS_ERR(group)) 390 return group; 391 392 err = dev_set_name(&group->dev, "%s%d", 393 group->type == VFIO_NO_IOMMU ? "noiommu-" : "", 394 iommu_group_id(iommu_group)); 395 if (err) { 396 ret = ERR_PTR(err); 397 goto err_put; 398 } 399 400 mutex_lock(&vfio.group_lock); 401 402 /* Did we race creating this group? */ 403 ret = __vfio_group_get_from_iommu(iommu_group); 404 if (ret) 405 goto err_unlock; 406 407 err = cdev_device_add(&group->cdev, &group->dev); 408 if (err) { 409 ret = ERR_PTR(err); 410 goto err_unlock; 411 } 412 413 list_add(&group->vfio_next, &vfio.group_list); 414 415 mutex_unlock(&vfio.group_lock); 416 return group; 417 418 err_unlock: 419 mutex_unlock(&vfio.group_lock); 420 err_put: 421 put_device(&group->dev); 422 return ret; 423 } 424 425 static void vfio_group_put(struct vfio_group *group) 426 { 427 if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock)) 428 return; 429 430 /* 431 * These data structures all have paired operations that can only be 432 * undone when the caller holds a live reference on the group. Since all 433 * pairs must be undone these WARN_ON's indicate some caller did not 434 * properly hold the group reference. 435 */ 436 WARN_ON(!list_empty(&group->device_list)); 437 WARN_ON(group->container || group->container_users); 438 WARN_ON(group->notifier.head); 439 440 list_del(&group->vfio_next); 441 cdev_device_del(&group->cdev, &group->dev); 442 mutex_unlock(&vfio.group_lock); 443 444 put_device(&group->dev); 445 } 446 447 static void vfio_group_get(struct vfio_group *group) 448 { 449 refcount_inc(&group->users); 450 } 451 452 /* 453 * Device objects - create, release, get, put, search 454 */ 455 /* Device reference always implies a group reference */ 456 static void vfio_device_put(struct vfio_device *device) 457 { 458 if (refcount_dec_and_test(&device->refcount)) 459 complete(&device->comp); 460 } 461 462 static bool vfio_device_try_get(struct vfio_device *device) 463 { 464 return refcount_inc_not_zero(&device->refcount); 465 } 466 467 static struct vfio_device *vfio_group_get_device(struct vfio_group *group, 468 struct device *dev) 469 { 470 struct vfio_device *device; 471 472 mutex_lock(&group->device_lock); 473 list_for_each_entry(device, &group->device_list, group_next) { 474 if (device->dev == dev && vfio_device_try_get(device)) { 475 mutex_unlock(&group->device_lock); 476 return device; 477 } 478 } 479 mutex_unlock(&group->device_lock); 480 return NULL; 481 } 482 483 /* 484 * VFIO driver API 485 */ 486 void vfio_init_group_dev(struct vfio_device *device, struct device *dev, 487 const struct vfio_device_ops *ops) 488 { 489 init_completion(&device->comp); 490 device->dev = dev; 491 device->ops = ops; 492 } 493 EXPORT_SYMBOL_GPL(vfio_init_group_dev); 494 495 void vfio_uninit_group_dev(struct vfio_device *device) 496 { 497 vfio_release_device_set(device); 498 } 499 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev); 500 501 /* Release helper called by vfio_put_device() */ 502 void vfio_device_release(struct kref *kref) 503 { 504 struct vfio_device *device = 505 container_of(kref, struct vfio_device, kref); 506 507 vfio_uninit_group_dev(device); 508 509 /* 510 * kvfree() cannot be done here due to a life cycle mess in 511 * vfio-ccw. Before the ccw part is fixed all drivers are 512 * required to support @release and call vfio_free_device() 513 * from there. 514 */ 515 device->ops->release(device); 516 } 517 EXPORT_SYMBOL_GPL(vfio_device_release); 518 519 /* 520 * Allocate and initialize vfio_device so it can be registered to vfio 521 * core. 522 * 523 * Drivers should use the wrapper vfio_alloc_device() for allocation. 524 * @size is the size of the structure to be allocated, including any 525 * private data used by the driver. 526 * 527 * Driver may provide an @init callback to cover device private data. 528 * 529 * Use vfio_put_device() to release the structure after success return. 530 */ 531 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, 532 const struct vfio_device_ops *ops) 533 { 534 struct vfio_device *device; 535 int ret; 536 537 if (WARN_ON(size < sizeof(struct vfio_device))) 538 return ERR_PTR(-EINVAL); 539 540 device = kvzalloc(size, GFP_KERNEL); 541 if (!device) 542 return ERR_PTR(-ENOMEM); 543 544 ret = vfio_init_device(device, dev, ops); 545 if (ret) 546 goto out_free; 547 return device; 548 549 out_free: 550 kvfree(device); 551 return ERR_PTR(ret); 552 } 553 EXPORT_SYMBOL_GPL(_vfio_alloc_device); 554 555 /* 556 * Initialize a vfio_device so it can be registered to vfio core. 557 * 558 * Only vfio-ccw driver should call this interface. 559 */ 560 int vfio_init_device(struct vfio_device *device, struct device *dev, 561 const struct vfio_device_ops *ops) 562 { 563 int ret; 564 565 vfio_init_group_dev(device, dev, ops); 566 567 if (ops->init) { 568 ret = ops->init(device); 569 if (ret) 570 goto out_uninit; 571 } 572 573 kref_init(&device->kref); 574 return 0; 575 576 out_uninit: 577 vfio_uninit_group_dev(device); 578 return ret; 579 } 580 EXPORT_SYMBOL_GPL(vfio_init_device); 581 582 /* 583 * The helper called by driver @release callback to free the device 584 * structure. Drivers which don't have private data to clean can 585 * simply use this helper as its @release. 586 */ 587 void vfio_free_device(struct vfio_device *device) 588 { 589 kvfree(device); 590 } 591 EXPORT_SYMBOL_GPL(vfio_free_device); 592 593 static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, 594 enum vfio_group_type type) 595 { 596 struct iommu_group *iommu_group; 597 struct vfio_group *group; 598 int ret; 599 600 iommu_group = iommu_group_alloc(); 601 if (IS_ERR(iommu_group)) 602 return ERR_CAST(iommu_group); 603 604 ret = iommu_group_set_name(iommu_group, "vfio-noiommu"); 605 if (ret) 606 goto out_put_group; 607 ret = iommu_group_add_device(iommu_group, dev); 608 if (ret) 609 goto out_put_group; 610 611 group = vfio_create_group(iommu_group, type); 612 if (IS_ERR(group)) { 613 ret = PTR_ERR(group); 614 goto out_remove_device; 615 } 616 iommu_group_put(iommu_group); 617 return group; 618 619 out_remove_device: 620 iommu_group_remove_device(dev); 621 out_put_group: 622 iommu_group_put(iommu_group); 623 return ERR_PTR(ret); 624 } 625 626 static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) 627 { 628 struct iommu_group *iommu_group; 629 struct vfio_group *group; 630 631 iommu_group = iommu_group_get(dev); 632 #ifdef CONFIG_VFIO_NOIOMMU 633 if (!iommu_group && noiommu) { 634 /* 635 * With noiommu enabled, create an IOMMU group for devices that 636 * don't already have one, implying no IOMMU hardware/driver 637 * exists. Taint the kernel because we're about to give a DMA 638 * capable device to a user without IOMMU protection. 639 */ 640 group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU); 641 if (!IS_ERR(group)) { 642 add_taint(TAINT_USER, LOCKDEP_STILL_OK); 643 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); 644 } 645 return group; 646 } 647 #endif 648 if (!iommu_group) 649 return ERR_PTR(-EINVAL); 650 651 /* 652 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to 653 * restore cache coherency. It has to be checked here because it is only 654 * valid for cases where we are using iommu groups. 655 */ 656 if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) { 657 iommu_group_put(iommu_group); 658 return ERR_PTR(-EINVAL); 659 } 660 661 group = vfio_group_get_from_iommu(iommu_group); 662 if (!group) 663 group = vfio_create_group(iommu_group, VFIO_IOMMU); 664 665 /* The vfio_group holds a reference to the iommu_group */ 666 iommu_group_put(iommu_group); 667 return group; 668 } 669 670 static int __vfio_register_dev(struct vfio_device *device, 671 struct vfio_group *group) 672 { 673 struct vfio_device *existing_device; 674 675 if (IS_ERR(group)) 676 return PTR_ERR(group); 677 678 /* 679 * If the driver doesn't specify a set then the device is added to a 680 * singleton set just for itself. 681 */ 682 if (!device->dev_set) 683 vfio_assign_device_set(device, device); 684 685 existing_device = vfio_group_get_device(group, device->dev); 686 if (existing_device) { 687 dev_WARN(device->dev, "Device already exists on group %d\n", 688 iommu_group_id(group->iommu_group)); 689 vfio_device_put(existing_device); 690 if (group->type == VFIO_NO_IOMMU || 691 group->type == VFIO_EMULATED_IOMMU) 692 iommu_group_remove_device(device->dev); 693 vfio_group_put(group); 694 return -EBUSY; 695 } 696 697 /* Our reference on group is moved to the device */ 698 device->group = group; 699 700 /* Refcounting can't start until the driver calls register */ 701 refcount_set(&device->refcount, 1); 702 703 mutex_lock(&group->device_lock); 704 list_add(&device->group_next, &group->device_list); 705 mutex_unlock(&group->device_lock); 706 707 return 0; 708 } 709 710 int vfio_register_group_dev(struct vfio_device *device) 711 { 712 return __vfio_register_dev(device, 713 vfio_group_find_or_alloc(device->dev)); 714 } 715 EXPORT_SYMBOL_GPL(vfio_register_group_dev); 716 717 /* 718 * Register a virtual device without IOMMU backing. The user of this 719 * device must not be able to directly trigger unmediated DMA. 720 */ 721 int vfio_register_emulated_iommu_dev(struct vfio_device *device) 722 { 723 return __vfio_register_dev(device, 724 vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU)); 725 } 726 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); 727 728 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, 729 char *buf) 730 { 731 struct vfio_device *it, *device = ERR_PTR(-ENODEV); 732 733 mutex_lock(&group->device_lock); 734 list_for_each_entry(it, &group->device_list, group_next) { 735 int ret; 736 737 if (it->ops->match) { 738 ret = it->ops->match(it, buf); 739 if (ret < 0) { 740 device = ERR_PTR(ret); 741 break; 742 } 743 } else { 744 ret = !strcmp(dev_name(it->dev), buf); 745 } 746 747 if (ret && vfio_device_try_get(it)) { 748 device = it; 749 break; 750 } 751 } 752 mutex_unlock(&group->device_lock); 753 754 return device; 755 } 756 757 /* 758 * Decrement the device reference count and wait for the device to be 759 * removed. Open file descriptors for the device... */ 760 void vfio_unregister_group_dev(struct vfio_device *device) 761 { 762 struct vfio_group *group = device->group; 763 unsigned int i = 0; 764 bool interrupted = false; 765 long rc; 766 767 vfio_device_put(device); 768 rc = try_wait_for_completion(&device->comp); 769 while (rc <= 0) { 770 if (device->ops->request) 771 device->ops->request(device, i++); 772 773 if (interrupted) { 774 rc = wait_for_completion_timeout(&device->comp, 775 HZ * 10); 776 } else { 777 rc = wait_for_completion_interruptible_timeout( 778 &device->comp, HZ * 10); 779 if (rc < 0) { 780 interrupted = true; 781 dev_warn(device->dev, 782 "Device is currently in use, task" 783 " \"%s\" (%d) " 784 "blocked until device is released", 785 current->comm, task_pid_nr(current)); 786 } 787 } 788 } 789 790 mutex_lock(&group->device_lock); 791 list_del(&device->group_next); 792 mutex_unlock(&group->device_lock); 793 794 if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) 795 iommu_group_remove_device(device->dev); 796 797 /* Matches the get in vfio_register_group_dev() */ 798 vfio_group_put(group); 799 } 800 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); 801 802 /* 803 * VFIO base fd, /dev/vfio/vfio 804 */ 805 static long vfio_ioctl_check_extension(struct vfio_container *container, 806 unsigned long arg) 807 { 808 struct vfio_iommu_driver *driver; 809 long ret = 0; 810 811 down_read(&container->group_lock); 812 813 driver = container->iommu_driver; 814 815 switch (arg) { 816 /* No base extensions yet */ 817 default: 818 /* 819 * If no driver is set, poll all registered drivers for 820 * extensions and return the first positive result. If 821 * a driver is already set, further queries will be passed 822 * only to that driver. 823 */ 824 if (!driver) { 825 mutex_lock(&vfio.iommu_drivers_lock); 826 list_for_each_entry(driver, &vfio.iommu_drivers_list, 827 vfio_next) { 828 829 if (!list_empty(&container->group_list) && 830 !vfio_iommu_driver_allowed(container, 831 driver)) 832 continue; 833 if (!try_module_get(driver->ops->owner)) 834 continue; 835 836 ret = driver->ops->ioctl(NULL, 837 VFIO_CHECK_EXTENSION, 838 arg); 839 module_put(driver->ops->owner); 840 if (ret > 0) 841 break; 842 } 843 mutex_unlock(&vfio.iommu_drivers_lock); 844 } else 845 ret = driver->ops->ioctl(container->iommu_data, 846 VFIO_CHECK_EXTENSION, arg); 847 } 848 849 up_read(&container->group_lock); 850 851 return ret; 852 } 853 854 /* hold write lock on container->group_lock */ 855 static int __vfio_container_attach_groups(struct vfio_container *container, 856 struct vfio_iommu_driver *driver, 857 void *data) 858 { 859 struct vfio_group *group; 860 int ret = -ENODEV; 861 862 list_for_each_entry(group, &container->group_list, container_next) { 863 ret = driver->ops->attach_group(data, group->iommu_group, 864 group->type); 865 if (ret) 866 goto unwind; 867 } 868 869 return ret; 870 871 unwind: 872 list_for_each_entry_continue_reverse(group, &container->group_list, 873 container_next) { 874 driver->ops->detach_group(data, group->iommu_group); 875 } 876 877 return ret; 878 } 879 880 static long vfio_ioctl_set_iommu(struct vfio_container *container, 881 unsigned long arg) 882 { 883 struct vfio_iommu_driver *driver; 884 long ret = -ENODEV; 885 886 down_write(&container->group_lock); 887 888 /* 889 * The container is designed to be an unprivileged interface while 890 * the group can be assigned to specific users. Therefore, only by 891 * adding a group to a container does the user get the privilege of 892 * enabling the iommu, which may allocate finite resources. There 893 * is no unset_iommu, but by removing all the groups from a container, 894 * the container is deprivileged and returns to an unset state. 895 */ 896 if (list_empty(&container->group_list) || container->iommu_driver) { 897 up_write(&container->group_lock); 898 return -EINVAL; 899 } 900 901 mutex_lock(&vfio.iommu_drivers_lock); 902 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { 903 void *data; 904 905 if (!vfio_iommu_driver_allowed(container, driver)) 906 continue; 907 if (!try_module_get(driver->ops->owner)) 908 continue; 909 910 /* 911 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION, 912 * so test which iommu driver reported support for this 913 * extension and call open on them. We also pass them the 914 * magic, allowing a single driver to support multiple 915 * interfaces if they'd like. 916 */ 917 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) { 918 module_put(driver->ops->owner); 919 continue; 920 } 921 922 data = driver->ops->open(arg); 923 if (IS_ERR(data)) { 924 ret = PTR_ERR(data); 925 module_put(driver->ops->owner); 926 continue; 927 } 928 929 ret = __vfio_container_attach_groups(container, driver, data); 930 if (ret) { 931 driver->ops->release(data); 932 module_put(driver->ops->owner); 933 continue; 934 } 935 936 container->iommu_driver = driver; 937 container->iommu_data = data; 938 break; 939 } 940 941 mutex_unlock(&vfio.iommu_drivers_lock); 942 up_write(&container->group_lock); 943 944 return ret; 945 } 946 947 static long vfio_fops_unl_ioctl(struct file *filep, 948 unsigned int cmd, unsigned long arg) 949 { 950 struct vfio_container *container = filep->private_data; 951 struct vfio_iommu_driver *driver; 952 void *data; 953 long ret = -EINVAL; 954 955 if (!container) 956 return ret; 957 958 switch (cmd) { 959 case VFIO_GET_API_VERSION: 960 ret = VFIO_API_VERSION; 961 break; 962 case VFIO_CHECK_EXTENSION: 963 ret = vfio_ioctl_check_extension(container, arg); 964 break; 965 case VFIO_SET_IOMMU: 966 ret = vfio_ioctl_set_iommu(container, arg); 967 break; 968 default: 969 driver = container->iommu_driver; 970 data = container->iommu_data; 971 972 if (driver) /* passthrough all unrecognized ioctls */ 973 ret = driver->ops->ioctl(data, cmd, arg); 974 } 975 976 return ret; 977 } 978 979 static int vfio_fops_open(struct inode *inode, struct file *filep) 980 { 981 struct vfio_container *container; 982 983 container = kzalloc(sizeof(*container), GFP_KERNEL); 984 if (!container) 985 return -ENOMEM; 986 987 INIT_LIST_HEAD(&container->group_list); 988 init_rwsem(&container->group_lock); 989 kref_init(&container->kref); 990 991 filep->private_data = container; 992 993 return 0; 994 } 995 996 static int vfio_fops_release(struct inode *inode, struct file *filep) 997 { 998 struct vfio_container *container = filep->private_data; 999 struct vfio_iommu_driver *driver = container->iommu_driver; 1000 1001 if (driver && driver->ops->notify) 1002 driver->ops->notify(container->iommu_data, 1003 VFIO_IOMMU_CONTAINER_CLOSE); 1004 1005 filep->private_data = NULL; 1006 1007 vfio_container_put(container); 1008 1009 return 0; 1010 } 1011 1012 static const struct file_operations vfio_fops = { 1013 .owner = THIS_MODULE, 1014 .open = vfio_fops_open, 1015 .release = vfio_fops_release, 1016 .unlocked_ioctl = vfio_fops_unl_ioctl, 1017 .compat_ioctl = compat_ptr_ioctl, 1018 }; 1019 1020 /* 1021 * VFIO Group fd, /dev/vfio/$GROUP 1022 */ 1023 static void __vfio_group_unset_container(struct vfio_group *group) 1024 { 1025 struct vfio_container *container = group->container; 1026 struct vfio_iommu_driver *driver; 1027 1028 lockdep_assert_held_write(&group->group_rwsem); 1029 1030 down_write(&container->group_lock); 1031 1032 driver = container->iommu_driver; 1033 if (driver) 1034 driver->ops->detach_group(container->iommu_data, 1035 group->iommu_group); 1036 1037 if (group->type == VFIO_IOMMU) 1038 iommu_group_release_dma_owner(group->iommu_group); 1039 1040 group->container = NULL; 1041 group->container_users = 0; 1042 list_del(&group->container_next); 1043 1044 /* Detaching the last group deprivileges a container, remove iommu */ 1045 if (driver && list_empty(&container->group_list)) { 1046 driver->ops->release(container->iommu_data); 1047 module_put(driver->ops->owner); 1048 container->iommu_driver = NULL; 1049 container->iommu_data = NULL; 1050 } 1051 1052 up_write(&container->group_lock); 1053 1054 vfio_container_put(container); 1055 } 1056 1057 /* 1058 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or 1059 * if there was no container to unset. Since the ioctl is called on 1060 * the group, we know that still exists, therefore the only valid 1061 * transition here is 1->0. 1062 */ 1063 static int vfio_group_ioctl_unset_container(struct vfio_group *group) 1064 { 1065 int ret = 0; 1066 1067 down_write(&group->group_rwsem); 1068 if (!group->container) { 1069 ret = -EINVAL; 1070 goto out_unlock; 1071 } 1072 if (group->container_users != 1) { 1073 ret = -EBUSY; 1074 goto out_unlock; 1075 } 1076 __vfio_group_unset_container(group); 1077 1078 out_unlock: 1079 up_write(&group->group_rwsem); 1080 return ret; 1081 } 1082 1083 static int vfio_group_ioctl_set_container(struct vfio_group *group, 1084 int __user *arg) 1085 { 1086 struct fd f; 1087 struct vfio_container *container; 1088 struct vfio_iommu_driver *driver; 1089 int container_fd; 1090 int ret = 0; 1091 1092 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) 1093 return -EPERM; 1094 1095 if (get_user(container_fd, arg)) 1096 return -EFAULT; 1097 if (container_fd < 0) 1098 return -EINVAL; 1099 f = fdget(container_fd); 1100 if (!f.file) 1101 return -EBADF; 1102 1103 /* Sanity check, is this really our fd? */ 1104 if (f.file->f_op != &vfio_fops) { 1105 ret = -EINVAL; 1106 goto out_fdput; 1107 } 1108 container = f.file->private_data; 1109 WARN_ON(!container); /* fget ensures we don't race vfio_release */ 1110 1111 down_write(&group->group_rwsem); 1112 1113 if (group->container || WARN_ON(group->container_users)) { 1114 ret = -EINVAL; 1115 goto out_unlock_group; 1116 } 1117 1118 down_write(&container->group_lock); 1119 1120 /* Real groups and fake groups cannot mix */ 1121 if (!list_empty(&container->group_list) && 1122 container->noiommu != (group->type == VFIO_NO_IOMMU)) { 1123 ret = -EPERM; 1124 goto out_unlock_container; 1125 } 1126 1127 if (group->type == VFIO_IOMMU) { 1128 ret = iommu_group_claim_dma_owner(group->iommu_group, f.file); 1129 if (ret) 1130 goto out_unlock_container; 1131 } 1132 1133 driver = container->iommu_driver; 1134 if (driver) { 1135 ret = driver->ops->attach_group(container->iommu_data, 1136 group->iommu_group, 1137 group->type); 1138 if (ret) { 1139 if (group->type == VFIO_IOMMU) 1140 iommu_group_release_dma_owner( 1141 group->iommu_group); 1142 goto out_unlock_container; 1143 } 1144 } 1145 1146 group->container = container; 1147 group->container_users = 1; 1148 container->noiommu = (group->type == VFIO_NO_IOMMU); 1149 list_add(&group->container_next, &container->group_list); 1150 1151 /* Get a reference on the container and mark a user within the group */ 1152 vfio_container_get(container); 1153 1154 out_unlock_container: 1155 up_write(&container->group_lock); 1156 out_unlock_group: 1157 up_write(&group->group_rwsem); 1158 out_fdput: 1159 fdput(f); 1160 return ret; 1161 } 1162 1163 static const struct file_operations vfio_device_fops; 1164 1165 /* true if the vfio_device has open_device() called but not close_device() */ 1166 static bool vfio_assert_device_open(struct vfio_device *device) 1167 { 1168 return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); 1169 } 1170 1171 static int vfio_device_assign_container(struct vfio_device *device) 1172 { 1173 struct vfio_group *group = device->group; 1174 1175 lockdep_assert_held_write(&group->group_rwsem); 1176 1177 if (!group->container || !group->container->iommu_driver || 1178 WARN_ON(!group->container_users)) 1179 return -EINVAL; 1180 1181 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) 1182 return -EPERM; 1183 1184 get_file(group->opened_file); 1185 group->container_users++; 1186 return 0; 1187 } 1188 1189 static void vfio_device_unassign_container(struct vfio_device *device) 1190 { 1191 down_write(&device->group->group_rwsem); 1192 WARN_ON(device->group->container_users <= 1); 1193 device->group->container_users--; 1194 fput(device->group->opened_file); 1195 up_write(&device->group->group_rwsem); 1196 } 1197 1198 static struct file *vfio_device_open(struct vfio_device *device) 1199 { 1200 struct vfio_iommu_driver *iommu_driver; 1201 struct file *filep; 1202 int ret; 1203 1204 down_write(&device->group->group_rwsem); 1205 ret = vfio_device_assign_container(device); 1206 up_write(&device->group->group_rwsem); 1207 if (ret) 1208 return ERR_PTR(ret); 1209 1210 if (!try_module_get(device->dev->driver->owner)) { 1211 ret = -ENODEV; 1212 goto err_unassign_container; 1213 } 1214 1215 mutex_lock(&device->dev_set->lock); 1216 device->open_count++; 1217 if (device->open_count == 1) { 1218 /* 1219 * Here we pass the KVM pointer with the group under the read 1220 * lock. If the device driver will use it, it must obtain a 1221 * reference and release it during close_device. 1222 */ 1223 down_read(&device->group->group_rwsem); 1224 device->kvm = device->group->kvm; 1225 1226 if (device->ops->open_device) { 1227 ret = device->ops->open_device(device); 1228 if (ret) 1229 goto err_undo_count; 1230 } 1231 1232 iommu_driver = device->group->container->iommu_driver; 1233 if (iommu_driver && iommu_driver->ops->register_device) 1234 iommu_driver->ops->register_device( 1235 device->group->container->iommu_data, device); 1236 1237 up_read(&device->group->group_rwsem); 1238 } 1239 mutex_unlock(&device->dev_set->lock); 1240 1241 /* 1242 * We can't use anon_inode_getfd() because we need to modify 1243 * the f_mode flags directly to allow more than just ioctls 1244 */ 1245 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, 1246 device, O_RDWR); 1247 if (IS_ERR(filep)) { 1248 ret = PTR_ERR(filep); 1249 goto err_close_device; 1250 } 1251 1252 /* 1253 * TODO: add an anon_inode interface to do this. 1254 * Appears to be missing by lack of need rather than 1255 * explicitly prevented. Now there's need. 1256 */ 1257 filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); 1258 1259 if (device->group->type == VFIO_NO_IOMMU) 1260 dev_warn(device->dev, "vfio-noiommu device opened by user " 1261 "(%s:%d)\n", current->comm, task_pid_nr(current)); 1262 /* 1263 * On success the ref of device is moved to the file and 1264 * put in vfio_device_fops_release() 1265 */ 1266 return filep; 1267 1268 err_close_device: 1269 mutex_lock(&device->dev_set->lock); 1270 down_read(&device->group->group_rwsem); 1271 if (device->open_count == 1 && device->ops->close_device) { 1272 device->ops->close_device(device); 1273 1274 iommu_driver = device->group->container->iommu_driver; 1275 if (iommu_driver && iommu_driver->ops->unregister_device) 1276 iommu_driver->ops->unregister_device( 1277 device->group->container->iommu_data, device); 1278 } 1279 err_undo_count: 1280 up_read(&device->group->group_rwsem); 1281 device->open_count--; 1282 if (device->open_count == 0 && device->kvm) 1283 device->kvm = NULL; 1284 mutex_unlock(&device->dev_set->lock); 1285 module_put(device->dev->driver->owner); 1286 err_unassign_container: 1287 vfio_device_unassign_container(device); 1288 return ERR_PTR(ret); 1289 } 1290 1291 static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, 1292 char __user *arg) 1293 { 1294 struct vfio_device *device; 1295 struct file *filep; 1296 char *buf; 1297 int fdno; 1298 int ret; 1299 1300 buf = strndup_user(arg, PAGE_SIZE); 1301 if (IS_ERR(buf)) 1302 return PTR_ERR(buf); 1303 1304 device = vfio_device_get_from_name(group, buf); 1305 kfree(buf); 1306 if (IS_ERR(device)) 1307 return PTR_ERR(device); 1308 1309 fdno = get_unused_fd_flags(O_CLOEXEC); 1310 if (fdno < 0) { 1311 ret = fdno; 1312 goto err_put_device; 1313 } 1314 1315 filep = vfio_device_open(device); 1316 if (IS_ERR(filep)) { 1317 ret = PTR_ERR(filep); 1318 goto err_put_fdno; 1319 } 1320 1321 fd_install(fdno, filep); 1322 return fdno; 1323 1324 err_put_fdno: 1325 put_unused_fd(fdno); 1326 err_put_device: 1327 vfio_device_put(device); 1328 return ret; 1329 } 1330 1331 static int vfio_group_ioctl_get_status(struct vfio_group *group, 1332 struct vfio_group_status __user *arg) 1333 { 1334 unsigned long minsz = offsetofend(struct vfio_group_status, flags); 1335 struct vfio_group_status status; 1336 1337 if (copy_from_user(&status, arg, minsz)) 1338 return -EFAULT; 1339 1340 if (status.argsz < minsz) 1341 return -EINVAL; 1342 1343 status.flags = 0; 1344 1345 down_read(&group->group_rwsem); 1346 if (group->container) 1347 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | 1348 VFIO_GROUP_FLAGS_VIABLE; 1349 else if (!iommu_group_dma_owner_claimed(group->iommu_group)) 1350 status.flags |= VFIO_GROUP_FLAGS_VIABLE; 1351 up_read(&group->group_rwsem); 1352 1353 if (copy_to_user(arg, &status, minsz)) 1354 return -EFAULT; 1355 return 0; 1356 } 1357 1358 static long vfio_group_fops_unl_ioctl(struct file *filep, 1359 unsigned int cmd, unsigned long arg) 1360 { 1361 struct vfio_group *group = filep->private_data; 1362 void __user *uarg = (void __user *)arg; 1363 1364 switch (cmd) { 1365 case VFIO_GROUP_GET_DEVICE_FD: 1366 return vfio_group_ioctl_get_device_fd(group, uarg); 1367 case VFIO_GROUP_GET_STATUS: 1368 return vfio_group_ioctl_get_status(group, uarg); 1369 case VFIO_GROUP_SET_CONTAINER: 1370 return vfio_group_ioctl_set_container(group, uarg); 1371 case VFIO_GROUP_UNSET_CONTAINER: 1372 return vfio_group_ioctl_unset_container(group); 1373 default: 1374 return -ENOTTY; 1375 } 1376 } 1377 1378 static int vfio_group_fops_open(struct inode *inode, struct file *filep) 1379 { 1380 struct vfio_group *group = 1381 container_of(inode->i_cdev, struct vfio_group, cdev); 1382 int ret; 1383 1384 down_write(&group->group_rwsem); 1385 1386 /* users can be zero if this races with vfio_group_put() */ 1387 if (!refcount_inc_not_zero(&group->users)) { 1388 ret = -ENODEV; 1389 goto err_unlock; 1390 } 1391 1392 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { 1393 ret = -EPERM; 1394 goto err_put; 1395 } 1396 1397 /* 1398 * Do we need multiple instances of the group open? Seems not. 1399 */ 1400 if (group->opened_file) { 1401 ret = -EBUSY; 1402 goto err_put; 1403 } 1404 group->opened_file = filep; 1405 filep->private_data = group; 1406 1407 up_write(&group->group_rwsem); 1408 return 0; 1409 err_put: 1410 vfio_group_put(group); 1411 err_unlock: 1412 up_write(&group->group_rwsem); 1413 return ret; 1414 } 1415 1416 static int vfio_group_fops_release(struct inode *inode, struct file *filep) 1417 { 1418 struct vfio_group *group = filep->private_data; 1419 1420 filep->private_data = NULL; 1421 1422 down_write(&group->group_rwsem); 1423 /* 1424 * Device FDs hold a group file reference, therefore the group release 1425 * is only called when there are no open devices. 1426 */ 1427 WARN_ON(group->notifier.head); 1428 if (group->container) { 1429 WARN_ON(group->container_users != 1); 1430 __vfio_group_unset_container(group); 1431 } 1432 group->opened_file = NULL; 1433 up_write(&group->group_rwsem); 1434 1435 vfio_group_put(group); 1436 1437 return 0; 1438 } 1439 1440 static const struct file_operations vfio_group_fops = { 1441 .owner = THIS_MODULE, 1442 .unlocked_ioctl = vfio_group_fops_unl_ioctl, 1443 .compat_ioctl = compat_ptr_ioctl, 1444 .open = vfio_group_fops_open, 1445 .release = vfio_group_fops_release, 1446 }; 1447 1448 /* 1449 * Wrapper around pm_runtime_resume_and_get(). 1450 * Return error code on failure or 0 on success. 1451 */ 1452 static inline int vfio_device_pm_runtime_get(struct vfio_device *device) 1453 { 1454 struct device *dev = device->dev; 1455 1456 if (dev->driver && dev->driver->pm) { 1457 int ret; 1458 1459 ret = pm_runtime_resume_and_get(dev); 1460 if (ret) { 1461 dev_info_ratelimited(dev, 1462 "vfio: runtime resume failed %d\n", ret); 1463 return -EIO; 1464 } 1465 } 1466 1467 return 0; 1468 } 1469 1470 /* 1471 * Wrapper around pm_runtime_put(). 1472 */ 1473 static inline void vfio_device_pm_runtime_put(struct vfio_device *device) 1474 { 1475 struct device *dev = device->dev; 1476 1477 if (dev->driver && dev->driver->pm) 1478 pm_runtime_put(dev); 1479 } 1480 1481 /* 1482 * VFIO Device fd 1483 */ 1484 static int vfio_device_fops_release(struct inode *inode, struct file *filep) 1485 { 1486 struct vfio_device *device = filep->private_data; 1487 struct vfio_iommu_driver *iommu_driver; 1488 1489 mutex_lock(&device->dev_set->lock); 1490 vfio_assert_device_open(device); 1491 down_read(&device->group->group_rwsem); 1492 if (device->open_count == 1 && device->ops->close_device) 1493 device->ops->close_device(device); 1494 1495 iommu_driver = device->group->container->iommu_driver; 1496 if (iommu_driver && iommu_driver->ops->unregister_device) 1497 iommu_driver->ops->unregister_device( 1498 device->group->container->iommu_data, device); 1499 up_read(&device->group->group_rwsem); 1500 device->open_count--; 1501 if (device->open_count == 0) 1502 device->kvm = NULL; 1503 mutex_unlock(&device->dev_set->lock); 1504 1505 module_put(device->dev->driver->owner); 1506 1507 vfio_device_unassign_container(device); 1508 1509 vfio_device_put(device); 1510 1511 return 0; 1512 } 1513 1514 /* 1515 * vfio_mig_get_next_state - Compute the next step in the FSM 1516 * @cur_fsm - The current state the device is in 1517 * @new_fsm - The target state to reach 1518 * @next_fsm - Pointer to the next step to get to new_fsm 1519 * 1520 * Return 0 upon success, otherwise -errno 1521 * Upon success the next step in the state progression between cur_fsm and 1522 * new_fsm will be set in next_fsm. 1523 * 1524 * This breaks down requests for combination transitions into smaller steps and 1525 * returns the next step to get to new_fsm. The function may need to be called 1526 * multiple times before reaching new_fsm. 1527 * 1528 */ 1529 int vfio_mig_get_next_state(struct vfio_device *device, 1530 enum vfio_device_mig_state cur_fsm, 1531 enum vfio_device_mig_state new_fsm, 1532 enum vfio_device_mig_state *next_fsm) 1533 { 1534 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 }; 1535 /* 1536 * The coding in this table requires the driver to implement the 1537 * following FSM arcs: 1538 * RESUMING -> STOP 1539 * STOP -> RESUMING 1540 * STOP -> STOP_COPY 1541 * STOP_COPY -> STOP 1542 * 1543 * If P2P is supported then the driver must also implement these FSM 1544 * arcs: 1545 * RUNNING -> RUNNING_P2P 1546 * RUNNING_P2P -> RUNNING 1547 * RUNNING_P2P -> STOP 1548 * STOP -> RUNNING_P2P 1549 * Without P2P the driver must implement: 1550 * RUNNING -> STOP 1551 * STOP -> RUNNING 1552 * 1553 * The coding will step through multiple states for some combination 1554 * transitions; if all optional features are supported, this means the 1555 * following ones: 1556 * RESUMING -> STOP -> RUNNING_P2P 1557 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING 1558 * RESUMING -> STOP -> STOP_COPY 1559 * RUNNING -> RUNNING_P2P -> STOP 1560 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING 1561 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY 1562 * RUNNING_P2P -> STOP -> RESUMING 1563 * RUNNING_P2P -> STOP -> STOP_COPY 1564 * STOP -> RUNNING_P2P -> RUNNING 1565 * STOP_COPY -> STOP -> RESUMING 1566 * STOP_COPY -> STOP -> RUNNING_P2P 1567 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING 1568 */ 1569 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { 1570 [VFIO_DEVICE_STATE_STOP] = { 1571 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 1572 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 1573 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 1574 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 1575 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 1576 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 1577 }, 1578 [VFIO_DEVICE_STATE_RUNNING] = { 1579 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 1580 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 1581 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 1582 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 1583 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 1584 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 1585 }, 1586 [VFIO_DEVICE_STATE_STOP_COPY] = { 1587 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 1588 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 1589 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 1590 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 1591 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 1592 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 1593 }, 1594 [VFIO_DEVICE_STATE_RESUMING] = { 1595 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 1596 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 1597 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 1598 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 1599 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 1600 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 1601 }, 1602 [VFIO_DEVICE_STATE_RUNNING_P2P] = { 1603 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 1604 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 1605 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 1606 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 1607 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 1608 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 1609 }, 1610 [VFIO_DEVICE_STATE_ERROR] = { 1611 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, 1612 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, 1613 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, 1614 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, 1615 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, 1616 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 1617 }, 1618 }; 1619 1620 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { 1621 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, 1622 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, 1623 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, 1624 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, 1625 [VFIO_DEVICE_STATE_RUNNING_P2P] = 1626 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P, 1627 [VFIO_DEVICE_STATE_ERROR] = ~0U, 1628 }; 1629 1630 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 1631 (state_flags_table[cur_fsm] & device->migration_flags) != 1632 state_flags_table[cur_fsm])) 1633 return -EINVAL; 1634 1635 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 1636 (state_flags_table[new_fsm] & device->migration_flags) != 1637 state_flags_table[new_fsm]) 1638 return -EINVAL; 1639 1640 /* 1641 * Arcs touching optional and unsupported states are skipped over. The 1642 * driver will instead see an arc from the original state to the next 1643 * logical state, as per the above comment. 1644 */ 1645 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm]; 1646 while ((state_flags_table[*next_fsm] & device->migration_flags) != 1647 state_flags_table[*next_fsm]) 1648 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm]; 1649 1650 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL; 1651 } 1652 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state); 1653 1654 /* 1655 * Convert the drivers's struct file into a FD number and return it to userspace 1656 */ 1657 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg, 1658 struct vfio_device_feature_mig_state *mig) 1659 { 1660 int ret; 1661 int fd; 1662 1663 fd = get_unused_fd_flags(O_CLOEXEC); 1664 if (fd < 0) { 1665 ret = fd; 1666 goto out_fput; 1667 } 1668 1669 mig->data_fd = fd; 1670 if (copy_to_user(arg, mig, sizeof(*mig))) { 1671 ret = -EFAULT; 1672 goto out_put_unused; 1673 } 1674 fd_install(fd, filp); 1675 return 0; 1676 1677 out_put_unused: 1678 put_unused_fd(fd); 1679 out_fput: 1680 fput(filp); 1681 return ret; 1682 } 1683 1684 static int 1685 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device, 1686 u32 flags, void __user *arg, 1687 size_t argsz) 1688 { 1689 size_t minsz = 1690 offsetofend(struct vfio_device_feature_mig_state, data_fd); 1691 struct vfio_device_feature_mig_state mig; 1692 struct file *filp = NULL; 1693 int ret; 1694 1695 if (!device->mig_ops) 1696 return -ENOTTY; 1697 1698 ret = vfio_check_feature(flags, argsz, 1699 VFIO_DEVICE_FEATURE_SET | 1700 VFIO_DEVICE_FEATURE_GET, 1701 sizeof(mig)); 1702 if (ret != 1) 1703 return ret; 1704 1705 if (copy_from_user(&mig, arg, minsz)) 1706 return -EFAULT; 1707 1708 if (flags & VFIO_DEVICE_FEATURE_GET) { 1709 enum vfio_device_mig_state curr_state; 1710 1711 ret = device->mig_ops->migration_get_state(device, 1712 &curr_state); 1713 if (ret) 1714 return ret; 1715 mig.device_state = curr_state; 1716 goto out_copy; 1717 } 1718 1719 /* Handle the VFIO_DEVICE_FEATURE_SET */ 1720 filp = device->mig_ops->migration_set_state(device, mig.device_state); 1721 if (IS_ERR(filp) || !filp) 1722 goto out_copy; 1723 1724 return vfio_ioct_mig_return_fd(filp, arg, &mig); 1725 out_copy: 1726 mig.data_fd = -1; 1727 if (copy_to_user(arg, &mig, sizeof(mig))) 1728 return -EFAULT; 1729 if (IS_ERR(filp)) 1730 return PTR_ERR(filp); 1731 return 0; 1732 } 1733 1734 static int vfio_ioctl_device_feature_migration(struct vfio_device *device, 1735 u32 flags, void __user *arg, 1736 size_t argsz) 1737 { 1738 struct vfio_device_feature_migration mig = { 1739 .flags = device->migration_flags, 1740 }; 1741 int ret; 1742 1743 if (!device->mig_ops) 1744 return -ENOTTY; 1745 1746 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 1747 sizeof(mig)); 1748 if (ret != 1) 1749 return ret; 1750 if (copy_to_user(arg, &mig, sizeof(mig))) 1751 return -EFAULT; 1752 return 0; 1753 } 1754 1755 /* Ranges should fit into a single kernel page */ 1756 #define LOG_MAX_RANGES \ 1757 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range)) 1758 1759 static int 1760 vfio_ioctl_device_feature_logging_start(struct vfio_device *device, 1761 u32 flags, void __user *arg, 1762 size_t argsz) 1763 { 1764 size_t minsz = 1765 offsetofend(struct vfio_device_feature_dma_logging_control, 1766 ranges); 1767 struct vfio_device_feature_dma_logging_range __user *ranges; 1768 struct vfio_device_feature_dma_logging_control control; 1769 struct vfio_device_feature_dma_logging_range range; 1770 struct rb_root_cached root = RB_ROOT_CACHED; 1771 struct interval_tree_node *nodes; 1772 u64 iova_end; 1773 u32 nnodes; 1774 int i, ret; 1775 1776 if (!device->log_ops) 1777 return -ENOTTY; 1778 1779 ret = vfio_check_feature(flags, argsz, 1780 VFIO_DEVICE_FEATURE_SET, 1781 sizeof(control)); 1782 if (ret != 1) 1783 return ret; 1784 1785 if (copy_from_user(&control, arg, minsz)) 1786 return -EFAULT; 1787 1788 nnodes = control.num_ranges; 1789 if (!nnodes) 1790 return -EINVAL; 1791 1792 if (nnodes > LOG_MAX_RANGES) 1793 return -E2BIG; 1794 1795 ranges = u64_to_user_ptr(control.ranges); 1796 nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node), 1797 GFP_KERNEL); 1798 if (!nodes) 1799 return -ENOMEM; 1800 1801 for (i = 0; i < nnodes; i++) { 1802 if (copy_from_user(&range, &ranges[i], sizeof(range))) { 1803 ret = -EFAULT; 1804 goto end; 1805 } 1806 if (!IS_ALIGNED(range.iova, control.page_size) || 1807 !IS_ALIGNED(range.length, control.page_size)) { 1808 ret = -EINVAL; 1809 goto end; 1810 } 1811 1812 if (check_add_overflow(range.iova, range.length, &iova_end) || 1813 iova_end > ULONG_MAX) { 1814 ret = -EOVERFLOW; 1815 goto end; 1816 } 1817 1818 nodes[i].start = range.iova; 1819 nodes[i].last = range.iova + range.length - 1; 1820 if (interval_tree_iter_first(&root, nodes[i].start, 1821 nodes[i].last)) { 1822 /* Range overlapping */ 1823 ret = -EINVAL; 1824 goto end; 1825 } 1826 interval_tree_insert(nodes + i, &root); 1827 } 1828 1829 ret = device->log_ops->log_start(device, &root, nnodes, 1830 &control.page_size); 1831 if (ret) 1832 goto end; 1833 1834 if (copy_to_user(arg, &control, sizeof(control))) { 1835 ret = -EFAULT; 1836 device->log_ops->log_stop(device); 1837 } 1838 1839 end: 1840 kfree(nodes); 1841 return ret; 1842 } 1843 1844 static int 1845 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device, 1846 u32 flags, void __user *arg, 1847 size_t argsz) 1848 { 1849 int ret; 1850 1851 if (!device->log_ops) 1852 return -ENOTTY; 1853 1854 ret = vfio_check_feature(flags, argsz, 1855 VFIO_DEVICE_FEATURE_SET, 0); 1856 if (ret != 1) 1857 return ret; 1858 1859 return device->log_ops->log_stop(device); 1860 } 1861 1862 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter, 1863 unsigned long iova, size_t length, 1864 void *opaque) 1865 { 1866 struct vfio_device *device = opaque; 1867 1868 return device->log_ops->log_read_and_clear(device, iova, length, iter); 1869 } 1870 1871 static int 1872 vfio_ioctl_device_feature_logging_report(struct vfio_device *device, 1873 u32 flags, void __user *arg, 1874 size_t argsz) 1875 { 1876 size_t minsz = 1877 offsetofend(struct vfio_device_feature_dma_logging_report, 1878 bitmap); 1879 struct vfio_device_feature_dma_logging_report report; 1880 struct iova_bitmap *iter; 1881 u64 iova_end; 1882 int ret; 1883 1884 if (!device->log_ops) 1885 return -ENOTTY; 1886 1887 ret = vfio_check_feature(flags, argsz, 1888 VFIO_DEVICE_FEATURE_GET, 1889 sizeof(report)); 1890 if (ret != 1) 1891 return ret; 1892 1893 if (copy_from_user(&report, arg, minsz)) 1894 return -EFAULT; 1895 1896 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size)) 1897 return -EINVAL; 1898 1899 if (check_add_overflow(report.iova, report.length, &iova_end) || 1900 iova_end > ULONG_MAX) 1901 return -EOVERFLOW; 1902 1903 iter = iova_bitmap_alloc(report.iova, report.length, 1904 report.page_size, 1905 u64_to_user_ptr(report.bitmap)); 1906 if (IS_ERR(iter)) 1907 return PTR_ERR(iter); 1908 1909 ret = iova_bitmap_for_each(iter, device, 1910 vfio_device_log_read_and_clear); 1911 1912 iova_bitmap_free(iter); 1913 return ret; 1914 } 1915 1916 static int vfio_ioctl_device_feature(struct vfio_device *device, 1917 struct vfio_device_feature __user *arg) 1918 { 1919 size_t minsz = offsetofend(struct vfio_device_feature, flags); 1920 struct vfio_device_feature feature; 1921 1922 if (copy_from_user(&feature, arg, minsz)) 1923 return -EFAULT; 1924 1925 if (feature.argsz < minsz) 1926 return -EINVAL; 1927 1928 /* Check unknown flags */ 1929 if (feature.flags & 1930 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET | 1931 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE)) 1932 return -EINVAL; 1933 1934 /* GET & SET are mutually exclusive except with PROBE */ 1935 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 1936 (feature.flags & VFIO_DEVICE_FEATURE_SET) && 1937 (feature.flags & VFIO_DEVICE_FEATURE_GET)) 1938 return -EINVAL; 1939 1940 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 1941 case VFIO_DEVICE_FEATURE_MIGRATION: 1942 return vfio_ioctl_device_feature_migration( 1943 device, feature.flags, arg->data, 1944 feature.argsz - minsz); 1945 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: 1946 return vfio_ioctl_device_feature_mig_device_state( 1947 device, feature.flags, arg->data, 1948 feature.argsz - minsz); 1949 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: 1950 return vfio_ioctl_device_feature_logging_start( 1951 device, feature.flags, arg->data, 1952 feature.argsz - minsz); 1953 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: 1954 return vfio_ioctl_device_feature_logging_stop( 1955 device, feature.flags, arg->data, 1956 feature.argsz - minsz); 1957 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: 1958 return vfio_ioctl_device_feature_logging_report( 1959 device, feature.flags, arg->data, 1960 feature.argsz - minsz); 1961 default: 1962 if (unlikely(!device->ops->device_feature)) 1963 return -EINVAL; 1964 return device->ops->device_feature(device, feature.flags, 1965 arg->data, 1966 feature.argsz - minsz); 1967 } 1968 } 1969 1970 static long vfio_device_fops_unl_ioctl(struct file *filep, 1971 unsigned int cmd, unsigned long arg) 1972 { 1973 struct vfio_device *device = filep->private_data; 1974 int ret; 1975 1976 ret = vfio_device_pm_runtime_get(device); 1977 if (ret) 1978 return ret; 1979 1980 switch (cmd) { 1981 case VFIO_DEVICE_FEATURE: 1982 ret = vfio_ioctl_device_feature(device, (void __user *)arg); 1983 break; 1984 1985 default: 1986 if (unlikely(!device->ops->ioctl)) 1987 ret = -EINVAL; 1988 else 1989 ret = device->ops->ioctl(device, cmd, arg); 1990 break; 1991 } 1992 1993 vfio_device_pm_runtime_put(device); 1994 return ret; 1995 } 1996 1997 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, 1998 size_t count, loff_t *ppos) 1999 { 2000 struct vfio_device *device = filep->private_data; 2001 2002 if (unlikely(!device->ops->read)) 2003 return -EINVAL; 2004 2005 return device->ops->read(device, buf, count, ppos); 2006 } 2007 2008 static ssize_t vfio_device_fops_write(struct file *filep, 2009 const char __user *buf, 2010 size_t count, loff_t *ppos) 2011 { 2012 struct vfio_device *device = filep->private_data; 2013 2014 if (unlikely(!device->ops->write)) 2015 return -EINVAL; 2016 2017 return device->ops->write(device, buf, count, ppos); 2018 } 2019 2020 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) 2021 { 2022 struct vfio_device *device = filep->private_data; 2023 2024 if (unlikely(!device->ops->mmap)) 2025 return -EINVAL; 2026 2027 return device->ops->mmap(device, vma); 2028 } 2029 2030 static const struct file_operations vfio_device_fops = { 2031 .owner = THIS_MODULE, 2032 .release = vfio_device_fops_release, 2033 .read = vfio_device_fops_read, 2034 .write = vfio_device_fops_write, 2035 .unlocked_ioctl = vfio_device_fops_unl_ioctl, 2036 .compat_ioctl = compat_ptr_ioctl, 2037 .mmap = vfio_device_fops_mmap, 2038 }; 2039 2040 /** 2041 * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file 2042 * @file: VFIO group file 2043 * 2044 * The returned iommu_group is valid as long as a ref is held on the file. 2045 */ 2046 struct iommu_group *vfio_file_iommu_group(struct file *file) 2047 { 2048 struct vfio_group *group = file->private_data; 2049 2050 if (file->f_op != &vfio_group_fops) 2051 return NULL; 2052 return group->iommu_group; 2053 } 2054 EXPORT_SYMBOL_GPL(vfio_file_iommu_group); 2055 2056 /** 2057 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file 2058 * is always CPU cache coherent 2059 * @file: VFIO group file 2060 * 2061 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop 2062 * bit in DMA transactions. A return of false indicates that the user has 2063 * rights to access additional instructions such as wbinvd on x86. 2064 */ 2065 bool vfio_file_enforced_coherent(struct file *file) 2066 { 2067 struct vfio_group *group = file->private_data; 2068 bool ret; 2069 2070 if (file->f_op != &vfio_group_fops) 2071 return true; 2072 2073 down_read(&group->group_rwsem); 2074 if (group->container) { 2075 ret = vfio_ioctl_check_extension(group->container, 2076 VFIO_DMA_CC_IOMMU); 2077 } else { 2078 /* 2079 * Since the coherency state is determined only once a container 2080 * is attached the user must do so before they can prove they 2081 * have permission. 2082 */ 2083 ret = true; 2084 } 2085 up_read(&group->group_rwsem); 2086 return ret; 2087 } 2088 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); 2089 2090 /** 2091 * vfio_file_set_kvm - Link a kvm with VFIO drivers 2092 * @file: VFIO group file 2093 * @kvm: KVM to link 2094 * 2095 * When a VFIO device is first opened the KVM will be available in 2096 * device->kvm if one was associated with the group. 2097 */ 2098 void vfio_file_set_kvm(struct file *file, struct kvm *kvm) 2099 { 2100 struct vfio_group *group = file->private_data; 2101 2102 if (file->f_op != &vfio_group_fops) 2103 return; 2104 2105 down_write(&group->group_rwsem); 2106 group->kvm = kvm; 2107 up_write(&group->group_rwsem); 2108 } 2109 EXPORT_SYMBOL_GPL(vfio_file_set_kvm); 2110 2111 /** 2112 * vfio_file_has_dev - True if the VFIO file is a handle for device 2113 * @file: VFIO file to check 2114 * @device: Device that must be part of the file 2115 * 2116 * Returns true if given file has permission to manipulate the given device. 2117 */ 2118 bool vfio_file_has_dev(struct file *file, struct vfio_device *device) 2119 { 2120 struct vfio_group *group = file->private_data; 2121 2122 if (file->f_op != &vfio_group_fops) 2123 return false; 2124 2125 return group == device->group; 2126 } 2127 EXPORT_SYMBOL_GPL(vfio_file_has_dev); 2128 2129 /* 2130 * Sub-module support 2131 */ 2132 /* 2133 * Helper for managing a buffer of info chain capabilities, allocate or 2134 * reallocate a buffer with additional @size, filling in @id and @version 2135 * of the capability. A pointer to the new capability is returned. 2136 * 2137 * NB. The chain is based at the head of the buffer, so new entries are 2138 * added to the tail, vfio_info_cap_shift() should be called to fixup the 2139 * next offsets prior to copying to the user buffer. 2140 */ 2141 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, 2142 size_t size, u16 id, u16 version) 2143 { 2144 void *buf; 2145 struct vfio_info_cap_header *header, *tmp; 2146 2147 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); 2148 if (!buf) { 2149 kfree(caps->buf); 2150 caps->buf = NULL; 2151 caps->size = 0; 2152 return ERR_PTR(-ENOMEM); 2153 } 2154 2155 caps->buf = buf; 2156 header = buf + caps->size; 2157 2158 /* Eventually copied to user buffer, zero */ 2159 memset(header, 0, size); 2160 2161 header->id = id; 2162 header->version = version; 2163 2164 /* Add to the end of the capability chain */ 2165 for (tmp = buf; tmp->next; tmp = buf + tmp->next) 2166 ; /* nothing */ 2167 2168 tmp->next = caps->size; 2169 caps->size += size; 2170 2171 return header; 2172 } 2173 EXPORT_SYMBOL_GPL(vfio_info_cap_add); 2174 2175 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) 2176 { 2177 struct vfio_info_cap_header *tmp; 2178 void *buf = (void *)caps->buf; 2179 2180 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) 2181 tmp->next += offset; 2182 } 2183 EXPORT_SYMBOL(vfio_info_cap_shift); 2184 2185 int vfio_info_add_capability(struct vfio_info_cap *caps, 2186 struct vfio_info_cap_header *cap, size_t size) 2187 { 2188 struct vfio_info_cap_header *header; 2189 2190 header = vfio_info_cap_add(caps, size, cap->id, cap->version); 2191 if (IS_ERR(header)) 2192 return PTR_ERR(header); 2193 2194 memcpy(header + 1, cap + 1, size - sizeof(*header)); 2195 2196 return 0; 2197 } 2198 EXPORT_SYMBOL(vfio_info_add_capability); 2199 2200 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, 2201 int max_irq_type, size_t *data_size) 2202 { 2203 unsigned long minsz; 2204 size_t size; 2205 2206 minsz = offsetofend(struct vfio_irq_set, count); 2207 2208 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) || 2209 (hdr->count >= (U32_MAX - hdr->start)) || 2210 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | 2211 VFIO_IRQ_SET_ACTION_TYPE_MASK))) 2212 return -EINVAL; 2213 2214 if (data_size) 2215 *data_size = 0; 2216 2217 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs) 2218 return -EINVAL; 2219 2220 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { 2221 case VFIO_IRQ_SET_DATA_NONE: 2222 size = 0; 2223 break; 2224 case VFIO_IRQ_SET_DATA_BOOL: 2225 size = sizeof(uint8_t); 2226 break; 2227 case VFIO_IRQ_SET_DATA_EVENTFD: 2228 size = sizeof(int32_t); 2229 break; 2230 default: 2231 return -EINVAL; 2232 } 2233 2234 if (size) { 2235 if (hdr->argsz - minsz < hdr->count * size) 2236 return -EINVAL; 2237 2238 if (!data_size) 2239 return -EINVAL; 2240 2241 *data_size = hdr->count * size; 2242 } 2243 2244 return 0; 2245 } 2246 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); 2247 2248 /* 2249 * Pin contiguous user pages and return their associated host pages for local 2250 * domain only. 2251 * @device [in] : device 2252 * @iova [in] : starting IOVA of user pages to be pinned. 2253 * @npage [in] : count of pages to be pinned. This count should not 2254 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 2255 * @prot [in] : protection flags 2256 * @pages[out] : array of host pages 2257 * Return error or number of pages pinned. 2258 * 2259 * A driver may only call this function if the vfio_device was created 2260 * by vfio_register_emulated_iommu_dev(). 2261 */ 2262 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, 2263 int npage, int prot, struct page **pages) 2264 { 2265 struct vfio_container *container; 2266 struct vfio_group *group = device->group; 2267 struct vfio_iommu_driver *driver; 2268 int ret; 2269 2270 if (!pages || !npage || !vfio_assert_device_open(device)) 2271 return -EINVAL; 2272 2273 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) 2274 return -E2BIG; 2275 2276 /* group->container cannot change while a vfio device is open */ 2277 container = group->container; 2278 driver = container->iommu_driver; 2279 if (likely(driver && driver->ops->pin_pages)) 2280 ret = driver->ops->pin_pages(container->iommu_data, 2281 group->iommu_group, iova, 2282 npage, prot, pages); 2283 else 2284 ret = -ENOTTY; 2285 2286 return ret; 2287 } 2288 EXPORT_SYMBOL(vfio_pin_pages); 2289 2290 /* 2291 * Unpin contiguous host pages for local domain only. 2292 * @device [in] : device 2293 * @iova [in] : starting address of user pages to be unpinned. 2294 * @npage [in] : count of pages to be unpinned. This count should not 2295 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 2296 */ 2297 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) 2298 { 2299 struct vfio_container *container; 2300 struct vfio_iommu_driver *driver; 2301 2302 if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES)) 2303 return; 2304 2305 if (WARN_ON(!vfio_assert_device_open(device))) 2306 return; 2307 2308 /* group->container cannot change while a vfio device is open */ 2309 container = device->group->container; 2310 driver = container->iommu_driver; 2311 2312 driver->ops->unpin_pages(container->iommu_data, iova, npage); 2313 } 2314 EXPORT_SYMBOL(vfio_unpin_pages); 2315 2316 /* 2317 * This interface allows the CPUs to perform some sort of virtual DMA on 2318 * behalf of the device. 2319 * 2320 * CPUs read/write from/into a range of IOVAs pointing to user space memory 2321 * into/from a kernel buffer. 2322 * 2323 * As the read/write of user space memory is conducted via the CPUs and is 2324 * not a real device DMA, it is not necessary to pin the user space memory. 2325 * 2326 * @device [in] : VFIO device 2327 * @iova [in] : base IOVA of a user space buffer 2328 * @data [in] : pointer to kernel buffer 2329 * @len [in] : kernel buffer length 2330 * @write : indicate read or write 2331 * Return error code on failure or 0 on success. 2332 */ 2333 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, 2334 size_t len, bool write) 2335 { 2336 struct vfio_container *container; 2337 struct vfio_iommu_driver *driver; 2338 int ret = 0; 2339 2340 if (!data || len <= 0 || !vfio_assert_device_open(device)) 2341 return -EINVAL; 2342 2343 /* group->container cannot change while a vfio device is open */ 2344 container = device->group->container; 2345 driver = container->iommu_driver; 2346 2347 if (likely(driver && driver->ops->dma_rw)) 2348 ret = driver->ops->dma_rw(container->iommu_data, 2349 iova, data, len, write); 2350 else 2351 ret = -ENOTTY; 2352 return ret; 2353 } 2354 EXPORT_SYMBOL(vfio_dma_rw); 2355 2356 /* 2357 * Module/class support 2358 */ 2359 static char *vfio_devnode(struct device *dev, umode_t *mode) 2360 { 2361 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); 2362 } 2363 2364 static struct miscdevice vfio_dev = { 2365 .minor = VFIO_MINOR, 2366 .name = "vfio", 2367 .fops = &vfio_fops, 2368 .nodename = "vfio/vfio", 2369 .mode = S_IRUGO | S_IWUGO, 2370 }; 2371 2372 static int __init vfio_init(void) 2373 { 2374 int ret; 2375 2376 ida_init(&vfio.group_ida); 2377 mutex_init(&vfio.group_lock); 2378 mutex_init(&vfio.iommu_drivers_lock); 2379 INIT_LIST_HEAD(&vfio.group_list); 2380 INIT_LIST_HEAD(&vfio.iommu_drivers_list); 2381 2382 ret = misc_register(&vfio_dev); 2383 if (ret) { 2384 pr_err("vfio: misc device register failed\n"); 2385 return ret; 2386 } 2387 2388 /* /dev/vfio/$GROUP */ 2389 vfio.class = class_create(THIS_MODULE, "vfio"); 2390 if (IS_ERR(vfio.class)) { 2391 ret = PTR_ERR(vfio.class); 2392 goto err_class; 2393 } 2394 2395 vfio.class->devnode = vfio_devnode; 2396 2397 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio"); 2398 if (ret) 2399 goto err_alloc_chrdev; 2400 2401 #ifdef CONFIG_VFIO_NOIOMMU 2402 ret = vfio_register_iommu_driver(&vfio_noiommu_ops); 2403 #endif 2404 if (ret) 2405 goto err_driver_register; 2406 2407 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); 2408 return 0; 2409 2410 err_driver_register: 2411 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); 2412 err_alloc_chrdev: 2413 class_destroy(vfio.class); 2414 vfio.class = NULL; 2415 err_class: 2416 misc_deregister(&vfio_dev); 2417 return ret; 2418 } 2419 2420 static void __exit vfio_cleanup(void) 2421 { 2422 WARN_ON(!list_empty(&vfio.group_list)); 2423 2424 #ifdef CONFIG_VFIO_NOIOMMU 2425 vfio_unregister_iommu_driver(&vfio_noiommu_ops); 2426 #endif 2427 ida_destroy(&vfio.group_ida); 2428 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); 2429 class_destroy(vfio.class); 2430 vfio.class = NULL; 2431 misc_deregister(&vfio_dev); 2432 xa_destroy(&vfio_device_set_xa); 2433 } 2434 2435 module_init(vfio_init); 2436 module_exit(vfio_cleanup); 2437 2438 MODULE_VERSION(DRIVER_VERSION); 2439 MODULE_LICENSE("GPL v2"); 2440 MODULE_AUTHOR(DRIVER_AUTHOR); 2441 MODULE_DESCRIPTION(DRIVER_DESC); 2442 MODULE_ALIAS_MISCDEV(VFIO_MINOR); 2443 MODULE_ALIAS("devname:vfio/vfio"); 2444 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce"); 2445