1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO core 4 * 5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 6 * Author: Alex Williamson <alex.williamson@redhat.com> 7 * 8 * Derived from original vfio: 9 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 10 * Author: Tom Lyon, pugs@cisco.com 11 */ 12 13 #include <linux/cdev.h> 14 #include <linux/compat.h> 15 #include <linux/device.h> 16 #include <linux/fs.h> 17 #include <linux/idr.h> 18 #include <linux/iommu.h> 19 #ifdef CONFIG_HAVE_KVM 20 #include <linux/kvm_host.h> 21 #endif 22 #include <linux/list.h> 23 #include <linux/miscdevice.h> 24 #include <linux/module.h> 25 #include <linux/mutex.h> 26 #include <linux/pci.h> 27 #include <linux/rwsem.h> 28 #include <linux/sched.h> 29 #include <linux/slab.h> 30 #include <linux/stat.h> 31 #include <linux/string.h> 32 #include <linux/uaccess.h> 33 #include <linux/vfio.h> 34 #include <linux/wait.h> 35 #include <linux/sched/signal.h> 36 #include <linux/pm_runtime.h> 37 #include <linux/interval_tree.h> 38 #include <linux/iova_bitmap.h> 39 #include <linux/iommufd.h> 40 #include "vfio.h" 41 42 #define DRIVER_VERSION "0.3" 43 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 44 #define DRIVER_DESC "VFIO - User Level meta-driver" 45 46 static struct vfio { 47 struct class *device_class; 48 struct ida device_ida; 49 } vfio; 50 51 #ifdef CONFIG_VFIO_NOIOMMU 52 bool vfio_noiommu __read_mostly; 53 module_param_named(enable_unsafe_noiommu_mode, 54 vfio_noiommu, bool, S_IRUGO | S_IWUSR); 55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); 56 #endif 57 58 static DEFINE_XARRAY(vfio_device_set_xa); 59 60 int vfio_assign_device_set(struct vfio_device *device, void *set_id) 61 { 62 unsigned long idx = (unsigned long)set_id; 63 struct vfio_device_set *new_dev_set; 64 struct vfio_device_set *dev_set; 65 66 if (WARN_ON(!set_id)) 67 return -EINVAL; 68 69 /* 70 * Atomically acquire a singleton object in the xarray for this set_id 71 */ 72 xa_lock(&vfio_device_set_xa); 73 dev_set = xa_load(&vfio_device_set_xa, idx); 74 if (dev_set) 75 goto found_get_ref; 76 xa_unlock(&vfio_device_set_xa); 77 78 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL); 79 if (!new_dev_set) 80 return -ENOMEM; 81 mutex_init(&new_dev_set->lock); 82 INIT_LIST_HEAD(&new_dev_set->device_list); 83 new_dev_set->set_id = set_id; 84 85 xa_lock(&vfio_device_set_xa); 86 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set, 87 GFP_KERNEL); 88 if (!dev_set) { 89 dev_set = new_dev_set; 90 goto found_get_ref; 91 } 92 93 kfree(new_dev_set); 94 if (xa_is_err(dev_set)) { 95 xa_unlock(&vfio_device_set_xa); 96 return xa_err(dev_set); 97 } 98 99 found_get_ref: 100 dev_set->device_count++; 101 xa_unlock(&vfio_device_set_xa); 102 mutex_lock(&dev_set->lock); 103 device->dev_set = dev_set; 104 list_add_tail(&device->dev_set_list, &dev_set->device_list); 105 mutex_unlock(&dev_set->lock); 106 return 0; 107 } 108 EXPORT_SYMBOL_GPL(vfio_assign_device_set); 109 110 static void vfio_release_device_set(struct vfio_device *device) 111 { 112 struct vfio_device_set *dev_set = device->dev_set; 113 114 if (!dev_set) 115 return; 116 117 mutex_lock(&dev_set->lock); 118 list_del(&device->dev_set_list); 119 mutex_unlock(&dev_set->lock); 120 121 xa_lock(&vfio_device_set_xa); 122 if (!--dev_set->device_count) { 123 __xa_erase(&vfio_device_set_xa, 124 (unsigned long)dev_set->set_id); 125 mutex_destroy(&dev_set->lock); 126 kfree(dev_set); 127 } 128 xa_unlock(&vfio_device_set_xa); 129 } 130 131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set) 132 { 133 struct vfio_device *cur; 134 unsigned int open_count = 0; 135 136 lockdep_assert_held(&dev_set->lock); 137 138 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 139 open_count += cur->open_count; 140 return open_count; 141 } 142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count); 143 144 struct vfio_device * 145 vfio_find_device_in_devset(struct vfio_device_set *dev_set, 146 struct device *dev) 147 { 148 struct vfio_device *cur; 149 150 lockdep_assert_held(&dev_set->lock); 151 152 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 153 if (cur->dev == dev) 154 return cur; 155 return NULL; 156 } 157 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset); 158 159 /* 160 * Device objects - create, release, get, put, search 161 */ 162 /* Device reference always implies a group reference */ 163 void vfio_device_put_registration(struct vfio_device *device) 164 { 165 if (refcount_dec_and_test(&device->refcount)) 166 complete(&device->comp); 167 } 168 169 bool vfio_device_try_get_registration(struct vfio_device *device) 170 { 171 return refcount_inc_not_zero(&device->refcount); 172 } 173 174 /* 175 * VFIO driver API 176 */ 177 /* Release helper called by vfio_put_device() */ 178 static void vfio_device_release(struct device *dev) 179 { 180 struct vfio_device *device = 181 container_of(dev, struct vfio_device, device); 182 183 vfio_release_device_set(device); 184 ida_free(&vfio.device_ida, device->index); 185 186 if (device->ops->release) 187 device->ops->release(device); 188 189 kvfree(device); 190 } 191 192 static int vfio_init_device(struct vfio_device *device, struct device *dev, 193 const struct vfio_device_ops *ops); 194 195 /* 196 * Allocate and initialize vfio_device so it can be registered to vfio 197 * core. 198 * 199 * Drivers should use the wrapper vfio_alloc_device() for allocation. 200 * @size is the size of the structure to be allocated, including any 201 * private data used by the driver. 202 * 203 * Driver may provide an @init callback to cover device private data. 204 * 205 * Use vfio_put_device() to release the structure after success return. 206 */ 207 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, 208 const struct vfio_device_ops *ops) 209 { 210 struct vfio_device *device; 211 int ret; 212 213 if (WARN_ON(size < sizeof(struct vfio_device))) 214 return ERR_PTR(-EINVAL); 215 216 device = kvzalloc(size, GFP_KERNEL); 217 if (!device) 218 return ERR_PTR(-ENOMEM); 219 220 ret = vfio_init_device(device, dev, ops); 221 if (ret) 222 goto out_free; 223 return device; 224 225 out_free: 226 kvfree(device); 227 return ERR_PTR(ret); 228 } 229 EXPORT_SYMBOL_GPL(_vfio_alloc_device); 230 231 /* 232 * Initialize a vfio_device so it can be registered to vfio core. 233 */ 234 static int vfio_init_device(struct vfio_device *device, struct device *dev, 235 const struct vfio_device_ops *ops) 236 { 237 int ret; 238 239 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL); 240 if (ret < 0) { 241 dev_dbg(dev, "Error to alloc index\n"); 242 return ret; 243 } 244 245 device->index = ret; 246 init_completion(&device->comp); 247 device->dev = dev; 248 device->ops = ops; 249 250 if (ops->init) { 251 ret = ops->init(device); 252 if (ret) 253 goto out_uninit; 254 } 255 256 device_initialize(&device->device); 257 device->device.release = vfio_device_release; 258 device->device.class = vfio.device_class; 259 device->device.parent = device->dev; 260 return 0; 261 262 out_uninit: 263 vfio_release_device_set(device); 264 ida_free(&vfio.device_ida, device->index); 265 return ret; 266 } 267 268 static int __vfio_register_dev(struct vfio_device *device, 269 enum vfio_group_type type) 270 { 271 int ret; 272 273 if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) && 274 (!device->ops->bind_iommufd || 275 !device->ops->unbind_iommufd || 276 !device->ops->attach_ioas || 277 !device->ops->detach_ioas))) 278 return -EINVAL; 279 280 /* 281 * If the driver doesn't specify a set then the device is added to a 282 * singleton set just for itself. 283 */ 284 if (!device->dev_set) 285 vfio_assign_device_set(device, device); 286 287 ret = dev_set_name(&device->device, "vfio%d", device->index); 288 if (ret) 289 return ret; 290 291 ret = vfio_device_set_group(device, type); 292 if (ret) 293 return ret; 294 295 ret = device_add(&device->device); 296 if (ret) 297 goto err_out; 298 299 /* Refcounting can't start until the driver calls register */ 300 refcount_set(&device->refcount, 1); 301 302 vfio_device_group_register(device); 303 304 return 0; 305 err_out: 306 vfio_device_remove_group(device); 307 return ret; 308 } 309 310 int vfio_register_group_dev(struct vfio_device *device) 311 { 312 return __vfio_register_dev(device, VFIO_IOMMU); 313 } 314 EXPORT_SYMBOL_GPL(vfio_register_group_dev); 315 316 /* 317 * Register a virtual device without IOMMU backing. The user of this 318 * device must not be able to directly trigger unmediated DMA. 319 */ 320 int vfio_register_emulated_iommu_dev(struct vfio_device *device) 321 { 322 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU); 323 } 324 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); 325 326 /* 327 * Decrement the device reference count and wait for the device to be 328 * removed. Open file descriptors for the device... */ 329 void vfio_unregister_group_dev(struct vfio_device *device) 330 { 331 unsigned int i = 0; 332 bool interrupted = false; 333 long rc; 334 335 vfio_device_put_registration(device); 336 rc = try_wait_for_completion(&device->comp); 337 while (rc <= 0) { 338 if (device->ops->request) 339 device->ops->request(device, i++); 340 341 if (interrupted) { 342 rc = wait_for_completion_timeout(&device->comp, 343 HZ * 10); 344 } else { 345 rc = wait_for_completion_interruptible_timeout( 346 &device->comp, HZ * 10); 347 if (rc < 0) { 348 interrupted = true; 349 dev_warn(device->dev, 350 "Device is currently in use, task" 351 " \"%s\" (%d) " 352 "blocked until device is released", 353 current->comm, task_pid_nr(current)); 354 } 355 } 356 } 357 358 vfio_device_group_unregister(device); 359 360 /* Balances device_add in register path */ 361 device_del(&device->device); 362 363 /* Balances vfio_device_set_group in register path */ 364 vfio_device_remove_group(device); 365 } 366 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); 367 368 #ifdef CONFIG_HAVE_KVM 369 void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) 370 { 371 void (*pfn)(struct kvm *kvm); 372 bool (*fn)(struct kvm *kvm); 373 bool ret; 374 375 lockdep_assert_held(&device->dev_set->lock); 376 377 pfn = symbol_get(kvm_put_kvm); 378 if (WARN_ON(!pfn)) 379 return; 380 381 fn = symbol_get(kvm_get_kvm_safe); 382 if (WARN_ON(!fn)) { 383 symbol_put(kvm_put_kvm); 384 return; 385 } 386 387 ret = fn(kvm); 388 symbol_put(kvm_get_kvm_safe); 389 if (!ret) { 390 symbol_put(kvm_put_kvm); 391 return; 392 } 393 394 device->put_kvm = pfn; 395 device->kvm = kvm; 396 } 397 398 void vfio_device_put_kvm(struct vfio_device *device) 399 { 400 lockdep_assert_held(&device->dev_set->lock); 401 402 if (!device->kvm) 403 return; 404 405 if (WARN_ON(!device->put_kvm)) 406 goto clear; 407 408 device->put_kvm(device->kvm); 409 device->put_kvm = NULL; 410 symbol_put(kvm_put_kvm); 411 412 clear: 413 device->kvm = NULL; 414 } 415 #endif 416 417 /* true if the vfio_device has open_device() called but not close_device() */ 418 static bool vfio_assert_device_open(struct vfio_device *device) 419 { 420 return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); 421 } 422 423 struct vfio_device_file * 424 vfio_allocate_device_file(struct vfio_device *device) 425 { 426 struct vfio_device_file *df; 427 428 df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT); 429 if (!df) 430 return ERR_PTR(-ENOMEM); 431 432 df->device = device; 433 spin_lock_init(&df->kvm_ref_lock); 434 435 return df; 436 } 437 438 static int vfio_df_device_first_open(struct vfio_device_file *df) 439 { 440 struct vfio_device *device = df->device; 441 struct iommufd_ctx *iommufd = df->iommufd; 442 int ret; 443 444 lockdep_assert_held(&device->dev_set->lock); 445 446 if (!try_module_get(device->dev->driver->owner)) 447 return -ENODEV; 448 449 if (iommufd) 450 ret = vfio_df_iommufd_bind(df); 451 else 452 ret = vfio_device_group_use_iommu(device); 453 if (ret) 454 goto err_module_put; 455 456 if (device->ops->open_device) { 457 ret = device->ops->open_device(device); 458 if (ret) 459 goto err_unuse_iommu; 460 } 461 return 0; 462 463 err_unuse_iommu: 464 if (iommufd) 465 vfio_df_iommufd_unbind(df); 466 else 467 vfio_device_group_unuse_iommu(device); 468 err_module_put: 469 module_put(device->dev->driver->owner); 470 return ret; 471 } 472 473 static void vfio_df_device_last_close(struct vfio_device_file *df) 474 { 475 struct vfio_device *device = df->device; 476 struct iommufd_ctx *iommufd = df->iommufd; 477 478 lockdep_assert_held(&device->dev_set->lock); 479 480 if (device->ops->close_device) 481 device->ops->close_device(device); 482 if (iommufd) 483 vfio_df_iommufd_unbind(df); 484 else 485 vfio_device_group_unuse_iommu(device); 486 module_put(device->dev->driver->owner); 487 } 488 489 int vfio_df_open(struct vfio_device_file *df) 490 { 491 struct vfio_device *device = df->device; 492 int ret = 0; 493 494 lockdep_assert_held(&device->dev_set->lock); 495 496 /* 497 * Only the group path allows the device to be opened multiple 498 * times. The device cdev path doesn't have a secure way for it. 499 */ 500 if (device->open_count != 0 && !df->group) 501 return -EINVAL; 502 503 device->open_count++; 504 if (device->open_count == 1) { 505 ret = vfio_df_device_first_open(df); 506 if (ret) 507 device->open_count--; 508 } 509 510 return ret; 511 } 512 513 void vfio_df_close(struct vfio_device_file *df) 514 { 515 struct vfio_device *device = df->device; 516 517 lockdep_assert_held(&device->dev_set->lock); 518 519 vfio_assert_device_open(device); 520 if (device->open_count == 1) 521 vfio_df_device_last_close(df); 522 device->open_count--; 523 } 524 525 /* 526 * Wrapper around pm_runtime_resume_and_get(). 527 * Return error code on failure or 0 on success. 528 */ 529 static inline int vfio_device_pm_runtime_get(struct vfio_device *device) 530 { 531 struct device *dev = device->dev; 532 533 if (dev->driver && dev->driver->pm) { 534 int ret; 535 536 ret = pm_runtime_resume_and_get(dev); 537 if (ret) { 538 dev_info_ratelimited(dev, 539 "vfio: runtime resume failed %d\n", ret); 540 return -EIO; 541 } 542 } 543 544 return 0; 545 } 546 547 /* 548 * Wrapper around pm_runtime_put(). 549 */ 550 static inline void vfio_device_pm_runtime_put(struct vfio_device *device) 551 { 552 struct device *dev = device->dev; 553 554 if (dev->driver && dev->driver->pm) 555 pm_runtime_put(dev); 556 } 557 558 /* 559 * VFIO Device fd 560 */ 561 static int vfio_device_fops_release(struct inode *inode, struct file *filep) 562 { 563 struct vfio_device_file *df = filep->private_data; 564 struct vfio_device *device = df->device; 565 566 vfio_df_group_close(df); 567 568 vfio_device_put_registration(device); 569 570 kfree(df); 571 572 return 0; 573 } 574 575 /* 576 * vfio_mig_get_next_state - Compute the next step in the FSM 577 * @cur_fsm - The current state the device is in 578 * @new_fsm - The target state to reach 579 * @next_fsm - Pointer to the next step to get to new_fsm 580 * 581 * Return 0 upon success, otherwise -errno 582 * Upon success the next step in the state progression between cur_fsm and 583 * new_fsm will be set in next_fsm. 584 * 585 * This breaks down requests for combination transitions into smaller steps and 586 * returns the next step to get to new_fsm. The function may need to be called 587 * multiple times before reaching new_fsm. 588 * 589 */ 590 int vfio_mig_get_next_state(struct vfio_device *device, 591 enum vfio_device_mig_state cur_fsm, 592 enum vfio_device_mig_state new_fsm, 593 enum vfio_device_mig_state *next_fsm) 594 { 595 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 }; 596 /* 597 * The coding in this table requires the driver to implement the 598 * following FSM arcs: 599 * RESUMING -> STOP 600 * STOP -> RESUMING 601 * STOP -> STOP_COPY 602 * STOP_COPY -> STOP 603 * 604 * If P2P is supported then the driver must also implement these FSM 605 * arcs: 606 * RUNNING -> RUNNING_P2P 607 * RUNNING_P2P -> RUNNING 608 * RUNNING_P2P -> STOP 609 * STOP -> RUNNING_P2P 610 * 611 * If precopy is supported then the driver must support these additional 612 * FSM arcs: 613 * RUNNING -> PRE_COPY 614 * PRE_COPY -> RUNNING 615 * PRE_COPY -> STOP_COPY 616 * However, if precopy and P2P are supported together then the driver 617 * must support these additional arcs beyond the P2P arcs above: 618 * PRE_COPY -> RUNNING 619 * PRE_COPY -> PRE_COPY_P2P 620 * PRE_COPY_P2P -> PRE_COPY 621 * PRE_COPY_P2P -> RUNNING_P2P 622 * PRE_COPY_P2P -> STOP_COPY 623 * RUNNING -> PRE_COPY 624 * RUNNING_P2P -> PRE_COPY_P2P 625 * 626 * Without P2P and precopy the driver must implement: 627 * RUNNING -> STOP 628 * STOP -> RUNNING 629 * 630 * The coding will step through multiple states for some combination 631 * transitions; if all optional features are supported, this means the 632 * following ones: 633 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY 634 * PRE_COPY -> RUNNING -> RUNNING_P2P 635 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP 636 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING 637 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING 638 * PRE_COPY_P2P -> RUNNING_P2P -> STOP 639 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING 640 * RESUMING -> STOP -> RUNNING_P2P 641 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P 642 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING 643 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 644 * RESUMING -> STOP -> STOP_COPY 645 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P 646 * RUNNING -> RUNNING_P2P -> STOP 647 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING 648 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY 649 * RUNNING_P2P -> RUNNING -> PRE_COPY 650 * RUNNING_P2P -> STOP -> RESUMING 651 * RUNNING_P2P -> STOP -> STOP_COPY 652 * STOP -> RUNNING_P2P -> PRE_COPY_P2P 653 * STOP -> RUNNING_P2P -> RUNNING 654 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 655 * STOP_COPY -> STOP -> RESUMING 656 * STOP_COPY -> STOP -> RUNNING_P2P 657 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING 658 * 659 * The following transitions are blocked: 660 * STOP_COPY -> PRE_COPY 661 * STOP_COPY -> PRE_COPY_P2P 662 */ 663 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { 664 [VFIO_DEVICE_STATE_STOP] = { 665 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 666 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 667 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 668 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 669 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 670 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 671 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 672 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 673 }, 674 [VFIO_DEVICE_STATE_RUNNING] = { 675 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 676 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 677 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 678 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 679 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 680 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 681 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 682 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 683 }, 684 [VFIO_DEVICE_STATE_PRE_COPY] = { 685 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING, 686 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 687 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 688 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 689 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 690 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING, 691 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING, 692 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 693 }, 694 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = { 695 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 696 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 697 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 698 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 699 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 700 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 701 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 702 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 703 }, 704 [VFIO_DEVICE_STATE_STOP_COPY] = { 705 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 706 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 707 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 708 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 709 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 710 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 711 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 712 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 713 }, 714 [VFIO_DEVICE_STATE_RESUMING] = { 715 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 716 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 717 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP, 718 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP, 719 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 720 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 721 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 722 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 723 }, 724 [VFIO_DEVICE_STATE_RUNNING_P2P] = { 725 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 726 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 727 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING, 728 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 729 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 730 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 731 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 732 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 733 }, 734 [VFIO_DEVICE_STATE_ERROR] = { 735 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, 736 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, 737 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 738 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 739 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, 740 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, 741 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, 742 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 743 }, 744 }; 745 746 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { 747 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, 748 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, 749 [VFIO_DEVICE_STATE_PRE_COPY] = 750 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY, 751 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY | 752 VFIO_MIGRATION_P2P | 753 VFIO_MIGRATION_PRE_COPY, 754 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, 755 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, 756 [VFIO_DEVICE_STATE_RUNNING_P2P] = 757 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P, 758 [VFIO_DEVICE_STATE_ERROR] = ~0U, 759 }; 760 761 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 762 (state_flags_table[cur_fsm] & device->migration_flags) != 763 state_flags_table[cur_fsm])) 764 return -EINVAL; 765 766 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 767 (state_flags_table[new_fsm] & device->migration_flags) != 768 state_flags_table[new_fsm]) 769 return -EINVAL; 770 771 /* 772 * Arcs touching optional and unsupported states are skipped over. The 773 * driver will instead see an arc from the original state to the next 774 * logical state, as per the above comment. 775 */ 776 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm]; 777 while ((state_flags_table[*next_fsm] & device->migration_flags) != 778 state_flags_table[*next_fsm]) 779 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm]; 780 781 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL; 782 } 783 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state); 784 785 /* 786 * Convert the drivers's struct file into a FD number and return it to userspace 787 */ 788 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg, 789 struct vfio_device_feature_mig_state *mig) 790 { 791 int ret; 792 int fd; 793 794 fd = get_unused_fd_flags(O_CLOEXEC); 795 if (fd < 0) { 796 ret = fd; 797 goto out_fput; 798 } 799 800 mig->data_fd = fd; 801 if (copy_to_user(arg, mig, sizeof(*mig))) { 802 ret = -EFAULT; 803 goto out_put_unused; 804 } 805 fd_install(fd, filp); 806 return 0; 807 808 out_put_unused: 809 put_unused_fd(fd); 810 out_fput: 811 fput(filp); 812 return ret; 813 } 814 815 static int 816 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device, 817 u32 flags, void __user *arg, 818 size_t argsz) 819 { 820 size_t minsz = 821 offsetofend(struct vfio_device_feature_mig_state, data_fd); 822 struct vfio_device_feature_mig_state mig; 823 struct file *filp = NULL; 824 int ret; 825 826 if (!device->mig_ops) 827 return -ENOTTY; 828 829 ret = vfio_check_feature(flags, argsz, 830 VFIO_DEVICE_FEATURE_SET | 831 VFIO_DEVICE_FEATURE_GET, 832 sizeof(mig)); 833 if (ret != 1) 834 return ret; 835 836 if (copy_from_user(&mig, arg, minsz)) 837 return -EFAULT; 838 839 if (flags & VFIO_DEVICE_FEATURE_GET) { 840 enum vfio_device_mig_state curr_state; 841 842 ret = device->mig_ops->migration_get_state(device, 843 &curr_state); 844 if (ret) 845 return ret; 846 mig.device_state = curr_state; 847 goto out_copy; 848 } 849 850 /* Handle the VFIO_DEVICE_FEATURE_SET */ 851 filp = device->mig_ops->migration_set_state(device, mig.device_state); 852 if (IS_ERR(filp) || !filp) 853 goto out_copy; 854 855 return vfio_ioct_mig_return_fd(filp, arg, &mig); 856 out_copy: 857 mig.data_fd = -1; 858 if (copy_to_user(arg, &mig, sizeof(mig))) 859 return -EFAULT; 860 if (IS_ERR(filp)) 861 return PTR_ERR(filp); 862 return 0; 863 } 864 865 static int 866 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device, 867 u32 flags, void __user *arg, 868 size_t argsz) 869 { 870 struct vfio_device_feature_mig_data_size data_size = {}; 871 unsigned long stop_copy_length; 872 int ret; 873 874 if (!device->mig_ops) 875 return -ENOTTY; 876 877 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 878 sizeof(data_size)); 879 if (ret != 1) 880 return ret; 881 882 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length); 883 if (ret) 884 return ret; 885 886 data_size.stop_copy_length = stop_copy_length; 887 if (copy_to_user(arg, &data_size, sizeof(data_size))) 888 return -EFAULT; 889 890 return 0; 891 } 892 893 static int vfio_ioctl_device_feature_migration(struct vfio_device *device, 894 u32 flags, void __user *arg, 895 size_t argsz) 896 { 897 struct vfio_device_feature_migration mig = { 898 .flags = device->migration_flags, 899 }; 900 int ret; 901 902 if (!device->mig_ops) 903 return -ENOTTY; 904 905 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 906 sizeof(mig)); 907 if (ret != 1) 908 return ret; 909 if (copy_to_user(arg, &mig, sizeof(mig))) 910 return -EFAULT; 911 return 0; 912 } 913 914 /* Ranges should fit into a single kernel page */ 915 #define LOG_MAX_RANGES \ 916 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range)) 917 918 static int 919 vfio_ioctl_device_feature_logging_start(struct vfio_device *device, 920 u32 flags, void __user *arg, 921 size_t argsz) 922 { 923 size_t minsz = 924 offsetofend(struct vfio_device_feature_dma_logging_control, 925 ranges); 926 struct vfio_device_feature_dma_logging_range __user *ranges; 927 struct vfio_device_feature_dma_logging_control control; 928 struct vfio_device_feature_dma_logging_range range; 929 struct rb_root_cached root = RB_ROOT_CACHED; 930 struct interval_tree_node *nodes; 931 u64 iova_end; 932 u32 nnodes; 933 int i, ret; 934 935 if (!device->log_ops) 936 return -ENOTTY; 937 938 ret = vfio_check_feature(flags, argsz, 939 VFIO_DEVICE_FEATURE_SET, 940 sizeof(control)); 941 if (ret != 1) 942 return ret; 943 944 if (copy_from_user(&control, arg, minsz)) 945 return -EFAULT; 946 947 nnodes = control.num_ranges; 948 if (!nnodes) 949 return -EINVAL; 950 951 if (nnodes > LOG_MAX_RANGES) 952 return -E2BIG; 953 954 ranges = u64_to_user_ptr(control.ranges); 955 nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node), 956 GFP_KERNEL); 957 if (!nodes) 958 return -ENOMEM; 959 960 for (i = 0; i < nnodes; i++) { 961 if (copy_from_user(&range, &ranges[i], sizeof(range))) { 962 ret = -EFAULT; 963 goto end; 964 } 965 if (!IS_ALIGNED(range.iova, control.page_size) || 966 !IS_ALIGNED(range.length, control.page_size)) { 967 ret = -EINVAL; 968 goto end; 969 } 970 971 if (check_add_overflow(range.iova, range.length, &iova_end) || 972 iova_end > ULONG_MAX) { 973 ret = -EOVERFLOW; 974 goto end; 975 } 976 977 nodes[i].start = range.iova; 978 nodes[i].last = range.iova + range.length - 1; 979 if (interval_tree_iter_first(&root, nodes[i].start, 980 nodes[i].last)) { 981 /* Range overlapping */ 982 ret = -EINVAL; 983 goto end; 984 } 985 interval_tree_insert(nodes + i, &root); 986 } 987 988 ret = device->log_ops->log_start(device, &root, nnodes, 989 &control.page_size); 990 if (ret) 991 goto end; 992 993 if (copy_to_user(arg, &control, sizeof(control))) { 994 ret = -EFAULT; 995 device->log_ops->log_stop(device); 996 } 997 998 end: 999 kfree(nodes); 1000 return ret; 1001 } 1002 1003 static int 1004 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device, 1005 u32 flags, void __user *arg, 1006 size_t argsz) 1007 { 1008 int ret; 1009 1010 if (!device->log_ops) 1011 return -ENOTTY; 1012 1013 ret = vfio_check_feature(flags, argsz, 1014 VFIO_DEVICE_FEATURE_SET, 0); 1015 if (ret != 1) 1016 return ret; 1017 1018 return device->log_ops->log_stop(device); 1019 } 1020 1021 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter, 1022 unsigned long iova, size_t length, 1023 void *opaque) 1024 { 1025 struct vfio_device *device = opaque; 1026 1027 return device->log_ops->log_read_and_clear(device, iova, length, iter); 1028 } 1029 1030 static int 1031 vfio_ioctl_device_feature_logging_report(struct vfio_device *device, 1032 u32 flags, void __user *arg, 1033 size_t argsz) 1034 { 1035 size_t minsz = 1036 offsetofend(struct vfio_device_feature_dma_logging_report, 1037 bitmap); 1038 struct vfio_device_feature_dma_logging_report report; 1039 struct iova_bitmap *iter; 1040 u64 iova_end; 1041 int ret; 1042 1043 if (!device->log_ops) 1044 return -ENOTTY; 1045 1046 ret = vfio_check_feature(flags, argsz, 1047 VFIO_DEVICE_FEATURE_GET, 1048 sizeof(report)); 1049 if (ret != 1) 1050 return ret; 1051 1052 if (copy_from_user(&report, arg, minsz)) 1053 return -EFAULT; 1054 1055 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size)) 1056 return -EINVAL; 1057 1058 if (check_add_overflow(report.iova, report.length, &iova_end) || 1059 iova_end > ULONG_MAX) 1060 return -EOVERFLOW; 1061 1062 iter = iova_bitmap_alloc(report.iova, report.length, 1063 report.page_size, 1064 u64_to_user_ptr(report.bitmap)); 1065 if (IS_ERR(iter)) 1066 return PTR_ERR(iter); 1067 1068 ret = iova_bitmap_for_each(iter, device, 1069 vfio_device_log_read_and_clear); 1070 1071 iova_bitmap_free(iter); 1072 return ret; 1073 } 1074 1075 static int vfio_ioctl_device_feature(struct vfio_device *device, 1076 struct vfio_device_feature __user *arg) 1077 { 1078 size_t minsz = offsetofend(struct vfio_device_feature, flags); 1079 struct vfio_device_feature feature; 1080 1081 if (copy_from_user(&feature, arg, minsz)) 1082 return -EFAULT; 1083 1084 if (feature.argsz < minsz) 1085 return -EINVAL; 1086 1087 /* Check unknown flags */ 1088 if (feature.flags & 1089 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET | 1090 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE)) 1091 return -EINVAL; 1092 1093 /* GET & SET are mutually exclusive except with PROBE */ 1094 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 1095 (feature.flags & VFIO_DEVICE_FEATURE_SET) && 1096 (feature.flags & VFIO_DEVICE_FEATURE_GET)) 1097 return -EINVAL; 1098 1099 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 1100 case VFIO_DEVICE_FEATURE_MIGRATION: 1101 return vfio_ioctl_device_feature_migration( 1102 device, feature.flags, arg->data, 1103 feature.argsz - minsz); 1104 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: 1105 return vfio_ioctl_device_feature_mig_device_state( 1106 device, feature.flags, arg->data, 1107 feature.argsz - minsz); 1108 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: 1109 return vfio_ioctl_device_feature_logging_start( 1110 device, feature.flags, arg->data, 1111 feature.argsz - minsz); 1112 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: 1113 return vfio_ioctl_device_feature_logging_stop( 1114 device, feature.flags, arg->data, 1115 feature.argsz - minsz); 1116 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: 1117 return vfio_ioctl_device_feature_logging_report( 1118 device, feature.flags, arg->data, 1119 feature.argsz - minsz); 1120 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE: 1121 return vfio_ioctl_device_feature_migration_data_size( 1122 device, feature.flags, arg->data, 1123 feature.argsz - minsz); 1124 default: 1125 if (unlikely(!device->ops->device_feature)) 1126 return -EINVAL; 1127 return device->ops->device_feature(device, feature.flags, 1128 arg->data, 1129 feature.argsz - minsz); 1130 } 1131 } 1132 1133 static long vfio_device_fops_unl_ioctl(struct file *filep, 1134 unsigned int cmd, unsigned long arg) 1135 { 1136 struct vfio_device_file *df = filep->private_data; 1137 struct vfio_device *device = df->device; 1138 int ret; 1139 1140 /* Paired with smp_store_release() following vfio_df_open() */ 1141 if (!smp_load_acquire(&df->access_granted)) 1142 return -EINVAL; 1143 1144 ret = vfio_device_pm_runtime_get(device); 1145 if (ret) 1146 return ret; 1147 1148 switch (cmd) { 1149 case VFIO_DEVICE_FEATURE: 1150 ret = vfio_ioctl_device_feature(device, (void __user *)arg); 1151 break; 1152 1153 default: 1154 if (unlikely(!device->ops->ioctl)) 1155 ret = -EINVAL; 1156 else 1157 ret = device->ops->ioctl(device, cmd, arg); 1158 break; 1159 } 1160 1161 vfio_device_pm_runtime_put(device); 1162 return ret; 1163 } 1164 1165 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, 1166 size_t count, loff_t *ppos) 1167 { 1168 struct vfio_device_file *df = filep->private_data; 1169 struct vfio_device *device = df->device; 1170 1171 /* Paired with smp_store_release() following vfio_df_open() */ 1172 if (!smp_load_acquire(&df->access_granted)) 1173 return -EINVAL; 1174 1175 if (unlikely(!device->ops->read)) 1176 return -EINVAL; 1177 1178 return device->ops->read(device, buf, count, ppos); 1179 } 1180 1181 static ssize_t vfio_device_fops_write(struct file *filep, 1182 const char __user *buf, 1183 size_t count, loff_t *ppos) 1184 { 1185 struct vfio_device_file *df = filep->private_data; 1186 struct vfio_device *device = df->device; 1187 1188 /* Paired with smp_store_release() following vfio_df_open() */ 1189 if (!smp_load_acquire(&df->access_granted)) 1190 return -EINVAL; 1191 1192 if (unlikely(!device->ops->write)) 1193 return -EINVAL; 1194 1195 return device->ops->write(device, buf, count, ppos); 1196 } 1197 1198 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) 1199 { 1200 struct vfio_device_file *df = filep->private_data; 1201 struct vfio_device *device = df->device; 1202 1203 /* Paired with smp_store_release() following vfio_df_open() */ 1204 if (!smp_load_acquire(&df->access_granted)) 1205 return -EINVAL; 1206 1207 if (unlikely(!device->ops->mmap)) 1208 return -EINVAL; 1209 1210 return device->ops->mmap(device, vma); 1211 } 1212 1213 const struct file_operations vfio_device_fops = { 1214 .owner = THIS_MODULE, 1215 .release = vfio_device_fops_release, 1216 .read = vfio_device_fops_read, 1217 .write = vfio_device_fops_write, 1218 .unlocked_ioctl = vfio_device_fops_unl_ioctl, 1219 .compat_ioctl = compat_ptr_ioctl, 1220 .mmap = vfio_device_fops_mmap, 1221 }; 1222 1223 static struct vfio_device *vfio_device_from_file(struct file *file) 1224 { 1225 struct vfio_device_file *df = file->private_data; 1226 1227 if (file->f_op != &vfio_device_fops) 1228 return NULL; 1229 return df->device; 1230 } 1231 1232 /** 1233 * vfio_file_is_valid - True if the file is valid vfio file 1234 * @file: VFIO group file or VFIO device file 1235 */ 1236 bool vfio_file_is_valid(struct file *file) 1237 { 1238 return vfio_group_from_file(file) || 1239 vfio_device_from_file(file); 1240 } 1241 EXPORT_SYMBOL_GPL(vfio_file_is_valid); 1242 1243 /** 1244 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file 1245 * is always CPU cache coherent 1246 * @file: VFIO group file or VFIO device file 1247 * 1248 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop 1249 * bit in DMA transactions. A return of false indicates that the user has 1250 * rights to access additional instructions such as wbinvd on x86. 1251 */ 1252 bool vfio_file_enforced_coherent(struct file *file) 1253 { 1254 struct vfio_device *device; 1255 struct vfio_group *group; 1256 1257 group = vfio_group_from_file(file); 1258 if (group) 1259 return vfio_group_enforced_coherent(group); 1260 1261 device = vfio_device_from_file(file); 1262 if (device) 1263 return device_iommu_capable(device->dev, 1264 IOMMU_CAP_ENFORCE_CACHE_COHERENCY); 1265 1266 return true; 1267 } 1268 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); 1269 1270 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm) 1271 { 1272 struct vfio_device_file *df = file->private_data; 1273 1274 /* 1275 * The kvm is first recorded in the vfio_device_file, and will 1276 * be propagated to vfio_device::kvm when the file is bound to 1277 * iommufd successfully in the vfio device cdev path. 1278 */ 1279 spin_lock(&df->kvm_ref_lock); 1280 df->kvm = kvm; 1281 spin_unlock(&df->kvm_ref_lock); 1282 } 1283 1284 /** 1285 * vfio_file_set_kvm - Link a kvm with VFIO drivers 1286 * @file: VFIO group file or VFIO device file 1287 * @kvm: KVM to link 1288 * 1289 * When a VFIO device is first opened the KVM will be available in 1290 * device->kvm if one was associated with the file. 1291 */ 1292 void vfio_file_set_kvm(struct file *file, struct kvm *kvm) 1293 { 1294 struct vfio_group *group; 1295 1296 group = vfio_group_from_file(file); 1297 if (group) 1298 vfio_group_set_kvm(group, kvm); 1299 1300 if (vfio_device_from_file(file)) 1301 vfio_device_file_set_kvm(file, kvm); 1302 } 1303 EXPORT_SYMBOL_GPL(vfio_file_set_kvm); 1304 1305 /* 1306 * Sub-module support 1307 */ 1308 /* 1309 * Helper for managing a buffer of info chain capabilities, allocate or 1310 * reallocate a buffer with additional @size, filling in @id and @version 1311 * of the capability. A pointer to the new capability is returned. 1312 * 1313 * NB. The chain is based at the head of the buffer, so new entries are 1314 * added to the tail, vfio_info_cap_shift() should be called to fixup the 1315 * next offsets prior to copying to the user buffer. 1316 */ 1317 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, 1318 size_t size, u16 id, u16 version) 1319 { 1320 void *buf; 1321 struct vfio_info_cap_header *header, *tmp; 1322 1323 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); 1324 if (!buf) { 1325 kfree(caps->buf); 1326 caps->buf = NULL; 1327 caps->size = 0; 1328 return ERR_PTR(-ENOMEM); 1329 } 1330 1331 caps->buf = buf; 1332 header = buf + caps->size; 1333 1334 /* Eventually copied to user buffer, zero */ 1335 memset(header, 0, size); 1336 1337 header->id = id; 1338 header->version = version; 1339 1340 /* Add to the end of the capability chain */ 1341 for (tmp = buf; tmp->next; tmp = buf + tmp->next) 1342 ; /* nothing */ 1343 1344 tmp->next = caps->size; 1345 caps->size += size; 1346 1347 return header; 1348 } 1349 EXPORT_SYMBOL_GPL(vfio_info_cap_add); 1350 1351 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) 1352 { 1353 struct vfio_info_cap_header *tmp; 1354 void *buf = (void *)caps->buf; 1355 1356 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) 1357 tmp->next += offset; 1358 } 1359 EXPORT_SYMBOL(vfio_info_cap_shift); 1360 1361 int vfio_info_add_capability(struct vfio_info_cap *caps, 1362 struct vfio_info_cap_header *cap, size_t size) 1363 { 1364 struct vfio_info_cap_header *header; 1365 1366 header = vfio_info_cap_add(caps, size, cap->id, cap->version); 1367 if (IS_ERR(header)) 1368 return PTR_ERR(header); 1369 1370 memcpy(header + 1, cap + 1, size - sizeof(*header)); 1371 1372 return 0; 1373 } 1374 EXPORT_SYMBOL(vfio_info_add_capability); 1375 1376 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, 1377 int max_irq_type, size_t *data_size) 1378 { 1379 unsigned long minsz; 1380 size_t size; 1381 1382 minsz = offsetofend(struct vfio_irq_set, count); 1383 1384 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) || 1385 (hdr->count >= (U32_MAX - hdr->start)) || 1386 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | 1387 VFIO_IRQ_SET_ACTION_TYPE_MASK))) 1388 return -EINVAL; 1389 1390 if (data_size) 1391 *data_size = 0; 1392 1393 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs) 1394 return -EINVAL; 1395 1396 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { 1397 case VFIO_IRQ_SET_DATA_NONE: 1398 size = 0; 1399 break; 1400 case VFIO_IRQ_SET_DATA_BOOL: 1401 size = sizeof(uint8_t); 1402 break; 1403 case VFIO_IRQ_SET_DATA_EVENTFD: 1404 size = sizeof(int32_t); 1405 break; 1406 default: 1407 return -EINVAL; 1408 } 1409 1410 if (size) { 1411 if (hdr->argsz - minsz < hdr->count * size) 1412 return -EINVAL; 1413 1414 if (!data_size) 1415 return -EINVAL; 1416 1417 *data_size = hdr->count * size; 1418 } 1419 1420 return 0; 1421 } 1422 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); 1423 1424 /* 1425 * Pin contiguous user pages and return their associated host pages for local 1426 * domain only. 1427 * @device [in] : device 1428 * @iova [in] : starting IOVA of user pages to be pinned. 1429 * @npage [in] : count of pages to be pinned. This count should not 1430 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1431 * @prot [in] : protection flags 1432 * @pages[out] : array of host pages 1433 * Return error or number of pages pinned. 1434 * 1435 * A driver may only call this function if the vfio_device was created 1436 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages(). 1437 */ 1438 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, 1439 int npage, int prot, struct page **pages) 1440 { 1441 /* group->container cannot change while a vfio device is open */ 1442 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) 1443 return -EINVAL; 1444 if (vfio_device_has_container(device)) 1445 return vfio_device_container_pin_pages(device, iova, 1446 npage, prot, pages); 1447 if (device->iommufd_access) { 1448 int ret; 1449 1450 if (iova > ULONG_MAX) 1451 return -EINVAL; 1452 /* 1453 * VFIO ignores the sub page offset, npages is from the start of 1454 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover 1455 * the sub page offset by doing: 1456 * pages[0] + (iova % PAGE_SIZE) 1457 */ 1458 ret = iommufd_access_pin_pages( 1459 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE), 1460 npage * PAGE_SIZE, pages, 1461 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0); 1462 if (ret) 1463 return ret; 1464 return npage; 1465 } 1466 return -EINVAL; 1467 } 1468 EXPORT_SYMBOL(vfio_pin_pages); 1469 1470 /* 1471 * Unpin contiguous host pages for local domain only. 1472 * @device [in] : device 1473 * @iova [in] : starting address of user pages to be unpinned. 1474 * @npage [in] : count of pages to be unpinned. This count should not 1475 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1476 */ 1477 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) 1478 { 1479 if (WARN_ON(!vfio_assert_device_open(device))) 1480 return; 1481 1482 if (vfio_device_has_container(device)) { 1483 vfio_device_container_unpin_pages(device, iova, npage); 1484 return; 1485 } 1486 if (device->iommufd_access) { 1487 if (WARN_ON(iova > ULONG_MAX)) 1488 return; 1489 iommufd_access_unpin_pages(device->iommufd_access, 1490 ALIGN_DOWN(iova, PAGE_SIZE), 1491 npage * PAGE_SIZE); 1492 return; 1493 } 1494 } 1495 EXPORT_SYMBOL(vfio_unpin_pages); 1496 1497 /* 1498 * This interface allows the CPUs to perform some sort of virtual DMA on 1499 * behalf of the device. 1500 * 1501 * CPUs read/write from/into a range of IOVAs pointing to user space memory 1502 * into/from a kernel buffer. 1503 * 1504 * As the read/write of user space memory is conducted via the CPUs and is 1505 * not a real device DMA, it is not necessary to pin the user space memory. 1506 * 1507 * @device [in] : VFIO device 1508 * @iova [in] : base IOVA of a user space buffer 1509 * @data [in] : pointer to kernel buffer 1510 * @len [in] : kernel buffer length 1511 * @write : indicate read or write 1512 * Return error code on failure or 0 on success. 1513 */ 1514 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, 1515 size_t len, bool write) 1516 { 1517 if (!data || len <= 0 || !vfio_assert_device_open(device)) 1518 return -EINVAL; 1519 1520 if (vfio_device_has_container(device)) 1521 return vfio_device_container_dma_rw(device, iova, 1522 data, len, write); 1523 1524 if (device->iommufd_access) { 1525 unsigned int flags = 0; 1526 1527 if (iova > ULONG_MAX) 1528 return -EINVAL; 1529 1530 /* VFIO historically tries to auto-detect a kthread */ 1531 if (!current->mm) 1532 flags |= IOMMUFD_ACCESS_RW_KTHREAD; 1533 if (write) 1534 flags |= IOMMUFD_ACCESS_RW_WRITE; 1535 return iommufd_access_rw(device->iommufd_access, iova, data, 1536 len, flags); 1537 } 1538 return -EINVAL; 1539 } 1540 EXPORT_SYMBOL(vfio_dma_rw); 1541 1542 /* 1543 * Module/class support 1544 */ 1545 static int __init vfio_init(void) 1546 { 1547 int ret; 1548 1549 ida_init(&vfio.device_ida); 1550 1551 ret = vfio_group_init(); 1552 if (ret) 1553 return ret; 1554 1555 ret = vfio_virqfd_init(); 1556 if (ret) 1557 goto err_virqfd; 1558 1559 /* /sys/class/vfio-dev/vfioX */ 1560 vfio.device_class = class_create("vfio-dev"); 1561 if (IS_ERR(vfio.device_class)) { 1562 ret = PTR_ERR(vfio.device_class); 1563 goto err_dev_class; 1564 } 1565 1566 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); 1567 return 0; 1568 1569 err_dev_class: 1570 vfio_virqfd_exit(); 1571 err_virqfd: 1572 vfio_group_cleanup(); 1573 return ret; 1574 } 1575 1576 static void __exit vfio_cleanup(void) 1577 { 1578 ida_destroy(&vfio.device_ida); 1579 class_destroy(vfio.device_class); 1580 vfio.device_class = NULL; 1581 vfio_virqfd_exit(); 1582 vfio_group_cleanup(); 1583 xa_destroy(&vfio_device_set_xa); 1584 } 1585 1586 module_init(vfio_init); 1587 module_exit(vfio_cleanup); 1588 1589 MODULE_VERSION(DRIVER_VERSION); 1590 MODULE_LICENSE("GPL v2"); 1591 MODULE_AUTHOR(DRIVER_AUTHOR); 1592 MODULE_DESCRIPTION(DRIVER_DESC); 1593 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce"); 1594