1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VFIO core 4 * 5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 6 * Author: Alex Williamson <alex.williamson@redhat.com> 7 * 8 * Derived from original vfio: 9 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 10 * Author: Tom Lyon, pugs@cisco.com 11 */ 12 13 #include <linux/cdev.h> 14 #include <linux/compat.h> 15 #include <linux/device.h> 16 #include <linux/fs.h> 17 #include <linux/idr.h> 18 #include <linux/iommu.h> 19 #ifdef CONFIG_HAVE_KVM 20 #include <linux/kvm_host.h> 21 #endif 22 #include <linux/list.h> 23 #include <linux/miscdevice.h> 24 #include <linux/module.h> 25 #include <linux/mutex.h> 26 #include <linux/pci.h> 27 #include <linux/rwsem.h> 28 #include <linux/sched.h> 29 #include <linux/slab.h> 30 #include <linux/stat.h> 31 #include <linux/string.h> 32 #include <linux/uaccess.h> 33 #include <linux/vfio.h> 34 #include <linux/wait.h> 35 #include <linux/sched/signal.h> 36 #include <linux/pm_runtime.h> 37 #include <linux/interval_tree.h> 38 #include <linux/iova_bitmap.h> 39 #include <linux/iommufd.h> 40 #include "vfio.h" 41 42 #define DRIVER_VERSION "0.3" 43 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 44 #define DRIVER_DESC "VFIO - User Level meta-driver" 45 46 static struct vfio { 47 struct class *device_class; 48 struct ida device_ida; 49 } vfio; 50 51 #ifdef CONFIG_VFIO_NOIOMMU 52 bool vfio_noiommu __read_mostly; 53 module_param_named(enable_unsafe_noiommu_mode, 54 vfio_noiommu, bool, S_IRUGO | S_IWUSR); 55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); 56 #endif 57 58 static DEFINE_XARRAY(vfio_device_set_xa); 59 60 int vfio_assign_device_set(struct vfio_device *device, void *set_id) 61 { 62 unsigned long idx = (unsigned long)set_id; 63 struct vfio_device_set *new_dev_set; 64 struct vfio_device_set *dev_set; 65 66 if (WARN_ON(!set_id)) 67 return -EINVAL; 68 69 /* 70 * Atomically acquire a singleton object in the xarray for this set_id 71 */ 72 xa_lock(&vfio_device_set_xa); 73 dev_set = xa_load(&vfio_device_set_xa, idx); 74 if (dev_set) 75 goto found_get_ref; 76 xa_unlock(&vfio_device_set_xa); 77 78 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL); 79 if (!new_dev_set) 80 return -ENOMEM; 81 mutex_init(&new_dev_set->lock); 82 INIT_LIST_HEAD(&new_dev_set->device_list); 83 new_dev_set->set_id = set_id; 84 85 xa_lock(&vfio_device_set_xa); 86 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set, 87 GFP_KERNEL); 88 if (!dev_set) { 89 dev_set = new_dev_set; 90 goto found_get_ref; 91 } 92 93 kfree(new_dev_set); 94 if (xa_is_err(dev_set)) { 95 xa_unlock(&vfio_device_set_xa); 96 return xa_err(dev_set); 97 } 98 99 found_get_ref: 100 dev_set->device_count++; 101 xa_unlock(&vfio_device_set_xa); 102 mutex_lock(&dev_set->lock); 103 device->dev_set = dev_set; 104 list_add_tail(&device->dev_set_list, &dev_set->device_list); 105 mutex_unlock(&dev_set->lock); 106 return 0; 107 } 108 EXPORT_SYMBOL_GPL(vfio_assign_device_set); 109 110 static void vfio_release_device_set(struct vfio_device *device) 111 { 112 struct vfio_device_set *dev_set = device->dev_set; 113 114 if (!dev_set) 115 return; 116 117 mutex_lock(&dev_set->lock); 118 list_del(&device->dev_set_list); 119 mutex_unlock(&dev_set->lock); 120 121 xa_lock(&vfio_device_set_xa); 122 if (!--dev_set->device_count) { 123 __xa_erase(&vfio_device_set_xa, 124 (unsigned long)dev_set->set_id); 125 mutex_destroy(&dev_set->lock); 126 kfree(dev_set); 127 } 128 xa_unlock(&vfio_device_set_xa); 129 } 130 131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set) 132 { 133 struct vfio_device *cur; 134 unsigned int open_count = 0; 135 136 lockdep_assert_held(&dev_set->lock); 137 138 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 139 open_count += cur->open_count; 140 return open_count; 141 } 142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count); 143 144 /* 145 * Device objects - create, release, get, put, search 146 */ 147 /* Device reference always implies a group reference */ 148 void vfio_device_put_registration(struct vfio_device *device) 149 { 150 if (refcount_dec_and_test(&device->refcount)) 151 complete(&device->comp); 152 } 153 154 bool vfio_device_try_get_registration(struct vfio_device *device) 155 { 156 return refcount_inc_not_zero(&device->refcount); 157 } 158 159 /* 160 * VFIO driver API 161 */ 162 /* Release helper called by vfio_put_device() */ 163 static void vfio_device_release(struct device *dev) 164 { 165 struct vfio_device *device = 166 container_of(dev, struct vfio_device, device); 167 168 vfio_release_device_set(device); 169 ida_free(&vfio.device_ida, device->index); 170 171 if (device->ops->release) 172 device->ops->release(device); 173 174 kvfree(device); 175 } 176 177 static int vfio_init_device(struct vfio_device *device, struct device *dev, 178 const struct vfio_device_ops *ops); 179 180 /* 181 * Allocate and initialize vfio_device so it can be registered to vfio 182 * core. 183 * 184 * Drivers should use the wrapper vfio_alloc_device() for allocation. 185 * @size is the size of the structure to be allocated, including any 186 * private data used by the driver. 187 * 188 * Driver may provide an @init callback to cover device private data. 189 * 190 * Use vfio_put_device() to release the structure after success return. 191 */ 192 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, 193 const struct vfio_device_ops *ops) 194 { 195 struct vfio_device *device; 196 int ret; 197 198 if (WARN_ON(size < sizeof(struct vfio_device))) 199 return ERR_PTR(-EINVAL); 200 201 device = kvzalloc(size, GFP_KERNEL); 202 if (!device) 203 return ERR_PTR(-ENOMEM); 204 205 ret = vfio_init_device(device, dev, ops); 206 if (ret) 207 goto out_free; 208 return device; 209 210 out_free: 211 kvfree(device); 212 return ERR_PTR(ret); 213 } 214 EXPORT_SYMBOL_GPL(_vfio_alloc_device); 215 216 /* 217 * Initialize a vfio_device so it can be registered to vfio core. 218 */ 219 static int vfio_init_device(struct vfio_device *device, struct device *dev, 220 const struct vfio_device_ops *ops) 221 { 222 int ret; 223 224 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL); 225 if (ret < 0) { 226 dev_dbg(dev, "Error to alloc index\n"); 227 return ret; 228 } 229 230 device->index = ret; 231 init_completion(&device->comp); 232 device->dev = dev; 233 device->ops = ops; 234 235 if (ops->init) { 236 ret = ops->init(device); 237 if (ret) 238 goto out_uninit; 239 } 240 241 device_initialize(&device->device); 242 device->device.release = vfio_device_release; 243 device->device.class = vfio.device_class; 244 device->device.parent = device->dev; 245 return 0; 246 247 out_uninit: 248 vfio_release_device_set(device); 249 ida_free(&vfio.device_ida, device->index); 250 return ret; 251 } 252 253 static int __vfio_register_dev(struct vfio_device *device, 254 enum vfio_group_type type) 255 { 256 int ret; 257 258 if (WARN_ON(device->ops->bind_iommufd && 259 (!device->ops->unbind_iommufd || 260 !device->ops->attach_ioas))) 261 return -EINVAL; 262 263 /* 264 * If the driver doesn't specify a set then the device is added to a 265 * singleton set just for itself. 266 */ 267 if (!device->dev_set) 268 vfio_assign_device_set(device, device); 269 270 ret = dev_set_name(&device->device, "vfio%d", device->index); 271 if (ret) 272 return ret; 273 274 ret = vfio_device_set_group(device, type); 275 if (ret) 276 return ret; 277 278 ret = device_add(&device->device); 279 if (ret) 280 goto err_out; 281 282 /* Refcounting can't start until the driver calls register */ 283 refcount_set(&device->refcount, 1); 284 285 vfio_device_group_register(device); 286 287 return 0; 288 err_out: 289 vfio_device_remove_group(device); 290 return ret; 291 } 292 293 int vfio_register_group_dev(struct vfio_device *device) 294 { 295 return __vfio_register_dev(device, VFIO_IOMMU); 296 } 297 EXPORT_SYMBOL_GPL(vfio_register_group_dev); 298 299 /* 300 * Register a virtual device without IOMMU backing. The user of this 301 * device must not be able to directly trigger unmediated DMA. 302 */ 303 int vfio_register_emulated_iommu_dev(struct vfio_device *device) 304 { 305 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU); 306 } 307 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); 308 309 /* 310 * Decrement the device reference count and wait for the device to be 311 * removed. Open file descriptors for the device... */ 312 void vfio_unregister_group_dev(struct vfio_device *device) 313 { 314 unsigned int i = 0; 315 bool interrupted = false; 316 long rc; 317 318 vfio_device_put_registration(device); 319 rc = try_wait_for_completion(&device->comp); 320 while (rc <= 0) { 321 if (device->ops->request) 322 device->ops->request(device, i++); 323 324 if (interrupted) { 325 rc = wait_for_completion_timeout(&device->comp, 326 HZ * 10); 327 } else { 328 rc = wait_for_completion_interruptible_timeout( 329 &device->comp, HZ * 10); 330 if (rc < 0) { 331 interrupted = true; 332 dev_warn(device->dev, 333 "Device is currently in use, task" 334 " \"%s\" (%d) " 335 "blocked until device is released", 336 current->comm, task_pid_nr(current)); 337 } 338 } 339 } 340 341 vfio_device_group_unregister(device); 342 343 /* Balances device_add in register path */ 344 device_del(&device->device); 345 346 /* Balances vfio_device_set_group in register path */ 347 vfio_device_remove_group(device); 348 } 349 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); 350 351 #ifdef CONFIG_HAVE_KVM 352 void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) 353 { 354 void (*pfn)(struct kvm *kvm); 355 bool (*fn)(struct kvm *kvm); 356 bool ret; 357 358 lockdep_assert_held(&device->dev_set->lock); 359 360 pfn = symbol_get(kvm_put_kvm); 361 if (WARN_ON(!pfn)) 362 return; 363 364 fn = symbol_get(kvm_get_kvm_safe); 365 if (WARN_ON(!fn)) { 366 symbol_put(kvm_put_kvm); 367 return; 368 } 369 370 ret = fn(kvm); 371 symbol_put(kvm_get_kvm_safe); 372 if (!ret) { 373 symbol_put(kvm_put_kvm); 374 return; 375 } 376 377 device->put_kvm = pfn; 378 device->kvm = kvm; 379 } 380 381 void vfio_device_put_kvm(struct vfio_device *device) 382 { 383 lockdep_assert_held(&device->dev_set->lock); 384 385 if (!device->kvm) 386 return; 387 388 if (WARN_ON(!device->put_kvm)) 389 goto clear; 390 391 device->put_kvm(device->kvm); 392 device->put_kvm = NULL; 393 symbol_put(kvm_put_kvm); 394 395 clear: 396 device->kvm = NULL; 397 } 398 #endif 399 400 /* true if the vfio_device has open_device() called but not close_device() */ 401 static bool vfio_assert_device_open(struct vfio_device *device) 402 { 403 return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); 404 } 405 406 static int vfio_device_first_open(struct vfio_device *device, 407 struct iommufd_ctx *iommufd) 408 { 409 int ret; 410 411 lockdep_assert_held(&device->dev_set->lock); 412 413 if (!try_module_get(device->dev->driver->owner)) 414 return -ENODEV; 415 416 if (iommufd) 417 ret = vfio_iommufd_bind(device, iommufd); 418 else 419 ret = vfio_device_group_use_iommu(device); 420 if (ret) 421 goto err_module_put; 422 423 if (device->ops->open_device) { 424 ret = device->ops->open_device(device); 425 if (ret) 426 goto err_unuse_iommu; 427 } 428 return 0; 429 430 err_unuse_iommu: 431 if (iommufd) 432 vfio_iommufd_unbind(device); 433 else 434 vfio_device_group_unuse_iommu(device); 435 err_module_put: 436 module_put(device->dev->driver->owner); 437 return ret; 438 } 439 440 static void vfio_device_last_close(struct vfio_device *device, 441 struct iommufd_ctx *iommufd) 442 { 443 lockdep_assert_held(&device->dev_set->lock); 444 445 if (device->ops->close_device) 446 device->ops->close_device(device); 447 if (iommufd) 448 vfio_iommufd_unbind(device); 449 else 450 vfio_device_group_unuse_iommu(device); 451 module_put(device->dev->driver->owner); 452 } 453 454 int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd) 455 { 456 int ret = 0; 457 458 lockdep_assert_held(&device->dev_set->lock); 459 460 device->open_count++; 461 if (device->open_count == 1) { 462 ret = vfio_device_first_open(device, iommufd); 463 if (ret) 464 device->open_count--; 465 } 466 467 return ret; 468 } 469 470 void vfio_device_close(struct vfio_device *device, 471 struct iommufd_ctx *iommufd) 472 { 473 lockdep_assert_held(&device->dev_set->lock); 474 475 vfio_assert_device_open(device); 476 if (device->open_count == 1) 477 vfio_device_last_close(device, iommufd); 478 device->open_count--; 479 } 480 481 /* 482 * Wrapper around pm_runtime_resume_and_get(). 483 * Return error code on failure or 0 on success. 484 */ 485 static inline int vfio_device_pm_runtime_get(struct vfio_device *device) 486 { 487 struct device *dev = device->dev; 488 489 if (dev->driver && dev->driver->pm) { 490 int ret; 491 492 ret = pm_runtime_resume_and_get(dev); 493 if (ret) { 494 dev_info_ratelimited(dev, 495 "vfio: runtime resume failed %d\n", ret); 496 return -EIO; 497 } 498 } 499 500 return 0; 501 } 502 503 /* 504 * Wrapper around pm_runtime_put(). 505 */ 506 static inline void vfio_device_pm_runtime_put(struct vfio_device *device) 507 { 508 struct device *dev = device->dev; 509 510 if (dev->driver && dev->driver->pm) 511 pm_runtime_put(dev); 512 } 513 514 /* 515 * VFIO Device fd 516 */ 517 static int vfio_device_fops_release(struct inode *inode, struct file *filep) 518 { 519 struct vfio_device *device = filep->private_data; 520 521 vfio_device_group_close(device); 522 523 vfio_device_put_registration(device); 524 525 return 0; 526 } 527 528 /* 529 * vfio_mig_get_next_state - Compute the next step in the FSM 530 * @cur_fsm - The current state the device is in 531 * @new_fsm - The target state to reach 532 * @next_fsm - Pointer to the next step to get to new_fsm 533 * 534 * Return 0 upon success, otherwise -errno 535 * Upon success the next step in the state progression between cur_fsm and 536 * new_fsm will be set in next_fsm. 537 * 538 * This breaks down requests for combination transitions into smaller steps and 539 * returns the next step to get to new_fsm. The function may need to be called 540 * multiple times before reaching new_fsm. 541 * 542 */ 543 int vfio_mig_get_next_state(struct vfio_device *device, 544 enum vfio_device_mig_state cur_fsm, 545 enum vfio_device_mig_state new_fsm, 546 enum vfio_device_mig_state *next_fsm) 547 { 548 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 }; 549 /* 550 * The coding in this table requires the driver to implement the 551 * following FSM arcs: 552 * RESUMING -> STOP 553 * STOP -> RESUMING 554 * STOP -> STOP_COPY 555 * STOP_COPY -> STOP 556 * 557 * If P2P is supported then the driver must also implement these FSM 558 * arcs: 559 * RUNNING -> RUNNING_P2P 560 * RUNNING_P2P -> RUNNING 561 * RUNNING_P2P -> STOP 562 * STOP -> RUNNING_P2P 563 * 564 * If precopy is supported then the driver must support these additional 565 * FSM arcs: 566 * RUNNING -> PRE_COPY 567 * PRE_COPY -> RUNNING 568 * PRE_COPY -> STOP_COPY 569 * However, if precopy and P2P are supported together then the driver 570 * must support these additional arcs beyond the P2P arcs above: 571 * PRE_COPY -> RUNNING 572 * PRE_COPY -> PRE_COPY_P2P 573 * PRE_COPY_P2P -> PRE_COPY 574 * PRE_COPY_P2P -> RUNNING_P2P 575 * PRE_COPY_P2P -> STOP_COPY 576 * RUNNING -> PRE_COPY 577 * RUNNING_P2P -> PRE_COPY_P2P 578 * 579 * Without P2P and precopy the driver must implement: 580 * RUNNING -> STOP 581 * STOP -> RUNNING 582 * 583 * The coding will step through multiple states for some combination 584 * transitions; if all optional features are supported, this means the 585 * following ones: 586 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY 587 * PRE_COPY -> RUNNING -> RUNNING_P2P 588 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP 589 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING 590 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING 591 * PRE_COPY_P2P -> RUNNING_P2P -> STOP 592 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING 593 * RESUMING -> STOP -> RUNNING_P2P 594 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P 595 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING 596 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 597 * RESUMING -> STOP -> STOP_COPY 598 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P 599 * RUNNING -> RUNNING_P2P -> STOP 600 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING 601 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY 602 * RUNNING_P2P -> RUNNING -> PRE_COPY 603 * RUNNING_P2P -> STOP -> RESUMING 604 * RUNNING_P2P -> STOP -> STOP_COPY 605 * STOP -> RUNNING_P2P -> PRE_COPY_P2P 606 * STOP -> RUNNING_P2P -> RUNNING 607 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 608 * STOP_COPY -> STOP -> RESUMING 609 * STOP_COPY -> STOP -> RUNNING_P2P 610 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING 611 * 612 * The following transitions are blocked: 613 * STOP_COPY -> PRE_COPY 614 * STOP_COPY -> PRE_COPY_P2P 615 */ 616 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { 617 [VFIO_DEVICE_STATE_STOP] = { 618 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 619 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 620 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 621 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 622 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 623 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 624 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 625 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 626 }, 627 [VFIO_DEVICE_STATE_RUNNING] = { 628 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 629 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 630 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 631 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 632 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 633 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 634 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 635 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 636 }, 637 [VFIO_DEVICE_STATE_PRE_COPY] = { 638 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING, 639 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 640 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 641 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 642 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 643 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING, 644 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING, 645 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 646 }, 647 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = { 648 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 649 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 650 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 651 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 652 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 653 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 654 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 655 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 656 }, 657 [VFIO_DEVICE_STATE_STOP_COPY] = { 658 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 659 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 660 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 661 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 662 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 663 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 664 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 665 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 666 }, 667 [VFIO_DEVICE_STATE_RESUMING] = { 668 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 669 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 670 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP, 671 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP, 672 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 673 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 674 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, 675 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 676 }, 677 [VFIO_DEVICE_STATE_RUNNING_P2P] = { 678 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 679 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 680 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING, 681 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 682 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 683 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 684 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 685 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 686 }, 687 [VFIO_DEVICE_STATE_ERROR] = { 688 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, 689 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, 690 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 691 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 692 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, 693 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, 694 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, 695 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 696 }, 697 }; 698 699 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { 700 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, 701 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, 702 [VFIO_DEVICE_STATE_PRE_COPY] = 703 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY, 704 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY | 705 VFIO_MIGRATION_P2P | 706 VFIO_MIGRATION_PRE_COPY, 707 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, 708 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, 709 [VFIO_DEVICE_STATE_RUNNING_P2P] = 710 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P, 711 [VFIO_DEVICE_STATE_ERROR] = ~0U, 712 }; 713 714 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 715 (state_flags_table[cur_fsm] & device->migration_flags) != 716 state_flags_table[cur_fsm])) 717 return -EINVAL; 718 719 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) || 720 (state_flags_table[new_fsm] & device->migration_flags) != 721 state_flags_table[new_fsm]) 722 return -EINVAL; 723 724 /* 725 * Arcs touching optional and unsupported states are skipped over. The 726 * driver will instead see an arc from the original state to the next 727 * logical state, as per the above comment. 728 */ 729 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm]; 730 while ((state_flags_table[*next_fsm] & device->migration_flags) != 731 state_flags_table[*next_fsm]) 732 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm]; 733 734 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL; 735 } 736 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state); 737 738 /* 739 * Convert the drivers's struct file into a FD number and return it to userspace 740 */ 741 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg, 742 struct vfio_device_feature_mig_state *mig) 743 { 744 int ret; 745 int fd; 746 747 fd = get_unused_fd_flags(O_CLOEXEC); 748 if (fd < 0) { 749 ret = fd; 750 goto out_fput; 751 } 752 753 mig->data_fd = fd; 754 if (copy_to_user(arg, mig, sizeof(*mig))) { 755 ret = -EFAULT; 756 goto out_put_unused; 757 } 758 fd_install(fd, filp); 759 return 0; 760 761 out_put_unused: 762 put_unused_fd(fd); 763 out_fput: 764 fput(filp); 765 return ret; 766 } 767 768 static int 769 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device, 770 u32 flags, void __user *arg, 771 size_t argsz) 772 { 773 size_t minsz = 774 offsetofend(struct vfio_device_feature_mig_state, data_fd); 775 struct vfio_device_feature_mig_state mig; 776 struct file *filp = NULL; 777 int ret; 778 779 if (!device->mig_ops) 780 return -ENOTTY; 781 782 ret = vfio_check_feature(flags, argsz, 783 VFIO_DEVICE_FEATURE_SET | 784 VFIO_DEVICE_FEATURE_GET, 785 sizeof(mig)); 786 if (ret != 1) 787 return ret; 788 789 if (copy_from_user(&mig, arg, minsz)) 790 return -EFAULT; 791 792 if (flags & VFIO_DEVICE_FEATURE_GET) { 793 enum vfio_device_mig_state curr_state; 794 795 ret = device->mig_ops->migration_get_state(device, 796 &curr_state); 797 if (ret) 798 return ret; 799 mig.device_state = curr_state; 800 goto out_copy; 801 } 802 803 /* Handle the VFIO_DEVICE_FEATURE_SET */ 804 filp = device->mig_ops->migration_set_state(device, mig.device_state); 805 if (IS_ERR(filp) || !filp) 806 goto out_copy; 807 808 return vfio_ioct_mig_return_fd(filp, arg, &mig); 809 out_copy: 810 mig.data_fd = -1; 811 if (copy_to_user(arg, &mig, sizeof(mig))) 812 return -EFAULT; 813 if (IS_ERR(filp)) 814 return PTR_ERR(filp); 815 return 0; 816 } 817 818 static int 819 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device, 820 u32 flags, void __user *arg, 821 size_t argsz) 822 { 823 struct vfio_device_feature_mig_data_size data_size = {}; 824 unsigned long stop_copy_length; 825 int ret; 826 827 if (!device->mig_ops) 828 return -ENOTTY; 829 830 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 831 sizeof(data_size)); 832 if (ret != 1) 833 return ret; 834 835 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length); 836 if (ret) 837 return ret; 838 839 data_size.stop_copy_length = stop_copy_length; 840 if (copy_to_user(arg, &data_size, sizeof(data_size))) 841 return -EFAULT; 842 843 return 0; 844 } 845 846 static int vfio_ioctl_device_feature_migration(struct vfio_device *device, 847 u32 flags, void __user *arg, 848 size_t argsz) 849 { 850 struct vfio_device_feature_migration mig = { 851 .flags = device->migration_flags, 852 }; 853 int ret; 854 855 if (!device->mig_ops) 856 return -ENOTTY; 857 858 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 859 sizeof(mig)); 860 if (ret != 1) 861 return ret; 862 if (copy_to_user(arg, &mig, sizeof(mig))) 863 return -EFAULT; 864 return 0; 865 } 866 867 /* Ranges should fit into a single kernel page */ 868 #define LOG_MAX_RANGES \ 869 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range)) 870 871 static int 872 vfio_ioctl_device_feature_logging_start(struct vfio_device *device, 873 u32 flags, void __user *arg, 874 size_t argsz) 875 { 876 size_t minsz = 877 offsetofend(struct vfio_device_feature_dma_logging_control, 878 ranges); 879 struct vfio_device_feature_dma_logging_range __user *ranges; 880 struct vfio_device_feature_dma_logging_control control; 881 struct vfio_device_feature_dma_logging_range range; 882 struct rb_root_cached root = RB_ROOT_CACHED; 883 struct interval_tree_node *nodes; 884 u64 iova_end; 885 u32 nnodes; 886 int i, ret; 887 888 if (!device->log_ops) 889 return -ENOTTY; 890 891 ret = vfio_check_feature(flags, argsz, 892 VFIO_DEVICE_FEATURE_SET, 893 sizeof(control)); 894 if (ret != 1) 895 return ret; 896 897 if (copy_from_user(&control, arg, minsz)) 898 return -EFAULT; 899 900 nnodes = control.num_ranges; 901 if (!nnodes) 902 return -EINVAL; 903 904 if (nnodes > LOG_MAX_RANGES) 905 return -E2BIG; 906 907 ranges = u64_to_user_ptr(control.ranges); 908 nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node), 909 GFP_KERNEL); 910 if (!nodes) 911 return -ENOMEM; 912 913 for (i = 0; i < nnodes; i++) { 914 if (copy_from_user(&range, &ranges[i], sizeof(range))) { 915 ret = -EFAULT; 916 goto end; 917 } 918 if (!IS_ALIGNED(range.iova, control.page_size) || 919 !IS_ALIGNED(range.length, control.page_size)) { 920 ret = -EINVAL; 921 goto end; 922 } 923 924 if (check_add_overflow(range.iova, range.length, &iova_end) || 925 iova_end > ULONG_MAX) { 926 ret = -EOVERFLOW; 927 goto end; 928 } 929 930 nodes[i].start = range.iova; 931 nodes[i].last = range.iova + range.length - 1; 932 if (interval_tree_iter_first(&root, nodes[i].start, 933 nodes[i].last)) { 934 /* Range overlapping */ 935 ret = -EINVAL; 936 goto end; 937 } 938 interval_tree_insert(nodes + i, &root); 939 } 940 941 ret = device->log_ops->log_start(device, &root, nnodes, 942 &control.page_size); 943 if (ret) 944 goto end; 945 946 if (copy_to_user(arg, &control, sizeof(control))) { 947 ret = -EFAULT; 948 device->log_ops->log_stop(device); 949 } 950 951 end: 952 kfree(nodes); 953 return ret; 954 } 955 956 static int 957 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device, 958 u32 flags, void __user *arg, 959 size_t argsz) 960 { 961 int ret; 962 963 if (!device->log_ops) 964 return -ENOTTY; 965 966 ret = vfio_check_feature(flags, argsz, 967 VFIO_DEVICE_FEATURE_SET, 0); 968 if (ret != 1) 969 return ret; 970 971 return device->log_ops->log_stop(device); 972 } 973 974 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter, 975 unsigned long iova, size_t length, 976 void *opaque) 977 { 978 struct vfio_device *device = opaque; 979 980 return device->log_ops->log_read_and_clear(device, iova, length, iter); 981 } 982 983 static int 984 vfio_ioctl_device_feature_logging_report(struct vfio_device *device, 985 u32 flags, void __user *arg, 986 size_t argsz) 987 { 988 size_t minsz = 989 offsetofend(struct vfio_device_feature_dma_logging_report, 990 bitmap); 991 struct vfio_device_feature_dma_logging_report report; 992 struct iova_bitmap *iter; 993 u64 iova_end; 994 int ret; 995 996 if (!device->log_ops) 997 return -ENOTTY; 998 999 ret = vfio_check_feature(flags, argsz, 1000 VFIO_DEVICE_FEATURE_GET, 1001 sizeof(report)); 1002 if (ret != 1) 1003 return ret; 1004 1005 if (copy_from_user(&report, arg, minsz)) 1006 return -EFAULT; 1007 1008 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size)) 1009 return -EINVAL; 1010 1011 if (check_add_overflow(report.iova, report.length, &iova_end) || 1012 iova_end > ULONG_MAX) 1013 return -EOVERFLOW; 1014 1015 iter = iova_bitmap_alloc(report.iova, report.length, 1016 report.page_size, 1017 u64_to_user_ptr(report.bitmap)); 1018 if (IS_ERR(iter)) 1019 return PTR_ERR(iter); 1020 1021 ret = iova_bitmap_for_each(iter, device, 1022 vfio_device_log_read_and_clear); 1023 1024 iova_bitmap_free(iter); 1025 return ret; 1026 } 1027 1028 static int vfio_ioctl_device_feature(struct vfio_device *device, 1029 struct vfio_device_feature __user *arg) 1030 { 1031 size_t minsz = offsetofend(struct vfio_device_feature, flags); 1032 struct vfio_device_feature feature; 1033 1034 if (copy_from_user(&feature, arg, minsz)) 1035 return -EFAULT; 1036 1037 if (feature.argsz < minsz) 1038 return -EINVAL; 1039 1040 /* Check unknown flags */ 1041 if (feature.flags & 1042 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET | 1043 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE)) 1044 return -EINVAL; 1045 1046 /* GET & SET are mutually exclusive except with PROBE */ 1047 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 1048 (feature.flags & VFIO_DEVICE_FEATURE_SET) && 1049 (feature.flags & VFIO_DEVICE_FEATURE_GET)) 1050 return -EINVAL; 1051 1052 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 1053 case VFIO_DEVICE_FEATURE_MIGRATION: 1054 return vfio_ioctl_device_feature_migration( 1055 device, feature.flags, arg->data, 1056 feature.argsz - minsz); 1057 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: 1058 return vfio_ioctl_device_feature_mig_device_state( 1059 device, feature.flags, arg->data, 1060 feature.argsz - minsz); 1061 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: 1062 return vfio_ioctl_device_feature_logging_start( 1063 device, feature.flags, arg->data, 1064 feature.argsz - minsz); 1065 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: 1066 return vfio_ioctl_device_feature_logging_stop( 1067 device, feature.flags, arg->data, 1068 feature.argsz - minsz); 1069 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: 1070 return vfio_ioctl_device_feature_logging_report( 1071 device, feature.flags, arg->data, 1072 feature.argsz - minsz); 1073 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE: 1074 return vfio_ioctl_device_feature_migration_data_size( 1075 device, feature.flags, arg->data, 1076 feature.argsz - minsz); 1077 default: 1078 if (unlikely(!device->ops->device_feature)) 1079 return -EINVAL; 1080 return device->ops->device_feature(device, feature.flags, 1081 arg->data, 1082 feature.argsz - minsz); 1083 } 1084 } 1085 1086 static long vfio_device_fops_unl_ioctl(struct file *filep, 1087 unsigned int cmd, unsigned long arg) 1088 { 1089 struct vfio_device *device = filep->private_data; 1090 int ret; 1091 1092 ret = vfio_device_pm_runtime_get(device); 1093 if (ret) 1094 return ret; 1095 1096 switch (cmd) { 1097 case VFIO_DEVICE_FEATURE: 1098 ret = vfio_ioctl_device_feature(device, (void __user *)arg); 1099 break; 1100 1101 default: 1102 if (unlikely(!device->ops->ioctl)) 1103 ret = -EINVAL; 1104 else 1105 ret = device->ops->ioctl(device, cmd, arg); 1106 break; 1107 } 1108 1109 vfio_device_pm_runtime_put(device); 1110 return ret; 1111 } 1112 1113 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, 1114 size_t count, loff_t *ppos) 1115 { 1116 struct vfio_device *device = filep->private_data; 1117 1118 if (unlikely(!device->ops->read)) 1119 return -EINVAL; 1120 1121 return device->ops->read(device, buf, count, ppos); 1122 } 1123 1124 static ssize_t vfio_device_fops_write(struct file *filep, 1125 const char __user *buf, 1126 size_t count, loff_t *ppos) 1127 { 1128 struct vfio_device *device = filep->private_data; 1129 1130 if (unlikely(!device->ops->write)) 1131 return -EINVAL; 1132 1133 return device->ops->write(device, buf, count, ppos); 1134 } 1135 1136 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) 1137 { 1138 struct vfio_device *device = filep->private_data; 1139 1140 if (unlikely(!device->ops->mmap)) 1141 return -EINVAL; 1142 1143 return device->ops->mmap(device, vma); 1144 } 1145 1146 const struct file_operations vfio_device_fops = { 1147 .owner = THIS_MODULE, 1148 .release = vfio_device_fops_release, 1149 .read = vfio_device_fops_read, 1150 .write = vfio_device_fops_write, 1151 .unlocked_ioctl = vfio_device_fops_unl_ioctl, 1152 .compat_ioctl = compat_ptr_ioctl, 1153 .mmap = vfio_device_fops_mmap, 1154 }; 1155 1156 /* 1157 * Sub-module support 1158 */ 1159 /* 1160 * Helper for managing a buffer of info chain capabilities, allocate or 1161 * reallocate a buffer with additional @size, filling in @id and @version 1162 * of the capability. A pointer to the new capability is returned. 1163 * 1164 * NB. The chain is based at the head of the buffer, so new entries are 1165 * added to the tail, vfio_info_cap_shift() should be called to fixup the 1166 * next offsets prior to copying to the user buffer. 1167 */ 1168 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, 1169 size_t size, u16 id, u16 version) 1170 { 1171 void *buf; 1172 struct vfio_info_cap_header *header, *tmp; 1173 1174 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); 1175 if (!buf) { 1176 kfree(caps->buf); 1177 caps->buf = NULL; 1178 caps->size = 0; 1179 return ERR_PTR(-ENOMEM); 1180 } 1181 1182 caps->buf = buf; 1183 header = buf + caps->size; 1184 1185 /* Eventually copied to user buffer, zero */ 1186 memset(header, 0, size); 1187 1188 header->id = id; 1189 header->version = version; 1190 1191 /* Add to the end of the capability chain */ 1192 for (tmp = buf; tmp->next; tmp = buf + tmp->next) 1193 ; /* nothing */ 1194 1195 tmp->next = caps->size; 1196 caps->size += size; 1197 1198 return header; 1199 } 1200 EXPORT_SYMBOL_GPL(vfio_info_cap_add); 1201 1202 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) 1203 { 1204 struct vfio_info_cap_header *tmp; 1205 void *buf = (void *)caps->buf; 1206 1207 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) 1208 tmp->next += offset; 1209 } 1210 EXPORT_SYMBOL(vfio_info_cap_shift); 1211 1212 int vfio_info_add_capability(struct vfio_info_cap *caps, 1213 struct vfio_info_cap_header *cap, size_t size) 1214 { 1215 struct vfio_info_cap_header *header; 1216 1217 header = vfio_info_cap_add(caps, size, cap->id, cap->version); 1218 if (IS_ERR(header)) 1219 return PTR_ERR(header); 1220 1221 memcpy(header + 1, cap + 1, size - sizeof(*header)); 1222 1223 return 0; 1224 } 1225 EXPORT_SYMBOL(vfio_info_add_capability); 1226 1227 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, 1228 int max_irq_type, size_t *data_size) 1229 { 1230 unsigned long minsz; 1231 size_t size; 1232 1233 minsz = offsetofend(struct vfio_irq_set, count); 1234 1235 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) || 1236 (hdr->count >= (U32_MAX - hdr->start)) || 1237 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | 1238 VFIO_IRQ_SET_ACTION_TYPE_MASK))) 1239 return -EINVAL; 1240 1241 if (data_size) 1242 *data_size = 0; 1243 1244 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs) 1245 return -EINVAL; 1246 1247 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { 1248 case VFIO_IRQ_SET_DATA_NONE: 1249 size = 0; 1250 break; 1251 case VFIO_IRQ_SET_DATA_BOOL: 1252 size = sizeof(uint8_t); 1253 break; 1254 case VFIO_IRQ_SET_DATA_EVENTFD: 1255 size = sizeof(int32_t); 1256 break; 1257 default: 1258 return -EINVAL; 1259 } 1260 1261 if (size) { 1262 if (hdr->argsz - minsz < hdr->count * size) 1263 return -EINVAL; 1264 1265 if (!data_size) 1266 return -EINVAL; 1267 1268 *data_size = hdr->count * size; 1269 } 1270 1271 return 0; 1272 } 1273 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); 1274 1275 /* 1276 * Pin contiguous user pages and return their associated host pages for local 1277 * domain only. 1278 * @device [in] : device 1279 * @iova [in] : starting IOVA of user pages to be pinned. 1280 * @npage [in] : count of pages to be pinned. This count should not 1281 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1282 * @prot [in] : protection flags 1283 * @pages[out] : array of host pages 1284 * Return error or number of pages pinned. 1285 * 1286 * A driver may only call this function if the vfio_device was created 1287 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages(). 1288 */ 1289 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, 1290 int npage, int prot, struct page **pages) 1291 { 1292 /* group->container cannot change while a vfio device is open */ 1293 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) 1294 return -EINVAL; 1295 if (vfio_device_has_container(device)) 1296 return vfio_device_container_pin_pages(device, iova, 1297 npage, prot, pages); 1298 if (device->iommufd_access) { 1299 int ret; 1300 1301 if (iova > ULONG_MAX) 1302 return -EINVAL; 1303 /* 1304 * VFIO ignores the sub page offset, npages is from the start of 1305 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover 1306 * the sub page offset by doing: 1307 * pages[0] + (iova % PAGE_SIZE) 1308 */ 1309 ret = iommufd_access_pin_pages( 1310 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE), 1311 npage * PAGE_SIZE, pages, 1312 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0); 1313 if (ret) 1314 return ret; 1315 return npage; 1316 } 1317 return -EINVAL; 1318 } 1319 EXPORT_SYMBOL(vfio_pin_pages); 1320 1321 /* 1322 * Unpin contiguous host pages for local domain only. 1323 * @device [in] : device 1324 * @iova [in] : starting address of user pages to be unpinned. 1325 * @npage [in] : count of pages to be unpinned. This count should not 1326 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1327 */ 1328 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) 1329 { 1330 if (WARN_ON(!vfio_assert_device_open(device))) 1331 return; 1332 1333 if (vfio_device_has_container(device)) { 1334 vfio_device_container_unpin_pages(device, iova, npage); 1335 return; 1336 } 1337 if (device->iommufd_access) { 1338 if (WARN_ON(iova > ULONG_MAX)) 1339 return; 1340 iommufd_access_unpin_pages(device->iommufd_access, 1341 ALIGN_DOWN(iova, PAGE_SIZE), 1342 npage * PAGE_SIZE); 1343 return; 1344 } 1345 } 1346 EXPORT_SYMBOL(vfio_unpin_pages); 1347 1348 /* 1349 * This interface allows the CPUs to perform some sort of virtual DMA on 1350 * behalf of the device. 1351 * 1352 * CPUs read/write from/into a range of IOVAs pointing to user space memory 1353 * into/from a kernel buffer. 1354 * 1355 * As the read/write of user space memory is conducted via the CPUs and is 1356 * not a real device DMA, it is not necessary to pin the user space memory. 1357 * 1358 * @device [in] : VFIO device 1359 * @iova [in] : base IOVA of a user space buffer 1360 * @data [in] : pointer to kernel buffer 1361 * @len [in] : kernel buffer length 1362 * @write : indicate read or write 1363 * Return error code on failure or 0 on success. 1364 */ 1365 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, 1366 size_t len, bool write) 1367 { 1368 if (!data || len <= 0 || !vfio_assert_device_open(device)) 1369 return -EINVAL; 1370 1371 if (vfio_device_has_container(device)) 1372 return vfio_device_container_dma_rw(device, iova, 1373 data, len, write); 1374 1375 if (device->iommufd_access) { 1376 unsigned int flags = 0; 1377 1378 if (iova > ULONG_MAX) 1379 return -EINVAL; 1380 1381 /* VFIO historically tries to auto-detect a kthread */ 1382 if (!current->mm) 1383 flags |= IOMMUFD_ACCESS_RW_KTHREAD; 1384 if (write) 1385 flags |= IOMMUFD_ACCESS_RW_WRITE; 1386 return iommufd_access_rw(device->iommufd_access, iova, data, 1387 len, flags); 1388 } 1389 return -EINVAL; 1390 } 1391 EXPORT_SYMBOL(vfio_dma_rw); 1392 1393 /* 1394 * Module/class support 1395 */ 1396 static int __init vfio_init(void) 1397 { 1398 int ret; 1399 1400 ida_init(&vfio.device_ida); 1401 1402 ret = vfio_group_init(); 1403 if (ret) 1404 return ret; 1405 1406 ret = vfio_virqfd_init(); 1407 if (ret) 1408 goto err_virqfd; 1409 1410 /* /sys/class/vfio-dev/vfioX */ 1411 vfio.device_class = class_create(THIS_MODULE, "vfio-dev"); 1412 if (IS_ERR(vfio.device_class)) { 1413 ret = PTR_ERR(vfio.device_class); 1414 goto err_dev_class; 1415 } 1416 1417 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); 1418 return 0; 1419 1420 err_dev_class: 1421 vfio_virqfd_exit(); 1422 err_virqfd: 1423 vfio_group_cleanup(); 1424 return ret; 1425 } 1426 1427 static void __exit vfio_cleanup(void) 1428 { 1429 ida_destroy(&vfio.device_ida); 1430 class_destroy(vfio.device_class); 1431 vfio.device_class = NULL; 1432 vfio_virqfd_exit(); 1433 vfio_group_cleanup(); 1434 xa_destroy(&vfio_device_set_xa); 1435 } 1436 1437 module_init(vfio_init); 1438 module_exit(vfio_cleanup); 1439 1440 MODULE_VERSION(DRIVER_VERSION); 1441 MODULE_LICENSE("GPL v2"); 1442 MODULE_AUTHOR(DRIVER_AUTHOR); 1443 MODULE_DESCRIPTION(DRIVER_DESC); 1444 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce"); 1445