1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 4 * Author: Alex Williamson <alex.williamson@redhat.com> 5 * 6 * Derived from original vfio: 7 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 8 * Author: Tom Lyon, pugs@cisco.com 9 */ 10 11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13 #include <linux/device.h> 14 #include <linux/eventfd.h> 15 #include <linux/file.h> 16 #include <linux/interrupt.h> 17 #include <linux/iommu.h> 18 #include <linux/module.h> 19 #include <linux/mutex.h> 20 #include <linux/notifier.h> 21 #include <linux/pci.h> 22 #include <linux/pm_runtime.h> 23 #include <linux/slab.h> 24 #include <linux/types.h> 25 #include <linux/uaccess.h> 26 #include <linux/vfio.h> 27 #include <linux/vgaarb.h> 28 #include <linux/nospec.h> 29 #include <linux/sched/mm.h> 30 31 #include "vfio_pci_private.h" 32 33 #define DRIVER_VERSION "0.2" 34 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 35 #define DRIVER_DESC "VFIO PCI - User Level meta-driver" 36 37 static char ids[1024] __initdata; 38 module_param_string(ids, ids, sizeof(ids), 0); 39 MODULE_PARM_DESC(ids, "Initial PCI IDs to add to the vfio driver, format is \"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\" and multiple comma separated entries can be specified"); 40 41 static bool nointxmask; 42 module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR); 43 MODULE_PARM_DESC(nointxmask, 44 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag."); 45 46 #ifdef CONFIG_VFIO_PCI_VGA 47 static bool disable_vga; 48 module_param(disable_vga, bool, S_IRUGO); 49 MODULE_PARM_DESC(disable_vga, "Disable VGA resource access through vfio-pci"); 50 #endif 51 52 static bool disable_idle_d3; 53 module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR); 54 MODULE_PARM_DESC(disable_idle_d3, 55 "Disable using the PCI D3 low power state for idle, unused devices"); 56 57 static bool enable_sriov; 58 #ifdef CONFIG_PCI_IOV 59 module_param(enable_sriov, bool, 0644); 60 MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration. Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF."); 61 #endif 62 63 static bool disable_denylist; 64 module_param(disable_denylist, bool, 0444); 65 MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users."); 66 67 static inline bool vfio_vga_disabled(void) 68 { 69 #ifdef CONFIG_VFIO_PCI_VGA 70 return disable_vga; 71 #else 72 return true; 73 #endif 74 } 75 76 static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev) 77 { 78 switch (pdev->vendor) { 79 case PCI_VENDOR_ID_INTEL: 80 switch (pdev->device) { 81 case PCI_DEVICE_ID_INTEL_QAT_C3XXX: 82 case PCI_DEVICE_ID_INTEL_QAT_C3XXX_VF: 83 case PCI_DEVICE_ID_INTEL_QAT_C62X: 84 case PCI_DEVICE_ID_INTEL_QAT_C62X_VF: 85 case PCI_DEVICE_ID_INTEL_QAT_DH895XCC: 86 case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF: 87 return true; 88 default: 89 return false; 90 } 91 } 92 93 return false; 94 } 95 96 static bool vfio_pci_is_denylisted(struct pci_dev *pdev) 97 { 98 if (!vfio_pci_dev_in_denylist(pdev)) 99 return false; 100 101 if (disable_denylist) { 102 pci_warn(pdev, 103 "device denylist disabled - allowing device %04x:%04x.\n", 104 pdev->vendor, pdev->device); 105 return false; 106 } 107 108 pci_warn(pdev, "%04x:%04x exists in vfio-pci device denylist, driver probing disallowed.\n", 109 pdev->vendor, pdev->device); 110 111 return true; 112 } 113 114 /* 115 * Our VGA arbiter participation is limited since we don't know anything 116 * about the device itself. However, if the device is the only VGA device 117 * downstream of a bridge and VFIO VGA support is disabled, then we can 118 * safely return legacy VGA IO and memory as not decoded since the user 119 * has no way to get to it and routing can be disabled externally at the 120 * bridge. 121 */ 122 static unsigned int vfio_pci_set_decode(struct pci_dev *pdev, bool single_vga) 123 { 124 struct pci_dev *tmp = NULL; 125 unsigned char max_busnr; 126 unsigned int decodes; 127 128 if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus)) 129 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 130 VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 131 132 max_busnr = pci_bus_max_busnr(pdev->bus); 133 decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 134 135 while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) { 136 if (tmp == pdev || 137 pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) || 138 pci_is_root_bus(tmp->bus)) 139 continue; 140 141 if (tmp->bus->number >= pdev->bus->number && 142 tmp->bus->number <= max_busnr) { 143 pci_dev_put(tmp); 144 decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 145 break; 146 } 147 } 148 149 return decodes; 150 } 151 152 static inline bool vfio_pci_is_vga(struct pci_dev *pdev) 153 { 154 return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; 155 } 156 157 static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev) 158 { 159 struct resource *res; 160 int i; 161 struct vfio_pci_dummy_resource *dummy_res; 162 163 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 164 int bar = i + PCI_STD_RESOURCES; 165 166 res = &vdev->pdev->resource[bar]; 167 168 if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP)) 169 goto no_mmap; 170 171 if (!(res->flags & IORESOURCE_MEM)) 172 goto no_mmap; 173 174 /* 175 * The PCI core shouldn't set up a resource with a 176 * type but zero size. But there may be bugs that 177 * cause us to do that. 178 */ 179 if (!resource_size(res)) 180 goto no_mmap; 181 182 if (resource_size(res) >= PAGE_SIZE) { 183 vdev->bar_mmap_supported[bar] = true; 184 continue; 185 } 186 187 if (!(res->start & ~PAGE_MASK)) { 188 /* 189 * Add a dummy resource to reserve the remainder 190 * of the exclusive page in case that hot-add 191 * device's bar is assigned into it. 192 */ 193 dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL); 194 if (dummy_res == NULL) 195 goto no_mmap; 196 197 dummy_res->resource.name = "vfio sub-page reserved"; 198 dummy_res->resource.start = res->end + 1; 199 dummy_res->resource.end = res->start + PAGE_SIZE - 1; 200 dummy_res->resource.flags = res->flags; 201 if (request_resource(res->parent, 202 &dummy_res->resource)) { 203 kfree(dummy_res); 204 goto no_mmap; 205 } 206 dummy_res->index = bar; 207 list_add(&dummy_res->res_next, 208 &vdev->dummy_resources_list); 209 vdev->bar_mmap_supported[bar] = true; 210 continue; 211 } 212 /* 213 * Here we don't handle the case when the BAR is not page 214 * aligned because we can't expect the BAR will be 215 * assigned into the same location in a page in guest 216 * when we passthrough the BAR. And it's hard to access 217 * this BAR in userspace because we have no way to get 218 * the BAR's location in a page. 219 */ 220 no_mmap: 221 vdev->bar_mmap_supported[bar] = false; 222 } 223 } 224 225 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); 226 static void vfio_pci_disable(struct vfio_pci_device *vdev); 227 static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data); 228 229 /* 230 * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND 231 * _and_ the ability detect when the device is asserting INTx via PCI_STATUS. 232 * If a device implements the former but not the latter we would typically 233 * expect broken_intx_masking be set and require an exclusive interrupt. 234 * However since we do have control of the device's ability to assert INTx, 235 * we can instead pretend that the device does not implement INTx, virtualizing 236 * the pin register to report zero and maintaining DisINTx set on the host. 237 */ 238 static bool vfio_pci_nointx(struct pci_dev *pdev) 239 { 240 switch (pdev->vendor) { 241 case PCI_VENDOR_ID_INTEL: 242 switch (pdev->device) { 243 /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */ 244 case 0x1572: 245 case 0x1574: 246 case 0x1580 ... 0x1581: 247 case 0x1583 ... 0x158b: 248 case 0x37d0 ... 0x37d2: 249 /* X550 */ 250 case 0x1563: 251 return true; 252 default: 253 return false; 254 } 255 } 256 257 return false; 258 } 259 260 static void vfio_pci_probe_power_state(struct vfio_pci_device *vdev) 261 { 262 struct pci_dev *pdev = vdev->pdev; 263 u16 pmcsr; 264 265 if (!pdev->pm_cap) 266 return; 267 268 pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr); 269 270 vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET); 271 } 272 273 /* 274 * pci_set_power_state() wrapper handling devices which perform a soft reset on 275 * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev, 276 * restore when returned to D0. Saved separately from pci_saved_state for use 277 * by PM capability emulation and separately from pci_dev internal saved state 278 * to avoid it being overwritten and consumed around other resets. 279 */ 280 int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state) 281 { 282 struct pci_dev *pdev = vdev->pdev; 283 bool needs_restore = false, needs_save = false; 284 int ret; 285 286 if (vdev->needs_pm_restore) { 287 if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) { 288 pci_save_state(pdev); 289 needs_save = true; 290 } 291 292 if (pdev->current_state >= PCI_D3hot && state <= PCI_D0) 293 needs_restore = true; 294 } 295 296 ret = pci_set_power_state(pdev, state); 297 298 if (!ret) { 299 /* D3 might be unsupported via quirk, skip unless in D3 */ 300 if (needs_save && pdev->current_state >= PCI_D3hot) { 301 vdev->pm_save = pci_store_saved_state(pdev); 302 } else if (needs_restore) { 303 pci_load_and_free_saved_state(pdev, &vdev->pm_save); 304 pci_restore_state(pdev); 305 } 306 } 307 308 return ret; 309 } 310 311 static int vfio_pci_enable(struct vfio_pci_device *vdev) 312 { 313 struct pci_dev *pdev = vdev->pdev; 314 int ret; 315 u16 cmd; 316 u8 msix_pos; 317 318 vfio_pci_set_power_state(vdev, PCI_D0); 319 320 /* Don't allow our initial saved state to include busmaster */ 321 pci_clear_master(pdev); 322 323 ret = pci_enable_device(pdev); 324 if (ret) 325 return ret; 326 327 /* If reset fails because of the device lock, fail this path entirely */ 328 ret = pci_try_reset_function(pdev); 329 if (ret == -EAGAIN) { 330 pci_disable_device(pdev); 331 return ret; 332 } 333 334 vdev->reset_works = !ret; 335 pci_save_state(pdev); 336 vdev->pci_saved_state = pci_store_saved_state(pdev); 337 if (!vdev->pci_saved_state) 338 pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__); 339 340 if (likely(!nointxmask)) { 341 if (vfio_pci_nointx(pdev)) { 342 pci_info(pdev, "Masking broken INTx support\n"); 343 vdev->nointx = true; 344 pci_intx(pdev, 0); 345 } else 346 vdev->pci_2_3 = pci_intx_mask_supported(pdev); 347 } 348 349 pci_read_config_word(pdev, PCI_COMMAND, &cmd); 350 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { 351 cmd &= ~PCI_COMMAND_INTX_DISABLE; 352 pci_write_config_word(pdev, PCI_COMMAND, cmd); 353 } 354 355 ret = vfio_config_init(vdev); 356 if (ret) { 357 kfree(vdev->pci_saved_state); 358 vdev->pci_saved_state = NULL; 359 pci_disable_device(pdev); 360 return ret; 361 } 362 363 msix_pos = pdev->msix_cap; 364 if (msix_pos) { 365 u16 flags; 366 u32 table; 367 368 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); 369 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); 370 371 vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; 372 vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; 373 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; 374 } else 375 vdev->msix_bar = 0xFF; 376 377 if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) 378 vdev->has_vga = true; 379 380 if (vfio_pci_is_vga(pdev) && 381 pdev->vendor == PCI_VENDOR_ID_INTEL && 382 IS_ENABLED(CONFIG_VFIO_PCI_IGD)) { 383 ret = vfio_pci_igd_init(vdev); 384 if (ret && ret != -ENODEV) { 385 pci_warn(pdev, "Failed to setup Intel IGD regions\n"); 386 goto disable_exit; 387 } 388 } 389 390 vfio_pci_probe_mmaps(vdev); 391 392 return 0; 393 394 disable_exit: 395 vfio_pci_disable(vdev); 396 return ret; 397 } 398 399 static void vfio_pci_disable(struct vfio_pci_device *vdev) 400 { 401 struct pci_dev *pdev = vdev->pdev; 402 struct vfio_pci_dummy_resource *dummy_res, *tmp; 403 struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; 404 int i, bar; 405 406 /* Stop the device from further DMA */ 407 pci_clear_master(pdev); 408 409 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | 410 VFIO_IRQ_SET_ACTION_TRIGGER, 411 vdev->irq_type, 0, 0, NULL); 412 413 /* Device closed, don't need mutex here */ 414 list_for_each_entry_safe(ioeventfd, ioeventfd_tmp, 415 &vdev->ioeventfds_list, next) { 416 vfio_virqfd_disable(&ioeventfd->virqfd); 417 list_del(&ioeventfd->next); 418 kfree(ioeventfd); 419 } 420 vdev->ioeventfds_nr = 0; 421 422 vdev->virq_disabled = false; 423 424 for (i = 0; i < vdev->num_regions; i++) 425 vdev->region[i].ops->release(vdev, &vdev->region[i]); 426 427 vdev->num_regions = 0; 428 kfree(vdev->region); 429 vdev->region = NULL; /* don't krealloc a freed pointer */ 430 431 vfio_config_free(vdev); 432 433 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 434 bar = i + PCI_STD_RESOURCES; 435 if (!vdev->barmap[bar]) 436 continue; 437 pci_iounmap(pdev, vdev->barmap[bar]); 438 pci_release_selected_regions(pdev, 1 << bar); 439 vdev->barmap[bar] = NULL; 440 } 441 442 list_for_each_entry_safe(dummy_res, tmp, 443 &vdev->dummy_resources_list, res_next) { 444 list_del(&dummy_res->res_next); 445 release_resource(&dummy_res->resource); 446 kfree(dummy_res); 447 } 448 449 vdev->needs_reset = true; 450 451 /* 452 * If we have saved state, restore it. If we can reset the device, 453 * even better. Resetting with current state seems better than 454 * nothing, but saving and restoring current state without reset 455 * is just busy work. 456 */ 457 if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) { 458 pci_info(pdev, "%s: Couldn't reload saved state\n", __func__); 459 460 if (!vdev->reset_works) 461 goto out; 462 463 pci_save_state(pdev); 464 } 465 466 /* 467 * Disable INTx and MSI, presumably to avoid spurious interrupts 468 * during reset. Stolen from pci_reset_function() 469 */ 470 pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); 471 472 /* 473 * Try to get the locks ourselves to prevent a deadlock. The 474 * success of this is dependent on being able to lock the device, 475 * which is not always possible. 476 * We can not use the "try" reset interface here, which will 477 * overwrite the previously restored configuration information. 478 */ 479 if (vdev->reset_works && pci_dev_trylock(pdev)) { 480 if (!__pci_reset_function_locked(pdev)) 481 vdev->needs_reset = false; 482 pci_dev_unlock(pdev); 483 } 484 485 pci_restore_state(pdev); 486 out: 487 pci_disable_device(pdev); 488 489 vfio_pci_try_bus_reset(vdev); 490 491 if (!disable_idle_d3) 492 vfio_pci_set_power_state(vdev, PCI_D3hot); 493 } 494 495 static struct pci_driver vfio_pci_driver; 496 497 static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev) 498 { 499 struct pci_dev *physfn = pci_physfn(vdev->pdev); 500 struct vfio_device *pf_dev; 501 502 if (!vdev->pdev->is_virtfn) 503 return NULL; 504 505 pf_dev = vfio_device_get_from_dev(&physfn->dev); 506 if (!pf_dev) 507 return NULL; 508 509 if (pci_dev_driver(physfn) != &vfio_pci_driver) { 510 vfio_device_put(pf_dev); 511 return NULL; 512 } 513 514 return container_of(pf_dev, struct vfio_pci_device, vdev); 515 } 516 517 static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val) 518 { 519 struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev); 520 521 if (!pf_vdev) 522 return; 523 524 mutex_lock(&pf_vdev->vf_token->lock); 525 pf_vdev->vf_token->users += val; 526 WARN_ON(pf_vdev->vf_token->users < 0); 527 mutex_unlock(&pf_vdev->vf_token->lock); 528 529 vfio_device_put(&pf_vdev->vdev); 530 } 531 532 static void vfio_pci_release(struct vfio_device *core_vdev) 533 { 534 struct vfio_pci_device *vdev = 535 container_of(core_vdev, struct vfio_pci_device, vdev); 536 537 mutex_lock(&vdev->reflck->lock); 538 539 if (!(--vdev->refcnt)) { 540 vfio_pci_vf_token_user_add(vdev, -1); 541 vfio_spapr_pci_eeh_release(vdev->pdev); 542 vfio_pci_disable(vdev); 543 544 mutex_lock(&vdev->igate); 545 if (vdev->err_trigger) { 546 eventfd_ctx_put(vdev->err_trigger); 547 vdev->err_trigger = NULL; 548 } 549 if (vdev->req_trigger) { 550 eventfd_ctx_put(vdev->req_trigger); 551 vdev->req_trigger = NULL; 552 } 553 mutex_unlock(&vdev->igate); 554 } 555 556 mutex_unlock(&vdev->reflck->lock); 557 } 558 559 static int vfio_pci_open(struct vfio_device *core_vdev) 560 { 561 struct vfio_pci_device *vdev = 562 container_of(core_vdev, struct vfio_pci_device, vdev); 563 int ret = 0; 564 565 mutex_lock(&vdev->reflck->lock); 566 567 if (!vdev->refcnt) { 568 ret = vfio_pci_enable(vdev); 569 if (ret) 570 goto error; 571 572 vfio_spapr_pci_eeh_open(vdev->pdev); 573 vfio_pci_vf_token_user_add(vdev, 1); 574 } 575 vdev->refcnt++; 576 error: 577 mutex_unlock(&vdev->reflck->lock); 578 return ret; 579 } 580 581 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) 582 { 583 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { 584 u8 pin; 585 586 if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || 587 vdev->nointx || vdev->pdev->is_virtfn) 588 return 0; 589 590 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); 591 592 return pin ? 1 : 0; 593 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { 594 u8 pos; 595 u16 flags; 596 597 pos = vdev->pdev->msi_cap; 598 if (pos) { 599 pci_read_config_word(vdev->pdev, 600 pos + PCI_MSI_FLAGS, &flags); 601 return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); 602 } 603 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { 604 u8 pos; 605 u16 flags; 606 607 pos = vdev->pdev->msix_cap; 608 if (pos) { 609 pci_read_config_word(vdev->pdev, 610 pos + PCI_MSIX_FLAGS, &flags); 611 612 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 613 } 614 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { 615 if (pci_is_pcie(vdev->pdev)) 616 return 1; 617 } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { 618 return 1; 619 } 620 621 return 0; 622 } 623 624 static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) 625 { 626 (*(int *)data)++; 627 return 0; 628 } 629 630 struct vfio_pci_fill_info { 631 int max; 632 int cur; 633 struct vfio_pci_dependent_device *devices; 634 }; 635 636 static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) 637 { 638 struct vfio_pci_fill_info *fill = data; 639 struct iommu_group *iommu_group; 640 641 if (fill->cur == fill->max) 642 return -EAGAIN; /* Something changed, try again */ 643 644 iommu_group = iommu_group_get(&pdev->dev); 645 if (!iommu_group) 646 return -EPERM; /* Cannot reset non-isolated devices */ 647 648 fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); 649 fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); 650 fill->devices[fill->cur].bus = pdev->bus->number; 651 fill->devices[fill->cur].devfn = pdev->devfn; 652 fill->cur++; 653 iommu_group_put(iommu_group); 654 return 0; 655 } 656 657 struct vfio_pci_group_entry { 658 struct vfio_group *group; 659 int id; 660 }; 661 662 struct vfio_pci_group_info { 663 int count; 664 struct vfio_pci_group_entry *groups; 665 }; 666 667 static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data) 668 { 669 struct vfio_pci_group_info *info = data; 670 struct iommu_group *group; 671 int id, i; 672 673 group = iommu_group_get(&pdev->dev); 674 if (!group) 675 return -EPERM; 676 677 id = iommu_group_id(group); 678 679 for (i = 0; i < info->count; i++) 680 if (info->groups[i].id == id) 681 break; 682 683 iommu_group_put(group); 684 685 return (i == info->count) ? -EINVAL : 0; 686 } 687 688 static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) 689 { 690 for (; pdev; pdev = pdev->bus->self) 691 if (pdev->bus == slot->bus) 692 return (pdev->slot == slot); 693 return false; 694 } 695 696 struct vfio_pci_walk_info { 697 int (*fn)(struct pci_dev *, void *data); 698 void *data; 699 struct pci_dev *pdev; 700 bool slot; 701 int ret; 702 }; 703 704 static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data) 705 { 706 struct vfio_pci_walk_info *walk = data; 707 708 if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot)) 709 walk->ret = walk->fn(pdev, walk->data); 710 711 return walk->ret; 712 } 713 714 static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, 715 int (*fn)(struct pci_dev *, 716 void *data), void *data, 717 bool slot) 718 { 719 struct vfio_pci_walk_info walk = { 720 .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0, 721 }; 722 723 pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk); 724 725 return walk.ret; 726 } 727 728 static int msix_mmappable_cap(struct vfio_pci_device *vdev, 729 struct vfio_info_cap *caps) 730 { 731 struct vfio_info_cap_header header = { 732 .id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, 733 .version = 1 734 }; 735 736 return vfio_info_add_capability(caps, &header, sizeof(header)); 737 } 738 739 int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, 740 unsigned int type, unsigned int subtype, 741 const struct vfio_pci_regops *ops, 742 size_t size, u32 flags, void *data) 743 { 744 struct vfio_pci_region *region; 745 746 region = krealloc(vdev->region, 747 (vdev->num_regions + 1) * sizeof(*region), 748 GFP_KERNEL); 749 if (!region) 750 return -ENOMEM; 751 752 vdev->region = region; 753 vdev->region[vdev->num_regions].type = type; 754 vdev->region[vdev->num_regions].subtype = subtype; 755 vdev->region[vdev->num_regions].ops = ops; 756 vdev->region[vdev->num_regions].size = size; 757 vdev->region[vdev->num_regions].flags = flags; 758 vdev->region[vdev->num_regions].data = data; 759 760 vdev->num_regions++; 761 762 return 0; 763 } 764 765 struct vfio_devices { 766 struct vfio_pci_device **devices; 767 int cur_index; 768 int max_index; 769 }; 770 771 static long vfio_pci_ioctl(struct vfio_device *core_vdev, 772 unsigned int cmd, unsigned long arg) 773 { 774 struct vfio_pci_device *vdev = 775 container_of(core_vdev, struct vfio_pci_device, vdev); 776 unsigned long minsz; 777 778 if (cmd == VFIO_DEVICE_GET_INFO) { 779 struct vfio_device_info info; 780 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 781 unsigned long capsz; 782 int ret; 783 784 minsz = offsetofend(struct vfio_device_info, num_irqs); 785 786 /* For backward compatibility, cannot require this */ 787 capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); 788 789 if (copy_from_user(&info, (void __user *)arg, minsz)) 790 return -EFAULT; 791 792 if (info.argsz < minsz) 793 return -EINVAL; 794 795 if (info.argsz >= capsz) { 796 minsz = capsz; 797 info.cap_offset = 0; 798 } 799 800 info.flags = VFIO_DEVICE_FLAGS_PCI; 801 802 if (vdev->reset_works) 803 info.flags |= VFIO_DEVICE_FLAGS_RESET; 804 805 info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; 806 info.num_irqs = VFIO_PCI_NUM_IRQS; 807 808 ret = vfio_pci_info_zdev_add_caps(vdev, &caps); 809 if (ret && ret != -ENODEV) { 810 pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n"); 811 return ret; 812 } 813 814 if (caps.size) { 815 info.flags |= VFIO_DEVICE_FLAGS_CAPS; 816 if (info.argsz < sizeof(info) + caps.size) { 817 info.argsz = sizeof(info) + caps.size; 818 } else { 819 vfio_info_cap_shift(&caps, sizeof(info)); 820 if (copy_to_user((void __user *)arg + 821 sizeof(info), caps.buf, 822 caps.size)) { 823 kfree(caps.buf); 824 return -EFAULT; 825 } 826 info.cap_offset = sizeof(info); 827 } 828 829 kfree(caps.buf); 830 } 831 832 return copy_to_user((void __user *)arg, &info, minsz) ? 833 -EFAULT : 0; 834 835 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { 836 struct pci_dev *pdev = vdev->pdev; 837 struct vfio_region_info info; 838 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 839 int i, ret; 840 841 minsz = offsetofend(struct vfio_region_info, offset); 842 843 if (copy_from_user(&info, (void __user *)arg, minsz)) 844 return -EFAULT; 845 846 if (info.argsz < minsz) 847 return -EINVAL; 848 849 switch (info.index) { 850 case VFIO_PCI_CONFIG_REGION_INDEX: 851 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 852 info.size = pdev->cfg_size; 853 info.flags = VFIO_REGION_INFO_FLAG_READ | 854 VFIO_REGION_INFO_FLAG_WRITE; 855 break; 856 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 857 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 858 info.size = pci_resource_len(pdev, info.index); 859 if (!info.size) { 860 info.flags = 0; 861 break; 862 } 863 864 info.flags = VFIO_REGION_INFO_FLAG_READ | 865 VFIO_REGION_INFO_FLAG_WRITE; 866 if (vdev->bar_mmap_supported[info.index]) { 867 info.flags |= VFIO_REGION_INFO_FLAG_MMAP; 868 if (info.index == vdev->msix_bar) { 869 ret = msix_mmappable_cap(vdev, &caps); 870 if (ret) 871 return ret; 872 } 873 } 874 875 break; 876 case VFIO_PCI_ROM_REGION_INDEX: 877 { 878 void __iomem *io; 879 size_t size; 880 u16 cmd; 881 882 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 883 info.flags = 0; 884 885 /* Report the BAR size, not the ROM size */ 886 info.size = pci_resource_len(pdev, info.index); 887 if (!info.size) { 888 /* Shadow ROMs appear as PCI option ROMs */ 889 if (pdev->resource[PCI_ROM_RESOURCE].flags & 890 IORESOURCE_ROM_SHADOW) 891 info.size = 0x20000; 892 else 893 break; 894 } 895 896 /* 897 * Is it really there? Enable memory decode for 898 * implicit access in pci_map_rom(). 899 */ 900 cmd = vfio_pci_memory_lock_and_enable(vdev); 901 io = pci_map_rom(pdev, &size); 902 if (io) { 903 info.flags = VFIO_REGION_INFO_FLAG_READ; 904 pci_unmap_rom(pdev, io); 905 } else { 906 info.size = 0; 907 } 908 vfio_pci_memory_unlock_and_restore(vdev, cmd); 909 910 break; 911 } 912 case VFIO_PCI_VGA_REGION_INDEX: 913 if (!vdev->has_vga) 914 return -EINVAL; 915 916 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 917 info.size = 0xc0000; 918 info.flags = VFIO_REGION_INFO_FLAG_READ | 919 VFIO_REGION_INFO_FLAG_WRITE; 920 921 break; 922 default: 923 { 924 struct vfio_region_info_cap_type cap_type = { 925 .header.id = VFIO_REGION_INFO_CAP_TYPE, 926 .header.version = 1 }; 927 928 if (info.index >= 929 VFIO_PCI_NUM_REGIONS + vdev->num_regions) 930 return -EINVAL; 931 info.index = array_index_nospec(info.index, 932 VFIO_PCI_NUM_REGIONS + 933 vdev->num_regions); 934 935 i = info.index - VFIO_PCI_NUM_REGIONS; 936 937 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 938 info.size = vdev->region[i].size; 939 info.flags = vdev->region[i].flags; 940 941 cap_type.type = vdev->region[i].type; 942 cap_type.subtype = vdev->region[i].subtype; 943 944 ret = vfio_info_add_capability(&caps, &cap_type.header, 945 sizeof(cap_type)); 946 if (ret) 947 return ret; 948 949 if (vdev->region[i].ops->add_capability) { 950 ret = vdev->region[i].ops->add_capability(vdev, 951 &vdev->region[i], &caps); 952 if (ret) 953 return ret; 954 } 955 } 956 } 957 958 if (caps.size) { 959 info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 960 if (info.argsz < sizeof(info) + caps.size) { 961 info.argsz = sizeof(info) + caps.size; 962 info.cap_offset = 0; 963 } else { 964 vfio_info_cap_shift(&caps, sizeof(info)); 965 if (copy_to_user((void __user *)arg + 966 sizeof(info), caps.buf, 967 caps.size)) { 968 kfree(caps.buf); 969 return -EFAULT; 970 } 971 info.cap_offset = sizeof(info); 972 } 973 974 kfree(caps.buf); 975 } 976 977 return copy_to_user((void __user *)arg, &info, minsz) ? 978 -EFAULT : 0; 979 980 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { 981 struct vfio_irq_info info; 982 983 minsz = offsetofend(struct vfio_irq_info, count); 984 985 if (copy_from_user(&info, (void __user *)arg, minsz)) 986 return -EFAULT; 987 988 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 989 return -EINVAL; 990 991 switch (info.index) { 992 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: 993 case VFIO_PCI_REQ_IRQ_INDEX: 994 break; 995 case VFIO_PCI_ERR_IRQ_INDEX: 996 if (pci_is_pcie(vdev->pdev)) 997 break; 998 fallthrough; 999 default: 1000 return -EINVAL; 1001 } 1002 1003 info.flags = VFIO_IRQ_INFO_EVENTFD; 1004 1005 info.count = vfio_pci_get_irq_count(vdev, info.index); 1006 1007 if (info.index == VFIO_PCI_INTX_IRQ_INDEX) 1008 info.flags |= (VFIO_IRQ_INFO_MASKABLE | 1009 VFIO_IRQ_INFO_AUTOMASKED); 1010 else 1011 info.flags |= VFIO_IRQ_INFO_NORESIZE; 1012 1013 return copy_to_user((void __user *)arg, &info, minsz) ? 1014 -EFAULT : 0; 1015 1016 } else if (cmd == VFIO_DEVICE_SET_IRQS) { 1017 struct vfio_irq_set hdr; 1018 u8 *data = NULL; 1019 int max, ret = 0; 1020 size_t data_size = 0; 1021 1022 minsz = offsetofend(struct vfio_irq_set, count); 1023 1024 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 1025 return -EFAULT; 1026 1027 max = vfio_pci_get_irq_count(vdev, hdr.index); 1028 1029 ret = vfio_set_irqs_validate_and_prepare(&hdr, max, 1030 VFIO_PCI_NUM_IRQS, &data_size); 1031 if (ret) 1032 return ret; 1033 1034 if (data_size) { 1035 data = memdup_user((void __user *)(arg + minsz), 1036 data_size); 1037 if (IS_ERR(data)) 1038 return PTR_ERR(data); 1039 } 1040 1041 mutex_lock(&vdev->igate); 1042 1043 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, 1044 hdr.start, hdr.count, data); 1045 1046 mutex_unlock(&vdev->igate); 1047 kfree(data); 1048 1049 return ret; 1050 1051 } else if (cmd == VFIO_DEVICE_RESET) { 1052 int ret; 1053 1054 if (!vdev->reset_works) 1055 return -EINVAL; 1056 1057 vfio_pci_zap_and_down_write_memory_lock(vdev); 1058 ret = pci_try_reset_function(vdev->pdev); 1059 up_write(&vdev->memory_lock); 1060 1061 return ret; 1062 1063 } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { 1064 struct vfio_pci_hot_reset_info hdr; 1065 struct vfio_pci_fill_info fill = { 0 }; 1066 struct vfio_pci_dependent_device *devices = NULL; 1067 bool slot = false; 1068 int ret = 0; 1069 1070 minsz = offsetofend(struct vfio_pci_hot_reset_info, count); 1071 1072 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 1073 return -EFAULT; 1074 1075 if (hdr.argsz < minsz) 1076 return -EINVAL; 1077 1078 hdr.flags = 0; 1079 1080 /* Can we do a slot or bus reset or neither? */ 1081 if (!pci_probe_reset_slot(vdev->pdev->slot)) 1082 slot = true; 1083 else if (pci_probe_reset_bus(vdev->pdev->bus)) 1084 return -ENODEV; 1085 1086 /* How many devices are affected? */ 1087 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1088 vfio_pci_count_devs, 1089 &fill.max, slot); 1090 if (ret) 1091 return ret; 1092 1093 WARN_ON(!fill.max); /* Should always be at least one */ 1094 1095 /* 1096 * If there's enough space, fill it now, otherwise return 1097 * -ENOSPC and the number of devices affected. 1098 */ 1099 if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { 1100 ret = -ENOSPC; 1101 hdr.count = fill.max; 1102 goto reset_info_exit; 1103 } 1104 1105 devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); 1106 if (!devices) 1107 return -ENOMEM; 1108 1109 fill.devices = devices; 1110 1111 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1112 vfio_pci_fill_devs, 1113 &fill, slot); 1114 1115 /* 1116 * If a device was removed between counting and filling, 1117 * we may come up short of fill.max. If a device was 1118 * added, we'll have a return of -EAGAIN above. 1119 */ 1120 if (!ret) 1121 hdr.count = fill.cur; 1122 1123 reset_info_exit: 1124 if (copy_to_user((void __user *)arg, &hdr, minsz)) 1125 ret = -EFAULT; 1126 1127 if (!ret) { 1128 if (copy_to_user((void __user *)(arg + minsz), devices, 1129 hdr.count * sizeof(*devices))) 1130 ret = -EFAULT; 1131 } 1132 1133 kfree(devices); 1134 return ret; 1135 1136 } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { 1137 struct vfio_pci_hot_reset hdr; 1138 int32_t *group_fds; 1139 struct vfio_pci_group_entry *groups; 1140 struct vfio_pci_group_info info; 1141 struct vfio_devices devs = { .cur_index = 0 }; 1142 bool slot = false; 1143 int i, group_idx, mem_idx = 0, count = 0, ret = 0; 1144 1145 minsz = offsetofend(struct vfio_pci_hot_reset, count); 1146 1147 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 1148 return -EFAULT; 1149 1150 if (hdr.argsz < minsz || hdr.flags) 1151 return -EINVAL; 1152 1153 /* Can we do a slot or bus reset or neither? */ 1154 if (!pci_probe_reset_slot(vdev->pdev->slot)) 1155 slot = true; 1156 else if (pci_probe_reset_bus(vdev->pdev->bus)) 1157 return -ENODEV; 1158 1159 /* 1160 * We can't let userspace give us an arbitrarily large 1161 * buffer to copy, so verify how many we think there 1162 * could be. Note groups can have multiple devices so 1163 * one group per device is the max. 1164 */ 1165 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1166 vfio_pci_count_devs, 1167 &count, slot); 1168 if (ret) 1169 return ret; 1170 1171 /* Somewhere between 1 and count is OK */ 1172 if (!hdr.count || hdr.count > count) 1173 return -EINVAL; 1174 1175 group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); 1176 groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); 1177 if (!group_fds || !groups) { 1178 kfree(group_fds); 1179 kfree(groups); 1180 return -ENOMEM; 1181 } 1182 1183 if (copy_from_user(group_fds, (void __user *)(arg + minsz), 1184 hdr.count * sizeof(*group_fds))) { 1185 kfree(group_fds); 1186 kfree(groups); 1187 return -EFAULT; 1188 } 1189 1190 /* 1191 * For each group_fd, get the group through the vfio external 1192 * user interface and store the group and iommu ID. This 1193 * ensures the group is held across the reset. 1194 */ 1195 for (group_idx = 0; group_idx < hdr.count; group_idx++) { 1196 struct vfio_group *group; 1197 struct fd f = fdget(group_fds[group_idx]); 1198 if (!f.file) { 1199 ret = -EBADF; 1200 break; 1201 } 1202 1203 group = vfio_group_get_external_user(f.file); 1204 fdput(f); 1205 if (IS_ERR(group)) { 1206 ret = PTR_ERR(group); 1207 break; 1208 } 1209 1210 groups[group_idx].group = group; 1211 groups[group_idx].id = 1212 vfio_external_user_iommu_id(group); 1213 } 1214 1215 kfree(group_fds); 1216 1217 /* release reference to groups on error */ 1218 if (ret) 1219 goto hot_reset_release; 1220 1221 info.count = hdr.count; 1222 info.groups = groups; 1223 1224 /* 1225 * Test whether all the affected devices are contained 1226 * by the set of groups provided by the user. 1227 */ 1228 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1229 vfio_pci_validate_devs, 1230 &info, slot); 1231 if (ret) 1232 goto hot_reset_release; 1233 1234 devs.max_index = count; 1235 devs.devices = kcalloc(count, sizeof(struct vfio_device *), 1236 GFP_KERNEL); 1237 if (!devs.devices) { 1238 ret = -ENOMEM; 1239 goto hot_reset_release; 1240 } 1241 1242 /* 1243 * We need to get memory_lock for each device, but devices 1244 * can share mmap_lock, therefore we need to zap and hold 1245 * the vma_lock for each device, and only then get each 1246 * memory_lock. 1247 */ 1248 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1249 vfio_pci_try_zap_and_vma_lock_cb, 1250 &devs, slot); 1251 if (ret) 1252 goto hot_reset_release; 1253 1254 for (; mem_idx < devs.cur_index; mem_idx++) { 1255 struct vfio_pci_device *tmp = devs.devices[mem_idx]; 1256 1257 ret = down_write_trylock(&tmp->memory_lock); 1258 if (!ret) { 1259 ret = -EBUSY; 1260 goto hot_reset_release; 1261 } 1262 mutex_unlock(&tmp->vma_lock); 1263 } 1264 1265 /* User has access, do the reset */ 1266 ret = pci_reset_bus(vdev->pdev); 1267 1268 hot_reset_release: 1269 for (i = 0; i < devs.cur_index; i++) { 1270 struct vfio_pci_device *tmp = devs.devices[i]; 1271 1272 if (i < mem_idx) 1273 up_write(&tmp->memory_lock); 1274 else 1275 mutex_unlock(&tmp->vma_lock); 1276 vfio_device_put(&tmp->vdev); 1277 } 1278 kfree(devs.devices); 1279 1280 for (group_idx--; group_idx >= 0; group_idx--) 1281 vfio_group_put_external_user(groups[group_idx].group); 1282 1283 kfree(groups); 1284 return ret; 1285 } else if (cmd == VFIO_DEVICE_IOEVENTFD) { 1286 struct vfio_device_ioeventfd ioeventfd; 1287 int count; 1288 1289 minsz = offsetofend(struct vfio_device_ioeventfd, fd); 1290 1291 if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) 1292 return -EFAULT; 1293 1294 if (ioeventfd.argsz < minsz) 1295 return -EINVAL; 1296 1297 if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) 1298 return -EINVAL; 1299 1300 count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; 1301 1302 if (hweight8(count) != 1 || ioeventfd.fd < -1) 1303 return -EINVAL; 1304 1305 return vfio_pci_ioeventfd(vdev, ioeventfd.offset, 1306 ioeventfd.data, count, ioeventfd.fd); 1307 } else if (cmd == VFIO_DEVICE_FEATURE) { 1308 struct vfio_device_feature feature; 1309 uuid_t uuid; 1310 1311 minsz = offsetofend(struct vfio_device_feature, flags); 1312 1313 if (copy_from_user(&feature, (void __user *)arg, minsz)) 1314 return -EFAULT; 1315 1316 if (feature.argsz < minsz) 1317 return -EINVAL; 1318 1319 /* Check unknown flags */ 1320 if (feature.flags & ~(VFIO_DEVICE_FEATURE_MASK | 1321 VFIO_DEVICE_FEATURE_SET | 1322 VFIO_DEVICE_FEATURE_GET | 1323 VFIO_DEVICE_FEATURE_PROBE)) 1324 return -EINVAL; 1325 1326 /* GET & SET are mutually exclusive except with PROBE */ 1327 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 1328 (feature.flags & VFIO_DEVICE_FEATURE_SET) && 1329 (feature.flags & VFIO_DEVICE_FEATURE_GET)) 1330 return -EINVAL; 1331 1332 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 1333 case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: 1334 if (!vdev->vf_token) 1335 return -ENOTTY; 1336 1337 /* 1338 * We do not support GET of the VF Token UUID as this 1339 * could expose the token of the previous device user. 1340 */ 1341 if (feature.flags & VFIO_DEVICE_FEATURE_GET) 1342 return -EINVAL; 1343 1344 if (feature.flags & VFIO_DEVICE_FEATURE_PROBE) 1345 return 0; 1346 1347 /* Don't SET unless told to do so */ 1348 if (!(feature.flags & VFIO_DEVICE_FEATURE_SET)) 1349 return -EINVAL; 1350 1351 if (feature.argsz < minsz + sizeof(uuid)) 1352 return -EINVAL; 1353 1354 if (copy_from_user(&uuid, (void __user *)(arg + minsz), 1355 sizeof(uuid))) 1356 return -EFAULT; 1357 1358 mutex_lock(&vdev->vf_token->lock); 1359 uuid_copy(&vdev->vf_token->uuid, &uuid); 1360 mutex_unlock(&vdev->vf_token->lock); 1361 1362 return 0; 1363 default: 1364 return -ENOTTY; 1365 } 1366 } 1367 1368 return -ENOTTY; 1369 } 1370 1371 static ssize_t vfio_pci_rw(struct vfio_pci_device *vdev, char __user *buf, 1372 size_t count, loff_t *ppos, bool iswrite) 1373 { 1374 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 1375 1376 if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 1377 return -EINVAL; 1378 1379 switch (index) { 1380 case VFIO_PCI_CONFIG_REGION_INDEX: 1381 return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); 1382 1383 case VFIO_PCI_ROM_REGION_INDEX: 1384 if (iswrite) 1385 return -EINVAL; 1386 return vfio_pci_bar_rw(vdev, buf, count, ppos, false); 1387 1388 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 1389 return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); 1390 1391 case VFIO_PCI_VGA_REGION_INDEX: 1392 return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); 1393 default: 1394 index -= VFIO_PCI_NUM_REGIONS; 1395 return vdev->region[index].ops->rw(vdev, buf, 1396 count, ppos, iswrite); 1397 } 1398 1399 return -EINVAL; 1400 } 1401 1402 static ssize_t vfio_pci_read(struct vfio_device *core_vdev, char __user *buf, 1403 size_t count, loff_t *ppos) 1404 { 1405 struct vfio_pci_device *vdev = 1406 container_of(core_vdev, struct vfio_pci_device, vdev); 1407 1408 if (!count) 1409 return 0; 1410 1411 return vfio_pci_rw(vdev, buf, count, ppos, false); 1412 } 1413 1414 static ssize_t vfio_pci_write(struct vfio_device *core_vdev, const char __user *buf, 1415 size_t count, loff_t *ppos) 1416 { 1417 struct vfio_pci_device *vdev = 1418 container_of(core_vdev, struct vfio_pci_device, vdev); 1419 1420 if (!count) 1421 return 0; 1422 1423 return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true); 1424 } 1425 1426 /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ 1427 static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try) 1428 { 1429 struct vfio_pci_mmap_vma *mmap_vma, *tmp; 1430 1431 /* 1432 * Lock ordering: 1433 * vma_lock is nested under mmap_lock for vm_ops callback paths. 1434 * The memory_lock semaphore is used by both code paths calling 1435 * into this function to zap vmas and the vm_ops.fault callback 1436 * to protect the memory enable state of the device. 1437 * 1438 * When zapping vmas we need to maintain the mmap_lock => vma_lock 1439 * ordering, which requires using vma_lock to walk vma_list to 1440 * acquire an mm, then dropping vma_lock to get the mmap_lock and 1441 * reacquiring vma_lock. This logic is derived from similar 1442 * requirements in uverbs_user_mmap_disassociate(). 1443 * 1444 * mmap_lock must always be the top-level lock when it is taken. 1445 * Therefore we can only hold the memory_lock write lock when 1446 * vma_list is empty, as we'd need to take mmap_lock to clear 1447 * entries. vma_list can only be guaranteed empty when holding 1448 * vma_lock, thus memory_lock is nested under vma_lock. 1449 * 1450 * This enables the vm_ops.fault callback to acquire vma_lock, 1451 * followed by memory_lock read lock, while already holding 1452 * mmap_lock without risk of deadlock. 1453 */ 1454 while (1) { 1455 struct mm_struct *mm = NULL; 1456 1457 if (try) { 1458 if (!mutex_trylock(&vdev->vma_lock)) 1459 return 0; 1460 } else { 1461 mutex_lock(&vdev->vma_lock); 1462 } 1463 while (!list_empty(&vdev->vma_list)) { 1464 mmap_vma = list_first_entry(&vdev->vma_list, 1465 struct vfio_pci_mmap_vma, 1466 vma_next); 1467 mm = mmap_vma->vma->vm_mm; 1468 if (mmget_not_zero(mm)) 1469 break; 1470 1471 list_del(&mmap_vma->vma_next); 1472 kfree(mmap_vma); 1473 mm = NULL; 1474 } 1475 if (!mm) 1476 return 1; 1477 mutex_unlock(&vdev->vma_lock); 1478 1479 if (try) { 1480 if (!mmap_read_trylock(mm)) { 1481 mmput(mm); 1482 return 0; 1483 } 1484 } else { 1485 mmap_read_lock(mm); 1486 } 1487 if (try) { 1488 if (!mutex_trylock(&vdev->vma_lock)) { 1489 mmap_read_unlock(mm); 1490 mmput(mm); 1491 return 0; 1492 } 1493 } else { 1494 mutex_lock(&vdev->vma_lock); 1495 } 1496 list_for_each_entry_safe(mmap_vma, tmp, 1497 &vdev->vma_list, vma_next) { 1498 struct vm_area_struct *vma = mmap_vma->vma; 1499 1500 if (vma->vm_mm != mm) 1501 continue; 1502 1503 list_del(&mmap_vma->vma_next); 1504 kfree(mmap_vma); 1505 1506 zap_vma_ptes(vma, vma->vm_start, 1507 vma->vm_end - vma->vm_start); 1508 } 1509 mutex_unlock(&vdev->vma_lock); 1510 mmap_read_unlock(mm); 1511 mmput(mm); 1512 } 1513 } 1514 1515 void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device *vdev) 1516 { 1517 vfio_pci_zap_and_vma_lock(vdev, false); 1518 down_write(&vdev->memory_lock); 1519 mutex_unlock(&vdev->vma_lock); 1520 } 1521 1522 u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev) 1523 { 1524 u16 cmd; 1525 1526 down_write(&vdev->memory_lock); 1527 pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd); 1528 if (!(cmd & PCI_COMMAND_MEMORY)) 1529 pci_write_config_word(vdev->pdev, PCI_COMMAND, 1530 cmd | PCI_COMMAND_MEMORY); 1531 1532 return cmd; 1533 } 1534 1535 void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev, u16 cmd) 1536 { 1537 pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd); 1538 up_write(&vdev->memory_lock); 1539 } 1540 1541 /* Caller holds vma_lock */ 1542 static int __vfio_pci_add_vma(struct vfio_pci_device *vdev, 1543 struct vm_area_struct *vma) 1544 { 1545 struct vfio_pci_mmap_vma *mmap_vma; 1546 1547 mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL); 1548 if (!mmap_vma) 1549 return -ENOMEM; 1550 1551 mmap_vma->vma = vma; 1552 list_add(&mmap_vma->vma_next, &vdev->vma_list); 1553 1554 return 0; 1555 } 1556 1557 /* 1558 * Zap mmaps on open so that we can fault them in on access and therefore 1559 * our vma_list only tracks mappings accessed since last zap. 1560 */ 1561 static void vfio_pci_mmap_open(struct vm_area_struct *vma) 1562 { 1563 zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1564 } 1565 1566 static void vfio_pci_mmap_close(struct vm_area_struct *vma) 1567 { 1568 struct vfio_pci_device *vdev = vma->vm_private_data; 1569 struct vfio_pci_mmap_vma *mmap_vma; 1570 1571 mutex_lock(&vdev->vma_lock); 1572 list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { 1573 if (mmap_vma->vma == vma) { 1574 list_del(&mmap_vma->vma_next); 1575 kfree(mmap_vma); 1576 break; 1577 } 1578 } 1579 mutex_unlock(&vdev->vma_lock); 1580 } 1581 1582 static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) 1583 { 1584 struct vm_area_struct *vma = vmf->vma; 1585 struct vfio_pci_device *vdev = vma->vm_private_data; 1586 struct vfio_pci_mmap_vma *mmap_vma; 1587 vm_fault_t ret = VM_FAULT_NOPAGE; 1588 1589 mutex_lock(&vdev->vma_lock); 1590 down_read(&vdev->memory_lock); 1591 1592 if (!__vfio_pci_memory_enabled(vdev)) { 1593 ret = VM_FAULT_SIGBUS; 1594 goto up_out; 1595 } 1596 1597 /* 1598 * We populate the whole vma on fault, so we need to test whether 1599 * the vma has already been mapped, such as for concurrent faults 1600 * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if 1601 * we ask it to fill the same range again. 1602 */ 1603 list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { 1604 if (mmap_vma->vma == vma) 1605 goto up_out; 1606 } 1607 1608 if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 1609 vma->vm_end - vma->vm_start, 1610 vma->vm_page_prot)) { 1611 ret = VM_FAULT_SIGBUS; 1612 zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1613 goto up_out; 1614 } 1615 1616 if (__vfio_pci_add_vma(vdev, vma)) { 1617 ret = VM_FAULT_OOM; 1618 zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1619 } 1620 1621 up_out: 1622 up_read(&vdev->memory_lock); 1623 mutex_unlock(&vdev->vma_lock); 1624 return ret; 1625 } 1626 1627 static const struct vm_operations_struct vfio_pci_mmap_ops = { 1628 .open = vfio_pci_mmap_open, 1629 .close = vfio_pci_mmap_close, 1630 .fault = vfio_pci_mmap_fault, 1631 }; 1632 1633 static int vfio_pci_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) 1634 { 1635 struct vfio_pci_device *vdev = 1636 container_of(core_vdev, struct vfio_pci_device, vdev); 1637 struct pci_dev *pdev = vdev->pdev; 1638 unsigned int index; 1639 u64 phys_len, req_len, pgoff, req_start; 1640 int ret; 1641 1642 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 1643 1644 if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 1645 return -EINVAL; 1646 if (vma->vm_end < vma->vm_start) 1647 return -EINVAL; 1648 if ((vma->vm_flags & VM_SHARED) == 0) 1649 return -EINVAL; 1650 if (index >= VFIO_PCI_NUM_REGIONS) { 1651 int regnum = index - VFIO_PCI_NUM_REGIONS; 1652 struct vfio_pci_region *region = vdev->region + regnum; 1653 1654 if (region->ops && region->ops->mmap && 1655 (region->flags & VFIO_REGION_INFO_FLAG_MMAP)) 1656 return region->ops->mmap(vdev, region, vma); 1657 return -EINVAL; 1658 } 1659 if (index >= VFIO_PCI_ROM_REGION_INDEX) 1660 return -EINVAL; 1661 if (!vdev->bar_mmap_supported[index]) 1662 return -EINVAL; 1663 1664 phys_len = PAGE_ALIGN(pci_resource_len(pdev, index)); 1665 req_len = vma->vm_end - vma->vm_start; 1666 pgoff = vma->vm_pgoff & 1667 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 1668 req_start = pgoff << PAGE_SHIFT; 1669 1670 if (req_start + req_len > phys_len) 1671 return -EINVAL; 1672 1673 /* 1674 * Even though we don't make use of the barmap for the mmap, 1675 * we need to request the region and the barmap tracks that. 1676 */ 1677 if (!vdev->barmap[index]) { 1678 ret = pci_request_selected_regions(pdev, 1679 1 << index, "vfio-pci"); 1680 if (ret) 1681 return ret; 1682 1683 vdev->barmap[index] = pci_iomap(pdev, index, 0); 1684 if (!vdev->barmap[index]) { 1685 pci_release_selected_regions(pdev, 1 << index); 1686 return -ENOMEM; 1687 } 1688 } 1689 1690 vma->vm_private_data = vdev; 1691 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 1692 vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; 1693 1694 /* 1695 * See remap_pfn_range(), called from vfio_pci_fault() but we can't 1696 * change vm_flags within the fault handler. Set them now. 1697 */ 1698 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; 1699 vma->vm_ops = &vfio_pci_mmap_ops; 1700 1701 return 0; 1702 } 1703 1704 static void vfio_pci_request(struct vfio_device *core_vdev, unsigned int count) 1705 { 1706 struct vfio_pci_device *vdev = 1707 container_of(core_vdev, struct vfio_pci_device, vdev); 1708 struct pci_dev *pdev = vdev->pdev; 1709 1710 mutex_lock(&vdev->igate); 1711 1712 if (vdev->req_trigger) { 1713 if (!(count % 10)) 1714 pci_notice_ratelimited(pdev, 1715 "Relaying device request to user (#%u)\n", 1716 count); 1717 eventfd_signal(vdev->req_trigger, 1); 1718 } else if (count == 0) { 1719 pci_warn(pdev, 1720 "No device request channel registered, blocked until released by user\n"); 1721 } 1722 1723 mutex_unlock(&vdev->igate); 1724 } 1725 1726 static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, 1727 bool vf_token, uuid_t *uuid) 1728 { 1729 /* 1730 * There's always some degree of trust or collaboration between SR-IOV 1731 * PF and VFs, even if just that the PF hosts the SR-IOV capability and 1732 * can disrupt VFs with a reset, but often the PF has more explicit 1733 * access to deny service to the VF or access data passed through the 1734 * VF. We therefore require an opt-in via a shared VF token (UUID) to 1735 * represent this trust. This both prevents that a VF driver might 1736 * assume the PF driver is a trusted, in-kernel driver, and also that 1737 * a PF driver might be replaced with a rogue driver, unknown to in-use 1738 * VF drivers. 1739 * 1740 * Therefore when presented with a VF, if the PF is a vfio device and 1741 * it is bound to the vfio-pci driver, the user needs to provide a VF 1742 * token to access the device, in the form of appending a vf_token to 1743 * the device name, for example: 1744 * 1745 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3" 1746 * 1747 * When presented with a PF which has VFs in use, the user must also 1748 * provide the current VF token to prove collaboration with existing 1749 * VF users. If VFs are not in use, the VF token provided for the PF 1750 * device will act to set the VF token. 1751 * 1752 * If the VF token is provided but unused, an error is generated. 1753 */ 1754 if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token) 1755 return 0; /* No VF token provided or required */ 1756 1757 if (vdev->pdev->is_virtfn) { 1758 struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev); 1759 bool match; 1760 1761 if (!pf_vdev) { 1762 if (!vf_token) 1763 return 0; /* PF is not vfio-pci, no VF token */ 1764 1765 pci_info_ratelimited(vdev->pdev, 1766 "VF token incorrectly provided, PF not bound to vfio-pci\n"); 1767 return -EINVAL; 1768 } 1769 1770 if (!vf_token) { 1771 vfio_device_put(&pf_vdev->vdev); 1772 pci_info_ratelimited(vdev->pdev, 1773 "VF token required to access device\n"); 1774 return -EACCES; 1775 } 1776 1777 mutex_lock(&pf_vdev->vf_token->lock); 1778 match = uuid_equal(uuid, &pf_vdev->vf_token->uuid); 1779 mutex_unlock(&pf_vdev->vf_token->lock); 1780 1781 vfio_device_put(&pf_vdev->vdev); 1782 1783 if (!match) { 1784 pci_info_ratelimited(vdev->pdev, 1785 "Incorrect VF token provided for device\n"); 1786 return -EACCES; 1787 } 1788 } else if (vdev->vf_token) { 1789 mutex_lock(&vdev->vf_token->lock); 1790 if (vdev->vf_token->users) { 1791 if (!vf_token) { 1792 mutex_unlock(&vdev->vf_token->lock); 1793 pci_info_ratelimited(vdev->pdev, 1794 "VF token required to access device\n"); 1795 return -EACCES; 1796 } 1797 1798 if (!uuid_equal(uuid, &vdev->vf_token->uuid)) { 1799 mutex_unlock(&vdev->vf_token->lock); 1800 pci_info_ratelimited(vdev->pdev, 1801 "Incorrect VF token provided for device\n"); 1802 return -EACCES; 1803 } 1804 } else if (vf_token) { 1805 uuid_copy(&vdev->vf_token->uuid, uuid); 1806 } 1807 1808 mutex_unlock(&vdev->vf_token->lock); 1809 } else if (vf_token) { 1810 pci_info_ratelimited(vdev->pdev, 1811 "VF token incorrectly provided, not a PF or VF\n"); 1812 return -EINVAL; 1813 } 1814 1815 return 0; 1816 } 1817 1818 #define VF_TOKEN_ARG "vf_token=" 1819 1820 static int vfio_pci_match(struct vfio_device *core_vdev, char *buf) 1821 { 1822 struct vfio_pci_device *vdev = 1823 container_of(core_vdev, struct vfio_pci_device, vdev); 1824 bool vf_token = false; 1825 uuid_t uuid; 1826 int ret; 1827 1828 if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev)))) 1829 return 0; /* No match */ 1830 1831 if (strlen(buf) > strlen(pci_name(vdev->pdev))) { 1832 buf += strlen(pci_name(vdev->pdev)); 1833 1834 if (*buf != ' ') 1835 return 0; /* No match: non-whitespace after name */ 1836 1837 while (*buf) { 1838 if (*buf == ' ') { 1839 buf++; 1840 continue; 1841 } 1842 1843 if (!vf_token && !strncmp(buf, VF_TOKEN_ARG, 1844 strlen(VF_TOKEN_ARG))) { 1845 buf += strlen(VF_TOKEN_ARG); 1846 1847 if (strlen(buf) < UUID_STRING_LEN) 1848 return -EINVAL; 1849 1850 ret = uuid_parse(buf, &uuid); 1851 if (ret) 1852 return ret; 1853 1854 vf_token = true; 1855 buf += UUID_STRING_LEN; 1856 } else { 1857 /* Unknown/duplicate option */ 1858 return -EINVAL; 1859 } 1860 } 1861 } 1862 1863 ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid); 1864 if (ret) 1865 return ret; 1866 1867 return 1; /* Match */ 1868 } 1869 1870 static const struct vfio_device_ops vfio_pci_ops = { 1871 .name = "vfio-pci", 1872 .open = vfio_pci_open, 1873 .release = vfio_pci_release, 1874 .ioctl = vfio_pci_ioctl, 1875 .read = vfio_pci_read, 1876 .write = vfio_pci_write, 1877 .mmap = vfio_pci_mmap, 1878 .request = vfio_pci_request, 1879 .match = vfio_pci_match, 1880 }; 1881 1882 static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev); 1883 static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck); 1884 1885 static int vfio_pci_bus_notifier(struct notifier_block *nb, 1886 unsigned long action, void *data) 1887 { 1888 struct vfio_pci_device *vdev = container_of(nb, 1889 struct vfio_pci_device, nb); 1890 struct device *dev = data; 1891 struct pci_dev *pdev = to_pci_dev(dev); 1892 struct pci_dev *physfn = pci_physfn(pdev); 1893 1894 if (action == BUS_NOTIFY_ADD_DEVICE && 1895 pdev->is_virtfn && physfn == vdev->pdev) { 1896 pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n", 1897 pci_name(pdev)); 1898 pdev->driver_override = kasprintf(GFP_KERNEL, "%s", 1899 vfio_pci_ops.name); 1900 } else if (action == BUS_NOTIFY_BOUND_DRIVER && 1901 pdev->is_virtfn && physfn == vdev->pdev) { 1902 struct pci_driver *drv = pci_dev_driver(pdev); 1903 1904 if (drv && drv != &vfio_pci_driver) 1905 pci_warn(vdev->pdev, 1906 "VF %s bound to driver %s while PF bound to vfio-pci\n", 1907 pci_name(pdev), drv->name); 1908 } 1909 1910 return 0; 1911 } 1912 1913 static int vfio_pci_vf_init(struct vfio_pci_device *vdev) 1914 { 1915 struct pci_dev *pdev = vdev->pdev; 1916 int ret; 1917 1918 if (!pdev->is_physfn) 1919 return 0; 1920 1921 vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL); 1922 if (!vdev->vf_token) 1923 return -ENOMEM; 1924 1925 mutex_init(&vdev->vf_token->lock); 1926 uuid_gen(&vdev->vf_token->uuid); 1927 1928 vdev->nb.notifier_call = vfio_pci_bus_notifier; 1929 ret = bus_register_notifier(&pci_bus_type, &vdev->nb); 1930 if (ret) { 1931 kfree(vdev->vf_token); 1932 return ret; 1933 } 1934 return 0; 1935 } 1936 1937 static void vfio_pci_vf_uninit(struct vfio_pci_device *vdev) 1938 { 1939 if (!vdev->vf_token) 1940 return; 1941 1942 bus_unregister_notifier(&pci_bus_type, &vdev->nb); 1943 WARN_ON(vdev->vf_token->users); 1944 mutex_destroy(&vdev->vf_token->lock); 1945 kfree(vdev->vf_token); 1946 } 1947 1948 static int vfio_pci_vga_init(struct vfio_pci_device *vdev) 1949 { 1950 struct pci_dev *pdev = vdev->pdev; 1951 int ret; 1952 1953 if (!vfio_pci_is_vga(pdev)) 1954 return 0; 1955 1956 ret = vga_client_register(pdev, vfio_pci_set_decode); 1957 if (ret) 1958 return ret; 1959 vga_set_legacy_decoding(pdev, vfio_pci_set_decode(pdev, false)); 1960 return 0; 1961 } 1962 1963 static void vfio_pci_vga_uninit(struct vfio_pci_device *vdev) 1964 { 1965 struct pci_dev *pdev = vdev->pdev; 1966 1967 if (!vfio_pci_is_vga(pdev)) 1968 return; 1969 vga_client_unregister(pdev); 1970 vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 1971 VGA_RSRC_LEGACY_IO | 1972 VGA_RSRC_LEGACY_MEM); 1973 } 1974 1975 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 1976 { 1977 struct vfio_pci_device *vdev; 1978 struct iommu_group *group; 1979 int ret; 1980 1981 if (vfio_pci_is_denylisted(pdev)) 1982 return -EINVAL; 1983 1984 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) 1985 return -EINVAL; 1986 1987 /* 1988 * Prevent binding to PFs with VFs enabled, the VFs might be in use 1989 * by the host or other users. We cannot capture the VFs if they 1990 * already exist, nor can we track VF users. Disabling SR-IOV here 1991 * would initiate removing the VFs, which would unbind the driver, 1992 * which is prone to blocking if that VF is also in use by vfio-pci. 1993 * Just reject these PFs and let the user sort it out. 1994 */ 1995 if (pci_num_vf(pdev)) { 1996 pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n"); 1997 return -EBUSY; 1998 } 1999 2000 group = vfio_iommu_group_get(&pdev->dev); 2001 if (!group) 2002 return -EINVAL; 2003 2004 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); 2005 if (!vdev) { 2006 ret = -ENOMEM; 2007 goto out_group_put; 2008 } 2009 2010 vfio_init_group_dev(&vdev->vdev, &pdev->dev, &vfio_pci_ops); 2011 vdev->pdev = pdev; 2012 vdev->irq_type = VFIO_PCI_NUM_IRQS; 2013 mutex_init(&vdev->igate); 2014 spin_lock_init(&vdev->irqlock); 2015 mutex_init(&vdev->ioeventfds_lock); 2016 INIT_LIST_HEAD(&vdev->dummy_resources_list); 2017 INIT_LIST_HEAD(&vdev->ioeventfds_list); 2018 mutex_init(&vdev->vma_lock); 2019 INIT_LIST_HEAD(&vdev->vma_list); 2020 init_rwsem(&vdev->memory_lock); 2021 2022 ret = vfio_pci_reflck_attach(vdev); 2023 if (ret) 2024 goto out_free; 2025 ret = vfio_pci_vf_init(vdev); 2026 if (ret) 2027 goto out_reflck; 2028 ret = vfio_pci_vga_init(vdev); 2029 if (ret) 2030 goto out_vf; 2031 2032 vfio_pci_probe_power_state(vdev); 2033 2034 if (!disable_idle_d3) { 2035 /* 2036 * pci-core sets the device power state to an unknown value at 2037 * bootup and after being removed from a driver. The only 2038 * transition it allows from this unknown state is to D0, which 2039 * typically happens when a driver calls pci_enable_device(). 2040 * We're not ready to enable the device yet, but we do want to 2041 * be able to get to D3. Therefore first do a D0 transition 2042 * before going to D3. 2043 */ 2044 vfio_pci_set_power_state(vdev, PCI_D0); 2045 vfio_pci_set_power_state(vdev, PCI_D3hot); 2046 } 2047 2048 ret = vfio_register_group_dev(&vdev->vdev); 2049 if (ret) 2050 goto out_power; 2051 dev_set_drvdata(&pdev->dev, vdev); 2052 return 0; 2053 2054 out_power: 2055 if (!disable_idle_d3) 2056 vfio_pci_set_power_state(vdev, PCI_D0); 2057 out_vf: 2058 vfio_pci_vf_uninit(vdev); 2059 out_reflck: 2060 vfio_pci_reflck_put(vdev->reflck); 2061 out_free: 2062 kfree(vdev->pm_save); 2063 kfree(vdev); 2064 out_group_put: 2065 vfio_iommu_group_put(group, &pdev->dev); 2066 return ret; 2067 } 2068 2069 static void vfio_pci_remove(struct pci_dev *pdev) 2070 { 2071 struct vfio_pci_device *vdev = dev_get_drvdata(&pdev->dev); 2072 2073 pci_disable_sriov(pdev); 2074 2075 vfio_unregister_group_dev(&vdev->vdev); 2076 2077 vfio_pci_vf_uninit(vdev); 2078 vfio_pci_reflck_put(vdev->reflck); 2079 vfio_pci_vga_uninit(vdev); 2080 2081 vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); 2082 2083 if (!disable_idle_d3) 2084 vfio_pci_set_power_state(vdev, PCI_D0); 2085 2086 mutex_destroy(&vdev->ioeventfds_lock); 2087 kfree(vdev->region); 2088 kfree(vdev->pm_save); 2089 kfree(vdev); 2090 } 2091 2092 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, 2093 pci_channel_state_t state) 2094 { 2095 struct vfio_pci_device *vdev; 2096 struct vfio_device *device; 2097 2098 device = vfio_device_get_from_dev(&pdev->dev); 2099 if (device == NULL) 2100 return PCI_ERS_RESULT_DISCONNECT; 2101 2102 vdev = container_of(device, struct vfio_pci_device, vdev); 2103 2104 mutex_lock(&vdev->igate); 2105 2106 if (vdev->err_trigger) 2107 eventfd_signal(vdev->err_trigger, 1); 2108 2109 mutex_unlock(&vdev->igate); 2110 2111 vfio_device_put(device); 2112 2113 return PCI_ERS_RESULT_CAN_RECOVER; 2114 } 2115 2116 static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) 2117 { 2118 struct vfio_device *device; 2119 int ret = 0; 2120 2121 might_sleep(); 2122 2123 if (!enable_sriov) 2124 return -ENOENT; 2125 2126 device = vfio_device_get_from_dev(&pdev->dev); 2127 if (!device) 2128 return -ENODEV; 2129 2130 if (nr_virtfn == 0) 2131 pci_disable_sriov(pdev); 2132 else 2133 ret = pci_enable_sriov(pdev, nr_virtfn); 2134 2135 vfio_device_put(device); 2136 2137 return ret < 0 ? ret : nr_virtfn; 2138 } 2139 2140 static const struct pci_error_handlers vfio_err_handlers = { 2141 .error_detected = vfio_pci_aer_err_detected, 2142 }; 2143 2144 static struct pci_driver vfio_pci_driver = { 2145 .name = "vfio-pci", 2146 .id_table = NULL, /* only dynamic ids */ 2147 .probe = vfio_pci_probe, 2148 .remove = vfio_pci_remove, 2149 .sriov_configure = vfio_pci_sriov_configure, 2150 .err_handler = &vfio_err_handlers, 2151 }; 2152 2153 static DEFINE_MUTEX(reflck_lock); 2154 2155 static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void) 2156 { 2157 struct vfio_pci_reflck *reflck; 2158 2159 reflck = kzalloc(sizeof(*reflck), GFP_KERNEL); 2160 if (!reflck) 2161 return ERR_PTR(-ENOMEM); 2162 2163 kref_init(&reflck->kref); 2164 mutex_init(&reflck->lock); 2165 2166 return reflck; 2167 } 2168 2169 static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck) 2170 { 2171 kref_get(&reflck->kref); 2172 } 2173 2174 static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data) 2175 { 2176 struct vfio_pci_reflck **preflck = data; 2177 struct vfio_device *device; 2178 struct vfio_pci_device *vdev; 2179 2180 device = vfio_device_get_from_dev(&pdev->dev); 2181 if (!device) 2182 return 0; 2183 2184 if (pci_dev_driver(pdev) != &vfio_pci_driver) { 2185 vfio_device_put(device); 2186 return 0; 2187 } 2188 2189 vdev = container_of(device, struct vfio_pci_device, vdev); 2190 2191 if (vdev->reflck) { 2192 vfio_pci_reflck_get(vdev->reflck); 2193 *preflck = vdev->reflck; 2194 vfio_device_put(device); 2195 return 1; 2196 } 2197 2198 vfio_device_put(device); 2199 return 0; 2200 } 2201 2202 static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev) 2203 { 2204 bool slot = !pci_probe_reset_slot(vdev->pdev->slot); 2205 2206 mutex_lock(&reflck_lock); 2207 2208 if (pci_is_root_bus(vdev->pdev->bus) || 2209 vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find, 2210 &vdev->reflck, slot) <= 0) 2211 vdev->reflck = vfio_pci_reflck_alloc(); 2212 2213 mutex_unlock(&reflck_lock); 2214 2215 return PTR_ERR_OR_ZERO(vdev->reflck); 2216 } 2217 2218 static void vfio_pci_reflck_release(struct kref *kref) 2219 { 2220 struct vfio_pci_reflck *reflck = container_of(kref, 2221 struct vfio_pci_reflck, 2222 kref); 2223 2224 kfree(reflck); 2225 mutex_unlock(&reflck_lock); 2226 } 2227 2228 static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck) 2229 { 2230 kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock); 2231 } 2232 2233 static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data) 2234 { 2235 struct vfio_devices *devs = data; 2236 struct vfio_device *device; 2237 struct vfio_pci_device *vdev; 2238 2239 if (devs->cur_index == devs->max_index) 2240 return -ENOSPC; 2241 2242 device = vfio_device_get_from_dev(&pdev->dev); 2243 if (!device) 2244 return -EINVAL; 2245 2246 if (pci_dev_driver(pdev) != &vfio_pci_driver) { 2247 vfio_device_put(device); 2248 return -EBUSY; 2249 } 2250 2251 vdev = container_of(device, struct vfio_pci_device, vdev); 2252 2253 /* Fault if the device is not unused */ 2254 if (vdev->refcnt) { 2255 vfio_device_put(device); 2256 return -EBUSY; 2257 } 2258 2259 devs->devices[devs->cur_index++] = vdev; 2260 return 0; 2261 } 2262 2263 static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data) 2264 { 2265 struct vfio_devices *devs = data; 2266 struct vfio_device *device; 2267 struct vfio_pci_device *vdev; 2268 2269 if (devs->cur_index == devs->max_index) 2270 return -ENOSPC; 2271 2272 device = vfio_device_get_from_dev(&pdev->dev); 2273 if (!device) 2274 return -EINVAL; 2275 2276 if (pci_dev_driver(pdev) != &vfio_pci_driver) { 2277 vfio_device_put(device); 2278 return -EBUSY; 2279 } 2280 2281 vdev = container_of(device, struct vfio_pci_device, vdev); 2282 2283 /* 2284 * Locking multiple devices is prone to deadlock, runaway and 2285 * unwind if we hit contention. 2286 */ 2287 if (!vfio_pci_zap_and_vma_lock(vdev, true)) { 2288 vfio_device_put(device); 2289 return -EBUSY; 2290 } 2291 2292 devs->devices[devs->cur_index++] = vdev; 2293 return 0; 2294 } 2295 2296 /* 2297 * If a bus or slot reset is available for the provided device and: 2298 * - All of the devices affected by that bus or slot reset are unused 2299 * (!refcnt) 2300 * - At least one of the affected devices is marked dirty via 2301 * needs_reset (such as by lack of FLR support) 2302 * Then attempt to perform that bus or slot reset. Callers are required 2303 * to hold vdev->reflck->lock, protecting the bus/slot reset group from 2304 * concurrent opens. A vfio_device reference is acquired for each device 2305 * to prevent unbinds during the reset operation. 2306 * 2307 * NB: vfio-core considers a group to be viable even if some devices are 2308 * bound to drivers like pci-stub or pcieport. Here we require all devices 2309 * to be bound to vfio_pci since that's the only way we can be sure they 2310 * stay put. 2311 */ 2312 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) 2313 { 2314 struct vfio_devices devs = { .cur_index = 0 }; 2315 int i = 0, ret = -EINVAL; 2316 bool slot = false; 2317 struct vfio_pci_device *tmp; 2318 2319 if (!pci_probe_reset_slot(vdev->pdev->slot)) 2320 slot = true; 2321 else if (pci_probe_reset_bus(vdev->pdev->bus)) 2322 return; 2323 2324 if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, 2325 &i, slot) || !i) 2326 return; 2327 2328 devs.max_index = i; 2329 devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL); 2330 if (!devs.devices) 2331 return; 2332 2333 if (vfio_pci_for_each_slot_or_bus(vdev->pdev, 2334 vfio_pci_get_unused_devs, 2335 &devs, slot)) 2336 goto put_devs; 2337 2338 /* Does at least one need a reset? */ 2339 for (i = 0; i < devs.cur_index; i++) { 2340 tmp = devs.devices[i]; 2341 if (tmp->needs_reset) { 2342 ret = pci_reset_bus(vdev->pdev); 2343 break; 2344 } 2345 } 2346 2347 put_devs: 2348 for (i = 0; i < devs.cur_index; i++) { 2349 tmp = devs.devices[i]; 2350 2351 /* 2352 * If reset was successful, affected devices no longer need 2353 * a reset and we should return all the collateral devices 2354 * to low power. If not successful, we either didn't reset 2355 * the bus or timed out waiting for it, so let's not touch 2356 * the power state. 2357 */ 2358 if (!ret) { 2359 tmp->needs_reset = false; 2360 2361 if (tmp != vdev && !disable_idle_d3) 2362 vfio_pci_set_power_state(tmp, PCI_D3hot); 2363 } 2364 2365 vfio_device_put(&tmp->vdev); 2366 } 2367 2368 kfree(devs.devices); 2369 } 2370 2371 static void __exit vfio_pci_cleanup(void) 2372 { 2373 pci_unregister_driver(&vfio_pci_driver); 2374 vfio_pci_uninit_perm_bits(); 2375 } 2376 2377 static void __init vfio_pci_fill_ids(void) 2378 { 2379 char *p, *id; 2380 int rc; 2381 2382 /* no ids passed actually */ 2383 if (ids[0] == '\0') 2384 return; 2385 2386 /* add ids specified in the module parameter */ 2387 p = ids; 2388 while ((id = strsep(&p, ","))) { 2389 unsigned int vendor, device, subvendor = PCI_ANY_ID, 2390 subdevice = PCI_ANY_ID, class = 0, class_mask = 0; 2391 int fields; 2392 2393 if (!strlen(id)) 2394 continue; 2395 2396 fields = sscanf(id, "%x:%x:%x:%x:%x:%x", 2397 &vendor, &device, &subvendor, &subdevice, 2398 &class, &class_mask); 2399 2400 if (fields < 2) { 2401 pr_warn("invalid id string \"%s\"\n", id); 2402 continue; 2403 } 2404 2405 rc = pci_add_dynid(&vfio_pci_driver, vendor, device, 2406 subvendor, subdevice, class, class_mask, 0); 2407 if (rc) 2408 pr_warn("failed to add dynamic id [%04x:%04x[%04x:%04x]] class %#08x/%08x (%d)\n", 2409 vendor, device, subvendor, subdevice, 2410 class, class_mask, rc); 2411 else 2412 pr_info("add [%04x:%04x[%04x:%04x]] class %#08x/%08x\n", 2413 vendor, device, subvendor, subdevice, 2414 class, class_mask); 2415 } 2416 } 2417 2418 static int __init vfio_pci_init(void) 2419 { 2420 int ret; 2421 2422 /* Allocate shared config space permission data used by all devices */ 2423 ret = vfio_pci_init_perm_bits(); 2424 if (ret) 2425 return ret; 2426 2427 /* Register and scan for devices */ 2428 ret = pci_register_driver(&vfio_pci_driver); 2429 if (ret) 2430 goto out_driver; 2431 2432 vfio_pci_fill_ids(); 2433 2434 if (disable_denylist) 2435 pr_warn("device denylist disabled.\n"); 2436 2437 return 0; 2438 2439 out_driver: 2440 vfio_pci_uninit_perm_bits(); 2441 return ret; 2442 } 2443 2444 module_init(vfio_pci_init); 2445 module_exit(vfio_pci_cleanup); 2446 2447 MODULE_VERSION(DRIVER_VERSION); 2448 MODULE_LICENSE("GPL v2"); 2449 MODULE_AUTHOR(DRIVER_AUTHOR); 2450 MODULE_DESCRIPTION(DRIVER_DESC); 2451