1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 4 * Author: Alex Williamson <alex.williamson@redhat.com> 5 * 6 * Derived from original vfio: 7 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 8 * Author: Tom Lyon, pugs@cisco.com 9 */ 10 11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13 #include <linux/device.h> 14 #include <linux/eventfd.h> 15 #include <linux/file.h> 16 #include <linux/interrupt.h> 17 #include <linux/iommu.h> 18 #include <linux/module.h> 19 #include <linux/mutex.h> 20 #include <linux/notifier.h> 21 #include <linux/pci.h> 22 #include <linux/pm_runtime.h> 23 #include <linux/slab.h> 24 #include <linux/types.h> 25 #include <linux/uaccess.h> 26 #include <linux/vfio.h> 27 #include <linux/vgaarb.h> 28 #include <linux/nospec.h> 29 #include <linux/sched/mm.h> 30 31 #include "vfio_pci_private.h" 32 33 #define DRIVER_VERSION "0.2" 34 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 35 #define DRIVER_DESC "VFIO PCI - User Level meta-driver" 36 37 static char ids[1024] __initdata; 38 module_param_string(ids, ids, sizeof(ids), 0); 39 MODULE_PARM_DESC(ids, "Initial PCI IDs to add to the vfio driver, format is \"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\" and multiple comma separated entries can be specified"); 40 41 static bool nointxmask; 42 module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR); 43 MODULE_PARM_DESC(nointxmask, 44 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag."); 45 46 #ifdef CONFIG_VFIO_PCI_VGA 47 static bool disable_vga; 48 module_param(disable_vga, bool, S_IRUGO); 49 MODULE_PARM_DESC(disable_vga, "Disable VGA resource access through vfio-pci"); 50 #endif 51 52 static bool disable_idle_d3; 53 module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR); 54 MODULE_PARM_DESC(disable_idle_d3, 55 "Disable using the PCI D3 low power state for idle, unused devices"); 56 57 static bool enable_sriov; 58 #ifdef CONFIG_PCI_IOV 59 module_param(enable_sriov, bool, 0644); 60 MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration. Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF."); 61 #endif 62 63 static inline bool vfio_vga_disabled(void) 64 { 65 #ifdef CONFIG_VFIO_PCI_VGA 66 return disable_vga; 67 #else 68 return true; 69 #endif 70 } 71 72 /* 73 * Our VGA arbiter participation is limited since we don't know anything 74 * about the device itself. However, if the device is the only VGA device 75 * downstream of a bridge and VFIO VGA support is disabled, then we can 76 * safely return legacy VGA IO and memory as not decoded since the user 77 * has no way to get to it and routing can be disabled externally at the 78 * bridge. 79 */ 80 static unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga) 81 { 82 struct vfio_pci_device *vdev = opaque; 83 struct pci_dev *tmp = NULL, *pdev = vdev->pdev; 84 unsigned char max_busnr; 85 unsigned int decodes; 86 87 if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus)) 88 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 89 VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 90 91 max_busnr = pci_bus_max_busnr(pdev->bus); 92 decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 93 94 while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) { 95 if (tmp == pdev || 96 pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) || 97 pci_is_root_bus(tmp->bus)) 98 continue; 99 100 if (tmp->bus->number >= pdev->bus->number && 101 tmp->bus->number <= max_busnr) { 102 pci_dev_put(tmp); 103 decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 104 break; 105 } 106 } 107 108 return decodes; 109 } 110 111 static inline bool vfio_pci_is_vga(struct pci_dev *pdev) 112 { 113 return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; 114 } 115 116 static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev) 117 { 118 struct resource *res; 119 int i; 120 struct vfio_pci_dummy_resource *dummy_res; 121 122 INIT_LIST_HEAD(&vdev->dummy_resources_list); 123 124 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 125 int bar = i + PCI_STD_RESOURCES; 126 127 res = &vdev->pdev->resource[bar]; 128 129 if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP)) 130 goto no_mmap; 131 132 if (!(res->flags & IORESOURCE_MEM)) 133 goto no_mmap; 134 135 /* 136 * The PCI core shouldn't set up a resource with a 137 * type but zero size. But there may be bugs that 138 * cause us to do that. 139 */ 140 if (!resource_size(res)) 141 goto no_mmap; 142 143 if (resource_size(res) >= PAGE_SIZE) { 144 vdev->bar_mmap_supported[bar] = true; 145 continue; 146 } 147 148 if (!(res->start & ~PAGE_MASK)) { 149 /* 150 * Add a dummy resource to reserve the remainder 151 * of the exclusive page in case that hot-add 152 * device's bar is assigned into it. 153 */ 154 dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL); 155 if (dummy_res == NULL) 156 goto no_mmap; 157 158 dummy_res->resource.name = "vfio sub-page reserved"; 159 dummy_res->resource.start = res->end + 1; 160 dummy_res->resource.end = res->start + PAGE_SIZE - 1; 161 dummy_res->resource.flags = res->flags; 162 if (request_resource(res->parent, 163 &dummy_res->resource)) { 164 kfree(dummy_res); 165 goto no_mmap; 166 } 167 dummy_res->index = bar; 168 list_add(&dummy_res->res_next, 169 &vdev->dummy_resources_list); 170 vdev->bar_mmap_supported[bar] = true; 171 continue; 172 } 173 /* 174 * Here we don't handle the case when the BAR is not page 175 * aligned because we can't expect the BAR will be 176 * assigned into the same location in a page in guest 177 * when we passthrough the BAR. And it's hard to access 178 * this BAR in userspace because we have no way to get 179 * the BAR's location in a page. 180 */ 181 no_mmap: 182 vdev->bar_mmap_supported[bar] = false; 183 } 184 } 185 186 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); 187 static void vfio_pci_disable(struct vfio_pci_device *vdev); 188 static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data); 189 190 /* 191 * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND 192 * _and_ the ability detect when the device is asserting INTx via PCI_STATUS. 193 * If a device implements the former but not the latter we would typically 194 * expect broken_intx_masking be set and require an exclusive interrupt. 195 * However since we do have control of the device's ability to assert INTx, 196 * we can instead pretend that the device does not implement INTx, virtualizing 197 * the pin register to report zero and maintaining DisINTx set on the host. 198 */ 199 static bool vfio_pci_nointx(struct pci_dev *pdev) 200 { 201 switch (pdev->vendor) { 202 case PCI_VENDOR_ID_INTEL: 203 switch (pdev->device) { 204 /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */ 205 case 0x1572: 206 case 0x1574: 207 case 0x1580 ... 0x1581: 208 case 0x1583 ... 0x158b: 209 case 0x37d0 ... 0x37d2: 210 return true; 211 default: 212 return false; 213 } 214 } 215 216 return false; 217 } 218 219 static void vfio_pci_probe_power_state(struct vfio_pci_device *vdev) 220 { 221 struct pci_dev *pdev = vdev->pdev; 222 u16 pmcsr; 223 224 if (!pdev->pm_cap) 225 return; 226 227 pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr); 228 229 vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET); 230 } 231 232 /* 233 * pci_set_power_state() wrapper handling devices which perform a soft reset on 234 * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev, 235 * restore when returned to D0. Saved separately from pci_saved_state for use 236 * by PM capability emulation and separately from pci_dev internal saved state 237 * to avoid it being overwritten and consumed around other resets. 238 */ 239 int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state) 240 { 241 struct pci_dev *pdev = vdev->pdev; 242 bool needs_restore = false, needs_save = false; 243 int ret; 244 245 if (vdev->needs_pm_restore) { 246 if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) { 247 pci_save_state(pdev); 248 needs_save = true; 249 } 250 251 if (pdev->current_state >= PCI_D3hot && state <= PCI_D0) 252 needs_restore = true; 253 } 254 255 ret = pci_set_power_state(pdev, state); 256 257 if (!ret) { 258 /* D3 might be unsupported via quirk, skip unless in D3 */ 259 if (needs_save && pdev->current_state >= PCI_D3hot) { 260 vdev->pm_save = pci_store_saved_state(pdev); 261 } else if (needs_restore) { 262 pci_load_and_free_saved_state(pdev, &vdev->pm_save); 263 pci_restore_state(pdev); 264 } 265 } 266 267 return ret; 268 } 269 270 static int vfio_pci_enable(struct vfio_pci_device *vdev) 271 { 272 struct pci_dev *pdev = vdev->pdev; 273 int ret; 274 u16 cmd; 275 u8 msix_pos; 276 277 vfio_pci_set_power_state(vdev, PCI_D0); 278 279 /* Don't allow our initial saved state to include busmaster */ 280 pci_clear_master(pdev); 281 282 ret = pci_enable_device(pdev); 283 if (ret) 284 return ret; 285 286 /* If reset fails because of the device lock, fail this path entirely */ 287 ret = pci_try_reset_function(pdev); 288 if (ret == -EAGAIN) { 289 pci_disable_device(pdev); 290 return ret; 291 } 292 293 vdev->reset_works = !ret; 294 pci_save_state(pdev); 295 vdev->pci_saved_state = pci_store_saved_state(pdev); 296 if (!vdev->pci_saved_state) 297 pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__); 298 299 if (likely(!nointxmask)) { 300 if (vfio_pci_nointx(pdev)) { 301 pci_info(pdev, "Masking broken INTx support\n"); 302 vdev->nointx = true; 303 pci_intx(pdev, 0); 304 } else 305 vdev->pci_2_3 = pci_intx_mask_supported(pdev); 306 } 307 308 pci_read_config_word(pdev, PCI_COMMAND, &cmd); 309 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { 310 cmd &= ~PCI_COMMAND_INTX_DISABLE; 311 pci_write_config_word(pdev, PCI_COMMAND, cmd); 312 } 313 314 ret = vfio_config_init(vdev); 315 if (ret) { 316 kfree(vdev->pci_saved_state); 317 vdev->pci_saved_state = NULL; 318 pci_disable_device(pdev); 319 return ret; 320 } 321 322 msix_pos = pdev->msix_cap; 323 if (msix_pos) { 324 u16 flags; 325 u32 table; 326 327 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); 328 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); 329 330 vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; 331 vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; 332 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; 333 } else 334 vdev->msix_bar = 0xFF; 335 336 if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) 337 vdev->has_vga = true; 338 339 340 if (vfio_pci_is_vga(pdev) && 341 pdev->vendor == PCI_VENDOR_ID_INTEL && 342 IS_ENABLED(CONFIG_VFIO_PCI_IGD)) { 343 ret = vfio_pci_igd_init(vdev); 344 if (ret) { 345 pci_warn(pdev, "Failed to setup Intel IGD regions\n"); 346 goto disable_exit; 347 } 348 } 349 350 if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && 351 IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { 352 ret = vfio_pci_nvdia_v100_nvlink2_init(vdev); 353 if (ret && ret != -ENODEV) { 354 pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n"); 355 goto disable_exit; 356 } 357 } 358 359 if (pdev->vendor == PCI_VENDOR_ID_IBM && 360 IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { 361 ret = vfio_pci_ibm_npu2_init(vdev); 362 if (ret && ret != -ENODEV) { 363 pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n"); 364 goto disable_exit; 365 } 366 } 367 368 vfio_pci_probe_mmaps(vdev); 369 370 return 0; 371 372 disable_exit: 373 vfio_pci_disable(vdev); 374 return ret; 375 } 376 377 static void vfio_pci_disable(struct vfio_pci_device *vdev) 378 { 379 struct pci_dev *pdev = vdev->pdev; 380 struct vfio_pci_dummy_resource *dummy_res, *tmp; 381 struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; 382 int i, bar; 383 384 /* Stop the device from further DMA */ 385 pci_clear_master(pdev); 386 387 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | 388 VFIO_IRQ_SET_ACTION_TRIGGER, 389 vdev->irq_type, 0, 0, NULL); 390 391 /* Device closed, don't need mutex here */ 392 list_for_each_entry_safe(ioeventfd, ioeventfd_tmp, 393 &vdev->ioeventfds_list, next) { 394 vfio_virqfd_disable(&ioeventfd->virqfd); 395 list_del(&ioeventfd->next); 396 kfree(ioeventfd); 397 } 398 vdev->ioeventfds_nr = 0; 399 400 vdev->virq_disabled = false; 401 402 for (i = 0; i < vdev->num_regions; i++) 403 vdev->region[i].ops->release(vdev, &vdev->region[i]); 404 405 vdev->num_regions = 0; 406 kfree(vdev->region); 407 vdev->region = NULL; /* don't krealloc a freed pointer */ 408 409 vfio_config_free(vdev); 410 411 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 412 bar = i + PCI_STD_RESOURCES; 413 if (!vdev->barmap[bar]) 414 continue; 415 pci_iounmap(pdev, vdev->barmap[bar]); 416 pci_release_selected_regions(pdev, 1 << bar); 417 vdev->barmap[bar] = NULL; 418 } 419 420 list_for_each_entry_safe(dummy_res, tmp, 421 &vdev->dummy_resources_list, res_next) { 422 list_del(&dummy_res->res_next); 423 release_resource(&dummy_res->resource); 424 kfree(dummy_res); 425 } 426 427 vdev->needs_reset = true; 428 429 /* 430 * If we have saved state, restore it. If we can reset the device, 431 * even better. Resetting with current state seems better than 432 * nothing, but saving and restoring current state without reset 433 * is just busy work. 434 */ 435 if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) { 436 pci_info(pdev, "%s: Couldn't reload saved state\n", __func__); 437 438 if (!vdev->reset_works) 439 goto out; 440 441 pci_save_state(pdev); 442 } 443 444 /* 445 * Disable INTx and MSI, presumably to avoid spurious interrupts 446 * during reset. Stolen from pci_reset_function() 447 */ 448 pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); 449 450 /* 451 * Try to get the locks ourselves to prevent a deadlock. The 452 * success of this is dependent on being able to lock the device, 453 * which is not always possible. 454 * We can not use the "try" reset interface here, which will 455 * overwrite the previously restored configuration information. 456 */ 457 if (vdev->reset_works && pci_cfg_access_trylock(pdev)) { 458 if (device_trylock(&pdev->dev)) { 459 if (!__pci_reset_function_locked(pdev)) 460 vdev->needs_reset = false; 461 device_unlock(&pdev->dev); 462 } 463 pci_cfg_access_unlock(pdev); 464 } 465 466 pci_restore_state(pdev); 467 out: 468 pci_disable_device(pdev); 469 470 vfio_pci_try_bus_reset(vdev); 471 472 if (!disable_idle_d3) 473 vfio_pci_set_power_state(vdev, PCI_D3hot); 474 } 475 476 static struct pci_driver vfio_pci_driver; 477 478 static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev, 479 struct vfio_device **pf_dev) 480 { 481 struct pci_dev *physfn = pci_physfn(vdev->pdev); 482 483 if (!vdev->pdev->is_virtfn) 484 return NULL; 485 486 *pf_dev = vfio_device_get_from_dev(&physfn->dev); 487 if (!*pf_dev) 488 return NULL; 489 490 if (pci_dev_driver(physfn) != &vfio_pci_driver) { 491 vfio_device_put(*pf_dev); 492 return NULL; 493 } 494 495 return vfio_device_data(*pf_dev); 496 } 497 498 static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val) 499 { 500 struct vfio_device *pf_dev; 501 struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev); 502 503 if (!pf_vdev) 504 return; 505 506 mutex_lock(&pf_vdev->vf_token->lock); 507 pf_vdev->vf_token->users += val; 508 WARN_ON(pf_vdev->vf_token->users < 0); 509 mutex_unlock(&pf_vdev->vf_token->lock); 510 511 vfio_device_put(pf_dev); 512 } 513 514 static void vfio_pci_release(void *device_data) 515 { 516 struct vfio_pci_device *vdev = device_data; 517 518 mutex_lock(&vdev->reflck->lock); 519 520 if (!(--vdev->refcnt)) { 521 vfio_pci_vf_token_user_add(vdev, -1); 522 vfio_spapr_pci_eeh_release(vdev->pdev); 523 vfio_pci_disable(vdev); 524 if (vdev->err_trigger) 525 eventfd_ctx_put(vdev->err_trigger); 526 if (vdev->req_trigger) 527 eventfd_ctx_put(vdev->req_trigger); 528 } 529 530 mutex_unlock(&vdev->reflck->lock); 531 532 module_put(THIS_MODULE); 533 } 534 535 static int vfio_pci_open(void *device_data) 536 { 537 struct vfio_pci_device *vdev = device_data; 538 int ret = 0; 539 540 if (!try_module_get(THIS_MODULE)) 541 return -ENODEV; 542 543 mutex_lock(&vdev->reflck->lock); 544 545 if (!vdev->refcnt) { 546 ret = vfio_pci_enable(vdev); 547 if (ret) 548 goto error; 549 550 vfio_spapr_pci_eeh_open(vdev->pdev); 551 vfio_pci_vf_token_user_add(vdev, 1); 552 } 553 vdev->refcnt++; 554 error: 555 mutex_unlock(&vdev->reflck->lock); 556 if (ret) 557 module_put(THIS_MODULE); 558 return ret; 559 } 560 561 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) 562 { 563 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { 564 u8 pin; 565 566 if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || 567 vdev->nointx || vdev->pdev->is_virtfn) 568 return 0; 569 570 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); 571 572 return pin ? 1 : 0; 573 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { 574 u8 pos; 575 u16 flags; 576 577 pos = vdev->pdev->msi_cap; 578 if (pos) { 579 pci_read_config_word(vdev->pdev, 580 pos + PCI_MSI_FLAGS, &flags); 581 return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); 582 } 583 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { 584 u8 pos; 585 u16 flags; 586 587 pos = vdev->pdev->msix_cap; 588 if (pos) { 589 pci_read_config_word(vdev->pdev, 590 pos + PCI_MSIX_FLAGS, &flags); 591 592 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 593 } 594 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { 595 if (pci_is_pcie(vdev->pdev)) 596 return 1; 597 } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { 598 return 1; 599 } 600 601 return 0; 602 } 603 604 static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) 605 { 606 (*(int *)data)++; 607 return 0; 608 } 609 610 struct vfio_pci_fill_info { 611 int max; 612 int cur; 613 struct vfio_pci_dependent_device *devices; 614 }; 615 616 static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) 617 { 618 struct vfio_pci_fill_info *fill = data; 619 struct iommu_group *iommu_group; 620 621 if (fill->cur == fill->max) 622 return -EAGAIN; /* Something changed, try again */ 623 624 iommu_group = iommu_group_get(&pdev->dev); 625 if (!iommu_group) 626 return -EPERM; /* Cannot reset non-isolated devices */ 627 628 fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); 629 fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); 630 fill->devices[fill->cur].bus = pdev->bus->number; 631 fill->devices[fill->cur].devfn = pdev->devfn; 632 fill->cur++; 633 iommu_group_put(iommu_group); 634 return 0; 635 } 636 637 struct vfio_pci_group_entry { 638 struct vfio_group *group; 639 int id; 640 }; 641 642 struct vfio_pci_group_info { 643 int count; 644 struct vfio_pci_group_entry *groups; 645 }; 646 647 static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data) 648 { 649 struct vfio_pci_group_info *info = data; 650 struct iommu_group *group; 651 int id, i; 652 653 group = iommu_group_get(&pdev->dev); 654 if (!group) 655 return -EPERM; 656 657 id = iommu_group_id(group); 658 659 for (i = 0; i < info->count; i++) 660 if (info->groups[i].id == id) 661 break; 662 663 iommu_group_put(group); 664 665 return (i == info->count) ? -EINVAL : 0; 666 } 667 668 static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) 669 { 670 for (; pdev; pdev = pdev->bus->self) 671 if (pdev->bus == slot->bus) 672 return (pdev->slot == slot); 673 return false; 674 } 675 676 struct vfio_pci_walk_info { 677 int (*fn)(struct pci_dev *, void *data); 678 void *data; 679 struct pci_dev *pdev; 680 bool slot; 681 int ret; 682 }; 683 684 static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data) 685 { 686 struct vfio_pci_walk_info *walk = data; 687 688 if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot)) 689 walk->ret = walk->fn(pdev, walk->data); 690 691 return walk->ret; 692 } 693 694 static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, 695 int (*fn)(struct pci_dev *, 696 void *data), void *data, 697 bool slot) 698 { 699 struct vfio_pci_walk_info walk = { 700 .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0, 701 }; 702 703 pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk); 704 705 return walk.ret; 706 } 707 708 static int msix_mmappable_cap(struct vfio_pci_device *vdev, 709 struct vfio_info_cap *caps) 710 { 711 struct vfio_info_cap_header header = { 712 .id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, 713 .version = 1 714 }; 715 716 return vfio_info_add_capability(caps, &header, sizeof(header)); 717 } 718 719 int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, 720 unsigned int type, unsigned int subtype, 721 const struct vfio_pci_regops *ops, 722 size_t size, u32 flags, void *data) 723 { 724 struct vfio_pci_region *region; 725 726 region = krealloc(vdev->region, 727 (vdev->num_regions + 1) * sizeof(*region), 728 GFP_KERNEL); 729 if (!region) 730 return -ENOMEM; 731 732 vdev->region = region; 733 vdev->region[vdev->num_regions].type = type; 734 vdev->region[vdev->num_regions].subtype = subtype; 735 vdev->region[vdev->num_regions].ops = ops; 736 vdev->region[vdev->num_regions].size = size; 737 vdev->region[vdev->num_regions].flags = flags; 738 vdev->region[vdev->num_regions].data = data; 739 740 vdev->num_regions++; 741 742 return 0; 743 } 744 745 struct vfio_devices { 746 struct vfio_device **devices; 747 int cur_index; 748 int max_index; 749 }; 750 751 static long vfio_pci_ioctl(void *device_data, 752 unsigned int cmd, unsigned long arg) 753 { 754 struct vfio_pci_device *vdev = device_data; 755 unsigned long minsz; 756 757 if (cmd == VFIO_DEVICE_GET_INFO) { 758 struct vfio_device_info info; 759 760 minsz = offsetofend(struct vfio_device_info, num_irqs); 761 762 if (copy_from_user(&info, (void __user *)arg, minsz)) 763 return -EFAULT; 764 765 if (info.argsz < minsz) 766 return -EINVAL; 767 768 info.flags = VFIO_DEVICE_FLAGS_PCI; 769 770 if (vdev->reset_works) 771 info.flags |= VFIO_DEVICE_FLAGS_RESET; 772 773 info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; 774 info.num_irqs = VFIO_PCI_NUM_IRQS; 775 776 return copy_to_user((void __user *)arg, &info, minsz) ? 777 -EFAULT : 0; 778 779 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { 780 struct pci_dev *pdev = vdev->pdev; 781 struct vfio_region_info info; 782 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 783 int i, ret; 784 785 minsz = offsetofend(struct vfio_region_info, offset); 786 787 if (copy_from_user(&info, (void __user *)arg, minsz)) 788 return -EFAULT; 789 790 if (info.argsz < minsz) 791 return -EINVAL; 792 793 switch (info.index) { 794 case VFIO_PCI_CONFIG_REGION_INDEX: 795 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 796 info.size = pdev->cfg_size; 797 info.flags = VFIO_REGION_INFO_FLAG_READ | 798 VFIO_REGION_INFO_FLAG_WRITE; 799 break; 800 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 801 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 802 info.size = pci_resource_len(pdev, info.index); 803 if (!info.size) { 804 info.flags = 0; 805 break; 806 } 807 808 info.flags = VFIO_REGION_INFO_FLAG_READ | 809 VFIO_REGION_INFO_FLAG_WRITE; 810 if (vdev->bar_mmap_supported[info.index]) { 811 info.flags |= VFIO_REGION_INFO_FLAG_MMAP; 812 if (info.index == vdev->msix_bar) { 813 ret = msix_mmappable_cap(vdev, &caps); 814 if (ret) 815 return ret; 816 } 817 } 818 819 break; 820 case VFIO_PCI_ROM_REGION_INDEX: 821 { 822 void __iomem *io; 823 size_t size; 824 u16 cmd; 825 826 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 827 info.flags = 0; 828 829 /* Report the BAR size, not the ROM size */ 830 info.size = pci_resource_len(pdev, info.index); 831 if (!info.size) { 832 /* Shadow ROMs appear as PCI option ROMs */ 833 if (pdev->resource[PCI_ROM_RESOURCE].flags & 834 IORESOURCE_ROM_SHADOW) 835 info.size = 0x20000; 836 else 837 break; 838 } 839 840 /* 841 * Is it really there? Enable memory decode for 842 * implicit access in pci_map_rom(). 843 */ 844 cmd = vfio_pci_memory_lock_and_enable(vdev); 845 io = pci_map_rom(pdev, &size); 846 if (io) { 847 info.flags = VFIO_REGION_INFO_FLAG_READ; 848 pci_unmap_rom(pdev, io); 849 } else { 850 info.size = 0; 851 } 852 vfio_pci_memory_unlock_and_restore(vdev, cmd); 853 854 break; 855 } 856 case VFIO_PCI_VGA_REGION_INDEX: 857 if (!vdev->has_vga) 858 return -EINVAL; 859 860 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 861 info.size = 0xc0000; 862 info.flags = VFIO_REGION_INFO_FLAG_READ | 863 VFIO_REGION_INFO_FLAG_WRITE; 864 865 break; 866 default: 867 { 868 struct vfio_region_info_cap_type cap_type = { 869 .header.id = VFIO_REGION_INFO_CAP_TYPE, 870 .header.version = 1 }; 871 872 if (info.index >= 873 VFIO_PCI_NUM_REGIONS + vdev->num_regions) 874 return -EINVAL; 875 info.index = array_index_nospec(info.index, 876 VFIO_PCI_NUM_REGIONS + 877 vdev->num_regions); 878 879 i = info.index - VFIO_PCI_NUM_REGIONS; 880 881 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 882 info.size = vdev->region[i].size; 883 info.flags = vdev->region[i].flags; 884 885 cap_type.type = vdev->region[i].type; 886 cap_type.subtype = vdev->region[i].subtype; 887 888 ret = vfio_info_add_capability(&caps, &cap_type.header, 889 sizeof(cap_type)); 890 if (ret) 891 return ret; 892 893 if (vdev->region[i].ops->add_capability) { 894 ret = vdev->region[i].ops->add_capability(vdev, 895 &vdev->region[i], &caps); 896 if (ret) 897 return ret; 898 } 899 } 900 } 901 902 if (caps.size) { 903 info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 904 if (info.argsz < sizeof(info) + caps.size) { 905 info.argsz = sizeof(info) + caps.size; 906 info.cap_offset = 0; 907 } else { 908 vfio_info_cap_shift(&caps, sizeof(info)); 909 if (copy_to_user((void __user *)arg + 910 sizeof(info), caps.buf, 911 caps.size)) { 912 kfree(caps.buf); 913 return -EFAULT; 914 } 915 info.cap_offset = sizeof(info); 916 } 917 918 kfree(caps.buf); 919 } 920 921 return copy_to_user((void __user *)arg, &info, minsz) ? 922 -EFAULT : 0; 923 924 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { 925 struct vfio_irq_info info; 926 927 minsz = offsetofend(struct vfio_irq_info, count); 928 929 if (copy_from_user(&info, (void __user *)arg, minsz)) 930 return -EFAULT; 931 932 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 933 return -EINVAL; 934 935 switch (info.index) { 936 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: 937 case VFIO_PCI_REQ_IRQ_INDEX: 938 break; 939 case VFIO_PCI_ERR_IRQ_INDEX: 940 if (pci_is_pcie(vdev->pdev)) 941 break; 942 /* fall through */ 943 default: 944 return -EINVAL; 945 } 946 947 info.flags = VFIO_IRQ_INFO_EVENTFD; 948 949 info.count = vfio_pci_get_irq_count(vdev, info.index); 950 951 if (info.index == VFIO_PCI_INTX_IRQ_INDEX) 952 info.flags |= (VFIO_IRQ_INFO_MASKABLE | 953 VFIO_IRQ_INFO_AUTOMASKED); 954 else 955 info.flags |= VFIO_IRQ_INFO_NORESIZE; 956 957 return copy_to_user((void __user *)arg, &info, minsz) ? 958 -EFAULT : 0; 959 960 } else if (cmd == VFIO_DEVICE_SET_IRQS) { 961 struct vfio_irq_set hdr; 962 u8 *data = NULL; 963 int max, ret = 0; 964 size_t data_size = 0; 965 966 minsz = offsetofend(struct vfio_irq_set, count); 967 968 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 969 return -EFAULT; 970 971 max = vfio_pci_get_irq_count(vdev, hdr.index); 972 973 ret = vfio_set_irqs_validate_and_prepare(&hdr, max, 974 VFIO_PCI_NUM_IRQS, &data_size); 975 if (ret) 976 return ret; 977 978 if (data_size) { 979 data = memdup_user((void __user *)(arg + minsz), 980 data_size); 981 if (IS_ERR(data)) 982 return PTR_ERR(data); 983 } 984 985 mutex_lock(&vdev->igate); 986 987 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, 988 hdr.start, hdr.count, data); 989 990 mutex_unlock(&vdev->igate); 991 kfree(data); 992 993 return ret; 994 995 } else if (cmd == VFIO_DEVICE_RESET) { 996 int ret; 997 998 if (!vdev->reset_works) 999 return -EINVAL; 1000 1001 vfio_pci_zap_and_down_write_memory_lock(vdev); 1002 ret = pci_try_reset_function(vdev->pdev); 1003 up_write(&vdev->memory_lock); 1004 1005 return ret; 1006 1007 } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { 1008 struct vfio_pci_hot_reset_info hdr; 1009 struct vfio_pci_fill_info fill = { 0 }; 1010 struct vfio_pci_dependent_device *devices = NULL; 1011 bool slot = false; 1012 int ret = 0; 1013 1014 minsz = offsetofend(struct vfio_pci_hot_reset_info, count); 1015 1016 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 1017 return -EFAULT; 1018 1019 if (hdr.argsz < minsz) 1020 return -EINVAL; 1021 1022 hdr.flags = 0; 1023 1024 /* Can we do a slot or bus reset or neither? */ 1025 if (!pci_probe_reset_slot(vdev->pdev->slot)) 1026 slot = true; 1027 else if (pci_probe_reset_bus(vdev->pdev->bus)) 1028 return -ENODEV; 1029 1030 /* How many devices are affected? */ 1031 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1032 vfio_pci_count_devs, 1033 &fill.max, slot); 1034 if (ret) 1035 return ret; 1036 1037 WARN_ON(!fill.max); /* Should always be at least one */ 1038 1039 /* 1040 * If there's enough space, fill it now, otherwise return 1041 * -ENOSPC and the number of devices affected. 1042 */ 1043 if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { 1044 ret = -ENOSPC; 1045 hdr.count = fill.max; 1046 goto reset_info_exit; 1047 } 1048 1049 devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); 1050 if (!devices) 1051 return -ENOMEM; 1052 1053 fill.devices = devices; 1054 1055 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1056 vfio_pci_fill_devs, 1057 &fill, slot); 1058 1059 /* 1060 * If a device was removed between counting and filling, 1061 * we may come up short of fill.max. If a device was 1062 * added, we'll have a return of -EAGAIN above. 1063 */ 1064 if (!ret) 1065 hdr.count = fill.cur; 1066 1067 reset_info_exit: 1068 if (copy_to_user((void __user *)arg, &hdr, minsz)) 1069 ret = -EFAULT; 1070 1071 if (!ret) { 1072 if (copy_to_user((void __user *)(arg + minsz), devices, 1073 hdr.count * sizeof(*devices))) 1074 ret = -EFAULT; 1075 } 1076 1077 kfree(devices); 1078 return ret; 1079 1080 } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { 1081 struct vfio_pci_hot_reset hdr; 1082 int32_t *group_fds; 1083 struct vfio_pci_group_entry *groups; 1084 struct vfio_pci_group_info info; 1085 struct vfio_devices devs = { .cur_index = 0 }; 1086 bool slot = false; 1087 int i, group_idx, mem_idx = 0, count = 0, ret = 0; 1088 1089 minsz = offsetofend(struct vfio_pci_hot_reset, count); 1090 1091 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 1092 return -EFAULT; 1093 1094 if (hdr.argsz < minsz || hdr.flags) 1095 return -EINVAL; 1096 1097 /* Can we do a slot or bus reset or neither? */ 1098 if (!pci_probe_reset_slot(vdev->pdev->slot)) 1099 slot = true; 1100 else if (pci_probe_reset_bus(vdev->pdev->bus)) 1101 return -ENODEV; 1102 1103 /* 1104 * We can't let userspace give us an arbitrarily large 1105 * buffer to copy, so verify how many we think there 1106 * could be. Note groups can have multiple devices so 1107 * one group per device is the max. 1108 */ 1109 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1110 vfio_pci_count_devs, 1111 &count, slot); 1112 if (ret) 1113 return ret; 1114 1115 /* Somewhere between 1 and count is OK */ 1116 if (!hdr.count || hdr.count > count) 1117 return -EINVAL; 1118 1119 group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); 1120 groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); 1121 if (!group_fds || !groups) { 1122 kfree(group_fds); 1123 kfree(groups); 1124 return -ENOMEM; 1125 } 1126 1127 if (copy_from_user(group_fds, (void __user *)(arg + minsz), 1128 hdr.count * sizeof(*group_fds))) { 1129 kfree(group_fds); 1130 kfree(groups); 1131 return -EFAULT; 1132 } 1133 1134 /* 1135 * For each group_fd, get the group through the vfio external 1136 * user interface and store the group and iommu ID. This 1137 * ensures the group is held across the reset. 1138 */ 1139 for (group_idx = 0; group_idx < hdr.count; group_idx++) { 1140 struct vfio_group *group; 1141 struct fd f = fdget(group_fds[group_idx]); 1142 if (!f.file) { 1143 ret = -EBADF; 1144 break; 1145 } 1146 1147 group = vfio_group_get_external_user(f.file); 1148 fdput(f); 1149 if (IS_ERR(group)) { 1150 ret = PTR_ERR(group); 1151 break; 1152 } 1153 1154 groups[group_idx].group = group; 1155 groups[group_idx].id = 1156 vfio_external_user_iommu_id(group); 1157 } 1158 1159 kfree(group_fds); 1160 1161 /* release reference to groups on error */ 1162 if (ret) 1163 goto hot_reset_release; 1164 1165 info.count = hdr.count; 1166 info.groups = groups; 1167 1168 /* 1169 * Test whether all the affected devices are contained 1170 * by the set of groups provided by the user. 1171 */ 1172 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1173 vfio_pci_validate_devs, 1174 &info, slot); 1175 if (ret) 1176 goto hot_reset_release; 1177 1178 devs.max_index = count; 1179 devs.devices = kcalloc(count, sizeof(struct vfio_device *), 1180 GFP_KERNEL); 1181 if (!devs.devices) { 1182 ret = -ENOMEM; 1183 goto hot_reset_release; 1184 } 1185 1186 /* 1187 * We need to get memory_lock for each device, but devices 1188 * can share mmap_lock, therefore we need to zap and hold 1189 * the vma_lock for each device, and only then get each 1190 * memory_lock. 1191 */ 1192 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1193 vfio_pci_try_zap_and_vma_lock_cb, 1194 &devs, slot); 1195 if (ret) 1196 goto hot_reset_release; 1197 1198 for (; mem_idx < devs.cur_index; mem_idx++) { 1199 struct vfio_pci_device *tmp; 1200 1201 tmp = vfio_device_data(devs.devices[mem_idx]); 1202 1203 ret = down_write_trylock(&tmp->memory_lock); 1204 if (!ret) { 1205 ret = -EBUSY; 1206 goto hot_reset_release; 1207 } 1208 mutex_unlock(&tmp->vma_lock); 1209 } 1210 1211 /* User has access, do the reset */ 1212 ret = pci_reset_bus(vdev->pdev); 1213 1214 hot_reset_release: 1215 for (i = 0; i < devs.cur_index; i++) { 1216 struct vfio_device *device; 1217 struct vfio_pci_device *tmp; 1218 1219 device = devs.devices[i]; 1220 tmp = vfio_device_data(device); 1221 1222 if (i < mem_idx) 1223 up_write(&tmp->memory_lock); 1224 else 1225 mutex_unlock(&tmp->vma_lock); 1226 vfio_device_put(device); 1227 } 1228 kfree(devs.devices); 1229 1230 for (group_idx--; group_idx >= 0; group_idx--) 1231 vfio_group_put_external_user(groups[group_idx].group); 1232 1233 kfree(groups); 1234 return ret; 1235 } else if (cmd == VFIO_DEVICE_IOEVENTFD) { 1236 struct vfio_device_ioeventfd ioeventfd; 1237 int count; 1238 1239 minsz = offsetofend(struct vfio_device_ioeventfd, fd); 1240 1241 if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) 1242 return -EFAULT; 1243 1244 if (ioeventfd.argsz < minsz) 1245 return -EINVAL; 1246 1247 if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) 1248 return -EINVAL; 1249 1250 count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; 1251 1252 if (hweight8(count) != 1 || ioeventfd.fd < -1) 1253 return -EINVAL; 1254 1255 return vfio_pci_ioeventfd(vdev, ioeventfd.offset, 1256 ioeventfd.data, count, ioeventfd.fd); 1257 } else if (cmd == VFIO_DEVICE_FEATURE) { 1258 struct vfio_device_feature feature; 1259 uuid_t uuid; 1260 1261 minsz = offsetofend(struct vfio_device_feature, flags); 1262 1263 if (copy_from_user(&feature, (void __user *)arg, minsz)) 1264 return -EFAULT; 1265 1266 if (feature.argsz < minsz) 1267 return -EINVAL; 1268 1269 /* Check unknown flags */ 1270 if (feature.flags & ~(VFIO_DEVICE_FEATURE_MASK | 1271 VFIO_DEVICE_FEATURE_SET | 1272 VFIO_DEVICE_FEATURE_GET | 1273 VFIO_DEVICE_FEATURE_PROBE)) 1274 return -EINVAL; 1275 1276 /* GET & SET are mutually exclusive except with PROBE */ 1277 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 1278 (feature.flags & VFIO_DEVICE_FEATURE_SET) && 1279 (feature.flags & VFIO_DEVICE_FEATURE_GET)) 1280 return -EINVAL; 1281 1282 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 1283 case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: 1284 if (!vdev->vf_token) 1285 return -ENOTTY; 1286 1287 /* 1288 * We do not support GET of the VF Token UUID as this 1289 * could expose the token of the previous device user. 1290 */ 1291 if (feature.flags & VFIO_DEVICE_FEATURE_GET) 1292 return -EINVAL; 1293 1294 if (feature.flags & VFIO_DEVICE_FEATURE_PROBE) 1295 return 0; 1296 1297 /* Don't SET unless told to do so */ 1298 if (!(feature.flags & VFIO_DEVICE_FEATURE_SET)) 1299 return -EINVAL; 1300 1301 if (feature.argsz < minsz + sizeof(uuid)) 1302 return -EINVAL; 1303 1304 if (copy_from_user(&uuid, (void __user *)(arg + minsz), 1305 sizeof(uuid))) 1306 return -EFAULT; 1307 1308 mutex_lock(&vdev->vf_token->lock); 1309 uuid_copy(&vdev->vf_token->uuid, &uuid); 1310 mutex_unlock(&vdev->vf_token->lock); 1311 1312 return 0; 1313 default: 1314 return -ENOTTY; 1315 } 1316 } 1317 1318 return -ENOTTY; 1319 } 1320 1321 static ssize_t vfio_pci_rw(void *device_data, char __user *buf, 1322 size_t count, loff_t *ppos, bool iswrite) 1323 { 1324 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 1325 struct vfio_pci_device *vdev = device_data; 1326 1327 if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 1328 return -EINVAL; 1329 1330 switch (index) { 1331 case VFIO_PCI_CONFIG_REGION_INDEX: 1332 return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); 1333 1334 case VFIO_PCI_ROM_REGION_INDEX: 1335 if (iswrite) 1336 return -EINVAL; 1337 return vfio_pci_bar_rw(vdev, buf, count, ppos, false); 1338 1339 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 1340 return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); 1341 1342 case VFIO_PCI_VGA_REGION_INDEX: 1343 return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); 1344 default: 1345 index -= VFIO_PCI_NUM_REGIONS; 1346 return vdev->region[index].ops->rw(vdev, buf, 1347 count, ppos, iswrite); 1348 } 1349 1350 return -EINVAL; 1351 } 1352 1353 static ssize_t vfio_pci_read(void *device_data, char __user *buf, 1354 size_t count, loff_t *ppos) 1355 { 1356 if (!count) 1357 return 0; 1358 1359 return vfio_pci_rw(device_data, buf, count, ppos, false); 1360 } 1361 1362 static ssize_t vfio_pci_write(void *device_data, const char __user *buf, 1363 size_t count, loff_t *ppos) 1364 { 1365 if (!count) 1366 return 0; 1367 1368 return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true); 1369 } 1370 1371 /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ 1372 static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try) 1373 { 1374 struct vfio_pci_mmap_vma *mmap_vma, *tmp; 1375 1376 /* 1377 * Lock ordering: 1378 * vma_lock is nested under mmap_lock for vm_ops callback paths. 1379 * The memory_lock semaphore is used by both code paths calling 1380 * into this function to zap vmas and the vm_ops.fault callback 1381 * to protect the memory enable state of the device. 1382 * 1383 * When zapping vmas we need to maintain the mmap_lock => vma_lock 1384 * ordering, which requires using vma_lock to walk vma_list to 1385 * acquire an mm, then dropping vma_lock to get the mmap_lock and 1386 * reacquiring vma_lock. This logic is derived from similar 1387 * requirements in uverbs_user_mmap_disassociate(). 1388 * 1389 * mmap_lock must always be the top-level lock when it is taken. 1390 * Therefore we can only hold the memory_lock write lock when 1391 * vma_list is empty, as we'd need to take mmap_lock to clear 1392 * entries. vma_list can only be guaranteed empty when holding 1393 * vma_lock, thus memory_lock is nested under vma_lock. 1394 * 1395 * This enables the vm_ops.fault callback to acquire vma_lock, 1396 * followed by memory_lock read lock, while already holding 1397 * mmap_lock without risk of deadlock. 1398 */ 1399 while (1) { 1400 struct mm_struct *mm = NULL; 1401 1402 if (try) { 1403 if (!mutex_trylock(&vdev->vma_lock)) 1404 return 0; 1405 } else { 1406 mutex_lock(&vdev->vma_lock); 1407 } 1408 while (!list_empty(&vdev->vma_list)) { 1409 mmap_vma = list_first_entry(&vdev->vma_list, 1410 struct vfio_pci_mmap_vma, 1411 vma_next); 1412 mm = mmap_vma->vma->vm_mm; 1413 if (mmget_not_zero(mm)) 1414 break; 1415 1416 list_del(&mmap_vma->vma_next); 1417 kfree(mmap_vma); 1418 mm = NULL; 1419 } 1420 if (!mm) 1421 return 1; 1422 mutex_unlock(&vdev->vma_lock); 1423 1424 if (try) { 1425 if (!mmap_read_trylock(mm)) { 1426 mmput(mm); 1427 return 0; 1428 } 1429 } else { 1430 mmap_read_lock(mm); 1431 } 1432 if (mmget_still_valid(mm)) { 1433 if (try) { 1434 if (!mutex_trylock(&vdev->vma_lock)) { 1435 mmap_read_unlock(mm); 1436 mmput(mm); 1437 return 0; 1438 } 1439 } else { 1440 mutex_lock(&vdev->vma_lock); 1441 } 1442 list_for_each_entry_safe(mmap_vma, tmp, 1443 &vdev->vma_list, vma_next) { 1444 struct vm_area_struct *vma = mmap_vma->vma; 1445 1446 if (vma->vm_mm != mm) 1447 continue; 1448 1449 list_del(&mmap_vma->vma_next); 1450 kfree(mmap_vma); 1451 1452 zap_vma_ptes(vma, vma->vm_start, 1453 vma->vm_end - vma->vm_start); 1454 } 1455 mutex_unlock(&vdev->vma_lock); 1456 } 1457 mmap_read_unlock(mm); 1458 mmput(mm); 1459 } 1460 } 1461 1462 void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device *vdev) 1463 { 1464 vfio_pci_zap_and_vma_lock(vdev, false); 1465 down_write(&vdev->memory_lock); 1466 mutex_unlock(&vdev->vma_lock); 1467 } 1468 1469 u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev) 1470 { 1471 u16 cmd; 1472 1473 down_write(&vdev->memory_lock); 1474 pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd); 1475 if (!(cmd & PCI_COMMAND_MEMORY)) 1476 pci_write_config_word(vdev->pdev, PCI_COMMAND, 1477 cmd | PCI_COMMAND_MEMORY); 1478 1479 return cmd; 1480 } 1481 1482 void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev, u16 cmd) 1483 { 1484 pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd); 1485 up_write(&vdev->memory_lock); 1486 } 1487 1488 /* Caller holds vma_lock */ 1489 static int __vfio_pci_add_vma(struct vfio_pci_device *vdev, 1490 struct vm_area_struct *vma) 1491 { 1492 struct vfio_pci_mmap_vma *mmap_vma; 1493 1494 mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL); 1495 if (!mmap_vma) 1496 return -ENOMEM; 1497 1498 mmap_vma->vma = vma; 1499 list_add(&mmap_vma->vma_next, &vdev->vma_list); 1500 1501 return 0; 1502 } 1503 1504 /* 1505 * Zap mmaps on open so that we can fault them in on access and therefore 1506 * our vma_list only tracks mappings accessed since last zap. 1507 */ 1508 static void vfio_pci_mmap_open(struct vm_area_struct *vma) 1509 { 1510 zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1511 } 1512 1513 static void vfio_pci_mmap_close(struct vm_area_struct *vma) 1514 { 1515 struct vfio_pci_device *vdev = vma->vm_private_data; 1516 struct vfio_pci_mmap_vma *mmap_vma; 1517 1518 mutex_lock(&vdev->vma_lock); 1519 list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { 1520 if (mmap_vma->vma == vma) { 1521 list_del(&mmap_vma->vma_next); 1522 kfree(mmap_vma); 1523 break; 1524 } 1525 } 1526 mutex_unlock(&vdev->vma_lock); 1527 } 1528 1529 static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) 1530 { 1531 struct vm_area_struct *vma = vmf->vma; 1532 struct vfio_pci_device *vdev = vma->vm_private_data; 1533 vm_fault_t ret = VM_FAULT_NOPAGE; 1534 1535 mutex_lock(&vdev->vma_lock); 1536 down_read(&vdev->memory_lock); 1537 1538 if (!__vfio_pci_memory_enabled(vdev)) { 1539 ret = VM_FAULT_SIGBUS; 1540 mutex_unlock(&vdev->vma_lock); 1541 goto up_out; 1542 } 1543 1544 if (__vfio_pci_add_vma(vdev, vma)) { 1545 ret = VM_FAULT_OOM; 1546 mutex_unlock(&vdev->vma_lock); 1547 goto up_out; 1548 } 1549 1550 mutex_unlock(&vdev->vma_lock); 1551 1552 if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 1553 vma->vm_end - vma->vm_start, vma->vm_page_prot)) 1554 ret = VM_FAULT_SIGBUS; 1555 1556 up_out: 1557 up_read(&vdev->memory_lock); 1558 return ret; 1559 } 1560 1561 static const struct vm_operations_struct vfio_pci_mmap_ops = { 1562 .open = vfio_pci_mmap_open, 1563 .close = vfio_pci_mmap_close, 1564 .fault = vfio_pci_mmap_fault, 1565 }; 1566 1567 static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) 1568 { 1569 struct vfio_pci_device *vdev = device_data; 1570 struct pci_dev *pdev = vdev->pdev; 1571 unsigned int index; 1572 u64 phys_len, req_len, pgoff, req_start; 1573 int ret; 1574 1575 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 1576 1577 if (vma->vm_end < vma->vm_start) 1578 return -EINVAL; 1579 if ((vma->vm_flags & VM_SHARED) == 0) 1580 return -EINVAL; 1581 if (index >= VFIO_PCI_NUM_REGIONS) { 1582 int regnum = index - VFIO_PCI_NUM_REGIONS; 1583 struct vfio_pci_region *region = vdev->region + regnum; 1584 1585 if (region && region->ops && region->ops->mmap && 1586 (region->flags & VFIO_REGION_INFO_FLAG_MMAP)) 1587 return region->ops->mmap(vdev, region, vma); 1588 return -EINVAL; 1589 } 1590 if (index >= VFIO_PCI_ROM_REGION_INDEX) 1591 return -EINVAL; 1592 if (!vdev->bar_mmap_supported[index]) 1593 return -EINVAL; 1594 1595 phys_len = PAGE_ALIGN(pci_resource_len(pdev, index)); 1596 req_len = vma->vm_end - vma->vm_start; 1597 pgoff = vma->vm_pgoff & 1598 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 1599 req_start = pgoff << PAGE_SHIFT; 1600 1601 if (req_start + req_len > phys_len) 1602 return -EINVAL; 1603 1604 /* 1605 * Even though we don't make use of the barmap for the mmap, 1606 * we need to request the region and the barmap tracks that. 1607 */ 1608 if (!vdev->barmap[index]) { 1609 ret = pci_request_selected_regions(pdev, 1610 1 << index, "vfio-pci"); 1611 if (ret) 1612 return ret; 1613 1614 vdev->barmap[index] = pci_iomap(pdev, index, 0); 1615 if (!vdev->barmap[index]) { 1616 pci_release_selected_regions(pdev, 1 << index); 1617 return -ENOMEM; 1618 } 1619 } 1620 1621 vma->vm_private_data = vdev; 1622 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 1623 vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; 1624 1625 /* 1626 * See remap_pfn_range(), called from vfio_pci_fault() but we can't 1627 * change vm_flags within the fault handler. Set them now. 1628 */ 1629 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; 1630 vma->vm_ops = &vfio_pci_mmap_ops; 1631 1632 return 0; 1633 } 1634 1635 static void vfio_pci_request(void *device_data, unsigned int count) 1636 { 1637 struct vfio_pci_device *vdev = device_data; 1638 struct pci_dev *pdev = vdev->pdev; 1639 1640 mutex_lock(&vdev->igate); 1641 1642 if (vdev->req_trigger) { 1643 if (!(count % 10)) 1644 pci_notice_ratelimited(pdev, 1645 "Relaying device request to user (#%u)\n", 1646 count); 1647 eventfd_signal(vdev->req_trigger, 1); 1648 } else if (count == 0) { 1649 pci_warn(pdev, 1650 "No device request channel registered, blocked until released by user\n"); 1651 } 1652 1653 mutex_unlock(&vdev->igate); 1654 } 1655 1656 static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, 1657 bool vf_token, uuid_t *uuid) 1658 { 1659 /* 1660 * There's always some degree of trust or collaboration between SR-IOV 1661 * PF and VFs, even if just that the PF hosts the SR-IOV capability and 1662 * can disrupt VFs with a reset, but often the PF has more explicit 1663 * access to deny service to the VF or access data passed through the 1664 * VF. We therefore require an opt-in via a shared VF token (UUID) to 1665 * represent this trust. This both prevents that a VF driver might 1666 * assume the PF driver is a trusted, in-kernel driver, and also that 1667 * a PF driver might be replaced with a rogue driver, unknown to in-use 1668 * VF drivers. 1669 * 1670 * Therefore when presented with a VF, if the PF is a vfio device and 1671 * it is bound to the vfio-pci driver, the user needs to provide a VF 1672 * token to access the device, in the form of appending a vf_token to 1673 * the device name, for example: 1674 * 1675 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3" 1676 * 1677 * When presented with a PF which has VFs in use, the user must also 1678 * provide the current VF token to prove collaboration with existing 1679 * VF users. If VFs are not in use, the VF token provided for the PF 1680 * device will act to set the VF token. 1681 * 1682 * If the VF token is provided but unused, an error is generated. 1683 */ 1684 if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token) 1685 return 0; /* No VF token provided or required */ 1686 1687 if (vdev->pdev->is_virtfn) { 1688 struct vfio_device *pf_dev; 1689 struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev); 1690 bool match; 1691 1692 if (!pf_vdev) { 1693 if (!vf_token) 1694 return 0; /* PF is not vfio-pci, no VF token */ 1695 1696 pci_info_ratelimited(vdev->pdev, 1697 "VF token incorrectly provided, PF not bound to vfio-pci\n"); 1698 return -EINVAL; 1699 } 1700 1701 if (!vf_token) { 1702 vfio_device_put(pf_dev); 1703 pci_info_ratelimited(vdev->pdev, 1704 "VF token required to access device\n"); 1705 return -EACCES; 1706 } 1707 1708 mutex_lock(&pf_vdev->vf_token->lock); 1709 match = uuid_equal(uuid, &pf_vdev->vf_token->uuid); 1710 mutex_unlock(&pf_vdev->vf_token->lock); 1711 1712 vfio_device_put(pf_dev); 1713 1714 if (!match) { 1715 pci_info_ratelimited(vdev->pdev, 1716 "Incorrect VF token provided for device\n"); 1717 return -EACCES; 1718 } 1719 } else if (vdev->vf_token) { 1720 mutex_lock(&vdev->vf_token->lock); 1721 if (vdev->vf_token->users) { 1722 if (!vf_token) { 1723 mutex_unlock(&vdev->vf_token->lock); 1724 pci_info_ratelimited(vdev->pdev, 1725 "VF token required to access device\n"); 1726 return -EACCES; 1727 } 1728 1729 if (!uuid_equal(uuid, &vdev->vf_token->uuid)) { 1730 mutex_unlock(&vdev->vf_token->lock); 1731 pci_info_ratelimited(vdev->pdev, 1732 "Incorrect VF token provided for device\n"); 1733 return -EACCES; 1734 } 1735 } else if (vf_token) { 1736 uuid_copy(&vdev->vf_token->uuid, uuid); 1737 } 1738 1739 mutex_unlock(&vdev->vf_token->lock); 1740 } else if (vf_token) { 1741 pci_info_ratelimited(vdev->pdev, 1742 "VF token incorrectly provided, not a PF or VF\n"); 1743 return -EINVAL; 1744 } 1745 1746 return 0; 1747 } 1748 1749 #define VF_TOKEN_ARG "vf_token=" 1750 1751 static int vfio_pci_match(void *device_data, char *buf) 1752 { 1753 struct vfio_pci_device *vdev = device_data; 1754 bool vf_token = false; 1755 uuid_t uuid; 1756 int ret; 1757 1758 if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev)))) 1759 return 0; /* No match */ 1760 1761 if (strlen(buf) > strlen(pci_name(vdev->pdev))) { 1762 buf += strlen(pci_name(vdev->pdev)); 1763 1764 if (*buf != ' ') 1765 return 0; /* No match: non-whitespace after name */ 1766 1767 while (*buf) { 1768 if (*buf == ' ') { 1769 buf++; 1770 continue; 1771 } 1772 1773 if (!vf_token && !strncmp(buf, VF_TOKEN_ARG, 1774 strlen(VF_TOKEN_ARG))) { 1775 buf += strlen(VF_TOKEN_ARG); 1776 1777 if (strlen(buf) < UUID_STRING_LEN) 1778 return -EINVAL; 1779 1780 ret = uuid_parse(buf, &uuid); 1781 if (ret) 1782 return ret; 1783 1784 vf_token = true; 1785 buf += UUID_STRING_LEN; 1786 } else { 1787 /* Unknown/duplicate option */ 1788 return -EINVAL; 1789 } 1790 } 1791 } 1792 1793 ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid); 1794 if (ret) 1795 return ret; 1796 1797 return 1; /* Match */ 1798 } 1799 1800 static const struct vfio_device_ops vfio_pci_ops = { 1801 .name = "vfio-pci", 1802 .open = vfio_pci_open, 1803 .release = vfio_pci_release, 1804 .ioctl = vfio_pci_ioctl, 1805 .read = vfio_pci_read, 1806 .write = vfio_pci_write, 1807 .mmap = vfio_pci_mmap, 1808 .request = vfio_pci_request, 1809 .match = vfio_pci_match, 1810 }; 1811 1812 static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev); 1813 static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck); 1814 static struct pci_driver vfio_pci_driver; 1815 1816 static int vfio_pci_bus_notifier(struct notifier_block *nb, 1817 unsigned long action, void *data) 1818 { 1819 struct vfio_pci_device *vdev = container_of(nb, 1820 struct vfio_pci_device, nb); 1821 struct device *dev = data; 1822 struct pci_dev *pdev = to_pci_dev(dev); 1823 struct pci_dev *physfn = pci_physfn(pdev); 1824 1825 if (action == BUS_NOTIFY_ADD_DEVICE && 1826 pdev->is_virtfn && physfn == vdev->pdev) { 1827 pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n", 1828 pci_name(pdev)); 1829 pdev->driver_override = kasprintf(GFP_KERNEL, "%s", 1830 vfio_pci_ops.name); 1831 } else if (action == BUS_NOTIFY_BOUND_DRIVER && 1832 pdev->is_virtfn && physfn == vdev->pdev) { 1833 struct pci_driver *drv = pci_dev_driver(pdev); 1834 1835 if (drv && drv != &vfio_pci_driver) 1836 pci_warn(vdev->pdev, 1837 "VF %s bound to driver %s while PF bound to vfio-pci\n", 1838 pci_name(pdev), drv->name); 1839 } 1840 1841 return 0; 1842 } 1843 1844 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 1845 { 1846 struct vfio_pci_device *vdev; 1847 struct iommu_group *group; 1848 int ret; 1849 1850 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) 1851 return -EINVAL; 1852 1853 /* 1854 * Prevent binding to PFs with VFs enabled, the VFs might be in use 1855 * by the host or other users. We cannot capture the VFs if they 1856 * already exist, nor can we track VF users. Disabling SR-IOV here 1857 * would initiate removing the VFs, which would unbind the driver, 1858 * which is prone to blocking if that VF is also in use by vfio-pci. 1859 * Just reject these PFs and let the user sort it out. 1860 */ 1861 if (pci_num_vf(pdev)) { 1862 pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n"); 1863 return -EBUSY; 1864 } 1865 1866 group = vfio_iommu_group_get(&pdev->dev); 1867 if (!group) 1868 return -EINVAL; 1869 1870 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); 1871 if (!vdev) { 1872 ret = -ENOMEM; 1873 goto out_group_put; 1874 } 1875 1876 vdev->pdev = pdev; 1877 vdev->irq_type = VFIO_PCI_NUM_IRQS; 1878 mutex_init(&vdev->igate); 1879 spin_lock_init(&vdev->irqlock); 1880 mutex_init(&vdev->ioeventfds_lock); 1881 INIT_LIST_HEAD(&vdev->ioeventfds_list); 1882 mutex_init(&vdev->vma_lock); 1883 INIT_LIST_HEAD(&vdev->vma_list); 1884 init_rwsem(&vdev->memory_lock); 1885 1886 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); 1887 if (ret) 1888 goto out_free; 1889 1890 ret = vfio_pci_reflck_attach(vdev); 1891 if (ret) 1892 goto out_del_group_dev; 1893 1894 if (pdev->is_physfn) { 1895 vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL); 1896 if (!vdev->vf_token) { 1897 ret = -ENOMEM; 1898 goto out_reflck; 1899 } 1900 1901 mutex_init(&vdev->vf_token->lock); 1902 uuid_gen(&vdev->vf_token->uuid); 1903 1904 vdev->nb.notifier_call = vfio_pci_bus_notifier; 1905 ret = bus_register_notifier(&pci_bus_type, &vdev->nb); 1906 if (ret) 1907 goto out_vf_token; 1908 } 1909 1910 if (vfio_pci_is_vga(pdev)) { 1911 vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode); 1912 vga_set_legacy_decoding(pdev, 1913 vfio_pci_set_vga_decode(vdev, false)); 1914 } 1915 1916 vfio_pci_probe_power_state(vdev); 1917 1918 if (!disable_idle_d3) { 1919 /* 1920 * pci-core sets the device power state to an unknown value at 1921 * bootup and after being removed from a driver. The only 1922 * transition it allows from this unknown state is to D0, which 1923 * typically happens when a driver calls pci_enable_device(). 1924 * We're not ready to enable the device yet, but we do want to 1925 * be able to get to D3. Therefore first do a D0 transition 1926 * before going to D3. 1927 */ 1928 vfio_pci_set_power_state(vdev, PCI_D0); 1929 vfio_pci_set_power_state(vdev, PCI_D3hot); 1930 } 1931 1932 return ret; 1933 1934 out_vf_token: 1935 kfree(vdev->vf_token); 1936 out_reflck: 1937 vfio_pci_reflck_put(vdev->reflck); 1938 out_del_group_dev: 1939 vfio_del_group_dev(&pdev->dev); 1940 out_free: 1941 kfree(vdev); 1942 out_group_put: 1943 vfio_iommu_group_put(group, &pdev->dev); 1944 return ret; 1945 } 1946 1947 static void vfio_pci_remove(struct pci_dev *pdev) 1948 { 1949 struct vfio_pci_device *vdev; 1950 1951 pci_disable_sriov(pdev); 1952 1953 vdev = vfio_del_group_dev(&pdev->dev); 1954 if (!vdev) 1955 return; 1956 1957 if (vdev->vf_token) { 1958 WARN_ON(vdev->vf_token->users); 1959 mutex_destroy(&vdev->vf_token->lock); 1960 kfree(vdev->vf_token); 1961 } 1962 1963 if (vdev->nb.notifier_call) 1964 bus_unregister_notifier(&pci_bus_type, &vdev->nb); 1965 1966 vfio_pci_reflck_put(vdev->reflck); 1967 1968 vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); 1969 kfree(vdev->region); 1970 mutex_destroy(&vdev->ioeventfds_lock); 1971 1972 if (!disable_idle_d3) 1973 vfio_pci_set_power_state(vdev, PCI_D0); 1974 1975 kfree(vdev->pm_save); 1976 kfree(vdev); 1977 1978 if (vfio_pci_is_vga(pdev)) { 1979 vga_client_register(pdev, NULL, NULL, NULL); 1980 vga_set_legacy_decoding(pdev, 1981 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 1982 VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM); 1983 } 1984 } 1985 1986 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, 1987 pci_channel_state_t state) 1988 { 1989 struct vfio_pci_device *vdev; 1990 struct vfio_device *device; 1991 1992 device = vfio_device_get_from_dev(&pdev->dev); 1993 if (device == NULL) 1994 return PCI_ERS_RESULT_DISCONNECT; 1995 1996 vdev = vfio_device_data(device); 1997 if (vdev == NULL) { 1998 vfio_device_put(device); 1999 return PCI_ERS_RESULT_DISCONNECT; 2000 } 2001 2002 mutex_lock(&vdev->igate); 2003 2004 if (vdev->err_trigger) 2005 eventfd_signal(vdev->err_trigger, 1); 2006 2007 mutex_unlock(&vdev->igate); 2008 2009 vfio_device_put(device); 2010 2011 return PCI_ERS_RESULT_CAN_RECOVER; 2012 } 2013 2014 static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) 2015 { 2016 struct vfio_pci_device *vdev; 2017 struct vfio_device *device; 2018 int ret = 0; 2019 2020 might_sleep(); 2021 2022 if (!enable_sriov) 2023 return -ENOENT; 2024 2025 device = vfio_device_get_from_dev(&pdev->dev); 2026 if (!device) 2027 return -ENODEV; 2028 2029 vdev = vfio_device_data(device); 2030 if (!vdev) { 2031 vfio_device_put(device); 2032 return -ENODEV; 2033 } 2034 2035 if (nr_virtfn == 0) 2036 pci_disable_sriov(pdev); 2037 else 2038 ret = pci_enable_sriov(pdev, nr_virtfn); 2039 2040 vfio_device_put(device); 2041 2042 return ret < 0 ? ret : nr_virtfn; 2043 } 2044 2045 static const struct pci_error_handlers vfio_err_handlers = { 2046 .error_detected = vfio_pci_aer_err_detected, 2047 }; 2048 2049 static struct pci_driver vfio_pci_driver = { 2050 .name = "vfio-pci", 2051 .id_table = NULL, /* only dynamic ids */ 2052 .probe = vfio_pci_probe, 2053 .remove = vfio_pci_remove, 2054 .sriov_configure = vfio_pci_sriov_configure, 2055 .err_handler = &vfio_err_handlers, 2056 }; 2057 2058 static DEFINE_MUTEX(reflck_lock); 2059 2060 static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void) 2061 { 2062 struct vfio_pci_reflck *reflck; 2063 2064 reflck = kzalloc(sizeof(*reflck), GFP_KERNEL); 2065 if (!reflck) 2066 return ERR_PTR(-ENOMEM); 2067 2068 kref_init(&reflck->kref); 2069 mutex_init(&reflck->lock); 2070 2071 return reflck; 2072 } 2073 2074 static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck) 2075 { 2076 kref_get(&reflck->kref); 2077 } 2078 2079 static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data) 2080 { 2081 struct vfio_pci_reflck **preflck = data; 2082 struct vfio_device *device; 2083 struct vfio_pci_device *vdev; 2084 2085 device = vfio_device_get_from_dev(&pdev->dev); 2086 if (!device) 2087 return 0; 2088 2089 if (pci_dev_driver(pdev) != &vfio_pci_driver) { 2090 vfio_device_put(device); 2091 return 0; 2092 } 2093 2094 vdev = vfio_device_data(device); 2095 2096 if (vdev->reflck) { 2097 vfio_pci_reflck_get(vdev->reflck); 2098 *preflck = vdev->reflck; 2099 vfio_device_put(device); 2100 return 1; 2101 } 2102 2103 vfio_device_put(device); 2104 return 0; 2105 } 2106 2107 static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev) 2108 { 2109 bool slot = !pci_probe_reset_slot(vdev->pdev->slot); 2110 2111 mutex_lock(&reflck_lock); 2112 2113 if (pci_is_root_bus(vdev->pdev->bus) || 2114 vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find, 2115 &vdev->reflck, slot) <= 0) 2116 vdev->reflck = vfio_pci_reflck_alloc(); 2117 2118 mutex_unlock(&reflck_lock); 2119 2120 return PTR_ERR_OR_ZERO(vdev->reflck); 2121 } 2122 2123 static void vfio_pci_reflck_release(struct kref *kref) 2124 { 2125 struct vfio_pci_reflck *reflck = container_of(kref, 2126 struct vfio_pci_reflck, 2127 kref); 2128 2129 kfree(reflck); 2130 mutex_unlock(&reflck_lock); 2131 } 2132 2133 static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck) 2134 { 2135 kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock); 2136 } 2137 2138 static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data) 2139 { 2140 struct vfio_devices *devs = data; 2141 struct vfio_device *device; 2142 struct vfio_pci_device *vdev; 2143 2144 if (devs->cur_index == devs->max_index) 2145 return -ENOSPC; 2146 2147 device = vfio_device_get_from_dev(&pdev->dev); 2148 if (!device) 2149 return -EINVAL; 2150 2151 if (pci_dev_driver(pdev) != &vfio_pci_driver) { 2152 vfio_device_put(device); 2153 return -EBUSY; 2154 } 2155 2156 vdev = vfio_device_data(device); 2157 2158 /* Fault if the device is not unused */ 2159 if (vdev->refcnt) { 2160 vfio_device_put(device); 2161 return -EBUSY; 2162 } 2163 2164 devs->devices[devs->cur_index++] = device; 2165 return 0; 2166 } 2167 2168 static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data) 2169 { 2170 struct vfio_devices *devs = data; 2171 struct vfio_device *device; 2172 struct vfio_pci_device *vdev; 2173 2174 if (devs->cur_index == devs->max_index) 2175 return -ENOSPC; 2176 2177 device = vfio_device_get_from_dev(&pdev->dev); 2178 if (!device) 2179 return -EINVAL; 2180 2181 if (pci_dev_driver(pdev) != &vfio_pci_driver) { 2182 vfio_device_put(device); 2183 return -EBUSY; 2184 } 2185 2186 vdev = vfio_device_data(device); 2187 2188 /* 2189 * Locking multiple devices is prone to deadlock, runaway and 2190 * unwind if we hit contention. 2191 */ 2192 if (!vfio_pci_zap_and_vma_lock(vdev, true)) { 2193 vfio_device_put(device); 2194 return -EBUSY; 2195 } 2196 2197 devs->devices[devs->cur_index++] = device; 2198 return 0; 2199 } 2200 2201 /* 2202 * If a bus or slot reset is available for the provided device and: 2203 * - All of the devices affected by that bus or slot reset are unused 2204 * (!refcnt) 2205 * - At least one of the affected devices is marked dirty via 2206 * needs_reset (such as by lack of FLR support) 2207 * Then attempt to perform that bus or slot reset. Callers are required 2208 * to hold vdev->reflck->lock, protecting the bus/slot reset group from 2209 * concurrent opens. A vfio_device reference is acquired for each device 2210 * to prevent unbinds during the reset operation. 2211 * 2212 * NB: vfio-core considers a group to be viable even if some devices are 2213 * bound to drivers like pci-stub or pcieport. Here we require all devices 2214 * to be bound to vfio_pci since that's the only way we can be sure they 2215 * stay put. 2216 */ 2217 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) 2218 { 2219 struct vfio_devices devs = { .cur_index = 0 }; 2220 int i = 0, ret = -EINVAL; 2221 bool slot = false; 2222 struct vfio_pci_device *tmp; 2223 2224 if (!pci_probe_reset_slot(vdev->pdev->slot)) 2225 slot = true; 2226 else if (pci_probe_reset_bus(vdev->pdev->bus)) 2227 return; 2228 2229 if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, 2230 &i, slot) || !i) 2231 return; 2232 2233 devs.max_index = i; 2234 devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL); 2235 if (!devs.devices) 2236 return; 2237 2238 if (vfio_pci_for_each_slot_or_bus(vdev->pdev, 2239 vfio_pci_get_unused_devs, 2240 &devs, slot)) 2241 goto put_devs; 2242 2243 /* Does at least one need a reset? */ 2244 for (i = 0; i < devs.cur_index; i++) { 2245 tmp = vfio_device_data(devs.devices[i]); 2246 if (tmp->needs_reset) { 2247 ret = pci_reset_bus(vdev->pdev); 2248 break; 2249 } 2250 } 2251 2252 put_devs: 2253 for (i = 0; i < devs.cur_index; i++) { 2254 tmp = vfio_device_data(devs.devices[i]); 2255 2256 /* 2257 * If reset was successful, affected devices no longer need 2258 * a reset and we should return all the collateral devices 2259 * to low power. If not successful, we either didn't reset 2260 * the bus or timed out waiting for it, so let's not touch 2261 * the power state. 2262 */ 2263 if (!ret) { 2264 tmp->needs_reset = false; 2265 2266 if (tmp != vdev && !disable_idle_d3) 2267 vfio_pci_set_power_state(tmp, PCI_D3hot); 2268 } 2269 2270 vfio_device_put(devs.devices[i]); 2271 } 2272 2273 kfree(devs.devices); 2274 } 2275 2276 static void __exit vfio_pci_cleanup(void) 2277 { 2278 pci_unregister_driver(&vfio_pci_driver); 2279 vfio_pci_uninit_perm_bits(); 2280 } 2281 2282 static void __init vfio_pci_fill_ids(void) 2283 { 2284 char *p, *id; 2285 int rc; 2286 2287 /* no ids passed actually */ 2288 if (ids[0] == '\0') 2289 return; 2290 2291 /* add ids specified in the module parameter */ 2292 p = ids; 2293 while ((id = strsep(&p, ","))) { 2294 unsigned int vendor, device, subvendor = PCI_ANY_ID, 2295 subdevice = PCI_ANY_ID, class = 0, class_mask = 0; 2296 int fields; 2297 2298 if (!strlen(id)) 2299 continue; 2300 2301 fields = sscanf(id, "%x:%x:%x:%x:%x:%x", 2302 &vendor, &device, &subvendor, &subdevice, 2303 &class, &class_mask); 2304 2305 if (fields < 2) { 2306 pr_warn("invalid id string \"%s\"\n", id); 2307 continue; 2308 } 2309 2310 rc = pci_add_dynid(&vfio_pci_driver, vendor, device, 2311 subvendor, subdevice, class, class_mask, 0); 2312 if (rc) 2313 pr_warn("failed to add dynamic id [%04x:%04x[%04x:%04x]] class %#08x/%08x (%d)\n", 2314 vendor, device, subvendor, subdevice, 2315 class, class_mask, rc); 2316 else 2317 pr_info("add [%04x:%04x[%04x:%04x]] class %#08x/%08x\n", 2318 vendor, device, subvendor, subdevice, 2319 class, class_mask); 2320 } 2321 } 2322 2323 static int __init vfio_pci_init(void) 2324 { 2325 int ret; 2326 2327 /* Allocate shared config space permision data used by all devices */ 2328 ret = vfio_pci_init_perm_bits(); 2329 if (ret) 2330 return ret; 2331 2332 /* Register and scan for devices */ 2333 ret = pci_register_driver(&vfio_pci_driver); 2334 if (ret) 2335 goto out_driver; 2336 2337 vfio_pci_fill_ids(); 2338 2339 return 0; 2340 2341 out_driver: 2342 vfio_pci_uninit_perm_bits(); 2343 return ret; 2344 } 2345 2346 module_init(vfio_pci_init); 2347 module_exit(vfio_pci_cleanup); 2348 2349 MODULE_VERSION(DRIVER_VERSION); 2350 MODULE_LICENSE("GPL v2"); 2351 MODULE_AUTHOR(DRIVER_AUTHOR); 2352 MODULE_DESCRIPTION(DRIVER_DESC); 2353