1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 4 * Author: Alex Williamson <alex.williamson@redhat.com> 5 * 6 * Derived from original vfio: 7 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 8 * Author: Tom Lyon, pugs@cisco.com 9 */ 10 11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13 #include <linux/aperture.h> 14 #include <linux/device.h> 15 #include <linux/eventfd.h> 16 #include <linux/file.h> 17 #include <linux/interrupt.h> 18 #include <linux/iommu.h> 19 #include <linux/module.h> 20 #include <linux/mutex.h> 21 #include <linux/notifier.h> 22 #include <linux/pci.h> 23 #include <linux/pm_runtime.h> 24 #include <linux/slab.h> 25 #include <linux/types.h> 26 #include <linux/uaccess.h> 27 #include <linux/vgaarb.h> 28 #include <linux/nospec.h> 29 #include <linux/sched/mm.h> 30 31 #include <linux/vfio_pci_core.h> 32 33 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 34 #define DRIVER_DESC "core driver for VFIO based PCI devices" 35 36 static bool nointxmask; 37 static bool disable_vga; 38 static bool disable_idle_d3; 39 40 /* List of PF's that vfio_pci_core_sriov_configure() has been called on */ 41 static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex); 42 static LIST_HEAD(vfio_pci_sriov_pfs); 43 44 static inline bool vfio_vga_disabled(void) 45 { 46 #ifdef CONFIG_VFIO_PCI_VGA 47 return disable_vga; 48 #else 49 return true; 50 #endif 51 } 52 53 /* 54 * Our VGA arbiter participation is limited since we don't know anything 55 * about the device itself. However, if the device is the only VGA device 56 * downstream of a bridge and VFIO VGA support is disabled, then we can 57 * safely return legacy VGA IO and memory as not decoded since the user 58 * has no way to get to it and routing can be disabled externally at the 59 * bridge. 60 */ 61 static unsigned int vfio_pci_set_decode(struct pci_dev *pdev, bool single_vga) 62 { 63 struct pci_dev *tmp = NULL; 64 unsigned char max_busnr; 65 unsigned int decodes; 66 67 if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus)) 68 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 69 VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 70 71 max_busnr = pci_bus_max_busnr(pdev->bus); 72 decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 73 74 while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) { 75 if (tmp == pdev || 76 pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) || 77 pci_is_root_bus(tmp->bus)) 78 continue; 79 80 if (tmp->bus->number >= pdev->bus->number && 81 tmp->bus->number <= max_busnr) { 82 pci_dev_put(tmp); 83 decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 84 break; 85 } 86 } 87 88 return decodes; 89 } 90 91 static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev) 92 { 93 struct resource *res; 94 int i; 95 struct vfio_pci_dummy_resource *dummy_res; 96 97 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 98 int bar = i + PCI_STD_RESOURCES; 99 100 res = &vdev->pdev->resource[bar]; 101 102 if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP)) 103 goto no_mmap; 104 105 if (!(res->flags & IORESOURCE_MEM)) 106 goto no_mmap; 107 108 /* 109 * The PCI core shouldn't set up a resource with a 110 * type but zero size. But there may be bugs that 111 * cause us to do that. 112 */ 113 if (!resource_size(res)) 114 goto no_mmap; 115 116 if (resource_size(res) >= PAGE_SIZE) { 117 vdev->bar_mmap_supported[bar] = true; 118 continue; 119 } 120 121 if (!(res->start & ~PAGE_MASK)) { 122 /* 123 * Add a dummy resource to reserve the remainder 124 * of the exclusive page in case that hot-add 125 * device's bar is assigned into it. 126 */ 127 dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL); 128 if (dummy_res == NULL) 129 goto no_mmap; 130 131 dummy_res->resource.name = "vfio sub-page reserved"; 132 dummy_res->resource.start = res->end + 1; 133 dummy_res->resource.end = res->start + PAGE_SIZE - 1; 134 dummy_res->resource.flags = res->flags; 135 if (request_resource(res->parent, 136 &dummy_res->resource)) { 137 kfree(dummy_res); 138 goto no_mmap; 139 } 140 dummy_res->index = bar; 141 list_add(&dummy_res->res_next, 142 &vdev->dummy_resources_list); 143 vdev->bar_mmap_supported[bar] = true; 144 continue; 145 } 146 /* 147 * Here we don't handle the case when the BAR is not page 148 * aligned because we can't expect the BAR will be 149 * assigned into the same location in a page in guest 150 * when we passthrough the BAR. And it's hard to access 151 * this BAR in userspace because we have no way to get 152 * the BAR's location in a page. 153 */ 154 no_mmap: 155 vdev->bar_mmap_supported[bar] = false; 156 } 157 } 158 159 struct vfio_pci_group_info; 160 static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); 161 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, 162 struct vfio_pci_group_info *groups); 163 164 /* 165 * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND 166 * _and_ the ability detect when the device is asserting INTx via PCI_STATUS. 167 * If a device implements the former but not the latter we would typically 168 * expect broken_intx_masking be set and require an exclusive interrupt. 169 * However since we do have control of the device's ability to assert INTx, 170 * we can instead pretend that the device does not implement INTx, virtualizing 171 * the pin register to report zero and maintaining DisINTx set on the host. 172 */ 173 static bool vfio_pci_nointx(struct pci_dev *pdev) 174 { 175 switch (pdev->vendor) { 176 case PCI_VENDOR_ID_INTEL: 177 switch (pdev->device) { 178 /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */ 179 case 0x1572: 180 case 0x1574: 181 case 0x1580 ... 0x1581: 182 case 0x1583 ... 0x158b: 183 case 0x37d0 ... 0x37d2: 184 /* X550 */ 185 case 0x1563: 186 return true; 187 default: 188 return false; 189 } 190 } 191 192 return false; 193 } 194 195 static void vfio_pci_probe_power_state(struct vfio_pci_core_device *vdev) 196 { 197 struct pci_dev *pdev = vdev->pdev; 198 u16 pmcsr; 199 200 if (!pdev->pm_cap) 201 return; 202 203 pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr); 204 205 vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET); 206 } 207 208 /* 209 * pci_set_power_state() wrapper handling devices which perform a soft reset on 210 * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev, 211 * restore when returned to D0. Saved separately from pci_saved_state for use 212 * by PM capability emulation and separately from pci_dev internal saved state 213 * to avoid it being overwritten and consumed around other resets. 214 */ 215 int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state) 216 { 217 struct pci_dev *pdev = vdev->pdev; 218 bool needs_restore = false, needs_save = false; 219 int ret; 220 221 /* Prevent changing power state for PFs with VFs enabled */ 222 if (pci_num_vf(pdev) && state > PCI_D0) 223 return -EBUSY; 224 225 if (vdev->needs_pm_restore) { 226 if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) { 227 pci_save_state(pdev); 228 needs_save = true; 229 } 230 231 if (pdev->current_state >= PCI_D3hot && state <= PCI_D0) 232 needs_restore = true; 233 } 234 235 ret = pci_set_power_state(pdev, state); 236 237 if (!ret) { 238 /* D3 might be unsupported via quirk, skip unless in D3 */ 239 if (needs_save && pdev->current_state >= PCI_D3hot) { 240 /* 241 * The current PCI state will be saved locally in 242 * 'pm_save' during the D3hot transition. When the 243 * device state is changed to D0 again with the current 244 * function, then pci_store_saved_state() will restore 245 * the state and will free the memory pointed by 246 * 'pm_save'. There are few cases where the PCI power 247 * state can be changed to D0 without the involvement 248 * of the driver. For these cases, free the earlier 249 * allocated memory first before overwriting 'pm_save' 250 * to prevent the memory leak. 251 */ 252 kfree(vdev->pm_save); 253 vdev->pm_save = pci_store_saved_state(pdev); 254 } else if (needs_restore) { 255 pci_load_and_free_saved_state(pdev, &vdev->pm_save); 256 pci_restore_state(pdev); 257 } 258 } 259 260 return ret; 261 } 262 263 /* 264 * The dev_pm_ops needs to be provided to make pci-driver runtime PM working, 265 * so use structure without any callbacks. 266 * 267 * The pci-driver core runtime PM routines always save the device state 268 * before going into suspended state. If the device is going into low power 269 * state with only with runtime PM ops, then no explicit handling is needed 270 * for the devices which have NoSoftRst-. 271 */ 272 static const struct dev_pm_ops vfio_pci_core_pm_ops = { }; 273 274 int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) 275 { 276 struct pci_dev *pdev = vdev->pdev; 277 int ret; 278 u16 cmd; 279 u8 msix_pos; 280 281 if (!disable_idle_d3) { 282 ret = pm_runtime_resume_and_get(&pdev->dev); 283 if (ret < 0) 284 return ret; 285 } 286 287 /* Don't allow our initial saved state to include busmaster */ 288 pci_clear_master(pdev); 289 290 ret = pci_enable_device(pdev); 291 if (ret) 292 goto out_power; 293 294 /* If reset fails because of the device lock, fail this path entirely */ 295 ret = pci_try_reset_function(pdev); 296 if (ret == -EAGAIN) 297 goto out_disable_device; 298 299 vdev->reset_works = !ret; 300 pci_save_state(pdev); 301 vdev->pci_saved_state = pci_store_saved_state(pdev); 302 if (!vdev->pci_saved_state) 303 pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__); 304 305 if (likely(!nointxmask)) { 306 if (vfio_pci_nointx(pdev)) { 307 pci_info(pdev, "Masking broken INTx support\n"); 308 vdev->nointx = true; 309 pci_intx(pdev, 0); 310 } else 311 vdev->pci_2_3 = pci_intx_mask_supported(pdev); 312 } 313 314 pci_read_config_word(pdev, PCI_COMMAND, &cmd); 315 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { 316 cmd &= ~PCI_COMMAND_INTX_DISABLE; 317 pci_write_config_word(pdev, PCI_COMMAND, cmd); 318 } 319 320 ret = vfio_config_init(vdev); 321 if (ret) 322 goto out_free_state; 323 324 msix_pos = pdev->msix_cap; 325 if (msix_pos) { 326 u16 flags; 327 u32 table; 328 329 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); 330 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); 331 332 vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; 333 vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; 334 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; 335 } else 336 vdev->msix_bar = 0xFF; 337 338 if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) 339 vdev->has_vga = true; 340 341 342 return 0; 343 344 out_free_state: 345 kfree(vdev->pci_saved_state); 346 vdev->pci_saved_state = NULL; 347 out_disable_device: 348 pci_disable_device(pdev); 349 out_power: 350 if (!disable_idle_d3) 351 pm_runtime_put(&pdev->dev); 352 return ret; 353 } 354 EXPORT_SYMBOL_GPL(vfio_pci_core_enable); 355 356 void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) 357 { 358 struct pci_dev *pdev = vdev->pdev; 359 struct vfio_pci_dummy_resource *dummy_res, *tmp; 360 struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; 361 int i, bar; 362 363 /* For needs_reset */ 364 lockdep_assert_held(&vdev->vdev.dev_set->lock); 365 366 /* 367 * This function can be invoked while the power state is non-D0. 368 * This function calls __pci_reset_function_locked() which internally 369 * can use pci_pm_reset() for the function reset. pci_pm_reset() will 370 * fail if the power state is non-D0. Also, for the devices which 371 * have NoSoftRst-, the reset function can cause the PCI config space 372 * reset without restoring the original state (saved locally in 373 * 'vdev->pm_save'). 374 */ 375 vfio_pci_set_power_state(vdev, PCI_D0); 376 377 /* Stop the device from further DMA */ 378 pci_clear_master(pdev); 379 380 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | 381 VFIO_IRQ_SET_ACTION_TRIGGER, 382 vdev->irq_type, 0, 0, NULL); 383 384 /* Device closed, don't need mutex here */ 385 list_for_each_entry_safe(ioeventfd, ioeventfd_tmp, 386 &vdev->ioeventfds_list, next) { 387 vfio_virqfd_disable(&ioeventfd->virqfd); 388 list_del(&ioeventfd->next); 389 kfree(ioeventfd); 390 } 391 vdev->ioeventfds_nr = 0; 392 393 vdev->virq_disabled = false; 394 395 for (i = 0; i < vdev->num_regions; i++) 396 vdev->region[i].ops->release(vdev, &vdev->region[i]); 397 398 vdev->num_regions = 0; 399 kfree(vdev->region); 400 vdev->region = NULL; /* don't krealloc a freed pointer */ 401 402 vfio_config_free(vdev); 403 404 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 405 bar = i + PCI_STD_RESOURCES; 406 if (!vdev->barmap[bar]) 407 continue; 408 pci_iounmap(pdev, vdev->barmap[bar]); 409 pci_release_selected_regions(pdev, 1 << bar); 410 vdev->barmap[bar] = NULL; 411 } 412 413 list_for_each_entry_safe(dummy_res, tmp, 414 &vdev->dummy_resources_list, res_next) { 415 list_del(&dummy_res->res_next); 416 release_resource(&dummy_res->resource); 417 kfree(dummy_res); 418 } 419 420 vdev->needs_reset = true; 421 422 /* 423 * If we have saved state, restore it. If we can reset the device, 424 * even better. Resetting with current state seems better than 425 * nothing, but saving and restoring current state without reset 426 * is just busy work. 427 */ 428 if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) { 429 pci_info(pdev, "%s: Couldn't reload saved state\n", __func__); 430 431 if (!vdev->reset_works) 432 goto out; 433 434 pci_save_state(pdev); 435 } 436 437 /* 438 * Disable INTx and MSI, presumably to avoid spurious interrupts 439 * during reset. Stolen from pci_reset_function() 440 */ 441 pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); 442 443 /* 444 * Try to get the locks ourselves to prevent a deadlock. The 445 * success of this is dependent on being able to lock the device, 446 * which is not always possible. 447 * We can not use the "try" reset interface here, which will 448 * overwrite the previously restored configuration information. 449 */ 450 if (vdev->reset_works && pci_dev_trylock(pdev)) { 451 if (!__pci_reset_function_locked(pdev)) 452 vdev->needs_reset = false; 453 pci_dev_unlock(pdev); 454 } 455 456 pci_restore_state(pdev); 457 out: 458 pci_disable_device(pdev); 459 460 vfio_pci_dev_set_try_reset(vdev->vdev.dev_set); 461 462 /* Put the pm-runtime usage counter acquired during enable */ 463 if (!disable_idle_d3) 464 pm_runtime_put(&pdev->dev); 465 } 466 EXPORT_SYMBOL_GPL(vfio_pci_core_disable); 467 468 void vfio_pci_core_close_device(struct vfio_device *core_vdev) 469 { 470 struct vfio_pci_core_device *vdev = 471 container_of(core_vdev, struct vfio_pci_core_device, vdev); 472 473 if (vdev->sriov_pf_core_dev) { 474 mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock); 475 WARN_ON(!vdev->sriov_pf_core_dev->vf_token->users); 476 vdev->sriov_pf_core_dev->vf_token->users--; 477 mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock); 478 } 479 vfio_spapr_pci_eeh_release(vdev->pdev); 480 vfio_pci_core_disable(vdev); 481 482 mutex_lock(&vdev->igate); 483 if (vdev->err_trigger) { 484 eventfd_ctx_put(vdev->err_trigger); 485 vdev->err_trigger = NULL; 486 } 487 if (vdev->req_trigger) { 488 eventfd_ctx_put(vdev->req_trigger); 489 vdev->req_trigger = NULL; 490 } 491 mutex_unlock(&vdev->igate); 492 } 493 EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); 494 495 void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev) 496 { 497 vfio_pci_probe_mmaps(vdev); 498 vfio_spapr_pci_eeh_open(vdev->pdev); 499 500 if (vdev->sriov_pf_core_dev) { 501 mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock); 502 vdev->sriov_pf_core_dev->vf_token->users++; 503 mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock); 504 } 505 } 506 EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable); 507 508 static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type) 509 { 510 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { 511 u8 pin; 512 513 if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || 514 vdev->nointx || vdev->pdev->is_virtfn) 515 return 0; 516 517 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); 518 519 return pin ? 1 : 0; 520 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { 521 u8 pos; 522 u16 flags; 523 524 pos = vdev->pdev->msi_cap; 525 if (pos) { 526 pci_read_config_word(vdev->pdev, 527 pos + PCI_MSI_FLAGS, &flags); 528 return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); 529 } 530 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { 531 u8 pos; 532 u16 flags; 533 534 pos = vdev->pdev->msix_cap; 535 if (pos) { 536 pci_read_config_word(vdev->pdev, 537 pos + PCI_MSIX_FLAGS, &flags); 538 539 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 540 } 541 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { 542 if (pci_is_pcie(vdev->pdev)) 543 return 1; 544 } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { 545 return 1; 546 } 547 548 return 0; 549 } 550 551 static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) 552 { 553 (*(int *)data)++; 554 return 0; 555 } 556 557 struct vfio_pci_fill_info { 558 int max; 559 int cur; 560 struct vfio_pci_dependent_device *devices; 561 }; 562 563 static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) 564 { 565 struct vfio_pci_fill_info *fill = data; 566 struct iommu_group *iommu_group; 567 568 if (fill->cur == fill->max) 569 return -EAGAIN; /* Something changed, try again */ 570 571 iommu_group = iommu_group_get(&pdev->dev); 572 if (!iommu_group) 573 return -EPERM; /* Cannot reset non-isolated devices */ 574 575 fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); 576 fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); 577 fill->devices[fill->cur].bus = pdev->bus->number; 578 fill->devices[fill->cur].devfn = pdev->devfn; 579 fill->cur++; 580 iommu_group_put(iommu_group); 581 return 0; 582 } 583 584 struct vfio_pci_group_info { 585 int count; 586 struct file **files; 587 }; 588 589 static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) 590 { 591 for (; pdev; pdev = pdev->bus->self) 592 if (pdev->bus == slot->bus) 593 return (pdev->slot == slot); 594 return false; 595 } 596 597 struct vfio_pci_walk_info { 598 int (*fn)(struct pci_dev *pdev, void *data); 599 void *data; 600 struct pci_dev *pdev; 601 bool slot; 602 int ret; 603 }; 604 605 static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data) 606 { 607 struct vfio_pci_walk_info *walk = data; 608 609 if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot)) 610 walk->ret = walk->fn(pdev, walk->data); 611 612 return walk->ret; 613 } 614 615 static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, 616 int (*fn)(struct pci_dev *, 617 void *data), void *data, 618 bool slot) 619 { 620 struct vfio_pci_walk_info walk = { 621 .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0, 622 }; 623 624 pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk); 625 626 return walk.ret; 627 } 628 629 static int msix_mmappable_cap(struct vfio_pci_core_device *vdev, 630 struct vfio_info_cap *caps) 631 { 632 struct vfio_info_cap_header header = { 633 .id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, 634 .version = 1 635 }; 636 637 return vfio_info_add_capability(caps, &header, sizeof(header)); 638 } 639 640 int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, 641 unsigned int type, unsigned int subtype, 642 const struct vfio_pci_regops *ops, 643 size_t size, u32 flags, void *data) 644 { 645 struct vfio_pci_region *region; 646 647 region = krealloc(vdev->region, 648 (vdev->num_regions + 1) * sizeof(*region), 649 GFP_KERNEL); 650 if (!region) 651 return -ENOMEM; 652 653 vdev->region = region; 654 vdev->region[vdev->num_regions].type = type; 655 vdev->region[vdev->num_regions].subtype = subtype; 656 vdev->region[vdev->num_regions].ops = ops; 657 vdev->region[vdev->num_regions].size = size; 658 vdev->region[vdev->num_regions].flags = flags; 659 vdev->region[vdev->num_regions].data = data; 660 661 vdev->num_regions++; 662 663 return 0; 664 } 665 EXPORT_SYMBOL_GPL(vfio_pci_register_dev_region); 666 667 long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, 668 unsigned long arg) 669 { 670 struct vfio_pci_core_device *vdev = 671 container_of(core_vdev, struct vfio_pci_core_device, vdev); 672 unsigned long minsz; 673 674 if (cmd == VFIO_DEVICE_GET_INFO) { 675 struct vfio_device_info info; 676 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 677 unsigned long capsz; 678 int ret; 679 680 minsz = offsetofend(struct vfio_device_info, num_irqs); 681 682 /* For backward compatibility, cannot require this */ 683 capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); 684 685 if (copy_from_user(&info, (void __user *)arg, minsz)) 686 return -EFAULT; 687 688 if (info.argsz < minsz) 689 return -EINVAL; 690 691 if (info.argsz >= capsz) { 692 minsz = capsz; 693 info.cap_offset = 0; 694 } 695 696 info.flags = VFIO_DEVICE_FLAGS_PCI; 697 698 if (vdev->reset_works) 699 info.flags |= VFIO_DEVICE_FLAGS_RESET; 700 701 info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; 702 info.num_irqs = VFIO_PCI_NUM_IRQS; 703 704 ret = vfio_pci_info_zdev_add_caps(vdev, &caps); 705 if (ret && ret != -ENODEV) { 706 pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n"); 707 return ret; 708 } 709 710 if (caps.size) { 711 info.flags |= VFIO_DEVICE_FLAGS_CAPS; 712 if (info.argsz < sizeof(info) + caps.size) { 713 info.argsz = sizeof(info) + caps.size; 714 } else { 715 vfio_info_cap_shift(&caps, sizeof(info)); 716 if (copy_to_user((void __user *)arg + 717 sizeof(info), caps.buf, 718 caps.size)) { 719 kfree(caps.buf); 720 return -EFAULT; 721 } 722 info.cap_offset = sizeof(info); 723 } 724 725 kfree(caps.buf); 726 } 727 728 return copy_to_user((void __user *)arg, &info, minsz) ? 729 -EFAULT : 0; 730 731 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { 732 struct pci_dev *pdev = vdev->pdev; 733 struct vfio_region_info info; 734 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 735 int i, ret; 736 737 minsz = offsetofend(struct vfio_region_info, offset); 738 739 if (copy_from_user(&info, (void __user *)arg, minsz)) 740 return -EFAULT; 741 742 if (info.argsz < minsz) 743 return -EINVAL; 744 745 switch (info.index) { 746 case VFIO_PCI_CONFIG_REGION_INDEX: 747 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 748 info.size = pdev->cfg_size; 749 info.flags = VFIO_REGION_INFO_FLAG_READ | 750 VFIO_REGION_INFO_FLAG_WRITE; 751 break; 752 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 753 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 754 info.size = pci_resource_len(pdev, info.index); 755 if (!info.size) { 756 info.flags = 0; 757 break; 758 } 759 760 info.flags = VFIO_REGION_INFO_FLAG_READ | 761 VFIO_REGION_INFO_FLAG_WRITE; 762 if (vdev->bar_mmap_supported[info.index]) { 763 info.flags |= VFIO_REGION_INFO_FLAG_MMAP; 764 if (info.index == vdev->msix_bar) { 765 ret = msix_mmappable_cap(vdev, &caps); 766 if (ret) 767 return ret; 768 } 769 } 770 771 break; 772 case VFIO_PCI_ROM_REGION_INDEX: 773 { 774 void __iomem *io; 775 size_t size; 776 u16 cmd; 777 778 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 779 info.flags = 0; 780 781 /* Report the BAR size, not the ROM size */ 782 info.size = pci_resource_len(pdev, info.index); 783 if (!info.size) { 784 /* Shadow ROMs appear as PCI option ROMs */ 785 if (pdev->resource[PCI_ROM_RESOURCE].flags & 786 IORESOURCE_ROM_SHADOW) 787 info.size = 0x20000; 788 else 789 break; 790 } 791 792 /* 793 * Is it really there? Enable memory decode for 794 * implicit access in pci_map_rom(). 795 */ 796 cmd = vfio_pci_memory_lock_and_enable(vdev); 797 io = pci_map_rom(pdev, &size); 798 if (io) { 799 info.flags = VFIO_REGION_INFO_FLAG_READ; 800 pci_unmap_rom(pdev, io); 801 } else { 802 info.size = 0; 803 } 804 vfio_pci_memory_unlock_and_restore(vdev, cmd); 805 806 break; 807 } 808 case VFIO_PCI_VGA_REGION_INDEX: 809 if (!vdev->has_vga) 810 return -EINVAL; 811 812 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 813 info.size = 0xc0000; 814 info.flags = VFIO_REGION_INFO_FLAG_READ | 815 VFIO_REGION_INFO_FLAG_WRITE; 816 817 break; 818 default: 819 { 820 struct vfio_region_info_cap_type cap_type = { 821 .header.id = VFIO_REGION_INFO_CAP_TYPE, 822 .header.version = 1 }; 823 824 if (info.index >= 825 VFIO_PCI_NUM_REGIONS + vdev->num_regions) 826 return -EINVAL; 827 info.index = array_index_nospec(info.index, 828 VFIO_PCI_NUM_REGIONS + 829 vdev->num_regions); 830 831 i = info.index - VFIO_PCI_NUM_REGIONS; 832 833 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 834 info.size = vdev->region[i].size; 835 info.flags = vdev->region[i].flags; 836 837 cap_type.type = vdev->region[i].type; 838 cap_type.subtype = vdev->region[i].subtype; 839 840 ret = vfio_info_add_capability(&caps, &cap_type.header, 841 sizeof(cap_type)); 842 if (ret) 843 return ret; 844 845 if (vdev->region[i].ops->add_capability) { 846 ret = vdev->region[i].ops->add_capability(vdev, 847 &vdev->region[i], &caps); 848 if (ret) 849 return ret; 850 } 851 } 852 } 853 854 if (caps.size) { 855 info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 856 if (info.argsz < sizeof(info) + caps.size) { 857 info.argsz = sizeof(info) + caps.size; 858 info.cap_offset = 0; 859 } else { 860 vfio_info_cap_shift(&caps, sizeof(info)); 861 if (copy_to_user((void __user *)arg + 862 sizeof(info), caps.buf, 863 caps.size)) { 864 kfree(caps.buf); 865 return -EFAULT; 866 } 867 info.cap_offset = sizeof(info); 868 } 869 870 kfree(caps.buf); 871 } 872 873 return copy_to_user((void __user *)arg, &info, minsz) ? 874 -EFAULT : 0; 875 876 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { 877 struct vfio_irq_info info; 878 879 minsz = offsetofend(struct vfio_irq_info, count); 880 881 if (copy_from_user(&info, (void __user *)arg, minsz)) 882 return -EFAULT; 883 884 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 885 return -EINVAL; 886 887 switch (info.index) { 888 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: 889 case VFIO_PCI_REQ_IRQ_INDEX: 890 break; 891 case VFIO_PCI_ERR_IRQ_INDEX: 892 if (pci_is_pcie(vdev->pdev)) 893 break; 894 fallthrough; 895 default: 896 return -EINVAL; 897 } 898 899 info.flags = VFIO_IRQ_INFO_EVENTFD; 900 901 info.count = vfio_pci_get_irq_count(vdev, info.index); 902 903 if (info.index == VFIO_PCI_INTX_IRQ_INDEX) 904 info.flags |= (VFIO_IRQ_INFO_MASKABLE | 905 VFIO_IRQ_INFO_AUTOMASKED); 906 else 907 info.flags |= VFIO_IRQ_INFO_NORESIZE; 908 909 return copy_to_user((void __user *)arg, &info, minsz) ? 910 -EFAULT : 0; 911 912 } else if (cmd == VFIO_DEVICE_SET_IRQS) { 913 struct vfio_irq_set hdr; 914 u8 *data = NULL; 915 int max, ret = 0; 916 size_t data_size = 0; 917 918 minsz = offsetofend(struct vfio_irq_set, count); 919 920 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 921 return -EFAULT; 922 923 max = vfio_pci_get_irq_count(vdev, hdr.index); 924 925 ret = vfio_set_irqs_validate_and_prepare(&hdr, max, 926 VFIO_PCI_NUM_IRQS, &data_size); 927 if (ret) 928 return ret; 929 930 if (data_size) { 931 data = memdup_user((void __user *)(arg + minsz), 932 data_size); 933 if (IS_ERR(data)) 934 return PTR_ERR(data); 935 } 936 937 mutex_lock(&vdev->igate); 938 939 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, 940 hdr.start, hdr.count, data); 941 942 mutex_unlock(&vdev->igate); 943 kfree(data); 944 945 return ret; 946 947 } else if (cmd == VFIO_DEVICE_RESET) { 948 int ret; 949 950 if (!vdev->reset_works) 951 return -EINVAL; 952 953 vfio_pci_zap_and_down_write_memory_lock(vdev); 954 955 /* 956 * This function can be invoked while the power state is non-D0. 957 * If pci_try_reset_function() has been called while the power 958 * state is non-D0, then pci_try_reset_function() will 959 * internally set the power state to D0 without vfio driver 960 * involvement. For the devices which have NoSoftRst-, the 961 * reset function can cause the PCI config space reset without 962 * restoring the original state (saved locally in 963 * 'vdev->pm_save'). 964 */ 965 vfio_pci_set_power_state(vdev, PCI_D0); 966 967 ret = pci_try_reset_function(vdev->pdev); 968 up_write(&vdev->memory_lock); 969 970 return ret; 971 972 } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { 973 struct vfio_pci_hot_reset_info hdr; 974 struct vfio_pci_fill_info fill = { 0 }; 975 struct vfio_pci_dependent_device *devices = NULL; 976 bool slot = false; 977 int ret = 0; 978 979 minsz = offsetofend(struct vfio_pci_hot_reset_info, count); 980 981 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 982 return -EFAULT; 983 984 if (hdr.argsz < minsz) 985 return -EINVAL; 986 987 hdr.flags = 0; 988 989 /* Can we do a slot or bus reset or neither? */ 990 if (!pci_probe_reset_slot(vdev->pdev->slot)) 991 slot = true; 992 else if (pci_probe_reset_bus(vdev->pdev->bus)) 993 return -ENODEV; 994 995 /* How many devices are affected? */ 996 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 997 vfio_pci_count_devs, 998 &fill.max, slot); 999 if (ret) 1000 return ret; 1001 1002 WARN_ON(!fill.max); /* Should always be at least one */ 1003 1004 /* 1005 * If there's enough space, fill it now, otherwise return 1006 * -ENOSPC and the number of devices affected. 1007 */ 1008 if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { 1009 ret = -ENOSPC; 1010 hdr.count = fill.max; 1011 goto reset_info_exit; 1012 } 1013 1014 devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); 1015 if (!devices) 1016 return -ENOMEM; 1017 1018 fill.devices = devices; 1019 1020 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1021 vfio_pci_fill_devs, 1022 &fill, slot); 1023 1024 /* 1025 * If a device was removed between counting and filling, 1026 * we may come up short of fill.max. If a device was 1027 * added, we'll have a return of -EAGAIN above. 1028 */ 1029 if (!ret) 1030 hdr.count = fill.cur; 1031 1032 reset_info_exit: 1033 if (copy_to_user((void __user *)arg, &hdr, minsz)) 1034 ret = -EFAULT; 1035 1036 if (!ret) { 1037 if (copy_to_user((void __user *)(arg + minsz), devices, 1038 hdr.count * sizeof(*devices))) 1039 ret = -EFAULT; 1040 } 1041 1042 kfree(devices); 1043 return ret; 1044 1045 } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { 1046 struct vfio_pci_hot_reset hdr; 1047 int32_t *group_fds; 1048 struct file **files; 1049 struct vfio_pci_group_info info; 1050 bool slot = false; 1051 int file_idx, count = 0, ret = 0; 1052 1053 minsz = offsetofend(struct vfio_pci_hot_reset, count); 1054 1055 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 1056 return -EFAULT; 1057 1058 if (hdr.argsz < minsz || hdr.flags) 1059 return -EINVAL; 1060 1061 /* Can we do a slot or bus reset or neither? */ 1062 if (!pci_probe_reset_slot(vdev->pdev->slot)) 1063 slot = true; 1064 else if (pci_probe_reset_bus(vdev->pdev->bus)) 1065 return -ENODEV; 1066 1067 /* 1068 * We can't let userspace give us an arbitrarily large 1069 * buffer to copy, so verify how many we think there 1070 * could be. Note groups can have multiple devices so 1071 * one group per device is the max. 1072 */ 1073 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1074 vfio_pci_count_devs, 1075 &count, slot); 1076 if (ret) 1077 return ret; 1078 1079 /* Somewhere between 1 and count is OK */ 1080 if (!hdr.count || hdr.count > count) 1081 return -EINVAL; 1082 1083 group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); 1084 files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL); 1085 if (!group_fds || !files) { 1086 kfree(group_fds); 1087 kfree(files); 1088 return -ENOMEM; 1089 } 1090 1091 if (copy_from_user(group_fds, (void __user *)(arg + minsz), 1092 hdr.count * sizeof(*group_fds))) { 1093 kfree(group_fds); 1094 kfree(files); 1095 return -EFAULT; 1096 } 1097 1098 /* 1099 * For each group_fd, get the group through the vfio external 1100 * user interface and store the group and iommu ID. This 1101 * ensures the group is held across the reset. 1102 */ 1103 for (file_idx = 0; file_idx < hdr.count; file_idx++) { 1104 struct file *file = fget(group_fds[file_idx]); 1105 1106 if (!file) { 1107 ret = -EBADF; 1108 break; 1109 } 1110 1111 /* Ensure the FD is a vfio group FD.*/ 1112 if (!vfio_file_iommu_group(file)) { 1113 fput(file); 1114 ret = -EINVAL; 1115 break; 1116 } 1117 1118 files[file_idx] = file; 1119 } 1120 1121 kfree(group_fds); 1122 1123 /* release reference to groups on error */ 1124 if (ret) 1125 goto hot_reset_release; 1126 1127 info.count = hdr.count; 1128 info.files = files; 1129 1130 ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); 1131 1132 hot_reset_release: 1133 for (file_idx--; file_idx >= 0; file_idx--) 1134 fput(files[file_idx]); 1135 1136 kfree(files); 1137 return ret; 1138 } else if (cmd == VFIO_DEVICE_IOEVENTFD) { 1139 struct vfio_device_ioeventfd ioeventfd; 1140 int count; 1141 1142 minsz = offsetofend(struct vfio_device_ioeventfd, fd); 1143 1144 if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) 1145 return -EFAULT; 1146 1147 if (ioeventfd.argsz < minsz) 1148 return -EINVAL; 1149 1150 if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) 1151 return -EINVAL; 1152 1153 count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; 1154 1155 if (hweight8(count) != 1 || ioeventfd.fd < -1) 1156 return -EINVAL; 1157 1158 return vfio_pci_ioeventfd(vdev, ioeventfd.offset, 1159 ioeventfd.data, count, ioeventfd.fd); 1160 } 1161 return -ENOTTY; 1162 } 1163 EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl); 1164 1165 static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, 1166 void __user *arg, size_t argsz) 1167 { 1168 struct vfio_pci_core_device *vdev = 1169 container_of(device, struct vfio_pci_core_device, vdev); 1170 uuid_t uuid; 1171 int ret; 1172 1173 if (!vdev->vf_token) 1174 return -ENOTTY; 1175 /* 1176 * We do not support GET of the VF Token UUID as this could 1177 * expose the token of the previous device user. 1178 */ 1179 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 1180 sizeof(uuid)); 1181 if (ret != 1) 1182 return ret; 1183 1184 if (copy_from_user(&uuid, arg, sizeof(uuid))) 1185 return -EFAULT; 1186 1187 mutex_lock(&vdev->vf_token->lock); 1188 uuid_copy(&vdev->vf_token->uuid, &uuid); 1189 mutex_unlock(&vdev->vf_token->lock); 1190 return 0; 1191 } 1192 1193 int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, 1194 void __user *arg, size_t argsz) 1195 { 1196 switch (flags & VFIO_DEVICE_FEATURE_MASK) { 1197 case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: 1198 return vfio_pci_core_feature_token(device, flags, arg, argsz); 1199 default: 1200 return -ENOTTY; 1201 } 1202 } 1203 EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl_feature); 1204 1205 static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf, 1206 size_t count, loff_t *ppos, bool iswrite) 1207 { 1208 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 1209 1210 if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 1211 return -EINVAL; 1212 1213 switch (index) { 1214 case VFIO_PCI_CONFIG_REGION_INDEX: 1215 return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); 1216 1217 case VFIO_PCI_ROM_REGION_INDEX: 1218 if (iswrite) 1219 return -EINVAL; 1220 return vfio_pci_bar_rw(vdev, buf, count, ppos, false); 1221 1222 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 1223 return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); 1224 1225 case VFIO_PCI_VGA_REGION_INDEX: 1226 return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); 1227 default: 1228 index -= VFIO_PCI_NUM_REGIONS; 1229 return vdev->region[index].ops->rw(vdev, buf, 1230 count, ppos, iswrite); 1231 } 1232 1233 return -EINVAL; 1234 } 1235 1236 ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, 1237 size_t count, loff_t *ppos) 1238 { 1239 struct vfio_pci_core_device *vdev = 1240 container_of(core_vdev, struct vfio_pci_core_device, vdev); 1241 1242 if (!count) 1243 return 0; 1244 1245 return vfio_pci_rw(vdev, buf, count, ppos, false); 1246 } 1247 EXPORT_SYMBOL_GPL(vfio_pci_core_read); 1248 1249 ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, 1250 size_t count, loff_t *ppos) 1251 { 1252 struct vfio_pci_core_device *vdev = 1253 container_of(core_vdev, struct vfio_pci_core_device, vdev); 1254 1255 if (!count) 1256 return 0; 1257 1258 return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true); 1259 } 1260 EXPORT_SYMBOL_GPL(vfio_pci_core_write); 1261 1262 /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ 1263 static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try) 1264 { 1265 struct vfio_pci_mmap_vma *mmap_vma, *tmp; 1266 1267 /* 1268 * Lock ordering: 1269 * vma_lock is nested under mmap_lock for vm_ops callback paths. 1270 * The memory_lock semaphore is used by both code paths calling 1271 * into this function to zap vmas and the vm_ops.fault callback 1272 * to protect the memory enable state of the device. 1273 * 1274 * When zapping vmas we need to maintain the mmap_lock => vma_lock 1275 * ordering, which requires using vma_lock to walk vma_list to 1276 * acquire an mm, then dropping vma_lock to get the mmap_lock and 1277 * reacquiring vma_lock. This logic is derived from similar 1278 * requirements in uverbs_user_mmap_disassociate(). 1279 * 1280 * mmap_lock must always be the top-level lock when it is taken. 1281 * Therefore we can only hold the memory_lock write lock when 1282 * vma_list is empty, as we'd need to take mmap_lock to clear 1283 * entries. vma_list can only be guaranteed empty when holding 1284 * vma_lock, thus memory_lock is nested under vma_lock. 1285 * 1286 * This enables the vm_ops.fault callback to acquire vma_lock, 1287 * followed by memory_lock read lock, while already holding 1288 * mmap_lock without risk of deadlock. 1289 */ 1290 while (1) { 1291 struct mm_struct *mm = NULL; 1292 1293 if (try) { 1294 if (!mutex_trylock(&vdev->vma_lock)) 1295 return 0; 1296 } else { 1297 mutex_lock(&vdev->vma_lock); 1298 } 1299 while (!list_empty(&vdev->vma_list)) { 1300 mmap_vma = list_first_entry(&vdev->vma_list, 1301 struct vfio_pci_mmap_vma, 1302 vma_next); 1303 mm = mmap_vma->vma->vm_mm; 1304 if (mmget_not_zero(mm)) 1305 break; 1306 1307 list_del(&mmap_vma->vma_next); 1308 kfree(mmap_vma); 1309 mm = NULL; 1310 } 1311 if (!mm) 1312 return 1; 1313 mutex_unlock(&vdev->vma_lock); 1314 1315 if (try) { 1316 if (!mmap_read_trylock(mm)) { 1317 mmput(mm); 1318 return 0; 1319 } 1320 } else { 1321 mmap_read_lock(mm); 1322 } 1323 if (try) { 1324 if (!mutex_trylock(&vdev->vma_lock)) { 1325 mmap_read_unlock(mm); 1326 mmput(mm); 1327 return 0; 1328 } 1329 } else { 1330 mutex_lock(&vdev->vma_lock); 1331 } 1332 list_for_each_entry_safe(mmap_vma, tmp, 1333 &vdev->vma_list, vma_next) { 1334 struct vm_area_struct *vma = mmap_vma->vma; 1335 1336 if (vma->vm_mm != mm) 1337 continue; 1338 1339 list_del(&mmap_vma->vma_next); 1340 kfree(mmap_vma); 1341 1342 zap_vma_ptes(vma, vma->vm_start, 1343 vma->vm_end - vma->vm_start); 1344 } 1345 mutex_unlock(&vdev->vma_lock); 1346 mmap_read_unlock(mm); 1347 mmput(mm); 1348 } 1349 } 1350 1351 void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev) 1352 { 1353 vfio_pci_zap_and_vma_lock(vdev, false); 1354 down_write(&vdev->memory_lock); 1355 mutex_unlock(&vdev->vma_lock); 1356 } 1357 1358 u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev) 1359 { 1360 u16 cmd; 1361 1362 down_write(&vdev->memory_lock); 1363 pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd); 1364 if (!(cmd & PCI_COMMAND_MEMORY)) 1365 pci_write_config_word(vdev->pdev, PCI_COMMAND, 1366 cmd | PCI_COMMAND_MEMORY); 1367 1368 return cmd; 1369 } 1370 1371 void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd) 1372 { 1373 pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd); 1374 up_write(&vdev->memory_lock); 1375 } 1376 1377 /* Caller holds vma_lock */ 1378 static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev, 1379 struct vm_area_struct *vma) 1380 { 1381 struct vfio_pci_mmap_vma *mmap_vma; 1382 1383 mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL); 1384 if (!mmap_vma) 1385 return -ENOMEM; 1386 1387 mmap_vma->vma = vma; 1388 list_add(&mmap_vma->vma_next, &vdev->vma_list); 1389 1390 return 0; 1391 } 1392 1393 /* 1394 * Zap mmaps on open so that we can fault them in on access and therefore 1395 * our vma_list only tracks mappings accessed since last zap. 1396 */ 1397 static void vfio_pci_mmap_open(struct vm_area_struct *vma) 1398 { 1399 zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1400 } 1401 1402 static void vfio_pci_mmap_close(struct vm_area_struct *vma) 1403 { 1404 struct vfio_pci_core_device *vdev = vma->vm_private_data; 1405 struct vfio_pci_mmap_vma *mmap_vma; 1406 1407 mutex_lock(&vdev->vma_lock); 1408 list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { 1409 if (mmap_vma->vma == vma) { 1410 list_del(&mmap_vma->vma_next); 1411 kfree(mmap_vma); 1412 break; 1413 } 1414 } 1415 mutex_unlock(&vdev->vma_lock); 1416 } 1417 1418 static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) 1419 { 1420 struct vm_area_struct *vma = vmf->vma; 1421 struct vfio_pci_core_device *vdev = vma->vm_private_data; 1422 struct vfio_pci_mmap_vma *mmap_vma; 1423 vm_fault_t ret = VM_FAULT_NOPAGE; 1424 1425 mutex_lock(&vdev->vma_lock); 1426 down_read(&vdev->memory_lock); 1427 1428 if (!__vfio_pci_memory_enabled(vdev)) { 1429 ret = VM_FAULT_SIGBUS; 1430 goto up_out; 1431 } 1432 1433 /* 1434 * We populate the whole vma on fault, so we need to test whether 1435 * the vma has already been mapped, such as for concurrent faults 1436 * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if 1437 * we ask it to fill the same range again. 1438 */ 1439 list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { 1440 if (mmap_vma->vma == vma) 1441 goto up_out; 1442 } 1443 1444 if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 1445 vma->vm_end - vma->vm_start, 1446 vma->vm_page_prot)) { 1447 ret = VM_FAULT_SIGBUS; 1448 zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1449 goto up_out; 1450 } 1451 1452 if (__vfio_pci_add_vma(vdev, vma)) { 1453 ret = VM_FAULT_OOM; 1454 zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1455 } 1456 1457 up_out: 1458 up_read(&vdev->memory_lock); 1459 mutex_unlock(&vdev->vma_lock); 1460 return ret; 1461 } 1462 1463 static const struct vm_operations_struct vfio_pci_mmap_ops = { 1464 .open = vfio_pci_mmap_open, 1465 .close = vfio_pci_mmap_close, 1466 .fault = vfio_pci_mmap_fault, 1467 }; 1468 1469 int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) 1470 { 1471 struct vfio_pci_core_device *vdev = 1472 container_of(core_vdev, struct vfio_pci_core_device, vdev); 1473 struct pci_dev *pdev = vdev->pdev; 1474 unsigned int index; 1475 u64 phys_len, req_len, pgoff, req_start; 1476 int ret; 1477 1478 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 1479 1480 if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 1481 return -EINVAL; 1482 if (vma->vm_end < vma->vm_start) 1483 return -EINVAL; 1484 if ((vma->vm_flags & VM_SHARED) == 0) 1485 return -EINVAL; 1486 if (index >= VFIO_PCI_NUM_REGIONS) { 1487 int regnum = index - VFIO_PCI_NUM_REGIONS; 1488 struct vfio_pci_region *region = vdev->region + regnum; 1489 1490 if (region->ops && region->ops->mmap && 1491 (region->flags & VFIO_REGION_INFO_FLAG_MMAP)) 1492 return region->ops->mmap(vdev, region, vma); 1493 return -EINVAL; 1494 } 1495 if (index >= VFIO_PCI_ROM_REGION_INDEX) 1496 return -EINVAL; 1497 if (!vdev->bar_mmap_supported[index]) 1498 return -EINVAL; 1499 1500 phys_len = PAGE_ALIGN(pci_resource_len(pdev, index)); 1501 req_len = vma->vm_end - vma->vm_start; 1502 pgoff = vma->vm_pgoff & 1503 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 1504 req_start = pgoff << PAGE_SHIFT; 1505 1506 if (req_start + req_len > phys_len) 1507 return -EINVAL; 1508 1509 /* 1510 * Even though we don't make use of the barmap for the mmap, 1511 * we need to request the region and the barmap tracks that. 1512 */ 1513 if (!vdev->barmap[index]) { 1514 ret = pci_request_selected_regions(pdev, 1515 1 << index, "vfio-pci"); 1516 if (ret) 1517 return ret; 1518 1519 vdev->barmap[index] = pci_iomap(pdev, index, 0); 1520 if (!vdev->barmap[index]) { 1521 pci_release_selected_regions(pdev, 1 << index); 1522 return -ENOMEM; 1523 } 1524 } 1525 1526 vma->vm_private_data = vdev; 1527 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 1528 vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; 1529 1530 /* 1531 * See remap_pfn_range(), called from vfio_pci_fault() but we can't 1532 * change vm_flags within the fault handler. Set them now. 1533 */ 1534 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; 1535 vma->vm_ops = &vfio_pci_mmap_ops; 1536 1537 return 0; 1538 } 1539 EXPORT_SYMBOL_GPL(vfio_pci_core_mmap); 1540 1541 void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count) 1542 { 1543 struct vfio_pci_core_device *vdev = 1544 container_of(core_vdev, struct vfio_pci_core_device, vdev); 1545 struct pci_dev *pdev = vdev->pdev; 1546 1547 mutex_lock(&vdev->igate); 1548 1549 if (vdev->req_trigger) { 1550 if (!(count % 10)) 1551 pci_notice_ratelimited(pdev, 1552 "Relaying device request to user (#%u)\n", 1553 count); 1554 eventfd_signal(vdev->req_trigger, 1); 1555 } else if (count == 0) { 1556 pci_warn(pdev, 1557 "No device request channel registered, blocked until released by user\n"); 1558 } 1559 1560 mutex_unlock(&vdev->igate); 1561 } 1562 EXPORT_SYMBOL_GPL(vfio_pci_core_request); 1563 1564 static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, 1565 bool vf_token, uuid_t *uuid) 1566 { 1567 /* 1568 * There's always some degree of trust or collaboration between SR-IOV 1569 * PF and VFs, even if just that the PF hosts the SR-IOV capability and 1570 * can disrupt VFs with a reset, but often the PF has more explicit 1571 * access to deny service to the VF or access data passed through the 1572 * VF. We therefore require an opt-in via a shared VF token (UUID) to 1573 * represent this trust. This both prevents that a VF driver might 1574 * assume the PF driver is a trusted, in-kernel driver, and also that 1575 * a PF driver might be replaced with a rogue driver, unknown to in-use 1576 * VF drivers. 1577 * 1578 * Therefore when presented with a VF, if the PF is a vfio device and 1579 * it is bound to the vfio-pci driver, the user needs to provide a VF 1580 * token to access the device, in the form of appending a vf_token to 1581 * the device name, for example: 1582 * 1583 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3" 1584 * 1585 * When presented with a PF which has VFs in use, the user must also 1586 * provide the current VF token to prove collaboration with existing 1587 * VF users. If VFs are not in use, the VF token provided for the PF 1588 * device will act to set the VF token. 1589 * 1590 * If the VF token is provided but unused, an error is generated. 1591 */ 1592 if (vdev->pdev->is_virtfn) { 1593 struct vfio_pci_core_device *pf_vdev = vdev->sriov_pf_core_dev; 1594 bool match; 1595 1596 if (!pf_vdev) { 1597 if (!vf_token) 1598 return 0; /* PF is not vfio-pci, no VF token */ 1599 1600 pci_info_ratelimited(vdev->pdev, 1601 "VF token incorrectly provided, PF not bound to vfio-pci\n"); 1602 return -EINVAL; 1603 } 1604 1605 if (!vf_token) { 1606 pci_info_ratelimited(vdev->pdev, 1607 "VF token required to access device\n"); 1608 return -EACCES; 1609 } 1610 1611 mutex_lock(&pf_vdev->vf_token->lock); 1612 match = uuid_equal(uuid, &pf_vdev->vf_token->uuid); 1613 mutex_unlock(&pf_vdev->vf_token->lock); 1614 1615 if (!match) { 1616 pci_info_ratelimited(vdev->pdev, 1617 "Incorrect VF token provided for device\n"); 1618 return -EACCES; 1619 } 1620 } else if (vdev->vf_token) { 1621 mutex_lock(&vdev->vf_token->lock); 1622 if (vdev->vf_token->users) { 1623 if (!vf_token) { 1624 mutex_unlock(&vdev->vf_token->lock); 1625 pci_info_ratelimited(vdev->pdev, 1626 "VF token required to access device\n"); 1627 return -EACCES; 1628 } 1629 1630 if (!uuid_equal(uuid, &vdev->vf_token->uuid)) { 1631 mutex_unlock(&vdev->vf_token->lock); 1632 pci_info_ratelimited(vdev->pdev, 1633 "Incorrect VF token provided for device\n"); 1634 return -EACCES; 1635 } 1636 } else if (vf_token) { 1637 uuid_copy(&vdev->vf_token->uuid, uuid); 1638 } 1639 1640 mutex_unlock(&vdev->vf_token->lock); 1641 } else if (vf_token) { 1642 pci_info_ratelimited(vdev->pdev, 1643 "VF token incorrectly provided, not a PF or VF\n"); 1644 return -EINVAL; 1645 } 1646 1647 return 0; 1648 } 1649 1650 #define VF_TOKEN_ARG "vf_token=" 1651 1652 int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf) 1653 { 1654 struct vfio_pci_core_device *vdev = 1655 container_of(core_vdev, struct vfio_pci_core_device, vdev); 1656 bool vf_token = false; 1657 uuid_t uuid; 1658 int ret; 1659 1660 if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev)))) 1661 return 0; /* No match */ 1662 1663 if (strlen(buf) > strlen(pci_name(vdev->pdev))) { 1664 buf += strlen(pci_name(vdev->pdev)); 1665 1666 if (*buf != ' ') 1667 return 0; /* No match: non-whitespace after name */ 1668 1669 while (*buf) { 1670 if (*buf == ' ') { 1671 buf++; 1672 continue; 1673 } 1674 1675 if (!vf_token && !strncmp(buf, VF_TOKEN_ARG, 1676 strlen(VF_TOKEN_ARG))) { 1677 buf += strlen(VF_TOKEN_ARG); 1678 1679 if (strlen(buf) < UUID_STRING_LEN) 1680 return -EINVAL; 1681 1682 ret = uuid_parse(buf, &uuid); 1683 if (ret) 1684 return ret; 1685 1686 vf_token = true; 1687 buf += UUID_STRING_LEN; 1688 } else { 1689 /* Unknown/duplicate option */ 1690 return -EINVAL; 1691 } 1692 } 1693 } 1694 1695 ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid); 1696 if (ret) 1697 return ret; 1698 1699 return 1; /* Match */ 1700 } 1701 EXPORT_SYMBOL_GPL(vfio_pci_core_match); 1702 1703 static int vfio_pci_bus_notifier(struct notifier_block *nb, 1704 unsigned long action, void *data) 1705 { 1706 struct vfio_pci_core_device *vdev = container_of(nb, 1707 struct vfio_pci_core_device, nb); 1708 struct device *dev = data; 1709 struct pci_dev *pdev = to_pci_dev(dev); 1710 struct pci_dev *physfn = pci_physfn(pdev); 1711 1712 if (action == BUS_NOTIFY_ADD_DEVICE && 1713 pdev->is_virtfn && physfn == vdev->pdev) { 1714 pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n", 1715 pci_name(pdev)); 1716 pdev->driver_override = kasprintf(GFP_KERNEL, "%s", 1717 vdev->vdev.ops->name); 1718 } else if (action == BUS_NOTIFY_BOUND_DRIVER && 1719 pdev->is_virtfn && physfn == vdev->pdev) { 1720 struct pci_driver *drv = pci_dev_driver(pdev); 1721 1722 if (drv && drv != pci_dev_driver(vdev->pdev)) 1723 pci_warn(vdev->pdev, 1724 "VF %s bound to driver %s while PF bound to driver %s\n", 1725 pci_name(pdev), drv->name, 1726 pci_dev_driver(vdev->pdev)->name); 1727 } 1728 1729 return 0; 1730 } 1731 1732 static int vfio_pci_vf_init(struct vfio_pci_core_device *vdev) 1733 { 1734 struct pci_dev *pdev = vdev->pdev; 1735 struct vfio_pci_core_device *cur; 1736 struct pci_dev *physfn; 1737 int ret; 1738 1739 if (pdev->is_virtfn) { 1740 /* 1741 * If this VF was created by our vfio_pci_core_sriov_configure() 1742 * then we can find the PF vfio_pci_core_device now, and due to 1743 * the locking in pci_disable_sriov() it cannot change until 1744 * this VF device driver is removed. 1745 */ 1746 physfn = pci_physfn(vdev->pdev); 1747 mutex_lock(&vfio_pci_sriov_pfs_mutex); 1748 list_for_each_entry(cur, &vfio_pci_sriov_pfs, sriov_pfs_item) { 1749 if (cur->pdev == physfn) { 1750 vdev->sriov_pf_core_dev = cur; 1751 break; 1752 } 1753 } 1754 mutex_unlock(&vfio_pci_sriov_pfs_mutex); 1755 return 0; 1756 } 1757 1758 /* Not a SRIOV PF */ 1759 if (!pdev->is_physfn) 1760 return 0; 1761 1762 vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL); 1763 if (!vdev->vf_token) 1764 return -ENOMEM; 1765 1766 mutex_init(&vdev->vf_token->lock); 1767 uuid_gen(&vdev->vf_token->uuid); 1768 1769 vdev->nb.notifier_call = vfio_pci_bus_notifier; 1770 ret = bus_register_notifier(&pci_bus_type, &vdev->nb); 1771 if (ret) { 1772 kfree(vdev->vf_token); 1773 return ret; 1774 } 1775 return 0; 1776 } 1777 1778 static void vfio_pci_vf_uninit(struct vfio_pci_core_device *vdev) 1779 { 1780 if (!vdev->vf_token) 1781 return; 1782 1783 bus_unregister_notifier(&pci_bus_type, &vdev->nb); 1784 WARN_ON(vdev->vf_token->users); 1785 mutex_destroy(&vdev->vf_token->lock); 1786 kfree(vdev->vf_token); 1787 } 1788 1789 static int vfio_pci_vga_init(struct vfio_pci_core_device *vdev) 1790 { 1791 struct pci_dev *pdev = vdev->pdev; 1792 int ret; 1793 1794 if (!vfio_pci_is_vga(pdev)) 1795 return 0; 1796 1797 ret = aperture_remove_conflicting_pci_devices(pdev, vdev->vdev.ops->name); 1798 if (ret) 1799 return ret; 1800 1801 ret = vga_client_register(pdev, vfio_pci_set_decode); 1802 if (ret) 1803 return ret; 1804 vga_set_legacy_decoding(pdev, vfio_pci_set_decode(pdev, false)); 1805 return 0; 1806 } 1807 1808 static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev) 1809 { 1810 struct pci_dev *pdev = vdev->pdev; 1811 1812 if (!vfio_pci_is_vga(pdev)) 1813 return; 1814 vga_client_unregister(pdev); 1815 vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 1816 VGA_RSRC_LEGACY_IO | 1817 VGA_RSRC_LEGACY_MEM); 1818 } 1819 1820 void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, 1821 struct pci_dev *pdev, 1822 const struct vfio_device_ops *vfio_pci_ops) 1823 { 1824 vfio_init_group_dev(&vdev->vdev, &pdev->dev, vfio_pci_ops); 1825 vdev->pdev = pdev; 1826 vdev->irq_type = VFIO_PCI_NUM_IRQS; 1827 mutex_init(&vdev->igate); 1828 spin_lock_init(&vdev->irqlock); 1829 mutex_init(&vdev->ioeventfds_lock); 1830 INIT_LIST_HEAD(&vdev->dummy_resources_list); 1831 INIT_LIST_HEAD(&vdev->ioeventfds_list); 1832 mutex_init(&vdev->vma_lock); 1833 INIT_LIST_HEAD(&vdev->vma_list); 1834 INIT_LIST_HEAD(&vdev->sriov_pfs_item); 1835 init_rwsem(&vdev->memory_lock); 1836 } 1837 EXPORT_SYMBOL_GPL(vfio_pci_core_init_device); 1838 1839 void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev) 1840 { 1841 mutex_destroy(&vdev->igate); 1842 mutex_destroy(&vdev->ioeventfds_lock); 1843 mutex_destroy(&vdev->vma_lock); 1844 vfio_uninit_group_dev(&vdev->vdev); 1845 kfree(vdev->region); 1846 kfree(vdev->pm_save); 1847 } 1848 EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device); 1849 1850 int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) 1851 { 1852 struct pci_dev *pdev = vdev->pdev; 1853 struct device *dev = &pdev->dev; 1854 int ret; 1855 1856 /* Drivers must set the vfio_pci_core_device to their drvdata */ 1857 if (WARN_ON(vdev != dev_get_drvdata(dev))) 1858 return -EINVAL; 1859 1860 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) 1861 return -EINVAL; 1862 1863 /* 1864 * Prevent binding to PFs with VFs enabled, the VFs might be in use 1865 * by the host or other users. We cannot capture the VFs if they 1866 * already exist, nor can we track VF users. Disabling SR-IOV here 1867 * would initiate removing the VFs, which would unbind the driver, 1868 * which is prone to blocking if that VF is also in use by vfio-pci. 1869 * Just reject these PFs and let the user sort it out. 1870 */ 1871 if (pci_num_vf(pdev)) { 1872 pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n"); 1873 return -EBUSY; 1874 } 1875 1876 if (pci_is_root_bus(pdev->bus)) { 1877 ret = vfio_assign_device_set(&vdev->vdev, vdev); 1878 } else if (!pci_probe_reset_slot(pdev->slot)) { 1879 ret = vfio_assign_device_set(&vdev->vdev, pdev->slot); 1880 } else { 1881 /* 1882 * If there is no slot reset support for this device, the whole 1883 * bus needs to be grouped together to support bus-wide resets. 1884 */ 1885 ret = vfio_assign_device_set(&vdev->vdev, pdev->bus); 1886 } 1887 1888 if (ret) 1889 return ret; 1890 ret = vfio_pci_vf_init(vdev); 1891 if (ret) 1892 return ret; 1893 ret = vfio_pci_vga_init(vdev); 1894 if (ret) 1895 goto out_vf; 1896 1897 vfio_pci_probe_power_state(vdev); 1898 1899 /* 1900 * pci-core sets the device power state to an unknown value at 1901 * bootup and after being removed from a driver. The only 1902 * transition it allows from this unknown state is to D0, which 1903 * typically happens when a driver calls pci_enable_device(). 1904 * We're not ready to enable the device yet, but we do want to 1905 * be able to get to D3. Therefore first do a D0 transition 1906 * before enabling runtime PM. 1907 */ 1908 vfio_pci_set_power_state(vdev, PCI_D0); 1909 1910 dev->driver->pm = &vfio_pci_core_pm_ops; 1911 pm_runtime_allow(dev); 1912 if (!disable_idle_d3) 1913 pm_runtime_put(dev); 1914 1915 ret = vfio_register_group_dev(&vdev->vdev); 1916 if (ret) 1917 goto out_power; 1918 return 0; 1919 1920 out_power: 1921 if (!disable_idle_d3) 1922 pm_runtime_get_noresume(dev); 1923 1924 pm_runtime_forbid(dev); 1925 out_vf: 1926 vfio_pci_vf_uninit(vdev); 1927 return ret; 1928 } 1929 EXPORT_SYMBOL_GPL(vfio_pci_core_register_device); 1930 1931 void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) 1932 { 1933 vfio_pci_core_sriov_configure(vdev, 0); 1934 1935 vfio_unregister_group_dev(&vdev->vdev); 1936 1937 vfio_pci_vf_uninit(vdev); 1938 vfio_pci_vga_uninit(vdev); 1939 1940 if (!disable_idle_d3) 1941 pm_runtime_get_noresume(&vdev->pdev->dev); 1942 1943 pm_runtime_forbid(&vdev->pdev->dev); 1944 } 1945 EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device); 1946 1947 pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, 1948 pci_channel_state_t state) 1949 { 1950 struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); 1951 1952 mutex_lock(&vdev->igate); 1953 1954 if (vdev->err_trigger) 1955 eventfd_signal(vdev->err_trigger, 1); 1956 1957 mutex_unlock(&vdev->igate); 1958 1959 return PCI_ERS_RESULT_CAN_RECOVER; 1960 } 1961 EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected); 1962 1963 int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, 1964 int nr_virtfn) 1965 { 1966 struct pci_dev *pdev = vdev->pdev; 1967 int ret = 0; 1968 1969 device_lock_assert(&pdev->dev); 1970 1971 if (nr_virtfn) { 1972 mutex_lock(&vfio_pci_sriov_pfs_mutex); 1973 /* 1974 * The thread that adds the vdev to the list is the only thread 1975 * that gets to call pci_enable_sriov() and we will only allow 1976 * it to be called once without going through 1977 * pci_disable_sriov() 1978 */ 1979 if (!list_empty(&vdev->sriov_pfs_item)) { 1980 ret = -EINVAL; 1981 goto out_unlock; 1982 } 1983 list_add_tail(&vdev->sriov_pfs_item, &vfio_pci_sriov_pfs); 1984 mutex_unlock(&vfio_pci_sriov_pfs_mutex); 1985 1986 /* 1987 * The PF power state should always be higher than the VF power 1988 * state. The PF can be in low power state either with runtime 1989 * power management (when there is no user) or PCI_PM_CTRL 1990 * register write by the user. If PF is in the low power state, 1991 * then change the power state to D0 first before enabling 1992 * SR-IOV. Also, this function can be called at any time, and 1993 * userspace PCI_PM_CTRL write can race against this code path, 1994 * so protect the same with 'memory_lock'. 1995 */ 1996 ret = pm_runtime_resume_and_get(&pdev->dev); 1997 if (ret) 1998 goto out_del; 1999 2000 down_write(&vdev->memory_lock); 2001 vfio_pci_set_power_state(vdev, PCI_D0); 2002 ret = pci_enable_sriov(pdev, nr_virtfn); 2003 up_write(&vdev->memory_lock); 2004 if (ret) { 2005 pm_runtime_put(&pdev->dev); 2006 goto out_del; 2007 } 2008 return nr_virtfn; 2009 } 2010 2011 if (pci_num_vf(pdev)) { 2012 pci_disable_sriov(pdev); 2013 pm_runtime_put(&pdev->dev); 2014 } 2015 2016 out_del: 2017 mutex_lock(&vfio_pci_sriov_pfs_mutex); 2018 list_del_init(&vdev->sriov_pfs_item); 2019 out_unlock: 2020 mutex_unlock(&vfio_pci_sriov_pfs_mutex); 2021 return ret; 2022 } 2023 EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure); 2024 2025 const struct pci_error_handlers vfio_pci_core_err_handlers = { 2026 .error_detected = vfio_pci_core_aer_err_detected, 2027 }; 2028 EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers); 2029 2030 static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev, 2031 struct vfio_pci_group_info *groups) 2032 { 2033 unsigned int i; 2034 2035 for (i = 0; i < groups->count; i++) 2036 if (vfio_file_has_dev(groups->files[i], &vdev->vdev)) 2037 return true; 2038 return false; 2039 } 2040 2041 static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data) 2042 { 2043 struct vfio_device_set *dev_set = data; 2044 struct vfio_device *cur; 2045 2046 list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 2047 if (cur->dev == &pdev->dev) 2048 return 0; 2049 return -EBUSY; 2050 } 2051 2052 /* 2053 * vfio-core considers a group to be viable and will create a vfio_device even 2054 * if some devices are bound to drivers like pci-stub or pcieport. Here we 2055 * require all PCI devices to be inside our dev_set since that ensures they stay 2056 * put and that every driver controlling the device can co-ordinate with the 2057 * device reset. 2058 * 2059 * Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be 2060 * reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise. 2061 */ 2062 static struct pci_dev * 2063 vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set) 2064 { 2065 struct pci_dev *pdev; 2066 2067 lockdep_assert_held(&dev_set->lock); 2068 2069 /* 2070 * By definition all PCI devices in the dev_set share the same PCI 2071 * reset, so any pci_dev will have the same outcomes for 2072 * pci_probe_reset_*() and pci_reset_bus(). 2073 */ 2074 pdev = list_first_entry(&dev_set->device_list, 2075 struct vfio_pci_core_device, 2076 vdev.dev_set_list)->pdev; 2077 2078 /* pci_reset_bus() is supported */ 2079 if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus)) 2080 return NULL; 2081 2082 if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set, 2083 dev_set, 2084 !pci_probe_reset_slot(pdev->slot))) 2085 return NULL; 2086 return pdev; 2087 } 2088 2089 static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set) 2090 { 2091 struct vfio_pci_core_device *cur; 2092 int ret; 2093 2094 list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { 2095 ret = pm_runtime_resume_and_get(&cur->pdev->dev); 2096 if (ret) 2097 goto unwind; 2098 } 2099 2100 return 0; 2101 2102 unwind: 2103 list_for_each_entry_continue_reverse(cur, &dev_set->device_list, 2104 vdev.dev_set_list) 2105 pm_runtime_put(&cur->pdev->dev); 2106 2107 return ret; 2108 } 2109 2110 /* 2111 * We need to get memory_lock for each device, but devices can share mmap_lock, 2112 * therefore we need to zap and hold the vma_lock for each device, and only then 2113 * get each memory_lock. 2114 */ 2115 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, 2116 struct vfio_pci_group_info *groups) 2117 { 2118 struct vfio_pci_core_device *cur_mem; 2119 struct vfio_pci_core_device *cur_vma; 2120 struct vfio_pci_core_device *cur; 2121 struct pci_dev *pdev; 2122 bool is_mem = true; 2123 int ret; 2124 2125 mutex_lock(&dev_set->lock); 2126 cur_mem = list_first_entry(&dev_set->device_list, 2127 struct vfio_pci_core_device, 2128 vdev.dev_set_list); 2129 2130 pdev = vfio_pci_dev_set_resettable(dev_set); 2131 if (!pdev) { 2132 ret = -EINVAL; 2133 goto err_unlock; 2134 } 2135 2136 list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { 2137 /* 2138 * Test whether all the affected devices are contained by the 2139 * set of groups provided by the user. 2140 */ 2141 if (!vfio_dev_in_groups(cur_vma, groups)) { 2142 ret = -EINVAL; 2143 goto err_undo; 2144 } 2145 2146 /* 2147 * Locking multiple devices is prone to deadlock, runaway and 2148 * unwind if we hit contention. 2149 */ 2150 if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) { 2151 ret = -EBUSY; 2152 goto err_undo; 2153 } 2154 } 2155 cur_vma = NULL; 2156 2157 list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) { 2158 if (!down_write_trylock(&cur_mem->memory_lock)) { 2159 ret = -EBUSY; 2160 goto err_undo; 2161 } 2162 mutex_unlock(&cur_mem->vma_lock); 2163 } 2164 cur_mem = NULL; 2165 2166 /* 2167 * The pci_reset_bus() will reset all the devices in the bus. 2168 * The power state can be non-D0 for some of the devices in the bus. 2169 * For these devices, the pci_reset_bus() will internally set 2170 * the power state to D0 without vfio driver involvement. 2171 * For the devices which have NoSoftRst-, the reset function can 2172 * cause the PCI config space reset without restoring the original 2173 * state (saved locally in 'vdev->pm_save'). 2174 */ 2175 list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) 2176 vfio_pci_set_power_state(cur, PCI_D0); 2177 2178 ret = pci_reset_bus(pdev); 2179 2180 err_undo: 2181 list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { 2182 if (cur == cur_mem) 2183 is_mem = false; 2184 if (cur == cur_vma) 2185 break; 2186 if (is_mem) 2187 up_write(&cur->memory_lock); 2188 else 2189 mutex_unlock(&cur->vma_lock); 2190 } 2191 err_unlock: 2192 mutex_unlock(&dev_set->lock); 2193 return ret; 2194 } 2195 2196 static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set) 2197 { 2198 struct vfio_pci_core_device *cur; 2199 bool needs_reset = false; 2200 2201 list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { 2202 /* No VFIO device in the set can have an open device FD */ 2203 if (cur->vdev.open_count) 2204 return false; 2205 needs_reset |= cur->needs_reset; 2206 } 2207 return needs_reset; 2208 } 2209 2210 /* 2211 * If a bus or slot reset is available for the provided dev_set and: 2212 * - All of the devices affected by that bus or slot reset are unused 2213 * - At least one of the affected devices is marked dirty via 2214 * needs_reset (such as by lack of FLR support) 2215 * Then attempt to perform that bus or slot reset. 2216 */ 2217 static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set) 2218 { 2219 struct vfio_pci_core_device *cur; 2220 struct pci_dev *pdev; 2221 bool reset_done = false; 2222 2223 if (!vfio_pci_dev_set_needs_reset(dev_set)) 2224 return; 2225 2226 pdev = vfio_pci_dev_set_resettable(dev_set); 2227 if (!pdev) 2228 return; 2229 2230 /* 2231 * Some of the devices in the bus can be in the runtime suspended 2232 * state. Increment the usage count for all the devices in the dev_set 2233 * before reset and decrement the same after reset. 2234 */ 2235 if (!disable_idle_d3 && vfio_pci_dev_set_pm_runtime_get(dev_set)) 2236 return; 2237 2238 if (!pci_reset_bus(pdev)) 2239 reset_done = true; 2240 2241 list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { 2242 if (reset_done) 2243 cur->needs_reset = false; 2244 2245 if (!disable_idle_d3) 2246 pm_runtime_put(&cur->pdev->dev); 2247 } 2248 } 2249 2250 void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga, 2251 bool is_disable_idle_d3) 2252 { 2253 nointxmask = is_nointxmask; 2254 disable_vga = is_disable_vga; 2255 disable_idle_d3 = is_disable_idle_d3; 2256 } 2257 EXPORT_SYMBOL_GPL(vfio_pci_core_set_params); 2258 2259 static void vfio_pci_core_cleanup(void) 2260 { 2261 vfio_pci_uninit_perm_bits(); 2262 } 2263 2264 static int __init vfio_pci_core_init(void) 2265 { 2266 /* Allocate shared config space permission data used by all devices */ 2267 return vfio_pci_init_perm_bits(); 2268 } 2269 2270 module_init(vfio_pci_core_init); 2271 module_exit(vfio_pci_core_cleanup); 2272 2273 MODULE_LICENSE("GPL v2"); 2274 MODULE_AUTHOR(DRIVER_AUTHOR); 2275 MODULE_DESCRIPTION(DRIVER_DESC); 2276