1 /* 2 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 3 * Author: Alex Williamson <alex.williamson@redhat.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License version 2 as 7 * published by the Free Software Foundation. 8 * 9 * Derived from original vfio: 10 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 11 * Author: Tom Lyon, pugs@cisco.com 12 */ 13 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 16 #include <linux/device.h> 17 #include <linux/eventfd.h> 18 #include <linux/file.h> 19 #include <linux/interrupt.h> 20 #include <linux/iommu.h> 21 #include <linux/module.h> 22 #include <linux/mutex.h> 23 #include <linux/notifier.h> 24 #include <linux/pci.h> 25 #include <linux/pm_runtime.h> 26 #include <linux/slab.h> 27 #include <linux/types.h> 28 #include <linux/uaccess.h> 29 #include <linux/vfio.h> 30 #include <linux/vgaarb.h> 31 #include <linux/nospec.h> 32 33 #include "vfio_pci_private.h" 34 35 #define DRIVER_VERSION "0.2" 36 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 37 #define DRIVER_DESC "VFIO PCI - User Level meta-driver" 38 39 static char ids[1024] __initdata; 40 module_param_string(ids, ids, sizeof(ids), 0); 41 MODULE_PARM_DESC(ids, "Initial PCI IDs to add to the vfio driver, format is \"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\" and multiple comma separated entries can be specified"); 42 43 static bool nointxmask; 44 module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR); 45 MODULE_PARM_DESC(nointxmask, 46 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag."); 47 48 #ifdef CONFIG_VFIO_PCI_VGA 49 static bool disable_vga; 50 module_param(disable_vga, bool, S_IRUGO); 51 MODULE_PARM_DESC(disable_vga, "Disable VGA resource access through vfio-pci"); 52 #endif 53 54 static bool disable_idle_d3; 55 module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR); 56 MODULE_PARM_DESC(disable_idle_d3, 57 "Disable using the PCI D3 low power state for idle, unused devices"); 58 59 static inline bool vfio_vga_disabled(void) 60 { 61 #ifdef CONFIG_VFIO_PCI_VGA 62 return disable_vga; 63 #else 64 return true; 65 #endif 66 } 67 68 /* 69 * Our VGA arbiter participation is limited since we don't know anything 70 * about the device itself. However, if the device is the only VGA device 71 * downstream of a bridge and VFIO VGA support is disabled, then we can 72 * safely return legacy VGA IO and memory as not decoded since the user 73 * has no way to get to it and routing can be disabled externally at the 74 * bridge. 75 */ 76 static unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga) 77 { 78 struct vfio_pci_device *vdev = opaque; 79 struct pci_dev *tmp = NULL, *pdev = vdev->pdev; 80 unsigned char max_busnr; 81 unsigned int decodes; 82 83 if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus)) 84 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 85 VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 86 87 max_busnr = pci_bus_max_busnr(pdev->bus); 88 decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 89 90 while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) { 91 if (tmp == pdev || 92 pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) || 93 pci_is_root_bus(tmp->bus)) 94 continue; 95 96 if (tmp->bus->number >= pdev->bus->number && 97 tmp->bus->number <= max_busnr) { 98 pci_dev_put(tmp); 99 decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 100 break; 101 } 102 } 103 104 return decodes; 105 } 106 107 static inline bool vfio_pci_is_vga(struct pci_dev *pdev) 108 { 109 return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; 110 } 111 112 static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev) 113 { 114 struct resource *res; 115 int bar; 116 struct vfio_pci_dummy_resource *dummy_res; 117 118 INIT_LIST_HEAD(&vdev->dummy_resources_list); 119 120 for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { 121 res = vdev->pdev->resource + bar; 122 123 if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP)) 124 goto no_mmap; 125 126 if (!(res->flags & IORESOURCE_MEM)) 127 goto no_mmap; 128 129 /* 130 * The PCI core shouldn't set up a resource with a 131 * type but zero size. But there may be bugs that 132 * cause us to do that. 133 */ 134 if (!resource_size(res)) 135 goto no_mmap; 136 137 if (resource_size(res) >= PAGE_SIZE) { 138 vdev->bar_mmap_supported[bar] = true; 139 continue; 140 } 141 142 if (!(res->start & ~PAGE_MASK)) { 143 /* 144 * Add a dummy resource to reserve the remainder 145 * of the exclusive page in case that hot-add 146 * device's bar is assigned into it. 147 */ 148 dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL); 149 if (dummy_res == NULL) 150 goto no_mmap; 151 152 dummy_res->resource.name = "vfio sub-page reserved"; 153 dummy_res->resource.start = res->end + 1; 154 dummy_res->resource.end = res->start + PAGE_SIZE - 1; 155 dummy_res->resource.flags = res->flags; 156 if (request_resource(res->parent, 157 &dummy_res->resource)) { 158 kfree(dummy_res); 159 goto no_mmap; 160 } 161 dummy_res->index = bar; 162 list_add(&dummy_res->res_next, 163 &vdev->dummy_resources_list); 164 vdev->bar_mmap_supported[bar] = true; 165 continue; 166 } 167 /* 168 * Here we don't handle the case when the BAR is not page 169 * aligned because we can't expect the BAR will be 170 * assigned into the same location in a page in guest 171 * when we passthrough the BAR. And it's hard to access 172 * this BAR in userspace because we have no way to get 173 * the BAR's location in a page. 174 */ 175 no_mmap: 176 vdev->bar_mmap_supported[bar] = false; 177 } 178 } 179 180 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); 181 static void vfio_pci_disable(struct vfio_pci_device *vdev); 182 183 /* 184 * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND 185 * _and_ the ability detect when the device is asserting INTx via PCI_STATUS. 186 * If a device implements the former but not the latter we would typically 187 * expect broken_intx_masking be set and require an exclusive interrupt. 188 * However since we do have control of the device's ability to assert INTx, 189 * we can instead pretend that the device does not implement INTx, virtualizing 190 * the pin register to report zero and maintaining DisINTx set on the host. 191 */ 192 static bool vfio_pci_nointx(struct pci_dev *pdev) 193 { 194 switch (pdev->vendor) { 195 case PCI_VENDOR_ID_INTEL: 196 switch (pdev->device) { 197 /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */ 198 case 0x1572: 199 case 0x1574: 200 case 0x1580 ... 0x1581: 201 case 0x1583 ... 0x158b: 202 case 0x37d0 ... 0x37d2: 203 return true; 204 default: 205 return false; 206 } 207 } 208 209 return false; 210 } 211 212 static int vfio_pci_enable(struct vfio_pci_device *vdev) 213 { 214 struct pci_dev *pdev = vdev->pdev; 215 int ret; 216 u16 cmd; 217 u8 msix_pos; 218 219 pci_set_power_state(pdev, PCI_D0); 220 221 /* Don't allow our initial saved state to include busmaster */ 222 pci_clear_master(pdev); 223 224 ret = pci_enable_device(pdev); 225 if (ret) 226 return ret; 227 228 /* If reset fails because of the device lock, fail this path entirely */ 229 ret = pci_try_reset_function(pdev); 230 if (ret == -EAGAIN) { 231 pci_disable_device(pdev); 232 return ret; 233 } 234 235 vdev->reset_works = !ret; 236 pci_save_state(pdev); 237 vdev->pci_saved_state = pci_store_saved_state(pdev); 238 if (!vdev->pci_saved_state) 239 pr_debug("%s: Couldn't store %s saved state\n", 240 __func__, dev_name(&pdev->dev)); 241 242 if (likely(!nointxmask)) { 243 if (vfio_pci_nointx(pdev)) { 244 dev_info(&pdev->dev, "Masking broken INTx support\n"); 245 vdev->nointx = true; 246 pci_intx(pdev, 0); 247 } else 248 vdev->pci_2_3 = pci_intx_mask_supported(pdev); 249 } 250 251 pci_read_config_word(pdev, PCI_COMMAND, &cmd); 252 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { 253 cmd &= ~PCI_COMMAND_INTX_DISABLE; 254 pci_write_config_word(pdev, PCI_COMMAND, cmd); 255 } 256 257 ret = vfio_config_init(vdev); 258 if (ret) { 259 kfree(vdev->pci_saved_state); 260 vdev->pci_saved_state = NULL; 261 pci_disable_device(pdev); 262 return ret; 263 } 264 265 msix_pos = pdev->msix_cap; 266 if (msix_pos) { 267 u16 flags; 268 u32 table; 269 270 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); 271 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); 272 273 vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; 274 vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; 275 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; 276 } else 277 vdev->msix_bar = 0xFF; 278 279 if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) 280 vdev->has_vga = true; 281 282 283 if (vfio_pci_is_vga(pdev) && 284 pdev->vendor == PCI_VENDOR_ID_INTEL && 285 IS_ENABLED(CONFIG_VFIO_PCI_IGD)) { 286 ret = vfio_pci_igd_init(vdev); 287 if (ret) { 288 dev_warn(&vdev->pdev->dev, 289 "Failed to setup Intel IGD regions\n"); 290 goto disable_exit; 291 } 292 } 293 294 if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && 295 IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { 296 ret = vfio_pci_nvdia_v100_nvlink2_init(vdev); 297 if (ret && ret != -ENODEV) { 298 dev_warn(&vdev->pdev->dev, 299 "Failed to setup NVIDIA NV2 RAM region\n"); 300 goto disable_exit; 301 } 302 } 303 304 if (pdev->vendor == PCI_VENDOR_ID_IBM && 305 IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { 306 ret = vfio_pci_ibm_npu2_init(vdev); 307 if (ret && ret != -ENODEV) { 308 dev_warn(&vdev->pdev->dev, 309 "Failed to setup NVIDIA NV2 ATSD region\n"); 310 goto disable_exit; 311 } 312 } 313 314 vfio_pci_probe_mmaps(vdev); 315 316 return 0; 317 318 disable_exit: 319 vfio_pci_disable(vdev); 320 return ret; 321 } 322 323 static void vfio_pci_disable(struct vfio_pci_device *vdev) 324 { 325 struct pci_dev *pdev = vdev->pdev; 326 struct vfio_pci_dummy_resource *dummy_res, *tmp; 327 struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; 328 int i, bar; 329 330 /* Stop the device from further DMA */ 331 pci_clear_master(pdev); 332 333 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | 334 VFIO_IRQ_SET_ACTION_TRIGGER, 335 vdev->irq_type, 0, 0, NULL); 336 337 /* Device closed, don't need mutex here */ 338 list_for_each_entry_safe(ioeventfd, ioeventfd_tmp, 339 &vdev->ioeventfds_list, next) { 340 vfio_virqfd_disable(&ioeventfd->virqfd); 341 list_del(&ioeventfd->next); 342 kfree(ioeventfd); 343 } 344 vdev->ioeventfds_nr = 0; 345 346 vdev->virq_disabled = false; 347 348 for (i = 0; i < vdev->num_regions; i++) 349 vdev->region[i].ops->release(vdev, &vdev->region[i]); 350 351 vdev->num_regions = 0; 352 kfree(vdev->region); 353 vdev->region = NULL; /* don't krealloc a freed pointer */ 354 355 vfio_config_free(vdev); 356 357 for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { 358 if (!vdev->barmap[bar]) 359 continue; 360 pci_iounmap(pdev, vdev->barmap[bar]); 361 pci_release_selected_regions(pdev, 1 << bar); 362 vdev->barmap[bar] = NULL; 363 } 364 365 list_for_each_entry_safe(dummy_res, tmp, 366 &vdev->dummy_resources_list, res_next) { 367 list_del(&dummy_res->res_next); 368 release_resource(&dummy_res->resource); 369 kfree(dummy_res); 370 } 371 372 vdev->needs_reset = true; 373 374 /* 375 * If we have saved state, restore it. If we can reset the device, 376 * even better. Resetting with current state seems better than 377 * nothing, but saving and restoring current state without reset 378 * is just busy work. 379 */ 380 if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) { 381 pr_info("%s: Couldn't reload %s saved state\n", 382 __func__, dev_name(&pdev->dev)); 383 384 if (!vdev->reset_works) 385 goto out; 386 387 pci_save_state(pdev); 388 } 389 390 /* 391 * Disable INTx and MSI, presumably to avoid spurious interrupts 392 * during reset. Stolen from pci_reset_function() 393 */ 394 pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); 395 396 /* 397 * Try to reset the device. The success of this is dependent on 398 * being able to lock the device, which is not always possible. 399 */ 400 if (vdev->reset_works && !pci_try_reset_function(pdev)) 401 vdev->needs_reset = false; 402 403 pci_restore_state(pdev); 404 out: 405 pci_disable_device(pdev); 406 407 vfio_pci_try_bus_reset(vdev); 408 409 if (!disable_idle_d3) 410 pci_set_power_state(pdev, PCI_D3hot); 411 } 412 413 static void vfio_pci_release(void *device_data) 414 { 415 struct vfio_pci_device *vdev = device_data; 416 417 mutex_lock(&vdev->reflck->lock); 418 419 if (!(--vdev->refcnt)) { 420 vfio_spapr_pci_eeh_release(vdev->pdev); 421 vfio_pci_disable(vdev); 422 } 423 424 mutex_unlock(&vdev->reflck->lock); 425 426 module_put(THIS_MODULE); 427 } 428 429 static int vfio_pci_open(void *device_data) 430 { 431 struct vfio_pci_device *vdev = device_data; 432 int ret = 0; 433 434 if (!try_module_get(THIS_MODULE)) 435 return -ENODEV; 436 437 mutex_lock(&vdev->reflck->lock); 438 439 if (!vdev->refcnt) { 440 ret = vfio_pci_enable(vdev); 441 if (ret) 442 goto error; 443 444 vfio_spapr_pci_eeh_open(vdev->pdev); 445 } 446 vdev->refcnt++; 447 error: 448 mutex_unlock(&vdev->reflck->lock); 449 if (ret) 450 module_put(THIS_MODULE); 451 return ret; 452 } 453 454 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) 455 { 456 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { 457 u8 pin; 458 459 if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || 460 vdev->nointx || vdev->pdev->is_virtfn) 461 return 0; 462 463 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); 464 465 return pin ? 1 : 0; 466 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { 467 u8 pos; 468 u16 flags; 469 470 pos = vdev->pdev->msi_cap; 471 if (pos) { 472 pci_read_config_word(vdev->pdev, 473 pos + PCI_MSI_FLAGS, &flags); 474 return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); 475 } 476 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { 477 u8 pos; 478 u16 flags; 479 480 pos = vdev->pdev->msix_cap; 481 if (pos) { 482 pci_read_config_word(vdev->pdev, 483 pos + PCI_MSIX_FLAGS, &flags); 484 485 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 486 } 487 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { 488 if (pci_is_pcie(vdev->pdev)) 489 return 1; 490 } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { 491 return 1; 492 } 493 494 return 0; 495 } 496 497 static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) 498 { 499 (*(int *)data)++; 500 return 0; 501 } 502 503 struct vfio_pci_fill_info { 504 int max; 505 int cur; 506 struct vfio_pci_dependent_device *devices; 507 }; 508 509 static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) 510 { 511 struct vfio_pci_fill_info *fill = data; 512 struct iommu_group *iommu_group; 513 514 if (fill->cur == fill->max) 515 return -EAGAIN; /* Something changed, try again */ 516 517 iommu_group = iommu_group_get(&pdev->dev); 518 if (!iommu_group) 519 return -EPERM; /* Cannot reset non-isolated devices */ 520 521 fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); 522 fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); 523 fill->devices[fill->cur].bus = pdev->bus->number; 524 fill->devices[fill->cur].devfn = pdev->devfn; 525 fill->cur++; 526 iommu_group_put(iommu_group); 527 return 0; 528 } 529 530 struct vfio_pci_group_entry { 531 struct vfio_group *group; 532 int id; 533 }; 534 535 struct vfio_pci_group_info { 536 int count; 537 struct vfio_pci_group_entry *groups; 538 }; 539 540 static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data) 541 { 542 struct vfio_pci_group_info *info = data; 543 struct iommu_group *group; 544 int id, i; 545 546 group = iommu_group_get(&pdev->dev); 547 if (!group) 548 return -EPERM; 549 550 id = iommu_group_id(group); 551 552 for (i = 0; i < info->count; i++) 553 if (info->groups[i].id == id) 554 break; 555 556 iommu_group_put(group); 557 558 return (i == info->count) ? -EINVAL : 0; 559 } 560 561 static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) 562 { 563 for (; pdev; pdev = pdev->bus->self) 564 if (pdev->bus == slot->bus) 565 return (pdev->slot == slot); 566 return false; 567 } 568 569 struct vfio_pci_walk_info { 570 int (*fn)(struct pci_dev *, void *data); 571 void *data; 572 struct pci_dev *pdev; 573 bool slot; 574 int ret; 575 }; 576 577 static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data) 578 { 579 struct vfio_pci_walk_info *walk = data; 580 581 if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot)) 582 walk->ret = walk->fn(pdev, walk->data); 583 584 return walk->ret; 585 } 586 587 static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, 588 int (*fn)(struct pci_dev *, 589 void *data), void *data, 590 bool slot) 591 { 592 struct vfio_pci_walk_info walk = { 593 .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0, 594 }; 595 596 pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk); 597 598 return walk.ret; 599 } 600 601 static int msix_mmappable_cap(struct vfio_pci_device *vdev, 602 struct vfio_info_cap *caps) 603 { 604 struct vfio_info_cap_header header = { 605 .id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, 606 .version = 1 607 }; 608 609 return vfio_info_add_capability(caps, &header, sizeof(header)); 610 } 611 612 int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, 613 unsigned int type, unsigned int subtype, 614 const struct vfio_pci_regops *ops, 615 size_t size, u32 flags, void *data) 616 { 617 struct vfio_pci_region *region; 618 619 region = krealloc(vdev->region, 620 (vdev->num_regions + 1) * sizeof(*region), 621 GFP_KERNEL); 622 if (!region) 623 return -ENOMEM; 624 625 vdev->region = region; 626 vdev->region[vdev->num_regions].type = type; 627 vdev->region[vdev->num_regions].subtype = subtype; 628 vdev->region[vdev->num_regions].ops = ops; 629 vdev->region[vdev->num_regions].size = size; 630 vdev->region[vdev->num_regions].flags = flags; 631 vdev->region[vdev->num_regions].data = data; 632 633 vdev->num_regions++; 634 635 return 0; 636 } 637 638 static long vfio_pci_ioctl(void *device_data, 639 unsigned int cmd, unsigned long arg) 640 { 641 struct vfio_pci_device *vdev = device_data; 642 unsigned long minsz; 643 644 if (cmd == VFIO_DEVICE_GET_INFO) { 645 struct vfio_device_info info; 646 647 minsz = offsetofend(struct vfio_device_info, num_irqs); 648 649 if (copy_from_user(&info, (void __user *)arg, minsz)) 650 return -EFAULT; 651 652 if (info.argsz < minsz) 653 return -EINVAL; 654 655 info.flags = VFIO_DEVICE_FLAGS_PCI; 656 657 if (vdev->reset_works) 658 info.flags |= VFIO_DEVICE_FLAGS_RESET; 659 660 info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; 661 info.num_irqs = VFIO_PCI_NUM_IRQS; 662 663 return copy_to_user((void __user *)arg, &info, minsz) ? 664 -EFAULT : 0; 665 666 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { 667 struct pci_dev *pdev = vdev->pdev; 668 struct vfio_region_info info; 669 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 670 int i, ret; 671 672 minsz = offsetofend(struct vfio_region_info, offset); 673 674 if (copy_from_user(&info, (void __user *)arg, minsz)) 675 return -EFAULT; 676 677 if (info.argsz < minsz) 678 return -EINVAL; 679 680 switch (info.index) { 681 case VFIO_PCI_CONFIG_REGION_INDEX: 682 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 683 info.size = pdev->cfg_size; 684 info.flags = VFIO_REGION_INFO_FLAG_READ | 685 VFIO_REGION_INFO_FLAG_WRITE; 686 break; 687 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 688 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 689 info.size = pci_resource_len(pdev, info.index); 690 if (!info.size) { 691 info.flags = 0; 692 break; 693 } 694 695 info.flags = VFIO_REGION_INFO_FLAG_READ | 696 VFIO_REGION_INFO_FLAG_WRITE; 697 if (vdev->bar_mmap_supported[info.index]) { 698 info.flags |= VFIO_REGION_INFO_FLAG_MMAP; 699 if (info.index == vdev->msix_bar) { 700 ret = msix_mmappable_cap(vdev, &caps); 701 if (ret) 702 return ret; 703 } 704 } 705 706 break; 707 case VFIO_PCI_ROM_REGION_INDEX: 708 { 709 void __iomem *io; 710 size_t size; 711 712 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 713 info.flags = 0; 714 715 /* Report the BAR size, not the ROM size */ 716 info.size = pci_resource_len(pdev, info.index); 717 if (!info.size) { 718 /* Shadow ROMs appear as PCI option ROMs */ 719 if (pdev->resource[PCI_ROM_RESOURCE].flags & 720 IORESOURCE_ROM_SHADOW) 721 info.size = 0x20000; 722 else 723 break; 724 } 725 726 /* Is it really there? */ 727 io = pci_map_rom(pdev, &size); 728 if (!io || !size) { 729 info.size = 0; 730 break; 731 } 732 pci_unmap_rom(pdev, io); 733 734 info.flags = VFIO_REGION_INFO_FLAG_READ; 735 break; 736 } 737 case VFIO_PCI_VGA_REGION_INDEX: 738 if (!vdev->has_vga) 739 return -EINVAL; 740 741 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 742 info.size = 0xc0000; 743 info.flags = VFIO_REGION_INFO_FLAG_READ | 744 VFIO_REGION_INFO_FLAG_WRITE; 745 746 break; 747 default: 748 { 749 struct vfio_region_info_cap_type cap_type = { 750 .header.id = VFIO_REGION_INFO_CAP_TYPE, 751 .header.version = 1 }; 752 753 if (info.index >= 754 VFIO_PCI_NUM_REGIONS + vdev->num_regions) 755 return -EINVAL; 756 info.index = array_index_nospec(info.index, 757 VFIO_PCI_NUM_REGIONS + 758 vdev->num_regions); 759 760 i = info.index - VFIO_PCI_NUM_REGIONS; 761 762 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 763 info.size = vdev->region[i].size; 764 info.flags = vdev->region[i].flags; 765 766 cap_type.type = vdev->region[i].type; 767 cap_type.subtype = vdev->region[i].subtype; 768 769 ret = vfio_info_add_capability(&caps, &cap_type.header, 770 sizeof(cap_type)); 771 if (ret) 772 return ret; 773 774 if (vdev->region[i].ops->add_capability) { 775 ret = vdev->region[i].ops->add_capability(vdev, 776 &vdev->region[i], &caps); 777 if (ret) 778 return ret; 779 } 780 } 781 } 782 783 if (caps.size) { 784 info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 785 if (info.argsz < sizeof(info) + caps.size) { 786 info.argsz = sizeof(info) + caps.size; 787 info.cap_offset = 0; 788 } else { 789 vfio_info_cap_shift(&caps, sizeof(info)); 790 if (copy_to_user((void __user *)arg + 791 sizeof(info), caps.buf, 792 caps.size)) { 793 kfree(caps.buf); 794 return -EFAULT; 795 } 796 info.cap_offset = sizeof(info); 797 } 798 799 kfree(caps.buf); 800 } 801 802 return copy_to_user((void __user *)arg, &info, minsz) ? 803 -EFAULT : 0; 804 805 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { 806 struct vfio_irq_info info; 807 808 minsz = offsetofend(struct vfio_irq_info, count); 809 810 if (copy_from_user(&info, (void __user *)arg, minsz)) 811 return -EFAULT; 812 813 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 814 return -EINVAL; 815 816 switch (info.index) { 817 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: 818 case VFIO_PCI_REQ_IRQ_INDEX: 819 break; 820 case VFIO_PCI_ERR_IRQ_INDEX: 821 if (pci_is_pcie(vdev->pdev)) 822 break; 823 /* fall through */ 824 default: 825 return -EINVAL; 826 } 827 828 info.flags = VFIO_IRQ_INFO_EVENTFD; 829 830 info.count = vfio_pci_get_irq_count(vdev, info.index); 831 832 if (info.index == VFIO_PCI_INTX_IRQ_INDEX) 833 info.flags |= (VFIO_IRQ_INFO_MASKABLE | 834 VFIO_IRQ_INFO_AUTOMASKED); 835 else 836 info.flags |= VFIO_IRQ_INFO_NORESIZE; 837 838 return copy_to_user((void __user *)arg, &info, minsz) ? 839 -EFAULT : 0; 840 841 } else if (cmd == VFIO_DEVICE_SET_IRQS) { 842 struct vfio_irq_set hdr; 843 u8 *data = NULL; 844 int max, ret = 0; 845 size_t data_size = 0; 846 847 minsz = offsetofend(struct vfio_irq_set, count); 848 849 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 850 return -EFAULT; 851 852 max = vfio_pci_get_irq_count(vdev, hdr.index); 853 854 ret = vfio_set_irqs_validate_and_prepare(&hdr, max, 855 VFIO_PCI_NUM_IRQS, &data_size); 856 if (ret) 857 return ret; 858 859 if (data_size) { 860 data = memdup_user((void __user *)(arg + minsz), 861 data_size); 862 if (IS_ERR(data)) 863 return PTR_ERR(data); 864 } 865 866 mutex_lock(&vdev->igate); 867 868 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, 869 hdr.start, hdr.count, data); 870 871 mutex_unlock(&vdev->igate); 872 kfree(data); 873 874 return ret; 875 876 } else if (cmd == VFIO_DEVICE_RESET) { 877 return vdev->reset_works ? 878 pci_try_reset_function(vdev->pdev) : -EINVAL; 879 880 } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { 881 struct vfio_pci_hot_reset_info hdr; 882 struct vfio_pci_fill_info fill = { 0 }; 883 struct vfio_pci_dependent_device *devices = NULL; 884 bool slot = false; 885 int ret = 0; 886 887 minsz = offsetofend(struct vfio_pci_hot_reset_info, count); 888 889 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 890 return -EFAULT; 891 892 if (hdr.argsz < minsz) 893 return -EINVAL; 894 895 hdr.flags = 0; 896 897 /* Can we do a slot or bus reset or neither? */ 898 if (!pci_probe_reset_slot(vdev->pdev->slot)) 899 slot = true; 900 else if (pci_probe_reset_bus(vdev->pdev->bus)) 901 return -ENODEV; 902 903 /* How many devices are affected? */ 904 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 905 vfio_pci_count_devs, 906 &fill.max, slot); 907 if (ret) 908 return ret; 909 910 WARN_ON(!fill.max); /* Should always be at least one */ 911 912 /* 913 * If there's enough space, fill it now, otherwise return 914 * -ENOSPC and the number of devices affected. 915 */ 916 if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { 917 ret = -ENOSPC; 918 hdr.count = fill.max; 919 goto reset_info_exit; 920 } 921 922 devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); 923 if (!devices) 924 return -ENOMEM; 925 926 fill.devices = devices; 927 928 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 929 vfio_pci_fill_devs, 930 &fill, slot); 931 932 /* 933 * If a device was removed between counting and filling, 934 * we may come up short of fill.max. If a device was 935 * added, we'll have a return of -EAGAIN above. 936 */ 937 if (!ret) 938 hdr.count = fill.cur; 939 940 reset_info_exit: 941 if (copy_to_user((void __user *)arg, &hdr, minsz)) 942 ret = -EFAULT; 943 944 if (!ret) { 945 if (copy_to_user((void __user *)(arg + minsz), devices, 946 hdr.count * sizeof(*devices))) 947 ret = -EFAULT; 948 } 949 950 kfree(devices); 951 return ret; 952 953 } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { 954 struct vfio_pci_hot_reset hdr; 955 int32_t *group_fds; 956 struct vfio_pci_group_entry *groups; 957 struct vfio_pci_group_info info; 958 bool slot = false; 959 int i, count = 0, ret = 0; 960 961 minsz = offsetofend(struct vfio_pci_hot_reset, count); 962 963 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 964 return -EFAULT; 965 966 if (hdr.argsz < minsz || hdr.flags) 967 return -EINVAL; 968 969 /* Can we do a slot or bus reset or neither? */ 970 if (!pci_probe_reset_slot(vdev->pdev->slot)) 971 slot = true; 972 else if (pci_probe_reset_bus(vdev->pdev->bus)) 973 return -ENODEV; 974 975 /* 976 * We can't let userspace give us an arbitrarily large 977 * buffer to copy, so verify how many we think there 978 * could be. Note groups can have multiple devices so 979 * one group per device is the max. 980 */ 981 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 982 vfio_pci_count_devs, 983 &count, slot); 984 if (ret) 985 return ret; 986 987 /* Somewhere between 1 and count is OK */ 988 if (!hdr.count || hdr.count > count) 989 return -EINVAL; 990 991 group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); 992 groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); 993 if (!group_fds || !groups) { 994 kfree(group_fds); 995 kfree(groups); 996 return -ENOMEM; 997 } 998 999 if (copy_from_user(group_fds, (void __user *)(arg + minsz), 1000 hdr.count * sizeof(*group_fds))) { 1001 kfree(group_fds); 1002 kfree(groups); 1003 return -EFAULT; 1004 } 1005 1006 /* 1007 * For each group_fd, get the group through the vfio external 1008 * user interface and store the group and iommu ID. This 1009 * ensures the group is held across the reset. 1010 */ 1011 for (i = 0; i < hdr.count; i++) { 1012 struct vfio_group *group; 1013 struct fd f = fdget(group_fds[i]); 1014 if (!f.file) { 1015 ret = -EBADF; 1016 break; 1017 } 1018 1019 group = vfio_group_get_external_user(f.file); 1020 fdput(f); 1021 if (IS_ERR(group)) { 1022 ret = PTR_ERR(group); 1023 break; 1024 } 1025 1026 groups[i].group = group; 1027 groups[i].id = vfio_external_user_iommu_id(group); 1028 } 1029 1030 kfree(group_fds); 1031 1032 /* release reference to groups on error */ 1033 if (ret) 1034 goto hot_reset_release; 1035 1036 info.count = hdr.count; 1037 info.groups = groups; 1038 1039 /* 1040 * Test whether all the affected devices are contained 1041 * by the set of groups provided by the user. 1042 */ 1043 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1044 vfio_pci_validate_devs, 1045 &info, slot); 1046 if (!ret) 1047 /* User has access, do the reset */ 1048 ret = pci_reset_bus(vdev->pdev); 1049 1050 hot_reset_release: 1051 for (i--; i >= 0; i--) 1052 vfio_group_put_external_user(groups[i].group); 1053 1054 kfree(groups); 1055 return ret; 1056 } else if (cmd == VFIO_DEVICE_IOEVENTFD) { 1057 struct vfio_device_ioeventfd ioeventfd; 1058 int count; 1059 1060 minsz = offsetofend(struct vfio_device_ioeventfd, fd); 1061 1062 if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) 1063 return -EFAULT; 1064 1065 if (ioeventfd.argsz < minsz) 1066 return -EINVAL; 1067 1068 if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) 1069 return -EINVAL; 1070 1071 count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; 1072 1073 if (hweight8(count) != 1 || ioeventfd.fd < -1) 1074 return -EINVAL; 1075 1076 return vfio_pci_ioeventfd(vdev, ioeventfd.offset, 1077 ioeventfd.data, count, ioeventfd.fd); 1078 } 1079 1080 return -ENOTTY; 1081 } 1082 1083 static ssize_t vfio_pci_rw(void *device_data, char __user *buf, 1084 size_t count, loff_t *ppos, bool iswrite) 1085 { 1086 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 1087 struct vfio_pci_device *vdev = device_data; 1088 1089 if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 1090 return -EINVAL; 1091 1092 switch (index) { 1093 case VFIO_PCI_CONFIG_REGION_INDEX: 1094 return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); 1095 1096 case VFIO_PCI_ROM_REGION_INDEX: 1097 if (iswrite) 1098 return -EINVAL; 1099 return vfio_pci_bar_rw(vdev, buf, count, ppos, false); 1100 1101 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 1102 return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); 1103 1104 case VFIO_PCI_VGA_REGION_INDEX: 1105 return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); 1106 default: 1107 index -= VFIO_PCI_NUM_REGIONS; 1108 return vdev->region[index].ops->rw(vdev, buf, 1109 count, ppos, iswrite); 1110 } 1111 1112 return -EINVAL; 1113 } 1114 1115 static ssize_t vfio_pci_read(void *device_data, char __user *buf, 1116 size_t count, loff_t *ppos) 1117 { 1118 if (!count) 1119 return 0; 1120 1121 return vfio_pci_rw(device_data, buf, count, ppos, false); 1122 } 1123 1124 static ssize_t vfio_pci_write(void *device_data, const char __user *buf, 1125 size_t count, loff_t *ppos) 1126 { 1127 if (!count) 1128 return 0; 1129 1130 return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true); 1131 } 1132 1133 static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) 1134 { 1135 struct vfio_pci_device *vdev = device_data; 1136 struct pci_dev *pdev = vdev->pdev; 1137 unsigned int index; 1138 u64 phys_len, req_len, pgoff, req_start; 1139 int ret; 1140 1141 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 1142 1143 if (vma->vm_end < vma->vm_start) 1144 return -EINVAL; 1145 if ((vma->vm_flags & VM_SHARED) == 0) 1146 return -EINVAL; 1147 if (index >= VFIO_PCI_NUM_REGIONS) { 1148 int regnum = index - VFIO_PCI_NUM_REGIONS; 1149 struct vfio_pci_region *region = vdev->region + regnum; 1150 1151 if (region && region->ops && region->ops->mmap && 1152 (region->flags & VFIO_REGION_INFO_FLAG_MMAP)) 1153 return region->ops->mmap(vdev, region, vma); 1154 return -EINVAL; 1155 } 1156 if (index >= VFIO_PCI_ROM_REGION_INDEX) 1157 return -EINVAL; 1158 if (!vdev->bar_mmap_supported[index]) 1159 return -EINVAL; 1160 1161 phys_len = PAGE_ALIGN(pci_resource_len(pdev, index)); 1162 req_len = vma->vm_end - vma->vm_start; 1163 pgoff = vma->vm_pgoff & 1164 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 1165 req_start = pgoff << PAGE_SHIFT; 1166 1167 if (req_start + req_len > phys_len) 1168 return -EINVAL; 1169 1170 /* 1171 * Even though we don't make use of the barmap for the mmap, 1172 * we need to request the region and the barmap tracks that. 1173 */ 1174 if (!vdev->barmap[index]) { 1175 ret = pci_request_selected_regions(pdev, 1176 1 << index, "vfio-pci"); 1177 if (ret) 1178 return ret; 1179 1180 vdev->barmap[index] = pci_iomap(pdev, index, 0); 1181 if (!vdev->barmap[index]) { 1182 pci_release_selected_regions(pdev, 1 << index); 1183 return -ENOMEM; 1184 } 1185 } 1186 1187 vma->vm_private_data = vdev; 1188 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 1189 vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; 1190 1191 return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 1192 req_len, vma->vm_page_prot); 1193 } 1194 1195 static void vfio_pci_request(void *device_data, unsigned int count) 1196 { 1197 struct vfio_pci_device *vdev = device_data; 1198 1199 mutex_lock(&vdev->igate); 1200 1201 if (vdev->req_trigger) { 1202 if (!(count % 10)) 1203 dev_notice_ratelimited(&vdev->pdev->dev, 1204 "Relaying device request to user (#%u)\n", 1205 count); 1206 eventfd_signal(vdev->req_trigger, 1); 1207 } else if (count == 0) { 1208 dev_warn(&vdev->pdev->dev, 1209 "No device request channel registered, blocked until released by user\n"); 1210 } 1211 1212 mutex_unlock(&vdev->igate); 1213 } 1214 1215 static const struct vfio_device_ops vfio_pci_ops = { 1216 .name = "vfio-pci", 1217 .open = vfio_pci_open, 1218 .release = vfio_pci_release, 1219 .ioctl = vfio_pci_ioctl, 1220 .read = vfio_pci_read, 1221 .write = vfio_pci_write, 1222 .mmap = vfio_pci_mmap, 1223 .request = vfio_pci_request, 1224 }; 1225 1226 static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev); 1227 static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck); 1228 1229 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 1230 { 1231 struct vfio_pci_device *vdev; 1232 struct iommu_group *group; 1233 int ret; 1234 1235 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) 1236 return -EINVAL; 1237 1238 /* 1239 * Prevent binding to PFs with VFs enabled, this too easily allows 1240 * userspace instance with VFs and PFs from the same device, which 1241 * cannot work. Disabling SR-IOV here would initiate removing the 1242 * VFs, which would unbind the driver, which is prone to blocking 1243 * if that VF is also in use by vfio-pci. Just reject these PFs 1244 * and let the user sort it out. 1245 */ 1246 if (pci_num_vf(pdev)) { 1247 pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n"); 1248 return -EBUSY; 1249 } 1250 1251 group = vfio_iommu_group_get(&pdev->dev); 1252 if (!group) 1253 return -EINVAL; 1254 1255 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); 1256 if (!vdev) { 1257 vfio_iommu_group_put(group, &pdev->dev); 1258 return -ENOMEM; 1259 } 1260 1261 vdev->pdev = pdev; 1262 vdev->irq_type = VFIO_PCI_NUM_IRQS; 1263 mutex_init(&vdev->igate); 1264 spin_lock_init(&vdev->irqlock); 1265 mutex_init(&vdev->ioeventfds_lock); 1266 INIT_LIST_HEAD(&vdev->ioeventfds_list); 1267 1268 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); 1269 if (ret) { 1270 vfio_iommu_group_put(group, &pdev->dev); 1271 kfree(vdev); 1272 return ret; 1273 } 1274 1275 ret = vfio_pci_reflck_attach(vdev); 1276 if (ret) { 1277 vfio_del_group_dev(&pdev->dev); 1278 vfio_iommu_group_put(group, &pdev->dev); 1279 kfree(vdev); 1280 return ret; 1281 } 1282 1283 if (vfio_pci_is_vga(pdev)) { 1284 vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode); 1285 vga_set_legacy_decoding(pdev, 1286 vfio_pci_set_vga_decode(vdev, false)); 1287 } 1288 1289 if (!disable_idle_d3) { 1290 /* 1291 * pci-core sets the device power state to an unknown value at 1292 * bootup and after being removed from a driver. The only 1293 * transition it allows from this unknown state is to D0, which 1294 * typically happens when a driver calls pci_enable_device(). 1295 * We're not ready to enable the device yet, but we do want to 1296 * be able to get to D3. Therefore first do a D0 transition 1297 * before going to D3. 1298 */ 1299 pci_set_power_state(pdev, PCI_D0); 1300 pci_set_power_state(pdev, PCI_D3hot); 1301 } 1302 1303 return ret; 1304 } 1305 1306 static void vfio_pci_remove(struct pci_dev *pdev) 1307 { 1308 struct vfio_pci_device *vdev; 1309 1310 vdev = vfio_del_group_dev(&pdev->dev); 1311 if (!vdev) 1312 return; 1313 1314 vfio_pci_reflck_put(vdev->reflck); 1315 1316 vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); 1317 kfree(vdev->region); 1318 mutex_destroy(&vdev->ioeventfds_lock); 1319 kfree(vdev); 1320 1321 if (vfio_pci_is_vga(pdev)) { 1322 vga_client_register(pdev, NULL, NULL, NULL); 1323 vga_set_legacy_decoding(pdev, 1324 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 1325 VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM); 1326 } 1327 1328 if (!disable_idle_d3) 1329 pci_set_power_state(pdev, PCI_D0); 1330 } 1331 1332 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, 1333 pci_channel_state_t state) 1334 { 1335 struct vfio_pci_device *vdev; 1336 struct vfio_device *device; 1337 1338 device = vfio_device_get_from_dev(&pdev->dev); 1339 if (device == NULL) 1340 return PCI_ERS_RESULT_DISCONNECT; 1341 1342 vdev = vfio_device_data(device); 1343 if (vdev == NULL) { 1344 vfio_device_put(device); 1345 return PCI_ERS_RESULT_DISCONNECT; 1346 } 1347 1348 mutex_lock(&vdev->igate); 1349 1350 if (vdev->err_trigger) 1351 eventfd_signal(vdev->err_trigger, 1); 1352 1353 mutex_unlock(&vdev->igate); 1354 1355 vfio_device_put(device); 1356 1357 return PCI_ERS_RESULT_CAN_RECOVER; 1358 } 1359 1360 static const struct pci_error_handlers vfio_err_handlers = { 1361 .error_detected = vfio_pci_aer_err_detected, 1362 }; 1363 1364 static struct pci_driver vfio_pci_driver = { 1365 .name = "vfio-pci", 1366 .id_table = NULL, /* only dynamic ids */ 1367 .probe = vfio_pci_probe, 1368 .remove = vfio_pci_remove, 1369 .err_handler = &vfio_err_handlers, 1370 }; 1371 1372 static DEFINE_MUTEX(reflck_lock); 1373 1374 static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void) 1375 { 1376 struct vfio_pci_reflck *reflck; 1377 1378 reflck = kzalloc(sizeof(*reflck), GFP_KERNEL); 1379 if (!reflck) 1380 return ERR_PTR(-ENOMEM); 1381 1382 kref_init(&reflck->kref); 1383 mutex_init(&reflck->lock); 1384 1385 return reflck; 1386 } 1387 1388 static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck) 1389 { 1390 kref_get(&reflck->kref); 1391 } 1392 1393 static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data) 1394 { 1395 struct vfio_pci_reflck **preflck = data; 1396 struct vfio_device *device; 1397 struct vfio_pci_device *vdev; 1398 1399 device = vfio_device_get_from_dev(&pdev->dev); 1400 if (!device) 1401 return 0; 1402 1403 if (pci_dev_driver(pdev) != &vfio_pci_driver) { 1404 vfio_device_put(device); 1405 return 0; 1406 } 1407 1408 vdev = vfio_device_data(device); 1409 1410 if (vdev->reflck) { 1411 vfio_pci_reflck_get(vdev->reflck); 1412 *preflck = vdev->reflck; 1413 vfio_device_put(device); 1414 return 1; 1415 } 1416 1417 vfio_device_put(device); 1418 return 0; 1419 } 1420 1421 static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev) 1422 { 1423 bool slot = !pci_probe_reset_slot(vdev->pdev->slot); 1424 1425 mutex_lock(&reflck_lock); 1426 1427 if (pci_is_root_bus(vdev->pdev->bus) || 1428 vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find, 1429 &vdev->reflck, slot) <= 0) 1430 vdev->reflck = vfio_pci_reflck_alloc(); 1431 1432 mutex_unlock(&reflck_lock); 1433 1434 return PTR_ERR_OR_ZERO(vdev->reflck); 1435 } 1436 1437 static void vfio_pci_reflck_release(struct kref *kref) 1438 { 1439 struct vfio_pci_reflck *reflck = container_of(kref, 1440 struct vfio_pci_reflck, 1441 kref); 1442 1443 kfree(reflck); 1444 mutex_unlock(&reflck_lock); 1445 } 1446 1447 static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck) 1448 { 1449 kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock); 1450 } 1451 1452 struct vfio_devices { 1453 struct vfio_device **devices; 1454 int cur_index; 1455 int max_index; 1456 }; 1457 1458 static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data) 1459 { 1460 struct vfio_devices *devs = data; 1461 struct vfio_device *device; 1462 struct vfio_pci_device *vdev; 1463 1464 if (devs->cur_index == devs->max_index) 1465 return -ENOSPC; 1466 1467 device = vfio_device_get_from_dev(&pdev->dev); 1468 if (!device) 1469 return -EINVAL; 1470 1471 if (pci_dev_driver(pdev) != &vfio_pci_driver) { 1472 vfio_device_put(device); 1473 return -EBUSY; 1474 } 1475 1476 vdev = vfio_device_data(device); 1477 1478 /* Fault if the device is not unused */ 1479 if (vdev->refcnt) { 1480 vfio_device_put(device); 1481 return -EBUSY; 1482 } 1483 1484 devs->devices[devs->cur_index++] = device; 1485 return 0; 1486 } 1487 1488 /* 1489 * If a bus or slot reset is available for the provided device and: 1490 * - All of the devices affected by that bus or slot reset are unused 1491 * (!refcnt) 1492 * - At least one of the affected devices is marked dirty via 1493 * needs_reset (such as by lack of FLR support) 1494 * Then attempt to perform that bus or slot reset. Callers are required 1495 * to hold vdev->reflck->lock, protecting the bus/slot reset group from 1496 * concurrent opens. A vfio_device reference is acquired for each device 1497 * to prevent unbinds during the reset operation. 1498 * 1499 * NB: vfio-core considers a group to be viable even if some devices are 1500 * bound to drivers like pci-stub or pcieport. Here we require all devices 1501 * to be bound to vfio_pci since that's the only way we can be sure they 1502 * stay put. 1503 */ 1504 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) 1505 { 1506 struct vfio_devices devs = { .cur_index = 0 }; 1507 int i = 0, ret = -EINVAL; 1508 bool slot = false; 1509 struct vfio_pci_device *tmp; 1510 1511 if (!pci_probe_reset_slot(vdev->pdev->slot)) 1512 slot = true; 1513 else if (pci_probe_reset_bus(vdev->pdev->bus)) 1514 return; 1515 1516 if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, 1517 &i, slot) || !i) 1518 return; 1519 1520 devs.max_index = i; 1521 devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL); 1522 if (!devs.devices) 1523 return; 1524 1525 if (vfio_pci_for_each_slot_or_bus(vdev->pdev, 1526 vfio_pci_get_unused_devs, 1527 &devs, slot)) 1528 goto put_devs; 1529 1530 /* Does at least one need a reset? */ 1531 for (i = 0; i < devs.cur_index; i++) { 1532 tmp = vfio_device_data(devs.devices[i]); 1533 if (tmp->needs_reset) { 1534 ret = pci_reset_bus(vdev->pdev); 1535 break; 1536 } 1537 } 1538 1539 put_devs: 1540 for (i = 0; i < devs.cur_index; i++) { 1541 tmp = vfio_device_data(devs.devices[i]); 1542 1543 /* 1544 * If reset was successful, affected devices no longer need 1545 * a reset and we should return all the collateral devices 1546 * to low power. If not successful, we either didn't reset 1547 * the bus or timed out waiting for it, so let's not touch 1548 * the power state. 1549 */ 1550 if (!ret) { 1551 tmp->needs_reset = false; 1552 1553 if (tmp != vdev && !disable_idle_d3) 1554 pci_set_power_state(tmp->pdev, PCI_D3hot); 1555 } 1556 1557 vfio_device_put(devs.devices[i]); 1558 } 1559 1560 kfree(devs.devices); 1561 } 1562 1563 static void __exit vfio_pci_cleanup(void) 1564 { 1565 pci_unregister_driver(&vfio_pci_driver); 1566 vfio_pci_uninit_perm_bits(); 1567 } 1568 1569 static void __init vfio_pci_fill_ids(void) 1570 { 1571 char *p, *id; 1572 int rc; 1573 1574 /* no ids passed actually */ 1575 if (ids[0] == '\0') 1576 return; 1577 1578 /* add ids specified in the module parameter */ 1579 p = ids; 1580 while ((id = strsep(&p, ","))) { 1581 unsigned int vendor, device, subvendor = PCI_ANY_ID, 1582 subdevice = PCI_ANY_ID, class = 0, class_mask = 0; 1583 int fields; 1584 1585 if (!strlen(id)) 1586 continue; 1587 1588 fields = sscanf(id, "%x:%x:%x:%x:%x:%x", 1589 &vendor, &device, &subvendor, &subdevice, 1590 &class, &class_mask); 1591 1592 if (fields < 2) { 1593 pr_warn("invalid id string \"%s\"\n", id); 1594 continue; 1595 } 1596 1597 rc = pci_add_dynid(&vfio_pci_driver, vendor, device, 1598 subvendor, subdevice, class, class_mask, 0); 1599 if (rc) 1600 pr_warn("failed to add dynamic id [%04hx:%04hx[%04hx:%04hx]] class %#08x/%08x (%d)\n", 1601 vendor, device, subvendor, subdevice, 1602 class, class_mask, rc); 1603 else 1604 pr_info("add [%04hx:%04hx[%04hx:%04hx]] class %#08x/%08x\n", 1605 vendor, device, subvendor, subdevice, 1606 class, class_mask); 1607 } 1608 } 1609 1610 static int __init vfio_pci_init(void) 1611 { 1612 int ret; 1613 1614 /* Allocate shared config space permision data used by all devices */ 1615 ret = vfio_pci_init_perm_bits(); 1616 if (ret) 1617 return ret; 1618 1619 /* Register and scan for devices */ 1620 ret = pci_register_driver(&vfio_pci_driver); 1621 if (ret) 1622 goto out_driver; 1623 1624 vfio_pci_fill_ids(); 1625 1626 return 0; 1627 1628 out_driver: 1629 vfio_pci_uninit_perm_bits(); 1630 return ret; 1631 } 1632 1633 module_init(vfio_pci_init); 1634 module_exit(vfio_pci_cleanup); 1635 1636 MODULE_VERSION(DRIVER_VERSION); 1637 MODULE_LICENSE("GPL v2"); 1638 MODULE_AUTHOR(DRIVER_AUTHOR); 1639 MODULE_DESCRIPTION(DRIVER_DESC); 1640