1 /* 2 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 3 * Author: Alex Williamson <alex.williamson@redhat.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License version 2 as 7 * published by the Free Software Foundation. 8 * 9 * Derived from original vfio: 10 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 11 * Author: Tom Lyon, pugs@cisco.com 12 */ 13 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 16 #include <linux/device.h> 17 #include <linux/eventfd.h> 18 #include <linux/file.h> 19 #include <linux/interrupt.h> 20 #include <linux/iommu.h> 21 #include <linux/module.h> 22 #include <linux/mutex.h> 23 #include <linux/notifier.h> 24 #include <linux/pci.h> 25 #include <linux/pm_runtime.h> 26 #include <linux/slab.h> 27 #include <linux/types.h> 28 #include <linux/uaccess.h> 29 #include <linux/vfio.h> 30 #include <linux/vgaarb.h> 31 32 #include "vfio_pci_private.h" 33 34 #define DRIVER_VERSION "0.2" 35 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 36 #define DRIVER_DESC "VFIO PCI - User Level meta-driver" 37 38 static char ids[1024] __initdata; 39 module_param_string(ids, ids, sizeof(ids), 0); 40 MODULE_PARM_DESC(ids, "Initial PCI IDs to add to the vfio driver, format is \"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\" and multiple comma separated entries can be specified"); 41 42 static bool nointxmask; 43 module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR); 44 MODULE_PARM_DESC(nointxmask, 45 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag."); 46 47 #ifdef CONFIG_VFIO_PCI_VGA 48 static bool disable_vga; 49 module_param(disable_vga, bool, S_IRUGO); 50 MODULE_PARM_DESC(disable_vga, "Disable VGA resource access through vfio-pci"); 51 #endif 52 53 static bool disable_idle_d3; 54 module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR); 55 MODULE_PARM_DESC(disable_idle_d3, 56 "Disable using the PCI D3 low power state for idle, unused devices"); 57 58 static DEFINE_MUTEX(driver_lock); 59 60 static inline bool vfio_vga_disabled(void) 61 { 62 #ifdef CONFIG_VFIO_PCI_VGA 63 return disable_vga; 64 #else 65 return true; 66 #endif 67 } 68 69 /* 70 * Our VGA arbiter participation is limited since we don't know anything 71 * about the device itself. However, if the device is the only VGA device 72 * downstream of a bridge and VFIO VGA support is disabled, then we can 73 * safely return legacy VGA IO and memory as not decoded since the user 74 * has no way to get to it and routing can be disabled externally at the 75 * bridge. 76 */ 77 static unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga) 78 { 79 struct vfio_pci_device *vdev = opaque; 80 struct pci_dev *tmp = NULL, *pdev = vdev->pdev; 81 unsigned char max_busnr; 82 unsigned int decodes; 83 84 if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus)) 85 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 86 VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 87 88 max_busnr = pci_bus_max_busnr(pdev->bus); 89 decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 90 91 while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) { 92 if (tmp == pdev || 93 pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) || 94 pci_is_root_bus(tmp->bus)) 95 continue; 96 97 if (tmp->bus->number >= pdev->bus->number && 98 tmp->bus->number <= max_busnr) { 99 pci_dev_put(tmp); 100 decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 101 break; 102 } 103 } 104 105 return decodes; 106 } 107 108 static inline bool vfio_pci_is_vga(struct pci_dev *pdev) 109 { 110 return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; 111 } 112 113 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); 114 115 static int vfio_pci_enable(struct vfio_pci_device *vdev) 116 { 117 struct pci_dev *pdev = vdev->pdev; 118 int ret; 119 u16 cmd; 120 u8 msix_pos; 121 122 pci_set_power_state(pdev, PCI_D0); 123 124 /* Don't allow our initial saved state to include busmaster */ 125 pci_clear_master(pdev); 126 127 ret = pci_enable_device(pdev); 128 if (ret) 129 return ret; 130 131 vdev->reset_works = (pci_reset_function(pdev) == 0); 132 pci_save_state(pdev); 133 vdev->pci_saved_state = pci_store_saved_state(pdev); 134 if (!vdev->pci_saved_state) 135 pr_debug("%s: Couldn't store %s saved state\n", 136 __func__, dev_name(&pdev->dev)); 137 138 ret = vfio_config_init(vdev); 139 if (ret) { 140 kfree(vdev->pci_saved_state); 141 vdev->pci_saved_state = NULL; 142 pci_disable_device(pdev); 143 return ret; 144 } 145 146 if (likely(!nointxmask)) 147 vdev->pci_2_3 = pci_intx_mask_supported(pdev); 148 149 pci_read_config_word(pdev, PCI_COMMAND, &cmd); 150 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { 151 cmd &= ~PCI_COMMAND_INTX_DISABLE; 152 pci_write_config_word(pdev, PCI_COMMAND, cmd); 153 } 154 155 msix_pos = pdev->msix_cap; 156 if (msix_pos) { 157 u16 flags; 158 u32 table; 159 160 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); 161 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); 162 163 vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; 164 vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; 165 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; 166 } else 167 vdev->msix_bar = 0xFF; 168 169 if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) 170 vdev->has_vga = true; 171 172 return 0; 173 } 174 175 static void vfio_pci_disable(struct vfio_pci_device *vdev) 176 { 177 struct pci_dev *pdev = vdev->pdev; 178 int bar; 179 180 /* Stop the device from further DMA */ 181 pci_clear_master(pdev); 182 183 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | 184 VFIO_IRQ_SET_ACTION_TRIGGER, 185 vdev->irq_type, 0, 0, NULL); 186 187 vdev->virq_disabled = false; 188 189 vfio_config_free(vdev); 190 191 for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { 192 if (!vdev->barmap[bar]) 193 continue; 194 pci_iounmap(pdev, vdev->barmap[bar]); 195 pci_release_selected_regions(pdev, 1 << bar); 196 vdev->barmap[bar] = NULL; 197 } 198 199 vdev->needs_reset = true; 200 201 /* 202 * If we have saved state, restore it. If we can reset the device, 203 * even better. Resetting with current state seems better than 204 * nothing, but saving and restoring current state without reset 205 * is just busy work. 206 */ 207 if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) { 208 pr_info("%s: Couldn't reload %s saved state\n", 209 __func__, dev_name(&pdev->dev)); 210 211 if (!vdev->reset_works) 212 goto out; 213 214 pci_save_state(pdev); 215 } 216 217 /* 218 * Disable INTx and MSI, presumably to avoid spurious interrupts 219 * during reset. Stolen from pci_reset_function() 220 */ 221 pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); 222 223 /* 224 * Try to reset the device. The success of this is dependent on 225 * being able to lock the device, which is not always possible. 226 */ 227 if (vdev->reset_works && !pci_try_reset_function(pdev)) 228 vdev->needs_reset = false; 229 230 pci_restore_state(pdev); 231 out: 232 pci_disable_device(pdev); 233 234 vfio_pci_try_bus_reset(vdev); 235 236 if (!disable_idle_d3) 237 pci_set_power_state(pdev, PCI_D3hot); 238 } 239 240 static void vfio_pci_release(void *device_data) 241 { 242 struct vfio_pci_device *vdev = device_data; 243 244 mutex_lock(&driver_lock); 245 246 if (!(--vdev->refcnt)) { 247 vfio_spapr_pci_eeh_release(vdev->pdev); 248 vfio_pci_disable(vdev); 249 } 250 251 mutex_unlock(&driver_lock); 252 253 module_put(THIS_MODULE); 254 } 255 256 static int vfio_pci_open(void *device_data) 257 { 258 struct vfio_pci_device *vdev = device_data; 259 int ret = 0; 260 261 if (!try_module_get(THIS_MODULE)) 262 return -ENODEV; 263 264 mutex_lock(&driver_lock); 265 266 if (!vdev->refcnt) { 267 ret = vfio_pci_enable(vdev); 268 if (ret) 269 goto error; 270 271 vfio_spapr_pci_eeh_open(vdev->pdev); 272 } 273 vdev->refcnt++; 274 error: 275 mutex_unlock(&driver_lock); 276 if (ret) 277 module_put(THIS_MODULE); 278 return ret; 279 } 280 281 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) 282 { 283 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { 284 u8 pin; 285 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); 286 if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && pin) 287 return 1; 288 289 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { 290 u8 pos; 291 u16 flags; 292 293 pos = vdev->pdev->msi_cap; 294 if (pos) { 295 pci_read_config_word(vdev->pdev, 296 pos + PCI_MSI_FLAGS, &flags); 297 return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); 298 } 299 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { 300 u8 pos; 301 u16 flags; 302 303 pos = vdev->pdev->msix_cap; 304 if (pos) { 305 pci_read_config_word(vdev->pdev, 306 pos + PCI_MSIX_FLAGS, &flags); 307 308 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 309 } 310 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { 311 if (pci_is_pcie(vdev->pdev)) 312 return 1; 313 } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { 314 return 1; 315 } 316 317 return 0; 318 } 319 320 static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) 321 { 322 (*(int *)data)++; 323 return 0; 324 } 325 326 struct vfio_pci_fill_info { 327 int max; 328 int cur; 329 struct vfio_pci_dependent_device *devices; 330 }; 331 332 static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) 333 { 334 struct vfio_pci_fill_info *fill = data; 335 struct iommu_group *iommu_group; 336 337 if (fill->cur == fill->max) 338 return -EAGAIN; /* Something changed, try again */ 339 340 iommu_group = iommu_group_get(&pdev->dev); 341 if (!iommu_group) 342 return -EPERM; /* Cannot reset non-isolated devices */ 343 344 fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); 345 fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); 346 fill->devices[fill->cur].bus = pdev->bus->number; 347 fill->devices[fill->cur].devfn = pdev->devfn; 348 fill->cur++; 349 iommu_group_put(iommu_group); 350 return 0; 351 } 352 353 struct vfio_pci_group_entry { 354 struct vfio_group *group; 355 int id; 356 }; 357 358 struct vfio_pci_group_info { 359 int count; 360 struct vfio_pci_group_entry *groups; 361 }; 362 363 static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data) 364 { 365 struct vfio_pci_group_info *info = data; 366 struct iommu_group *group; 367 int id, i; 368 369 group = iommu_group_get(&pdev->dev); 370 if (!group) 371 return -EPERM; 372 373 id = iommu_group_id(group); 374 375 for (i = 0; i < info->count; i++) 376 if (info->groups[i].id == id) 377 break; 378 379 iommu_group_put(group); 380 381 return (i == info->count) ? -EINVAL : 0; 382 } 383 384 static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) 385 { 386 for (; pdev; pdev = pdev->bus->self) 387 if (pdev->bus == slot->bus) 388 return (pdev->slot == slot); 389 return false; 390 } 391 392 struct vfio_pci_walk_info { 393 int (*fn)(struct pci_dev *, void *data); 394 void *data; 395 struct pci_dev *pdev; 396 bool slot; 397 int ret; 398 }; 399 400 static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data) 401 { 402 struct vfio_pci_walk_info *walk = data; 403 404 if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot)) 405 walk->ret = walk->fn(pdev, walk->data); 406 407 return walk->ret; 408 } 409 410 static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, 411 int (*fn)(struct pci_dev *, 412 void *data), void *data, 413 bool slot) 414 { 415 struct vfio_pci_walk_info walk = { 416 .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0, 417 }; 418 419 pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk); 420 421 return walk.ret; 422 } 423 424 static long vfio_pci_ioctl(void *device_data, 425 unsigned int cmd, unsigned long arg) 426 { 427 struct vfio_pci_device *vdev = device_data; 428 unsigned long minsz; 429 430 if (cmd == VFIO_DEVICE_GET_INFO) { 431 struct vfio_device_info info; 432 433 minsz = offsetofend(struct vfio_device_info, num_irqs); 434 435 if (copy_from_user(&info, (void __user *)arg, minsz)) 436 return -EFAULT; 437 438 if (info.argsz < minsz) 439 return -EINVAL; 440 441 info.flags = VFIO_DEVICE_FLAGS_PCI; 442 443 if (vdev->reset_works) 444 info.flags |= VFIO_DEVICE_FLAGS_RESET; 445 446 info.num_regions = VFIO_PCI_NUM_REGIONS; 447 info.num_irqs = VFIO_PCI_NUM_IRQS; 448 449 return copy_to_user((void __user *)arg, &info, minsz); 450 451 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { 452 struct pci_dev *pdev = vdev->pdev; 453 struct vfio_region_info info; 454 455 minsz = offsetofend(struct vfio_region_info, offset); 456 457 if (copy_from_user(&info, (void __user *)arg, minsz)) 458 return -EFAULT; 459 460 if (info.argsz < minsz) 461 return -EINVAL; 462 463 switch (info.index) { 464 case VFIO_PCI_CONFIG_REGION_INDEX: 465 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 466 info.size = pdev->cfg_size; 467 info.flags = VFIO_REGION_INFO_FLAG_READ | 468 VFIO_REGION_INFO_FLAG_WRITE; 469 break; 470 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 471 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 472 info.size = pci_resource_len(pdev, info.index); 473 if (!info.size) { 474 info.flags = 0; 475 break; 476 } 477 478 info.flags = VFIO_REGION_INFO_FLAG_READ | 479 VFIO_REGION_INFO_FLAG_WRITE; 480 if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) && 481 pci_resource_flags(pdev, info.index) & 482 IORESOURCE_MEM && info.size >= PAGE_SIZE) 483 info.flags |= VFIO_REGION_INFO_FLAG_MMAP; 484 break; 485 case VFIO_PCI_ROM_REGION_INDEX: 486 { 487 void __iomem *io; 488 size_t size; 489 490 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 491 info.flags = 0; 492 493 /* Report the BAR size, not the ROM size */ 494 info.size = pci_resource_len(pdev, info.index); 495 if (!info.size) 496 break; 497 498 /* Is it really there? */ 499 io = pci_map_rom(pdev, &size); 500 if (!io || !size) { 501 info.size = 0; 502 break; 503 } 504 pci_unmap_rom(pdev, io); 505 506 info.flags = VFIO_REGION_INFO_FLAG_READ; 507 break; 508 } 509 case VFIO_PCI_VGA_REGION_INDEX: 510 if (!vdev->has_vga) 511 return -EINVAL; 512 513 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 514 info.size = 0xc0000; 515 info.flags = VFIO_REGION_INFO_FLAG_READ | 516 VFIO_REGION_INFO_FLAG_WRITE; 517 518 break; 519 default: 520 return -EINVAL; 521 } 522 523 return copy_to_user((void __user *)arg, &info, minsz); 524 525 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { 526 struct vfio_irq_info info; 527 528 minsz = offsetofend(struct vfio_irq_info, count); 529 530 if (copy_from_user(&info, (void __user *)arg, minsz)) 531 return -EFAULT; 532 533 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 534 return -EINVAL; 535 536 switch (info.index) { 537 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: 538 case VFIO_PCI_REQ_IRQ_INDEX: 539 break; 540 case VFIO_PCI_ERR_IRQ_INDEX: 541 if (pci_is_pcie(vdev->pdev)) 542 break; 543 /* pass thru to return error */ 544 default: 545 return -EINVAL; 546 } 547 548 info.flags = VFIO_IRQ_INFO_EVENTFD; 549 550 info.count = vfio_pci_get_irq_count(vdev, info.index); 551 552 if (info.index == VFIO_PCI_INTX_IRQ_INDEX) 553 info.flags |= (VFIO_IRQ_INFO_MASKABLE | 554 VFIO_IRQ_INFO_AUTOMASKED); 555 else 556 info.flags |= VFIO_IRQ_INFO_NORESIZE; 557 558 return copy_to_user((void __user *)arg, &info, minsz); 559 560 } else if (cmd == VFIO_DEVICE_SET_IRQS) { 561 struct vfio_irq_set hdr; 562 u8 *data = NULL; 563 int ret = 0; 564 565 minsz = offsetofend(struct vfio_irq_set, count); 566 567 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 568 return -EFAULT; 569 570 if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS || 571 hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | 572 VFIO_IRQ_SET_ACTION_TYPE_MASK)) 573 return -EINVAL; 574 575 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { 576 size_t size; 577 int max = vfio_pci_get_irq_count(vdev, hdr.index); 578 579 if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) 580 size = sizeof(uint8_t); 581 else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) 582 size = sizeof(int32_t); 583 else 584 return -EINVAL; 585 586 if (hdr.argsz - minsz < hdr.count * size || 587 hdr.start >= max || hdr.start + hdr.count > max) 588 return -EINVAL; 589 590 data = memdup_user((void __user *)(arg + minsz), 591 hdr.count * size); 592 if (IS_ERR(data)) 593 return PTR_ERR(data); 594 } 595 596 mutex_lock(&vdev->igate); 597 598 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, 599 hdr.start, hdr.count, data); 600 601 mutex_unlock(&vdev->igate); 602 kfree(data); 603 604 return ret; 605 606 } else if (cmd == VFIO_DEVICE_RESET) { 607 return vdev->reset_works ? 608 pci_try_reset_function(vdev->pdev) : -EINVAL; 609 610 } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { 611 struct vfio_pci_hot_reset_info hdr; 612 struct vfio_pci_fill_info fill = { 0 }; 613 struct vfio_pci_dependent_device *devices = NULL; 614 bool slot = false; 615 int ret = 0; 616 617 minsz = offsetofend(struct vfio_pci_hot_reset_info, count); 618 619 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 620 return -EFAULT; 621 622 if (hdr.argsz < minsz) 623 return -EINVAL; 624 625 hdr.flags = 0; 626 627 /* Can we do a slot or bus reset or neither? */ 628 if (!pci_probe_reset_slot(vdev->pdev->slot)) 629 slot = true; 630 else if (pci_probe_reset_bus(vdev->pdev->bus)) 631 return -ENODEV; 632 633 /* How many devices are affected? */ 634 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 635 vfio_pci_count_devs, 636 &fill.max, slot); 637 if (ret) 638 return ret; 639 640 WARN_ON(!fill.max); /* Should always be at least one */ 641 642 /* 643 * If there's enough space, fill it now, otherwise return 644 * -ENOSPC and the number of devices affected. 645 */ 646 if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { 647 ret = -ENOSPC; 648 hdr.count = fill.max; 649 goto reset_info_exit; 650 } 651 652 devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); 653 if (!devices) 654 return -ENOMEM; 655 656 fill.devices = devices; 657 658 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 659 vfio_pci_fill_devs, 660 &fill, slot); 661 662 /* 663 * If a device was removed between counting and filling, 664 * we may come up short of fill.max. If a device was 665 * added, we'll have a return of -EAGAIN above. 666 */ 667 if (!ret) 668 hdr.count = fill.cur; 669 670 reset_info_exit: 671 if (copy_to_user((void __user *)arg, &hdr, minsz)) 672 ret = -EFAULT; 673 674 if (!ret) { 675 if (copy_to_user((void __user *)(arg + minsz), devices, 676 hdr.count * sizeof(*devices))) 677 ret = -EFAULT; 678 } 679 680 kfree(devices); 681 return ret; 682 683 } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { 684 struct vfio_pci_hot_reset hdr; 685 int32_t *group_fds; 686 struct vfio_pci_group_entry *groups; 687 struct vfio_pci_group_info info; 688 bool slot = false; 689 int i, count = 0, ret = 0; 690 691 minsz = offsetofend(struct vfio_pci_hot_reset, count); 692 693 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 694 return -EFAULT; 695 696 if (hdr.argsz < minsz || hdr.flags) 697 return -EINVAL; 698 699 /* Can we do a slot or bus reset or neither? */ 700 if (!pci_probe_reset_slot(vdev->pdev->slot)) 701 slot = true; 702 else if (pci_probe_reset_bus(vdev->pdev->bus)) 703 return -ENODEV; 704 705 /* 706 * We can't let userspace give us an arbitrarily large 707 * buffer to copy, so verify how many we think there 708 * could be. Note groups can have multiple devices so 709 * one group per device is the max. 710 */ 711 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 712 vfio_pci_count_devs, 713 &count, slot); 714 if (ret) 715 return ret; 716 717 /* Somewhere between 1 and count is OK */ 718 if (!hdr.count || hdr.count > count) 719 return -EINVAL; 720 721 group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); 722 groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); 723 if (!group_fds || !groups) { 724 kfree(group_fds); 725 kfree(groups); 726 return -ENOMEM; 727 } 728 729 if (copy_from_user(group_fds, (void __user *)(arg + minsz), 730 hdr.count * sizeof(*group_fds))) { 731 kfree(group_fds); 732 kfree(groups); 733 return -EFAULT; 734 } 735 736 /* 737 * For each group_fd, get the group through the vfio external 738 * user interface and store the group and iommu ID. This 739 * ensures the group is held across the reset. 740 */ 741 for (i = 0; i < hdr.count; i++) { 742 struct vfio_group *group; 743 struct fd f = fdget(group_fds[i]); 744 if (!f.file) { 745 ret = -EBADF; 746 break; 747 } 748 749 group = vfio_group_get_external_user(f.file); 750 fdput(f); 751 if (IS_ERR(group)) { 752 ret = PTR_ERR(group); 753 break; 754 } 755 756 groups[i].group = group; 757 groups[i].id = vfio_external_user_iommu_id(group); 758 } 759 760 kfree(group_fds); 761 762 /* release reference to groups on error */ 763 if (ret) 764 goto hot_reset_release; 765 766 info.count = hdr.count; 767 info.groups = groups; 768 769 /* 770 * Test whether all the affected devices are contained 771 * by the set of groups provided by the user. 772 */ 773 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 774 vfio_pci_validate_devs, 775 &info, slot); 776 if (!ret) 777 /* User has access, do the reset */ 778 ret = slot ? pci_try_reset_slot(vdev->pdev->slot) : 779 pci_try_reset_bus(vdev->pdev->bus); 780 781 hot_reset_release: 782 for (i--; i >= 0; i--) 783 vfio_group_put_external_user(groups[i].group); 784 785 kfree(groups); 786 return ret; 787 } 788 789 return -ENOTTY; 790 } 791 792 static ssize_t vfio_pci_rw(void *device_data, char __user *buf, 793 size_t count, loff_t *ppos, bool iswrite) 794 { 795 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 796 struct vfio_pci_device *vdev = device_data; 797 798 if (index >= VFIO_PCI_NUM_REGIONS) 799 return -EINVAL; 800 801 switch (index) { 802 case VFIO_PCI_CONFIG_REGION_INDEX: 803 return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); 804 805 case VFIO_PCI_ROM_REGION_INDEX: 806 if (iswrite) 807 return -EINVAL; 808 return vfio_pci_bar_rw(vdev, buf, count, ppos, false); 809 810 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 811 return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); 812 813 case VFIO_PCI_VGA_REGION_INDEX: 814 return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); 815 } 816 817 return -EINVAL; 818 } 819 820 static ssize_t vfio_pci_read(void *device_data, char __user *buf, 821 size_t count, loff_t *ppos) 822 { 823 if (!count) 824 return 0; 825 826 return vfio_pci_rw(device_data, buf, count, ppos, false); 827 } 828 829 static ssize_t vfio_pci_write(void *device_data, const char __user *buf, 830 size_t count, loff_t *ppos) 831 { 832 if (!count) 833 return 0; 834 835 return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true); 836 } 837 838 static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) 839 { 840 struct vfio_pci_device *vdev = device_data; 841 struct pci_dev *pdev = vdev->pdev; 842 unsigned int index; 843 u64 phys_len, req_len, pgoff, req_start; 844 int ret; 845 846 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 847 848 if (vma->vm_end < vma->vm_start) 849 return -EINVAL; 850 if ((vma->vm_flags & VM_SHARED) == 0) 851 return -EINVAL; 852 if (index >= VFIO_PCI_ROM_REGION_INDEX) 853 return -EINVAL; 854 if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM)) 855 return -EINVAL; 856 857 phys_len = pci_resource_len(pdev, index); 858 req_len = vma->vm_end - vma->vm_start; 859 pgoff = vma->vm_pgoff & 860 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 861 req_start = pgoff << PAGE_SHIFT; 862 863 if (phys_len < PAGE_SIZE || req_start + req_len > phys_len) 864 return -EINVAL; 865 866 if (index == vdev->msix_bar) { 867 /* 868 * Disallow mmaps overlapping the MSI-X table; users don't 869 * get to touch this directly. We could find somewhere 870 * else to map the overlap, but page granularity is only 871 * a recommendation, not a requirement, so the user needs 872 * to know which bits are real. Requiring them to mmap 873 * around the table makes that clear. 874 */ 875 876 /* If neither entirely above nor below, then it overlaps */ 877 if (!(req_start >= vdev->msix_offset + vdev->msix_size || 878 req_start + req_len <= vdev->msix_offset)) 879 return -EINVAL; 880 } 881 882 /* 883 * Even though we don't make use of the barmap for the mmap, 884 * we need to request the region and the barmap tracks that. 885 */ 886 if (!vdev->barmap[index]) { 887 ret = pci_request_selected_regions(pdev, 888 1 << index, "vfio-pci"); 889 if (ret) 890 return ret; 891 892 vdev->barmap[index] = pci_iomap(pdev, index, 0); 893 } 894 895 vma->vm_private_data = vdev; 896 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 897 vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; 898 899 return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 900 req_len, vma->vm_page_prot); 901 } 902 903 static void vfio_pci_request(void *device_data, unsigned int count) 904 { 905 struct vfio_pci_device *vdev = device_data; 906 907 mutex_lock(&vdev->igate); 908 909 if (vdev->req_trigger) { 910 dev_dbg(&vdev->pdev->dev, "Requesting device from user\n"); 911 eventfd_signal(vdev->req_trigger, 1); 912 } 913 914 mutex_unlock(&vdev->igate); 915 } 916 917 static const struct vfio_device_ops vfio_pci_ops = { 918 .name = "vfio-pci", 919 .open = vfio_pci_open, 920 .release = vfio_pci_release, 921 .ioctl = vfio_pci_ioctl, 922 .read = vfio_pci_read, 923 .write = vfio_pci_write, 924 .mmap = vfio_pci_mmap, 925 .request = vfio_pci_request, 926 }; 927 928 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 929 { 930 struct vfio_pci_device *vdev; 931 struct iommu_group *group; 932 int ret; 933 934 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) 935 return -EINVAL; 936 937 group = iommu_group_get(&pdev->dev); 938 if (!group) 939 return -EINVAL; 940 941 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); 942 if (!vdev) { 943 iommu_group_put(group); 944 return -ENOMEM; 945 } 946 947 vdev->pdev = pdev; 948 vdev->irq_type = VFIO_PCI_NUM_IRQS; 949 mutex_init(&vdev->igate); 950 spin_lock_init(&vdev->irqlock); 951 952 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); 953 if (ret) { 954 iommu_group_put(group); 955 kfree(vdev); 956 return ret; 957 } 958 959 if (vfio_pci_is_vga(pdev)) { 960 vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode); 961 vga_set_legacy_decoding(pdev, 962 vfio_pci_set_vga_decode(vdev, false)); 963 } 964 965 if (!disable_idle_d3) { 966 /* 967 * pci-core sets the device power state to an unknown value at 968 * bootup and after being removed from a driver. The only 969 * transition it allows from this unknown state is to D0, which 970 * typically happens when a driver calls pci_enable_device(). 971 * We're not ready to enable the device yet, but we do want to 972 * be able to get to D3. Therefore first do a D0 transition 973 * before going to D3. 974 */ 975 pci_set_power_state(pdev, PCI_D0); 976 pci_set_power_state(pdev, PCI_D3hot); 977 } 978 979 return ret; 980 } 981 982 static void vfio_pci_remove(struct pci_dev *pdev) 983 { 984 struct vfio_pci_device *vdev; 985 986 vdev = vfio_del_group_dev(&pdev->dev); 987 if (!vdev) 988 return; 989 990 iommu_group_put(pdev->dev.iommu_group); 991 kfree(vdev); 992 993 if (vfio_pci_is_vga(pdev)) { 994 vga_client_register(pdev, NULL, NULL, NULL); 995 vga_set_legacy_decoding(pdev, 996 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 997 VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM); 998 } 999 1000 if (!disable_idle_d3) 1001 pci_set_power_state(pdev, PCI_D0); 1002 } 1003 1004 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, 1005 pci_channel_state_t state) 1006 { 1007 struct vfio_pci_device *vdev; 1008 struct vfio_device *device; 1009 1010 device = vfio_device_get_from_dev(&pdev->dev); 1011 if (device == NULL) 1012 return PCI_ERS_RESULT_DISCONNECT; 1013 1014 vdev = vfio_device_data(device); 1015 if (vdev == NULL) { 1016 vfio_device_put(device); 1017 return PCI_ERS_RESULT_DISCONNECT; 1018 } 1019 1020 mutex_lock(&vdev->igate); 1021 1022 if (vdev->err_trigger) 1023 eventfd_signal(vdev->err_trigger, 1); 1024 1025 mutex_unlock(&vdev->igate); 1026 1027 vfio_device_put(device); 1028 1029 return PCI_ERS_RESULT_CAN_RECOVER; 1030 } 1031 1032 static struct pci_error_handlers vfio_err_handlers = { 1033 .error_detected = vfio_pci_aer_err_detected, 1034 }; 1035 1036 static struct pci_driver vfio_pci_driver = { 1037 .name = "vfio-pci", 1038 .id_table = NULL, /* only dynamic ids */ 1039 .probe = vfio_pci_probe, 1040 .remove = vfio_pci_remove, 1041 .err_handler = &vfio_err_handlers, 1042 }; 1043 1044 struct vfio_devices { 1045 struct vfio_device **devices; 1046 int cur_index; 1047 int max_index; 1048 }; 1049 1050 static int vfio_pci_get_devs(struct pci_dev *pdev, void *data) 1051 { 1052 struct vfio_devices *devs = data; 1053 struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver); 1054 1055 if (pci_drv != &vfio_pci_driver) 1056 return -EBUSY; 1057 1058 if (devs->cur_index == devs->max_index) 1059 return -ENOSPC; 1060 1061 devs->devices[devs->cur_index] = vfio_device_get_from_dev(&pdev->dev); 1062 if (!devs->devices[devs->cur_index]) 1063 return -EINVAL; 1064 1065 devs->cur_index++; 1066 return 0; 1067 } 1068 1069 /* 1070 * Attempt to do a bus/slot reset if there are devices affected by a reset for 1071 * this device that are needs_reset and all of the affected devices are unused 1072 * (!refcnt). Callers are required to hold driver_lock when calling this to 1073 * prevent device opens and concurrent bus reset attempts. We prevent device 1074 * unbinds by acquiring and holding a reference to the vfio_device. 1075 * 1076 * NB: vfio-core considers a group to be viable even if some devices are 1077 * bound to drivers like pci-stub or pcieport. Here we require all devices 1078 * to be bound to vfio_pci since that's the only way we can be sure they 1079 * stay put. 1080 */ 1081 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) 1082 { 1083 struct vfio_devices devs = { .cur_index = 0 }; 1084 int i = 0, ret = -EINVAL; 1085 bool needs_reset = false, slot = false; 1086 struct vfio_pci_device *tmp; 1087 1088 if (!pci_probe_reset_slot(vdev->pdev->slot)) 1089 slot = true; 1090 else if (pci_probe_reset_bus(vdev->pdev->bus)) 1091 return; 1092 1093 if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, 1094 &i, slot) || !i) 1095 return; 1096 1097 devs.max_index = i; 1098 devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL); 1099 if (!devs.devices) 1100 return; 1101 1102 if (vfio_pci_for_each_slot_or_bus(vdev->pdev, 1103 vfio_pci_get_devs, &devs, slot)) 1104 goto put_devs; 1105 1106 for (i = 0; i < devs.cur_index; i++) { 1107 tmp = vfio_device_data(devs.devices[i]); 1108 if (tmp->needs_reset) 1109 needs_reset = true; 1110 if (tmp->refcnt) 1111 goto put_devs; 1112 } 1113 1114 if (needs_reset) 1115 ret = slot ? pci_try_reset_slot(vdev->pdev->slot) : 1116 pci_try_reset_bus(vdev->pdev->bus); 1117 1118 put_devs: 1119 for (i = 0; i < devs.cur_index; i++) { 1120 tmp = vfio_device_data(devs.devices[i]); 1121 if (!ret) 1122 tmp->needs_reset = false; 1123 1124 if (!tmp->refcnt && !disable_idle_d3) 1125 pci_set_power_state(tmp->pdev, PCI_D3hot); 1126 1127 vfio_device_put(devs.devices[i]); 1128 } 1129 1130 kfree(devs.devices); 1131 } 1132 1133 static void __exit vfio_pci_cleanup(void) 1134 { 1135 pci_unregister_driver(&vfio_pci_driver); 1136 vfio_pci_uninit_perm_bits(); 1137 } 1138 1139 static void __init vfio_pci_fill_ids(void) 1140 { 1141 char *p, *id; 1142 int rc; 1143 1144 /* no ids passed actually */ 1145 if (ids[0] == '\0') 1146 return; 1147 1148 /* add ids specified in the module parameter */ 1149 p = ids; 1150 while ((id = strsep(&p, ","))) { 1151 unsigned int vendor, device, subvendor = PCI_ANY_ID, 1152 subdevice = PCI_ANY_ID, class = 0, class_mask = 0; 1153 int fields; 1154 1155 if (!strlen(id)) 1156 continue; 1157 1158 fields = sscanf(id, "%x:%x:%x:%x:%x:%x", 1159 &vendor, &device, &subvendor, &subdevice, 1160 &class, &class_mask); 1161 1162 if (fields < 2) { 1163 pr_warn("invalid id string \"%s\"\n", id); 1164 continue; 1165 } 1166 1167 rc = pci_add_dynid(&vfio_pci_driver, vendor, device, 1168 subvendor, subdevice, class, class_mask, 0); 1169 if (rc) 1170 pr_warn("failed to add dynamic id [%04hx:%04hx[%04hx:%04hx]] class %#08x/%08x (%d)\n", 1171 vendor, device, subvendor, subdevice, 1172 class, class_mask, rc); 1173 else 1174 pr_info("add [%04hx:%04hx[%04hx:%04hx]] class %#08x/%08x\n", 1175 vendor, device, subvendor, subdevice, 1176 class, class_mask); 1177 } 1178 } 1179 1180 static int __init vfio_pci_init(void) 1181 { 1182 int ret; 1183 1184 /* Allocate shared config space permision data used by all devices */ 1185 ret = vfio_pci_init_perm_bits(); 1186 if (ret) 1187 return ret; 1188 1189 /* Register and scan for devices */ 1190 ret = pci_register_driver(&vfio_pci_driver); 1191 if (ret) 1192 goto out_driver; 1193 1194 vfio_pci_fill_ids(); 1195 1196 return 0; 1197 1198 out_driver: 1199 vfio_pci_uninit_perm_bits(); 1200 return ret; 1201 } 1202 1203 module_init(vfio_pci_init); 1204 module_exit(vfio_pci_cleanup); 1205 1206 MODULE_VERSION(DRIVER_VERSION); 1207 MODULE_LICENSE("GPL v2"); 1208 MODULE_AUTHOR(DRIVER_AUTHOR); 1209 MODULE_DESCRIPTION(DRIVER_DESC); 1210