1 /* 2 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 3 * Author: Alex Williamson <alex.williamson@redhat.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License version 2 as 7 * published by the Free Software Foundation. 8 * 9 * Derived from original vfio: 10 * Copyright 2010 Cisco Systems, Inc. All rights reserved. 11 * Author: Tom Lyon, pugs@cisco.com 12 */ 13 14 #include <linux/device.h> 15 #include <linux/eventfd.h> 16 #include <linux/file.h> 17 #include <linux/interrupt.h> 18 #include <linux/iommu.h> 19 #include <linux/module.h> 20 #include <linux/mutex.h> 21 #include <linux/notifier.h> 22 #include <linux/pci.h> 23 #include <linux/pm_runtime.h> 24 #include <linux/slab.h> 25 #include <linux/types.h> 26 #include <linux/uaccess.h> 27 #include <linux/vfio.h> 28 29 #include "vfio_pci_private.h" 30 31 #define DRIVER_VERSION "0.2" 32 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 33 #define DRIVER_DESC "VFIO PCI - User Level meta-driver" 34 35 static bool nointxmask; 36 module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR); 37 MODULE_PARM_DESC(nointxmask, 38 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag."); 39 40 static int vfio_pci_enable(struct vfio_pci_device *vdev) 41 { 42 struct pci_dev *pdev = vdev->pdev; 43 int ret; 44 u16 cmd; 45 u8 msix_pos; 46 47 ret = pci_enable_device(pdev); 48 if (ret) 49 return ret; 50 51 vdev->reset_works = (pci_reset_function(pdev) == 0); 52 pci_save_state(pdev); 53 vdev->pci_saved_state = pci_store_saved_state(pdev); 54 if (!vdev->pci_saved_state) 55 pr_debug("%s: Couldn't store %s saved state\n", 56 __func__, dev_name(&pdev->dev)); 57 58 ret = vfio_config_init(vdev); 59 if (ret) { 60 kfree(vdev->pci_saved_state); 61 vdev->pci_saved_state = NULL; 62 pci_disable_device(pdev); 63 return ret; 64 } 65 66 if (likely(!nointxmask)) 67 vdev->pci_2_3 = pci_intx_mask_supported(pdev); 68 69 pci_read_config_word(pdev, PCI_COMMAND, &cmd); 70 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { 71 cmd &= ~PCI_COMMAND_INTX_DISABLE; 72 pci_write_config_word(pdev, PCI_COMMAND, cmd); 73 } 74 75 msix_pos = pdev->msix_cap; 76 if (msix_pos) { 77 u16 flags; 78 u32 table; 79 80 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); 81 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); 82 83 vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; 84 vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; 85 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; 86 } else 87 vdev->msix_bar = 0xFF; 88 89 #ifdef CONFIG_VFIO_PCI_VGA 90 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 91 vdev->has_vga = true; 92 #endif 93 94 return 0; 95 } 96 97 static void vfio_pci_disable(struct vfio_pci_device *vdev) 98 { 99 struct pci_dev *pdev = vdev->pdev; 100 int bar; 101 102 pci_disable_device(pdev); 103 104 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | 105 VFIO_IRQ_SET_ACTION_TRIGGER, 106 vdev->irq_type, 0, 0, NULL); 107 108 vdev->virq_disabled = false; 109 110 vfio_config_free(vdev); 111 112 for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { 113 if (!vdev->barmap[bar]) 114 continue; 115 pci_iounmap(pdev, vdev->barmap[bar]); 116 pci_release_selected_regions(pdev, 1 << bar); 117 vdev->barmap[bar] = NULL; 118 } 119 120 /* 121 * If we have saved state, restore it. If we can reset the device, 122 * even better. Resetting with current state seems better than 123 * nothing, but saving and restoring current state without reset 124 * is just busy work. 125 */ 126 if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) { 127 pr_info("%s: Couldn't reload %s saved state\n", 128 __func__, dev_name(&pdev->dev)); 129 130 if (!vdev->reset_works) 131 return; 132 133 pci_save_state(pdev); 134 } 135 136 /* 137 * Disable INTx and MSI, presumably to avoid spurious interrupts 138 * during reset. Stolen from pci_reset_function() 139 */ 140 pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); 141 142 /* 143 * Try to reset the device. The success of this is dependent on 144 * being able to lock the device, which is not always possible. 145 */ 146 if (vdev->reset_works) { 147 int ret = pci_try_reset_function(pdev); 148 if (ret) 149 pr_warn("%s: Failed to reset device %s (%d)\n", 150 __func__, dev_name(&pdev->dev), ret); 151 } 152 153 pci_restore_state(pdev); 154 } 155 156 static void vfio_pci_release(void *device_data) 157 { 158 struct vfio_pci_device *vdev = device_data; 159 160 if (atomic_dec_and_test(&vdev->refcnt)) 161 vfio_pci_disable(vdev); 162 163 module_put(THIS_MODULE); 164 } 165 166 static int vfio_pci_open(void *device_data) 167 { 168 struct vfio_pci_device *vdev = device_data; 169 170 if (!try_module_get(THIS_MODULE)) 171 return -ENODEV; 172 173 if (atomic_inc_return(&vdev->refcnt) == 1) { 174 int ret = vfio_pci_enable(vdev); 175 if (ret) { 176 module_put(THIS_MODULE); 177 return ret; 178 } 179 } 180 181 return 0; 182 } 183 184 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) 185 { 186 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { 187 u8 pin; 188 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); 189 if (pin) 190 return 1; 191 192 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { 193 u8 pos; 194 u16 flags; 195 196 pos = vdev->pdev->msi_cap; 197 if (pos) { 198 pci_read_config_word(vdev->pdev, 199 pos + PCI_MSI_FLAGS, &flags); 200 return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); 201 } 202 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { 203 u8 pos; 204 u16 flags; 205 206 pos = vdev->pdev->msix_cap; 207 if (pos) { 208 pci_read_config_word(vdev->pdev, 209 pos + PCI_MSIX_FLAGS, &flags); 210 211 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 212 } 213 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) 214 if (pci_is_pcie(vdev->pdev)) 215 return 1; 216 217 return 0; 218 } 219 220 static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) 221 { 222 (*(int *)data)++; 223 return 0; 224 } 225 226 struct vfio_pci_fill_info { 227 int max; 228 int cur; 229 struct vfio_pci_dependent_device *devices; 230 }; 231 232 static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) 233 { 234 struct vfio_pci_fill_info *fill = data; 235 struct iommu_group *iommu_group; 236 237 if (fill->cur == fill->max) 238 return -EAGAIN; /* Something changed, try again */ 239 240 iommu_group = iommu_group_get(&pdev->dev); 241 if (!iommu_group) 242 return -EPERM; /* Cannot reset non-isolated devices */ 243 244 fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); 245 fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); 246 fill->devices[fill->cur].bus = pdev->bus->number; 247 fill->devices[fill->cur].devfn = pdev->devfn; 248 fill->cur++; 249 iommu_group_put(iommu_group); 250 return 0; 251 } 252 253 struct vfio_pci_group_entry { 254 struct vfio_group *group; 255 int id; 256 }; 257 258 struct vfio_pci_group_info { 259 int count; 260 struct vfio_pci_group_entry *groups; 261 }; 262 263 static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data) 264 { 265 struct vfio_pci_group_info *info = data; 266 struct iommu_group *group; 267 int id, i; 268 269 group = iommu_group_get(&pdev->dev); 270 if (!group) 271 return -EPERM; 272 273 id = iommu_group_id(group); 274 275 for (i = 0; i < info->count; i++) 276 if (info->groups[i].id == id) 277 break; 278 279 iommu_group_put(group); 280 281 return (i == info->count) ? -EINVAL : 0; 282 } 283 284 static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) 285 { 286 for (; pdev; pdev = pdev->bus->self) 287 if (pdev->bus == slot->bus) 288 return (pdev->slot == slot); 289 return false; 290 } 291 292 struct vfio_pci_walk_info { 293 int (*fn)(struct pci_dev *, void *data); 294 void *data; 295 struct pci_dev *pdev; 296 bool slot; 297 int ret; 298 }; 299 300 static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data) 301 { 302 struct vfio_pci_walk_info *walk = data; 303 304 if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot)) 305 walk->ret = walk->fn(pdev, walk->data); 306 307 return walk->ret; 308 } 309 310 static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, 311 int (*fn)(struct pci_dev *, 312 void *data), void *data, 313 bool slot) 314 { 315 struct vfio_pci_walk_info walk = { 316 .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0, 317 }; 318 319 pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk); 320 321 return walk.ret; 322 } 323 324 static long vfio_pci_ioctl(void *device_data, 325 unsigned int cmd, unsigned long arg) 326 { 327 struct vfio_pci_device *vdev = device_data; 328 unsigned long minsz; 329 330 if (cmd == VFIO_DEVICE_GET_INFO) { 331 struct vfio_device_info info; 332 333 minsz = offsetofend(struct vfio_device_info, num_irqs); 334 335 if (copy_from_user(&info, (void __user *)arg, minsz)) 336 return -EFAULT; 337 338 if (info.argsz < minsz) 339 return -EINVAL; 340 341 info.flags = VFIO_DEVICE_FLAGS_PCI; 342 343 if (vdev->reset_works) 344 info.flags |= VFIO_DEVICE_FLAGS_RESET; 345 346 info.num_regions = VFIO_PCI_NUM_REGIONS; 347 info.num_irqs = VFIO_PCI_NUM_IRQS; 348 349 return copy_to_user((void __user *)arg, &info, minsz); 350 351 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { 352 struct pci_dev *pdev = vdev->pdev; 353 struct vfio_region_info info; 354 355 minsz = offsetofend(struct vfio_region_info, offset); 356 357 if (copy_from_user(&info, (void __user *)arg, minsz)) 358 return -EFAULT; 359 360 if (info.argsz < minsz) 361 return -EINVAL; 362 363 switch (info.index) { 364 case VFIO_PCI_CONFIG_REGION_INDEX: 365 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 366 info.size = pdev->cfg_size; 367 info.flags = VFIO_REGION_INFO_FLAG_READ | 368 VFIO_REGION_INFO_FLAG_WRITE; 369 break; 370 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 371 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 372 info.size = pci_resource_len(pdev, info.index); 373 if (!info.size) { 374 info.flags = 0; 375 break; 376 } 377 378 info.flags = VFIO_REGION_INFO_FLAG_READ | 379 VFIO_REGION_INFO_FLAG_WRITE; 380 if (pci_resource_flags(pdev, info.index) & 381 IORESOURCE_MEM && info.size >= PAGE_SIZE) 382 info.flags |= VFIO_REGION_INFO_FLAG_MMAP; 383 break; 384 case VFIO_PCI_ROM_REGION_INDEX: 385 { 386 void __iomem *io; 387 size_t size; 388 389 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 390 info.flags = 0; 391 392 /* Report the BAR size, not the ROM size */ 393 info.size = pci_resource_len(pdev, info.index); 394 if (!info.size) 395 break; 396 397 /* Is it really there? */ 398 io = pci_map_rom(pdev, &size); 399 if (!io || !size) { 400 info.size = 0; 401 break; 402 } 403 pci_unmap_rom(pdev, io); 404 405 info.flags = VFIO_REGION_INFO_FLAG_READ; 406 break; 407 } 408 case VFIO_PCI_VGA_REGION_INDEX: 409 if (!vdev->has_vga) 410 return -EINVAL; 411 412 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 413 info.size = 0xc0000; 414 info.flags = VFIO_REGION_INFO_FLAG_READ | 415 VFIO_REGION_INFO_FLAG_WRITE; 416 417 break; 418 default: 419 return -EINVAL; 420 } 421 422 return copy_to_user((void __user *)arg, &info, minsz); 423 424 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { 425 struct vfio_irq_info info; 426 427 minsz = offsetofend(struct vfio_irq_info, count); 428 429 if (copy_from_user(&info, (void __user *)arg, minsz)) 430 return -EFAULT; 431 432 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 433 return -EINVAL; 434 435 switch (info.index) { 436 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: 437 break; 438 case VFIO_PCI_ERR_IRQ_INDEX: 439 if (pci_is_pcie(vdev->pdev)) 440 break; 441 /* pass thru to return error */ 442 default: 443 return -EINVAL; 444 } 445 446 info.flags = VFIO_IRQ_INFO_EVENTFD; 447 448 info.count = vfio_pci_get_irq_count(vdev, info.index); 449 450 if (info.index == VFIO_PCI_INTX_IRQ_INDEX) 451 info.flags |= (VFIO_IRQ_INFO_MASKABLE | 452 VFIO_IRQ_INFO_AUTOMASKED); 453 else 454 info.flags |= VFIO_IRQ_INFO_NORESIZE; 455 456 return copy_to_user((void __user *)arg, &info, minsz); 457 458 } else if (cmd == VFIO_DEVICE_SET_IRQS) { 459 struct vfio_irq_set hdr; 460 u8 *data = NULL; 461 int ret = 0; 462 463 minsz = offsetofend(struct vfio_irq_set, count); 464 465 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 466 return -EFAULT; 467 468 if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS || 469 hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | 470 VFIO_IRQ_SET_ACTION_TYPE_MASK)) 471 return -EINVAL; 472 473 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { 474 size_t size; 475 int max = vfio_pci_get_irq_count(vdev, hdr.index); 476 477 if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) 478 size = sizeof(uint8_t); 479 else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) 480 size = sizeof(int32_t); 481 else 482 return -EINVAL; 483 484 if (hdr.argsz - minsz < hdr.count * size || 485 hdr.start >= max || hdr.start + hdr.count > max) 486 return -EINVAL; 487 488 data = memdup_user((void __user *)(arg + minsz), 489 hdr.count * size); 490 if (IS_ERR(data)) 491 return PTR_ERR(data); 492 } 493 494 mutex_lock(&vdev->igate); 495 496 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, 497 hdr.start, hdr.count, data); 498 499 mutex_unlock(&vdev->igate); 500 kfree(data); 501 502 return ret; 503 504 } else if (cmd == VFIO_DEVICE_RESET) { 505 return vdev->reset_works ? 506 pci_try_reset_function(vdev->pdev) : -EINVAL; 507 508 } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { 509 struct vfio_pci_hot_reset_info hdr; 510 struct vfio_pci_fill_info fill = { 0 }; 511 struct vfio_pci_dependent_device *devices = NULL; 512 bool slot = false; 513 int ret = 0; 514 515 minsz = offsetofend(struct vfio_pci_hot_reset_info, count); 516 517 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 518 return -EFAULT; 519 520 if (hdr.argsz < minsz) 521 return -EINVAL; 522 523 hdr.flags = 0; 524 525 /* Can we do a slot or bus reset or neither? */ 526 if (!pci_probe_reset_slot(vdev->pdev->slot)) 527 slot = true; 528 else if (pci_probe_reset_bus(vdev->pdev->bus)) 529 return -ENODEV; 530 531 /* How many devices are affected? */ 532 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 533 vfio_pci_count_devs, 534 &fill.max, slot); 535 if (ret) 536 return ret; 537 538 WARN_ON(!fill.max); /* Should always be at least one */ 539 540 /* 541 * If there's enough space, fill it now, otherwise return 542 * -ENOSPC and the number of devices affected. 543 */ 544 if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { 545 ret = -ENOSPC; 546 hdr.count = fill.max; 547 goto reset_info_exit; 548 } 549 550 devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); 551 if (!devices) 552 return -ENOMEM; 553 554 fill.devices = devices; 555 556 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 557 vfio_pci_fill_devs, 558 &fill, slot); 559 560 /* 561 * If a device was removed between counting and filling, 562 * we may come up short of fill.max. If a device was 563 * added, we'll have a return of -EAGAIN above. 564 */ 565 if (!ret) 566 hdr.count = fill.cur; 567 568 reset_info_exit: 569 if (copy_to_user((void __user *)arg, &hdr, minsz)) 570 ret = -EFAULT; 571 572 if (!ret) { 573 if (copy_to_user((void __user *)(arg + minsz), devices, 574 hdr.count * sizeof(*devices))) 575 ret = -EFAULT; 576 } 577 578 kfree(devices); 579 return ret; 580 581 } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { 582 struct vfio_pci_hot_reset hdr; 583 int32_t *group_fds; 584 struct vfio_pci_group_entry *groups; 585 struct vfio_pci_group_info info; 586 bool slot = false; 587 int i, count = 0, ret = 0; 588 589 minsz = offsetofend(struct vfio_pci_hot_reset, count); 590 591 if (copy_from_user(&hdr, (void __user *)arg, minsz)) 592 return -EFAULT; 593 594 if (hdr.argsz < minsz || hdr.flags) 595 return -EINVAL; 596 597 /* Can we do a slot or bus reset or neither? */ 598 if (!pci_probe_reset_slot(vdev->pdev->slot)) 599 slot = true; 600 else if (pci_probe_reset_bus(vdev->pdev->bus)) 601 return -ENODEV; 602 603 /* 604 * We can't let userspace give us an arbitrarily large 605 * buffer to copy, so verify how many we think there 606 * could be. Note groups can have multiple devices so 607 * one group per device is the max. 608 */ 609 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 610 vfio_pci_count_devs, 611 &count, slot); 612 if (ret) 613 return ret; 614 615 /* Somewhere between 1 and count is OK */ 616 if (!hdr.count || hdr.count > count) 617 return -EINVAL; 618 619 group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); 620 groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); 621 if (!group_fds || !groups) { 622 kfree(group_fds); 623 kfree(groups); 624 return -ENOMEM; 625 } 626 627 if (copy_from_user(group_fds, (void __user *)(arg + minsz), 628 hdr.count * sizeof(*group_fds))) { 629 kfree(group_fds); 630 kfree(groups); 631 return -EFAULT; 632 } 633 634 /* 635 * For each group_fd, get the group through the vfio external 636 * user interface and store the group and iommu ID. This 637 * ensures the group is held across the reset. 638 */ 639 for (i = 0; i < hdr.count; i++) { 640 struct vfio_group *group; 641 struct fd f = fdget(group_fds[i]); 642 if (!f.file) { 643 ret = -EBADF; 644 break; 645 } 646 647 group = vfio_group_get_external_user(f.file); 648 fdput(f); 649 if (IS_ERR(group)) { 650 ret = PTR_ERR(group); 651 break; 652 } 653 654 groups[i].group = group; 655 groups[i].id = vfio_external_user_iommu_id(group); 656 } 657 658 kfree(group_fds); 659 660 /* release reference to groups on error */ 661 if (ret) 662 goto hot_reset_release; 663 664 info.count = hdr.count; 665 info.groups = groups; 666 667 /* 668 * Test whether all the affected devices are contained 669 * by the set of groups provided by the user. 670 */ 671 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 672 vfio_pci_validate_devs, 673 &info, slot); 674 if (!ret) 675 /* User has access, do the reset */ 676 ret = slot ? pci_try_reset_slot(vdev->pdev->slot) : 677 pci_try_reset_bus(vdev->pdev->bus); 678 679 hot_reset_release: 680 for (i--; i >= 0; i--) 681 vfio_group_put_external_user(groups[i].group); 682 683 kfree(groups); 684 return ret; 685 } 686 687 return -ENOTTY; 688 } 689 690 static ssize_t vfio_pci_rw(void *device_data, char __user *buf, 691 size_t count, loff_t *ppos, bool iswrite) 692 { 693 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 694 struct vfio_pci_device *vdev = device_data; 695 696 if (index >= VFIO_PCI_NUM_REGIONS) 697 return -EINVAL; 698 699 switch (index) { 700 case VFIO_PCI_CONFIG_REGION_INDEX: 701 return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); 702 703 case VFIO_PCI_ROM_REGION_INDEX: 704 if (iswrite) 705 return -EINVAL; 706 return vfio_pci_bar_rw(vdev, buf, count, ppos, false); 707 708 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 709 return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); 710 711 case VFIO_PCI_VGA_REGION_INDEX: 712 return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); 713 } 714 715 return -EINVAL; 716 } 717 718 static ssize_t vfio_pci_read(void *device_data, char __user *buf, 719 size_t count, loff_t *ppos) 720 { 721 if (!count) 722 return 0; 723 724 return vfio_pci_rw(device_data, buf, count, ppos, false); 725 } 726 727 static ssize_t vfio_pci_write(void *device_data, const char __user *buf, 728 size_t count, loff_t *ppos) 729 { 730 if (!count) 731 return 0; 732 733 return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true); 734 } 735 736 static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) 737 { 738 struct vfio_pci_device *vdev = device_data; 739 struct pci_dev *pdev = vdev->pdev; 740 unsigned int index; 741 u64 phys_len, req_len, pgoff, req_start; 742 int ret; 743 744 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 745 746 if (vma->vm_end < vma->vm_start) 747 return -EINVAL; 748 if ((vma->vm_flags & VM_SHARED) == 0) 749 return -EINVAL; 750 if (index >= VFIO_PCI_ROM_REGION_INDEX) 751 return -EINVAL; 752 if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM)) 753 return -EINVAL; 754 755 phys_len = pci_resource_len(pdev, index); 756 req_len = vma->vm_end - vma->vm_start; 757 pgoff = vma->vm_pgoff & 758 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 759 req_start = pgoff << PAGE_SHIFT; 760 761 if (phys_len < PAGE_SIZE || req_start + req_len > phys_len) 762 return -EINVAL; 763 764 if (index == vdev->msix_bar) { 765 /* 766 * Disallow mmaps overlapping the MSI-X table; users don't 767 * get to touch this directly. We could find somewhere 768 * else to map the overlap, but page granularity is only 769 * a recommendation, not a requirement, so the user needs 770 * to know which bits are real. Requiring them to mmap 771 * around the table makes that clear. 772 */ 773 774 /* If neither entirely above nor below, then it overlaps */ 775 if (!(req_start >= vdev->msix_offset + vdev->msix_size || 776 req_start + req_len <= vdev->msix_offset)) 777 return -EINVAL; 778 } 779 780 /* 781 * Even though we don't make use of the barmap for the mmap, 782 * we need to request the region and the barmap tracks that. 783 */ 784 if (!vdev->barmap[index]) { 785 ret = pci_request_selected_regions(pdev, 786 1 << index, "vfio-pci"); 787 if (ret) 788 return ret; 789 790 vdev->barmap[index] = pci_iomap(pdev, index, 0); 791 } 792 793 vma->vm_private_data = vdev; 794 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 795 vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; 796 797 return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 798 req_len, vma->vm_page_prot); 799 } 800 801 static const struct vfio_device_ops vfio_pci_ops = { 802 .name = "vfio-pci", 803 .open = vfio_pci_open, 804 .release = vfio_pci_release, 805 .ioctl = vfio_pci_ioctl, 806 .read = vfio_pci_read, 807 .write = vfio_pci_write, 808 .mmap = vfio_pci_mmap, 809 }; 810 811 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 812 { 813 u8 type; 814 struct vfio_pci_device *vdev; 815 struct iommu_group *group; 816 int ret; 817 818 pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type); 819 if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL) 820 return -EINVAL; 821 822 group = iommu_group_get(&pdev->dev); 823 if (!group) 824 return -EINVAL; 825 826 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); 827 if (!vdev) { 828 iommu_group_put(group); 829 return -ENOMEM; 830 } 831 832 vdev->pdev = pdev; 833 vdev->irq_type = VFIO_PCI_NUM_IRQS; 834 mutex_init(&vdev->igate); 835 spin_lock_init(&vdev->irqlock); 836 atomic_set(&vdev->refcnt, 0); 837 838 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); 839 if (ret) { 840 iommu_group_put(group); 841 kfree(vdev); 842 } 843 844 return ret; 845 } 846 847 static void vfio_pci_remove(struct pci_dev *pdev) 848 { 849 struct vfio_pci_device *vdev; 850 851 vdev = vfio_del_group_dev(&pdev->dev); 852 if (!vdev) 853 return; 854 855 iommu_group_put(pdev->dev.iommu_group); 856 kfree(vdev); 857 } 858 859 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, 860 pci_channel_state_t state) 861 { 862 struct vfio_pci_device *vdev; 863 struct vfio_device *device; 864 865 device = vfio_device_get_from_dev(&pdev->dev); 866 if (device == NULL) 867 return PCI_ERS_RESULT_DISCONNECT; 868 869 vdev = vfio_device_data(device); 870 if (vdev == NULL) { 871 vfio_device_put(device); 872 return PCI_ERS_RESULT_DISCONNECT; 873 } 874 875 mutex_lock(&vdev->igate); 876 877 if (vdev->err_trigger) 878 eventfd_signal(vdev->err_trigger, 1); 879 880 mutex_unlock(&vdev->igate); 881 882 vfio_device_put(device); 883 884 return PCI_ERS_RESULT_CAN_RECOVER; 885 } 886 887 static struct pci_error_handlers vfio_err_handlers = { 888 .error_detected = vfio_pci_aer_err_detected, 889 }; 890 891 static struct pci_driver vfio_pci_driver = { 892 .name = "vfio-pci", 893 .id_table = NULL, /* only dynamic ids */ 894 .probe = vfio_pci_probe, 895 .remove = vfio_pci_remove, 896 .err_handler = &vfio_err_handlers, 897 }; 898 899 static void __exit vfio_pci_cleanup(void) 900 { 901 pci_unregister_driver(&vfio_pci_driver); 902 vfio_pci_virqfd_exit(); 903 vfio_pci_uninit_perm_bits(); 904 } 905 906 static int __init vfio_pci_init(void) 907 { 908 int ret; 909 910 /* Allocate shared config space permision data used by all devices */ 911 ret = vfio_pci_init_perm_bits(); 912 if (ret) 913 return ret; 914 915 /* Start the virqfd cleanup handler */ 916 ret = vfio_pci_virqfd_init(); 917 if (ret) 918 goto out_virqfd; 919 920 /* Register and scan for devices */ 921 ret = pci_register_driver(&vfio_pci_driver); 922 if (ret) 923 goto out_driver; 924 925 return 0; 926 927 out_driver: 928 vfio_pci_virqfd_exit(); 929 out_virqfd: 930 vfio_pci_uninit_perm_bits(); 931 return ret; 932 } 933 934 module_init(vfio_pci_init); 935 module_exit(vfio_pci_cleanup); 936 937 MODULE_VERSION(DRIVER_VERSION); 938 MODULE_LICENSE("GPL v2"); 939 MODULE_AUTHOR(DRIVER_AUTHOR); 940 MODULE_DESCRIPTION(DRIVER_DESC); 941