1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) Microsoft Corporation. 4 * 5 * Author: 6 * Jake Oshins <jakeo@microsoft.com> 7 * 8 * This driver acts as a paravirtual front-end for PCI Express root buses. 9 * When a PCI Express function (either an entire device or an SR-IOV 10 * Virtual Function) is being passed through to the VM, this driver exposes 11 * a new bus to the guest VM. This is modeled as a root PCI bus because 12 * no bridges are being exposed to the VM. In fact, with a "Generation 2" 13 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM 14 * until a device as been exposed using this driver. 15 * 16 * Each root PCI bus has its own PCI domain, which is called "Segment" in 17 * the PCI Firmware Specifications. Thus while each device passed through 18 * to the VM using this front-end will appear at "device 0", the domain will 19 * be unique. Typically, each bus will have one PCI function on it, though 20 * this driver does support more than one. 21 * 22 * In order to map the interrupts from the device through to the guest VM, 23 * this driver also implements an IRQ Domain, which handles interrupts (either 24 * MSI or MSI-X) associated with the functions on the bus. As interrupts are 25 * set up, torn down, or reaffined, this driver communicates with the 26 * underlying hypervisor to adjust the mappings in the I/O MMU so that each 27 * interrupt will be delivered to the correct virtual processor at the right 28 * vector. This driver does not support level-triggered (line-based) 29 * interrupts, and will report that the Interrupt Line register in the 30 * function's configuration space is zero. 31 * 32 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V 33 * facilities. For instance, the configuration space of a function exposed 34 * by Hyper-V is mapped into a single page of memory space, and the 35 * read and write handlers for config space must be aware of this mechanism. 36 * Similarly, device setup and teardown involves messages sent to and from 37 * the PCI back-end driver in Hyper-V. 38 */ 39 40 #include <linux/kernel.h> 41 #include <linux/module.h> 42 #include <linux/pci.h> 43 #include <linux/delay.h> 44 #include <linux/semaphore.h> 45 #include <linux/irqdomain.h> 46 #include <asm/irqdomain.h> 47 #include <asm/apic.h> 48 #include <linux/irq.h> 49 #include <linux/msi.h> 50 #include <linux/hyperv.h> 51 #include <linux/refcount.h> 52 #include <asm/mshyperv.h> 53 54 /* 55 * Protocol versions. The low word is the minor version, the high word the 56 * major version. 57 */ 58 59 #define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor))) 60 #define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16) 61 #define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff) 62 63 enum pci_protocol_version_t { 64 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */ 65 PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */ 66 }; 67 68 #define CPU_AFFINITY_ALL -1ULL 69 70 /* 71 * Supported protocol versions in the order of probing - highest go 72 * first. 73 */ 74 static enum pci_protocol_version_t pci_protocol_versions[] = { 75 PCI_PROTOCOL_VERSION_1_2, 76 PCI_PROTOCOL_VERSION_1_1, 77 }; 78 79 #define PCI_CONFIG_MMIO_LENGTH 0x2000 80 #define CFG_PAGE_OFFSET 0x1000 81 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET) 82 83 #define MAX_SUPPORTED_MSI_MESSAGES 0x400 84 85 #define STATUS_REVISION_MISMATCH 0xC0000059 86 87 /* space for 32bit serial number as string */ 88 #define SLOT_NAME_SIZE 11 89 90 /* 91 * Message Types 92 */ 93 94 enum pci_message_type { 95 /* 96 * Version 1.1 97 */ 98 PCI_MESSAGE_BASE = 0x42490000, 99 PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0, 100 PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1, 101 PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4, 102 PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5, 103 PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6, 104 PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7, 105 PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8, 106 PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9, 107 PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA, 108 PCI_EJECT = PCI_MESSAGE_BASE + 0xB, 109 PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC, 110 PCI_REENABLE = PCI_MESSAGE_BASE + 0xD, 111 PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE, 112 PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF, 113 PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10, 114 PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11, 115 PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12, 116 PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13, 117 PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14, 118 PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15, 119 PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16, 120 PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17, 121 PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */ 122 PCI_MESSAGE_MAXIMUM 123 }; 124 125 /* 126 * Structures defining the virtual PCI Express protocol. 127 */ 128 129 union pci_version { 130 struct { 131 u16 minor_version; 132 u16 major_version; 133 } parts; 134 u32 version; 135 } __packed; 136 137 /* 138 * Function numbers are 8-bits wide on Express, as interpreted through ARI, 139 * which is all this driver does. This representation is the one used in 140 * Windows, which is what is expected when sending this back and forth with 141 * the Hyper-V parent partition. 142 */ 143 union win_slot_encoding { 144 struct { 145 u32 dev:5; 146 u32 func:3; 147 u32 reserved:24; 148 } bits; 149 u32 slot; 150 } __packed; 151 152 /* 153 * Pretty much as defined in the PCI Specifications. 154 */ 155 struct pci_function_description { 156 u16 v_id; /* vendor ID */ 157 u16 d_id; /* device ID */ 158 u8 rev; 159 u8 prog_intf; 160 u8 subclass; 161 u8 base_class; 162 u32 subsystem_id; 163 union win_slot_encoding win_slot; 164 u32 ser; /* serial number */ 165 } __packed; 166 167 /** 168 * struct hv_msi_desc 169 * @vector: IDT entry 170 * @delivery_mode: As defined in Intel's Programmer's 171 * Reference Manual, Volume 3, Chapter 8. 172 * @vector_count: Number of contiguous entries in the 173 * Interrupt Descriptor Table that are 174 * occupied by this Message-Signaled 175 * Interrupt. For "MSI", as first defined 176 * in PCI 2.2, this can be between 1 and 177 * 32. For "MSI-X," as first defined in PCI 178 * 3.0, this must be 1, as each MSI-X table 179 * entry would have its own descriptor. 180 * @reserved: Empty space 181 * @cpu_mask: All the target virtual processors. 182 */ 183 struct hv_msi_desc { 184 u8 vector; 185 u8 delivery_mode; 186 u16 vector_count; 187 u32 reserved; 188 u64 cpu_mask; 189 } __packed; 190 191 /** 192 * struct hv_msi_desc2 - 1.2 version of hv_msi_desc 193 * @vector: IDT entry 194 * @delivery_mode: As defined in Intel's Programmer's 195 * Reference Manual, Volume 3, Chapter 8. 196 * @vector_count: Number of contiguous entries in the 197 * Interrupt Descriptor Table that are 198 * occupied by this Message-Signaled 199 * Interrupt. For "MSI", as first defined 200 * in PCI 2.2, this can be between 1 and 201 * 32. For "MSI-X," as first defined in PCI 202 * 3.0, this must be 1, as each MSI-X table 203 * entry would have its own descriptor. 204 * @processor_count: number of bits enabled in array. 205 * @processor_array: All the target virtual processors. 206 */ 207 struct hv_msi_desc2 { 208 u8 vector; 209 u8 delivery_mode; 210 u16 vector_count; 211 u16 processor_count; 212 u16 processor_array[32]; 213 } __packed; 214 215 /** 216 * struct tran_int_desc 217 * @reserved: unused, padding 218 * @vector_count: same as in hv_msi_desc 219 * @data: This is the "data payload" value that is 220 * written by the device when it generates 221 * a message-signaled interrupt, either MSI 222 * or MSI-X. 223 * @address: This is the address to which the data 224 * payload is written on interrupt 225 * generation. 226 */ 227 struct tran_int_desc { 228 u16 reserved; 229 u16 vector_count; 230 u32 data; 231 u64 address; 232 } __packed; 233 234 /* 235 * A generic message format for virtual PCI. 236 * Specific message formats are defined later in the file. 237 */ 238 239 struct pci_message { 240 u32 type; 241 } __packed; 242 243 struct pci_child_message { 244 struct pci_message message_type; 245 union win_slot_encoding wslot; 246 } __packed; 247 248 struct pci_incoming_message { 249 struct vmpacket_descriptor hdr; 250 struct pci_message message_type; 251 } __packed; 252 253 struct pci_response { 254 struct vmpacket_descriptor hdr; 255 s32 status; /* negative values are failures */ 256 } __packed; 257 258 struct pci_packet { 259 void (*completion_func)(void *context, struct pci_response *resp, 260 int resp_packet_size); 261 void *compl_ctxt; 262 263 struct pci_message message[0]; 264 }; 265 266 /* 267 * Specific message types supporting the PCI protocol. 268 */ 269 270 /* 271 * Version negotiation message. Sent from the guest to the host. 272 * The guest is free to try different versions until the host 273 * accepts the version. 274 * 275 * pci_version: The protocol version requested. 276 * is_last_attempt: If TRUE, this is the last version guest will request. 277 * reservedz: Reserved field, set to zero. 278 */ 279 280 struct pci_version_request { 281 struct pci_message message_type; 282 u32 protocol_version; 283 } __packed; 284 285 /* 286 * Bus D0 Entry. This is sent from the guest to the host when the virtual 287 * bus (PCI Express port) is ready for action. 288 */ 289 290 struct pci_bus_d0_entry { 291 struct pci_message message_type; 292 u32 reserved; 293 u64 mmio_base; 294 } __packed; 295 296 struct pci_bus_relations { 297 struct pci_incoming_message incoming; 298 u32 device_count; 299 struct pci_function_description func[0]; 300 } __packed; 301 302 struct pci_q_res_req_response { 303 struct vmpacket_descriptor hdr; 304 s32 status; /* negative values are failures */ 305 u32 probed_bar[PCI_STD_NUM_BARS]; 306 } __packed; 307 308 struct pci_set_power { 309 struct pci_message message_type; 310 union win_slot_encoding wslot; 311 u32 power_state; /* In Windows terms */ 312 u32 reserved; 313 } __packed; 314 315 struct pci_set_power_response { 316 struct vmpacket_descriptor hdr; 317 s32 status; /* negative values are failures */ 318 union win_slot_encoding wslot; 319 u32 resultant_state; /* In Windows terms */ 320 u32 reserved; 321 } __packed; 322 323 struct pci_resources_assigned { 324 struct pci_message message_type; 325 union win_slot_encoding wslot; 326 u8 memory_range[0x14][6]; /* not used here */ 327 u32 msi_descriptors; 328 u32 reserved[4]; 329 } __packed; 330 331 struct pci_resources_assigned2 { 332 struct pci_message message_type; 333 union win_slot_encoding wslot; 334 u8 memory_range[0x14][6]; /* not used here */ 335 u32 msi_descriptor_count; 336 u8 reserved[70]; 337 } __packed; 338 339 struct pci_create_interrupt { 340 struct pci_message message_type; 341 union win_slot_encoding wslot; 342 struct hv_msi_desc int_desc; 343 } __packed; 344 345 struct pci_create_int_response { 346 struct pci_response response; 347 u32 reserved; 348 struct tran_int_desc int_desc; 349 } __packed; 350 351 struct pci_create_interrupt2 { 352 struct pci_message message_type; 353 union win_slot_encoding wslot; 354 struct hv_msi_desc2 int_desc; 355 } __packed; 356 357 struct pci_delete_interrupt { 358 struct pci_message message_type; 359 union win_slot_encoding wslot; 360 struct tran_int_desc int_desc; 361 } __packed; 362 363 /* 364 * Note: the VM must pass a valid block id, wslot and bytes_requested. 365 */ 366 struct pci_read_block { 367 struct pci_message message_type; 368 u32 block_id; 369 union win_slot_encoding wslot; 370 u32 bytes_requested; 371 } __packed; 372 373 struct pci_read_block_response { 374 struct vmpacket_descriptor hdr; 375 u32 status; 376 u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; 377 } __packed; 378 379 /* 380 * Note: the VM must pass a valid block id, wslot and byte_count. 381 */ 382 struct pci_write_block { 383 struct pci_message message_type; 384 u32 block_id; 385 union win_slot_encoding wslot; 386 u32 byte_count; 387 u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; 388 } __packed; 389 390 struct pci_dev_inval_block { 391 struct pci_incoming_message incoming; 392 union win_slot_encoding wslot; 393 u64 block_mask; 394 } __packed; 395 396 struct pci_dev_incoming { 397 struct pci_incoming_message incoming; 398 union win_slot_encoding wslot; 399 } __packed; 400 401 struct pci_eject_response { 402 struct pci_message message_type; 403 union win_slot_encoding wslot; 404 u32 status; 405 } __packed; 406 407 static int pci_ring_size = (4 * PAGE_SIZE); 408 409 /* 410 * Definitions or interrupt steering hypercall. 411 */ 412 #define HV_PARTITION_ID_SELF ((u64)-1) 413 #define HVCALL_RETARGET_INTERRUPT 0x7e 414 415 struct hv_interrupt_entry { 416 u32 source; /* 1 for MSI(-X) */ 417 u32 reserved1; 418 u32 address; 419 u32 data; 420 }; 421 422 /* 423 * flags for hv_device_interrupt_target.flags 424 */ 425 #define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1 426 #define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2 427 428 struct hv_device_interrupt_target { 429 u32 vector; 430 u32 flags; 431 union { 432 u64 vp_mask; 433 struct hv_vpset vp_set; 434 }; 435 }; 436 437 struct retarget_msi_interrupt { 438 u64 partition_id; /* use "self" */ 439 u64 device_id; 440 struct hv_interrupt_entry int_entry; 441 u64 reserved2; 442 struct hv_device_interrupt_target int_target; 443 } __packed __aligned(8); 444 445 /* 446 * Driver specific state. 447 */ 448 449 enum hv_pcibus_state { 450 hv_pcibus_init = 0, 451 hv_pcibus_probed, 452 hv_pcibus_installed, 453 hv_pcibus_removing, 454 hv_pcibus_removed, 455 hv_pcibus_maximum 456 }; 457 458 struct hv_pcibus_device { 459 struct pci_sysdata sysdata; 460 /* Protocol version negotiated with the host */ 461 enum pci_protocol_version_t protocol_version; 462 enum hv_pcibus_state state; 463 refcount_t remove_lock; 464 struct hv_device *hdev; 465 resource_size_t low_mmio_space; 466 resource_size_t high_mmio_space; 467 struct resource *mem_config; 468 struct resource *low_mmio_res; 469 struct resource *high_mmio_res; 470 struct completion *survey_event; 471 struct completion remove_event; 472 struct pci_bus *pci_bus; 473 spinlock_t config_lock; /* Avoid two threads writing index page */ 474 spinlock_t device_list_lock; /* Protect lists below */ 475 void __iomem *cfg_addr; 476 477 struct list_head resources_for_children; 478 479 struct list_head children; 480 struct list_head dr_list; 481 482 struct msi_domain_info msi_info; 483 struct msi_controller msi_chip; 484 struct irq_domain *irq_domain; 485 486 spinlock_t retarget_msi_interrupt_lock; 487 488 struct workqueue_struct *wq; 489 490 /* hypercall arg, must not cross page boundary */ 491 struct retarget_msi_interrupt retarget_msi_interrupt_params; 492 493 /* 494 * Don't put anything here: retarget_msi_interrupt_params must be last 495 */ 496 }; 497 498 /* 499 * Tracks "Device Relations" messages from the host, which must be both 500 * processed in order and deferred so that they don't run in the context 501 * of the incoming packet callback. 502 */ 503 struct hv_dr_work { 504 struct work_struct wrk; 505 struct hv_pcibus_device *bus; 506 }; 507 508 struct hv_dr_state { 509 struct list_head list_entry; 510 u32 device_count; 511 struct pci_function_description func[0]; 512 }; 513 514 enum hv_pcichild_state { 515 hv_pcichild_init = 0, 516 hv_pcichild_requirements, 517 hv_pcichild_resourced, 518 hv_pcichild_ejecting, 519 hv_pcichild_maximum 520 }; 521 522 struct hv_pci_dev { 523 /* List protected by pci_rescan_remove_lock */ 524 struct list_head list_entry; 525 refcount_t refs; 526 enum hv_pcichild_state state; 527 struct pci_slot *pci_slot; 528 struct pci_function_description desc; 529 bool reported_missing; 530 struct hv_pcibus_device *hbus; 531 struct work_struct wrk; 532 533 void (*block_invalidate)(void *context, u64 block_mask); 534 void *invalidate_context; 535 536 /* 537 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then 538 * read it back, for each of the BAR offsets within config space. 539 */ 540 u32 probed_bar[PCI_STD_NUM_BARS]; 541 }; 542 543 struct hv_pci_compl { 544 struct completion host_event; 545 s32 completion_status; 546 }; 547 548 static void hv_pci_onchannelcallback(void *context); 549 550 /** 551 * hv_pci_generic_compl() - Invoked for a completion packet 552 * @context: Set up by the sender of the packet. 553 * @resp: The response packet 554 * @resp_packet_size: Size in bytes of the packet 555 * 556 * This function is used to trigger an event and report status 557 * for any message for which the completion packet contains a 558 * status and nothing else. 559 */ 560 static void hv_pci_generic_compl(void *context, struct pci_response *resp, 561 int resp_packet_size) 562 { 563 struct hv_pci_compl *comp_pkt = context; 564 565 if (resp_packet_size >= offsetofend(struct pci_response, status)) 566 comp_pkt->completion_status = resp->status; 567 else 568 comp_pkt->completion_status = -1; 569 570 complete(&comp_pkt->host_event); 571 } 572 573 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 574 u32 wslot); 575 576 static void get_pcichild(struct hv_pci_dev *hpdev) 577 { 578 refcount_inc(&hpdev->refs); 579 } 580 581 static void put_pcichild(struct hv_pci_dev *hpdev) 582 { 583 if (refcount_dec_and_test(&hpdev->refs)) 584 kfree(hpdev); 585 } 586 587 static void get_hvpcibus(struct hv_pcibus_device *hv_pcibus); 588 static void put_hvpcibus(struct hv_pcibus_device *hv_pcibus); 589 590 /* 591 * There is no good way to get notified from vmbus_onoffer_rescind(), 592 * so let's use polling here, since this is not a hot path. 593 */ 594 static int wait_for_response(struct hv_device *hdev, 595 struct completion *comp) 596 { 597 while (true) { 598 if (hdev->channel->rescind) { 599 dev_warn_once(&hdev->device, "The device is gone.\n"); 600 return -ENODEV; 601 } 602 603 if (wait_for_completion_timeout(comp, HZ / 10)) 604 break; 605 } 606 607 return 0; 608 } 609 610 /** 611 * devfn_to_wslot() - Convert from Linux PCI slot to Windows 612 * @devfn: The Linux representation of PCI slot 613 * 614 * Windows uses a slightly different representation of PCI slot. 615 * 616 * Return: The Windows representation 617 */ 618 static u32 devfn_to_wslot(int devfn) 619 { 620 union win_slot_encoding wslot; 621 622 wslot.slot = 0; 623 wslot.bits.dev = PCI_SLOT(devfn); 624 wslot.bits.func = PCI_FUNC(devfn); 625 626 return wslot.slot; 627 } 628 629 /** 630 * wslot_to_devfn() - Convert from Windows PCI slot to Linux 631 * @wslot: The Windows representation of PCI slot 632 * 633 * Windows uses a slightly different representation of PCI slot. 634 * 635 * Return: The Linux representation 636 */ 637 static int wslot_to_devfn(u32 wslot) 638 { 639 union win_slot_encoding slot_no; 640 641 slot_no.slot = wslot; 642 return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func); 643 } 644 645 /* 646 * PCI Configuration Space for these root PCI buses is implemented as a pair 647 * of pages in memory-mapped I/O space. Writing to the first page chooses 648 * the PCI function being written or read. Once the first page has been 649 * written to, the following page maps in the entire configuration space of 650 * the function. 651 */ 652 653 /** 654 * _hv_pcifront_read_config() - Internal PCI config read 655 * @hpdev: The PCI driver's representation of the device 656 * @where: Offset within config space 657 * @size: Size of the transfer 658 * @val: Pointer to the buffer receiving the data 659 */ 660 static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, 661 int size, u32 *val) 662 { 663 unsigned long flags; 664 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; 665 666 /* 667 * If the attempt is to read the IDs or the ROM BAR, simulate that. 668 */ 669 if (where + size <= PCI_COMMAND) { 670 memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size); 671 } else if (where >= PCI_CLASS_REVISION && where + size <= 672 PCI_CACHE_LINE_SIZE) { 673 memcpy(val, ((u8 *)&hpdev->desc.rev) + where - 674 PCI_CLASS_REVISION, size); 675 } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <= 676 PCI_ROM_ADDRESS) { 677 memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where - 678 PCI_SUBSYSTEM_VENDOR_ID, size); 679 } else if (where >= PCI_ROM_ADDRESS && where + size <= 680 PCI_CAPABILITY_LIST) { 681 /* ROM BARs are unimplemented */ 682 *val = 0; 683 } else if (where >= PCI_INTERRUPT_LINE && where + size <= 684 PCI_INTERRUPT_PIN) { 685 /* 686 * Interrupt Line and Interrupt PIN are hard-wired to zero 687 * because this front-end only supports message-signaled 688 * interrupts. 689 */ 690 *val = 0; 691 } else if (where + size <= CFG_PAGE_SIZE) { 692 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 693 /* Choose the function to be read. (See comment above) */ 694 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 695 /* Make sure the function was chosen before we start reading. */ 696 mb(); 697 /* Read from that function's config space. */ 698 switch (size) { 699 case 1: 700 *val = readb(addr); 701 break; 702 case 2: 703 *val = readw(addr); 704 break; 705 default: 706 *val = readl(addr); 707 break; 708 } 709 /* 710 * Make sure the read was done before we release the spinlock 711 * allowing consecutive reads/writes. 712 */ 713 mb(); 714 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 715 } else { 716 dev_err(&hpdev->hbus->hdev->device, 717 "Attempt to read beyond a function's config space.\n"); 718 } 719 } 720 721 static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev) 722 { 723 u16 ret; 724 unsigned long flags; 725 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + 726 PCI_VENDOR_ID; 727 728 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 729 730 /* Choose the function to be read. (See comment above) */ 731 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 732 /* Make sure the function was chosen before we start reading. */ 733 mb(); 734 /* Read from that function's config space. */ 735 ret = readw(addr); 736 /* 737 * mb() is not required here, because the spin_unlock_irqrestore() 738 * is a barrier. 739 */ 740 741 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 742 743 return ret; 744 } 745 746 /** 747 * _hv_pcifront_write_config() - Internal PCI config write 748 * @hpdev: The PCI driver's representation of the device 749 * @where: Offset within config space 750 * @size: Size of the transfer 751 * @val: The data being transferred 752 */ 753 static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, 754 int size, u32 val) 755 { 756 unsigned long flags; 757 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; 758 759 if (where >= PCI_SUBSYSTEM_VENDOR_ID && 760 where + size <= PCI_CAPABILITY_LIST) { 761 /* SSIDs and ROM BARs are read-only */ 762 } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) { 763 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 764 /* Choose the function to be written. (See comment above) */ 765 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 766 /* Make sure the function was chosen before we start writing. */ 767 wmb(); 768 /* Write to that function's config space. */ 769 switch (size) { 770 case 1: 771 writeb(val, addr); 772 break; 773 case 2: 774 writew(val, addr); 775 break; 776 default: 777 writel(val, addr); 778 break; 779 } 780 /* 781 * Make sure the write was done before we release the spinlock 782 * allowing consecutive reads/writes. 783 */ 784 mb(); 785 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 786 } else { 787 dev_err(&hpdev->hbus->hdev->device, 788 "Attempt to write beyond a function's config space.\n"); 789 } 790 } 791 792 /** 793 * hv_pcifront_read_config() - Read configuration space 794 * @bus: PCI Bus structure 795 * @devfn: Device/function 796 * @where: Offset from base 797 * @size: Byte/word/dword 798 * @val: Value to be read 799 * 800 * Return: PCIBIOS_SUCCESSFUL on success 801 * PCIBIOS_DEVICE_NOT_FOUND on failure 802 */ 803 static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn, 804 int where, int size, u32 *val) 805 { 806 struct hv_pcibus_device *hbus = 807 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 808 struct hv_pci_dev *hpdev; 809 810 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 811 if (!hpdev) 812 return PCIBIOS_DEVICE_NOT_FOUND; 813 814 _hv_pcifront_read_config(hpdev, where, size, val); 815 816 put_pcichild(hpdev); 817 return PCIBIOS_SUCCESSFUL; 818 } 819 820 /** 821 * hv_pcifront_write_config() - Write configuration space 822 * @bus: PCI Bus structure 823 * @devfn: Device/function 824 * @where: Offset from base 825 * @size: Byte/word/dword 826 * @val: Value to be written to device 827 * 828 * Return: PCIBIOS_SUCCESSFUL on success 829 * PCIBIOS_DEVICE_NOT_FOUND on failure 830 */ 831 static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn, 832 int where, int size, u32 val) 833 { 834 struct hv_pcibus_device *hbus = 835 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 836 struct hv_pci_dev *hpdev; 837 838 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 839 if (!hpdev) 840 return PCIBIOS_DEVICE_NOT_FOUND; 841 842 _hv_pcifront_write_config(hpdev, where, size, val); 843 844 put_pcichild(hpdev); 845 return PCIBIOS_SUCCESSFUL; 846 } 847 848 /* PCIe operations */ 849 static struct pci_ops hv_pcifront_ops = { 850 .read = hv_pcifront_read_config, 851 .write = hv_pcifront_write_config, 852 }; 853 854 /* 855 * Paravirtual backchannel 856 * 857 * Hyper-V SR-IOV provides a backchannel mechanism in software for 858 * communication between a VF driver and a PF driver. These 859 * "configuration blocks" are similar in concept to PCI configuration space, 860 * but instead of doing reads and writes in 32-bit chunks through a very slow 861 * path, packets of up to 128 bytes can be sent or received asynchronously. 862 * 863 * Nearly every SR-IOV device contains just such a communications channel in 864 * hardware, so using this one in software is usually optional. Using the 865 * software channel, however, allows driver implementers to leverage software 866 * tools that fuzz the communications channel looking for vulnerabilities. 867 * 868 * The usage model for these packets puts the responsibility for reading or 869 * writing on the VF driver. The VF driver sends a read or a write packet, 870 * indicating which "block" is being referred to by number. 871 * 872 * If the PF driver wishes to initiate communication, it can "invalidate" one or 873 * more of the first 64 blocks. This invalidation is delivered via a callback 874 * supplied by the VF driver by this driver. 875 * 876 * No protocol is implied, except that supplied by the PF and VF drivers. 877 */ 878 879 struct hv_read_config_compl { 880 struct hv_pci_compl comp_pkt; 881 void *buf; 882 unsigned int len; 883 unsigned int bytes_returned; 884 }; 885 886 /** 887 * hv_pci_read_config_compl() - Invoked when a response packet 888 * for a read config block operation arrives. 889 * @context: Identifies the read config operation 890 * @resp: The response packet itself 891 * @resp_packet_size: Size in bytes of the response packet 892 */ 893 static void hv_pci_read_config_compl(void *context, struct pci_response *resp, 894 int resp_packet_size) 895 { 896 struct hv_read_config_compl *comp = context; 897 struct pci_read_block_response *read_resp = 898 (struct pci_read_block_response *)resp; 899 unsigned int data_len, hdr_len; 900 901 hdr_len = offsetof(struct pci_read_block_response, bytes); 902 if (resp_packet_size < hdr_len) { 903 comp->comp_pkt.completion_status = -1; 904 goto out; 905 } 906 907 data_len = resp_packet_size - hdr_len; 908 if (data_len > 0 && read_resp->status == 0) { 909 comp->bytes_returned = min(comp->len, data_len); 910 memcpy(comp->buf, read_resp->bytes, comp->bytes_returned); 911 } else { 912 comp->bytes_returned = 0; 913 } 914 915 comp->comp_pkt.completion_status = read_resp->status; 916 out: 917 complete(&comp->comp_pkt.host_event); 918 } 919 920 /** 921 * hv_read_config_block() - Sends a read config block request to 922 * the back-end driver running in the Hyper-V parent partition. 923 * @pdev: The PCI driver's representation for this device. 924 * @buf: Buffer into which the config block will be copied. 925 * @len: Size in bytes of buf. 926 * @block_id: Identifies the config block which has been requested. 927 * @bytes_returned: Size which came back from the back-end driver. 928 * 929 * Return: 0 on success, -errno on failure 930 */ 931 int hv_read_config_block(struct pci_dev *pdev, void *buf, unsigned int len, 932 unsigned int block_id, unsigned int *bytes_returned) 933 { 934 struct hv_pcibus_device *hbus = 935 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 936 sysdata); 937 struct { 938 struct pci_packet pkt; 939 char buf[sizeof(struct pci_read_block)]; 940 } pkt; 941 struct hv_read_config_compl comp_pkt; 942 struct pci_read_block *read_blk; 943 int ret; 944 945 if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) 946 return -EINVAL; 947 948 init_completion(&comp_pkt.comp_pkt.host_event); 949 comp_pkt.buf = buf; 950 comp_pkt.len = len; 951 952 memset(&pkt, 0, sizeof(pkt)); 953 pkt.pkt.completion_func = hv_pci_read_config_compl; 954 pkt.pkt.compl_ctxt = &comp_pkt; 955 read_blk = (struct pci_read_block *)&pkt.pkt.message; 956 read_blk->message_type.type = PCI_READ_BLOCK; 957 read_blk->wslot.slot = devfn_to_wslot(pdev->devfn); 958 read_blk->block_id = block_id; 959 read_blk->bytes_requested = len; 960 961 ret = vmbus_sendpacket(hbus->hdev->channel, read_blk, 962 sizeof(*read_blk), (unsigned long)&pkt.pkt, 963 VM_PKT_DATA_INBAND, 964 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 965 if (ret) 966 return ret; 967 968 ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event); 969 if (ret) 970 return ret; 971 972 if (comp_pkt.comp_pkt.completion_status != 0 || 973 comp_pkt.bytes_returned == 0) { 974 dev_err(&hbus->hdev->device, 975 "Read Config Block failed: 0x%x, bytes_returned=%d\n", 976 comp_pkt.comp_pkt.completion_status, 977 comp_pkt.bytes_returned); 978 return -EIO; 979 } 980 981 *bytes_returned = comp_pkt.bytes_returned; 982 return 0; 983 } 984 985 /** 986 * hv_pci_write_config_compl() - Invoked when a response packet for a write 987 * config block operation arrives. 988 * @context: Identifies the write config operation 989 * @resp: The response packet itself 990 * @resp_packet_size: Size in bytes of the response packet 991 */ 992 static void hv_pci_write_config_compl(void *context, struct pci_response *resp, 993 int resp_packet_size) 994 { 995 struct hv_pci_compl *comp_pkt = context; 996 997 comp_pkt->completion_status = resp->status; 998 complete(&comp_pkt->host_event); 999 } 1000 1001 /** 1002 * hv_write_config_block() - Sends a write config block request to the 1003 * back-end driver running in the Hyper-V parent partition. 1004 * @pdev: The PCI driver's representation for this device. 1005 * @buf: Buffer from which the config block will be copied. 1006 * @len: Size in bytes of buf. 1007 * @block_id: Identifies the config block which is being written. 1008 * 1009 * Return: 0 on success, -errno on failure 1010 */ 1011 int hv_write_config_block(struct pci_dev *pdev, void *buf, unsigned int len, 1012 unsigned int block_id) 1013 { 1014 struct hv_pcibus_device *hbus = 1015 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1016 sysdata); 1017 struct { 1018 struct pci_packet pkt; 1019 char buf[sizeof(struct pci_write_block)]; 1020 u32 reserved; 1021 } pkt; 1022 struct hv_pci_compl comp_pkt; 1023 struct pci_write_block *write_blk; 1024 u32 pkt_size; 1025 int ret; 1026 1027 if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) 1028 return -EINVAL; 1029 1030 init_completion(&comp_pkt.host_event); 1031 1032 memset(&pkt, 0, sizeof(pkt)); 1033 pkt.pkt.completion_func = hv_pci_write_config_compl; 1034 pkt.pkt.compl_ctxt = &comp_pkt; 1035 write_blk = (struct pci_write_block *)&pkt.pkt.message; 1036 write_blk->message_type.type = PCI_WRITE_BLOCK; 1037 write_blk->wslot.slot = devfn_to_wslot(pdev->devfn); 1038 write_blk->block_id = block_id; 1039 write_blk->byte_count = len; 1040 memcpy(write_blk->bytes, buf, len); 1041 pkt_size = offsetof(struct pci_write_block, bytes) + len; 1042 /* 1043 * This quirk is required on some hosts shipped around 2018, because 1044 * these hosts don't check the pkt_size correctly (new hosts have been 1045 * fixed since early 2019). The quirk is also safe on very old hosts 1046 * and new hosts, because, on them, what really matters is the length 1047 * specified in write_blk->byte_count. 1048 */ 1049 pkt_size += sizeof(pkt.reserved); 1050 1051 ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size, 1052 (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND, 1053 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1054 if (ret) 1055 return ret; 1056 1057 ret = wait_for_response(hbus->hdev, &comp_pkt.host_event); 1058 if (ret) 1059 return ret; 1060 1061 if (comp_pkt.completion_status != 0) { 1062 dev_err(&hbus->hdev->device, 1063 "Write Config Block failed: 0x%x\n", 1064 comp_pkt.completion_status); 1065 return -EIO; 1066 } 1067 1068 return 0; 1069 } 1070 1071 /** 1072 * hv_register_block_invalidate() - Invoked when a config block invalidation 1073 * arrives from the back-end driver. 1074 * @pdev: The PCI driver's representation for this device. 1075 * @context: Identifies the device. 1076 * @block_invalidate: Identifies all of the blocks being invalidated. 1077 * 1078 * Return: 0 on success, -errno on failure 1079 */ 1080 int hv_register_block_invalidate(struct pci_dev *pdev, void *context, 1081 void (*block_invalidate)(void *context, 1082 u64 block_mask)) 1083 { 1084 struct hv_pcibus_device *hbus = 1085 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1086 sysdata); 1087 struct hv_pci_dev *hpdev; 1088 1089 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1090 if (!hpdev) 1091 return -ENODEV; 1092 1093 hpdev->block_invalidate = block_invalidate; 1094 hpdev->invalidate_context = context; 1095 1096 put_pcichild(hpdev); 1097 return 0; 1098 1099 } 1100 1101 /* Interrupt management hooks */ 1102 static void hv_int_desc_free(struct hv_pci_dev *hpdev, 1103 struct tran_int_desc *int_desc) 1104 { 1105 struct pci_delete_interrupt *int_pkt; 1106 struct { 1107 struct pci_packet pkt; 1108 u8 buffer[sizeof(struct pci_delete_interrupt)]; 1109 } ctxt; 1110 1111 memset(&ctxt, 0, sizeof(ctxt)); 1112 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; 1113 int_pkt->message_type.type = 1114 PCI_DELETE_INTERRUPT_MESSAGE; 1115 int_pkt->wslot.slot = hpdev->desc.win_slot.slot; 1116 int_pkt->int_desc = *int_desc; 1117 vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt), 1118 (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0); 1119 kfree(int_desc); 1120 } 1121 1122 /** 1123 * hv_msi_free() - Free the MSI. 1124 * @domain: The interrupt domain pointer 1125 * @info: Extra MSI-related context 1126 * @irq: Identifies the IRQ. 1127 * 1128 * The Hyper-V parent partition and hypervisor are tracking the 1129 * messages that are in use, keeping the interrupt redirection 1130 * table up to date. This callback sends a message that frees 1131 * the IRT entry and related tracking nonsense. 1132 */ 1133 static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info, 1134 unsigned int irq) 1135 { 1136 struct hv_pcibus_device *hbus; 1137 struct hv_pci_dev *hpdev; 1138 struct pci_dev *pdev; 1139 struct tran_int_desc *int_desc; 1140 struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq); 1141 struct msi_desc *msi = irq_data_get_msi_desc(irq_data); 1142 1143 pdev = msi_desc_to_pci_dev(msi); 1144 hbus = info->data; 1145 int_desc = irq_data_get_irq_chip_data(irq_data); 1146 if (!int_desc) 1147 return; 1148 1149 irq_data->chip_data = NULL; 1150 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1151 if (!hpdev) { 1152 kfree(int_desc); 1153 return; 1154 } 1155 1156 hv_int_desc_free(hpdev, int_desc); 1157 put_pcichild(hpdev); 1158 } 1159 1160 static int hv_set_affinity(struct irq_data *data, const struct cpumask *dest, 1161 bool force) 1162 { 1163 struct irq_data *parent = data->parent_data; 1164 1165 return parent->chip->irq_set_affinity(parent, dest, force); 1166 } 1167 1168 static void hv_irq_mask(struct irq_data *data) 1169 { 1170 pci_msi_mask_irq(data); 1171 } 1172 1173 /** 1174 * hv_irq_unmask() - "Unmask" the IRQ by setting its current 1175 * affinity. 1176 * @data: Describes the IRQ 1177 * 1178 * Build new a destination for the MSI and make a hypercall to 1179 * update the Interrupt Redirection Table. "Device Logical ID" 1180 * is built out of this PCI bus's instance GUID and the function 1181 * number of the device. 1182 */ 1183 static void hv_irq_unmask(struct irq_data *data) 1184 { 1185 struct msi_desc *msi_desc = irq_data_get_msi_desc(data); 1186 struct irq_cfg *cfg = irqd_cfg(data); 1187 struct retarget_msi_interrupt *params; 1188 struct hv_pcibus_device *hbus; 1189 struct cpumask *dest; 1190 cpumask_var_t tmp; 1191 struct pci_bus *pbus; 1192 struct pci_dev *pdev; 1193 unsigned long flags; 1194 u32 var_size = 0; 1195 int cpu, nr_bank; 1196 u64 res; 1197 1198 dest = irq_data_get_effective_affinity_mask(data); 1199 pdev = msi_desc_to_pci_dev(msi_desc); 1200 pbus = pdev->bus; 1201 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 1202 1203 spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags); 1204 1205 params = &hbus->retarget_msi_interrupt_params; 1206 memset(params, 0, sizeof(*params)); 1207 params->partition_id = HV_PARTITION_ID_SELF; 1208 params->int_entry.source = 1; /* MSI(-X) */ 1209 params->int_entry.address = msi_desc->msg.address_lo; 1210 params->int_entry.data = msi_desc->msg.data; 1211 params->device_id = (hbus->hdev->dev_instance.b[5] << 24) | 1212 (hbus->hdev->dev_instance.b[4] << 16) | 1213 (hbus->hdev->dev_instance.b[7] << 8) | 1214 (hbus->hdev->dev_instance.b[6] & 0xf8) | 1215 PCI_FUNC(pdev->devfn); 1216 params->int_target.vector = cfg->vector; 1217 1218 /* 1219 * Honoring apic->irq_delivery_mode set to dest_Fixed by 1220 * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a 1221 * spurious interrupt storm. Not doing so does not seem to have a 1222 * negative effect (yet?). 1223 */ 1224 1225 if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) { 1226 /* 1227 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the 1228 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides 1229 * with >64 VP support. 1230 * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED 1231 * is not sufficient for this hypercall. 1232 */ 1233 params->int_target.flags |= 1234 HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; 1235 1236 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) { 1237 res = 1; 1238 goto exit_unlock; 1239 } 1240 1241 cpumask_and(tmp, dest, cpu_online_mask); 1242 nr_bank = cpumask_to_vpset(¶ms->int_target.vp_set, tmp); 1243 free_cpumask_var(tmp); 1244 1245 if (nr_bank <= 0) { 1246 res = 1; 1247 goto exit_unlock; 1248 } 1249 1250 /* 1251 * var-sized hypercall, var-size starts after vp_mask (thus 1252 * vp_set.format does not count, but vp_set.valid_bank_mask 1253 * does). 1254 */ 1255 var_size = 1 + nr_bank; 1256 } else { 1257 for_each_cpu_and(cpu, dest, cpu_online_mask) { 1258 params->int_target.vp_mask |= 1259 (1ULL << hv_cpu_number_to_vp_number(cpu)); 1260 } 1261 } 1262 1263 res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17), 1264 params, NULL); 1265 1266 exit_unlock: 1267 spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags); 1268 1269 if (res) { 1270 dev_err(&hbus->hdev->device, 1271 "%s() failed: %#llx", __func__, res); 1272 return; 1273 } 1274 1275 pci_msi_unmask_irq(data); 1276 } 1277 1278 struct compose_comp_ctxt { 1279 struct hv_pci_compl comp_pkt; 1280 struct tran_int_desc int_desc; 1281 }; 1282 1283 static void hv_pci_compose_compl(void *context, struct pci_response *resp, 1284 int resp_packet_size) 1285 { 1286 struct compose_comp_ctxt *comp_pkt = context; 1287 struct pci_create_int_response *int_resp = 1288 (struct pci_create_int_response *)resp; 1289 1290 comp_pkt->comp_pkt.completion_status = resp->status; 1291 comp_pkt->int_desc = int_resp->int_desc; 1292 complete(&comp_pkt->comp_pkt.host_event); 1293 } 1294 1295 static u32 hv_compose_msi_req_v1( 1296 struct pci_create_interrupt *int_pkt, struct cpumask *affinity, 1297 u32 slot, u8 vector) 1298 { 1299 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; 1300 int_pkt->wslot.slot = slot; 1301 int_pkt->int_desc.vector = vector; 1302 int_pkt->int_desc.vector_count = 1; 1303 int_pkt->int_desc.delivery_mode = dest_Fixed; 1304 1305 /* 1306 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in 1307 * hv_irq_unmask(). 1308 */ 1309 int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL; 1310 1311 return sizeof(*int_pkt); 1312 } 1313 1314 static u32 hv_compose_msi_req_v2( 1315 struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity, 1316 u32 slot, u8 vector) 1317 { 1318 int cpu; 1319 1320 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2; 1321 int_pkt->wslot.slot = slot; 1322 int_pkt->int_desc.vector = vector; 1323 int_pkt->int_desc.vector_count = 1; 1324 int_pkt->int_desc.delivery_mode = dest_Fixed; 1325 1326 /* 1327 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten 1328 * by subsequent retarget in hv_irq_unmask(). 1329 */ 1330 cpu = cpumask_first_and(affinity, cpu_online_mask); 1331 int_pkt->int_desc.processor_array[0] = 1332 hv_cpu_number_to_vp_number(cpu); 1333 int_pkt->int_desc.processor_count = 1; 1334 1335 return sizeof(*int_pkt); 1336 } 1337 1338 /** 1339 * hv_compose_msi_msg() - Supplies a valid MSI address/data 1340 * @data: Everything about this MSI 1341 * @msg: Buffer that is filled in by this function 1342 * 1343 * This function unpacks the IRQ looking for target CPU set, IDT 1344 * vector and mode and sends a message to the parent partition 1345 * asking for a mapping for that tuple in this partition. The 1346 * response supplies a data value and address to which that data 1347 * should be written to trigger that interrupt. 1348 */ 1349 static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) 1350 { 1351 struct irq_cfg *cfg = irqd_cfg(data); 1352 struct hv_pcibus_device *hbus; 1353 struct hv_pci_dev *hpdev; 1354 struct pci_bus *pbus; 1355 struct pci_dev *pdev; 1356 struct cpumask *dest; 1357 unsigned long flags; 1358 struct compose_comp_ctxt comp; 1359 struct tran_int_desc *int_desc; 1360 struct { 1361 struct pci_packet pci_pkt; 1362 union { 1363 struct pci_create_interrupt v1; 1364 struct pci_create_interrupt2 v2; 1365 } int_pkts; 1366 } __packed ctxt; 1367 1368 u32 size; 1369 int ret; 1370 1371 pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data)); 1372 dest = irq_data_get_effective_affinity_mask(data); 1373 pbus = pdev->bus; 1374 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 1375 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1376 if (!hpdev) 1377 goto return_null_message; 1378 1379 /* Free any previous message that might have already been composed. */ 1380 if (data->chip_data) { 1381 int_desc = data->chip_data; 1382 data->chip_data = NULL; 1383 hv_int_desc_free(hpdev, int_desc); 1384 } 1385 1386 int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC); 1387 if (!int_desc) 1388 goto drop_reference; 1389 1390 memset(&ctxt, 0, sizeof(ctxt)); 1391 init_completion(&comp.comp_pkt.host_event); 1392 ctxt.pci_pkt.completion_func = hv_pci_compose_compl; 1393 ctxt.pci_pkt.compl_ctxt = ∁ 1394 1395 switch (hbus->protocol_version) { 1396 case PCI_PROTOCOL_VERSION_1_1: 1397 size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1, 1398 dest, 1399 hpdev->desc.win_slot.slot, 1400 cfg->vector); 1401 break; 1402 1403 case PCI_PROTOCOL_VERSION_1_2: 1404 size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2, 1405 dest, 1406 hpdev->desc.win_slot.slot, 1407 cfg->vector); 1408 break; 1409 1410 default: 1411 /* As we only negotiate protocol versions known to this driver, 1412 * this path should never hit. However, this is it not a hot 1413 * path so we print a message to aid future updates. 1414 */ 1415 dev_err(&hbus->hdev->device, 1416 "Unexpected vPCI protocol, update driver."); 1417 goto free_int_desc; 1418 } 1419 1420 ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, &ctxt.int_pkts, 1421 size, (unsigned long)&ctxt.pci_pkt, 1422 VM_PKT_DATA_INBAND, 1423 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1424 if (ret) { 1425 dev_err(&hbus->hdev->device, 1426 "Sending request for interrupt failed: 0x%x", 1427 comp.comp_pkt.completion_status); 1428 goto free_int_desc; 1429 } 1430 1431 /* 1432 * Since this function is called with IRQ locks held, can't 1433 * do normal wait for completion; instead poll. 1434 */ 1435 while (!try_wait_for_completion(&comp.comp_pkt.host_event)) { 1436 /* 0xFFFF means an invalid PCI VENDOR ID. */ 1437 if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) { 1438 dev_err_once(&hbus->hdev->device, 1439 "the device has gone\n"); 1440 goto free_int_desc; 1441 } 1442 1443 /* 1444 * When the higher level interrupt code calls us with 1445 * interrupt disabled, we must poll the channel by calling 1446 * the channel callback directly when channel->target_cpu is 1447 * the current CPU. When the higher level interrupt code 1448 * calls us with interrupt enabled, let's add the 1449 * local_irq_save()/restore() to avoid race: 1450 * hv_pci_onchannelcallback() can also run in tasklet. 1451 */ 1452 local_irq_save(flags); 1453 1454 if (hbus->hdev->channel->target_cpu == smp_processor_id()) 1455 hv_pci_onchannelcallback(hbus); 1456 1457 local_irq_restore(flags); 1458 1459 if (hpdev->state == hv_pcichild_ejecting) { 1460 dev_err_once(&hbus->hdev->device, 1461 "the device is being ejected\n"); 1462 goto free_int_desc; 1463 } 1464 1465 udelay(100); 1466 } 1467 1468 if (comp.comp_pkt.completion_status < 0) { 1469 dev_err(&hbus->hdev->device, 1470 "Request for interrupt failed: 0x%x", 1471 comp.comp_pkt.completion_status); 1472 goto free_int_desc; 1473 } 1474 1475 /* 1476 * Record the assignment so that this can be unwound later. Using 1477 * irq_set_chip_data() here would be appropriate, but the lock it takes 1478 * is already held. 1479 */ 1480 *int_desc = comp.int_desc; 1481 data->chip_data = int_desc; 1482 1483 /* Pass up the result. */ 1484 msg->address_hi = comp.int_desc.address >> 32; 1485 msg->address_lo = comp.int_desc.address & 0xffffffff; 1486 msg->data = comp.int_desc.data; 1487 1488 put_pcichild(hpdev); 1489 return; 1490 1491 free_int_desc: 1492 kfree(int_desc); 1493 drop_reference: 1494 put_pcichild(hpdev); 1495 return_null_message: 1496 msg->address_hi = 0; 1497 msg->address_lo = 0; 1498 msg->data = 0; 1499 } 1500 1501 /* HW Interrupt Chip Descriptor */ 1502 static struct irq_chip hv_msi_irq_chip = { 1503 .name = "Hyper-V PCIe MSI", 1504 .irq_compose_msi_msg = hv_compose_msi_msg, 1505 .irq_set_affinity = hv_set_affinity, 1506 .irq_ack = irq_chip_ack_parent, 1507 .irq_mask = hv_irq_mask, 1508 .irq_unmask = hv_irq_unmask, 1509 }; 1510 1511 static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info, 1512 msi_alloc_info_t *arg) 1513 { 1514 return arg->msi_hwirq; 1515 } 1516 1517 static struct msi_domain_ops hv_msi_ops = { 1518 .get_hwirq = hv_msi_domain_ops_get_hwirq, 1519 .msi_prepare = pci_msi_prepare, 1520 .set_desc = pci_msi_set_desc, 1521 .msi_free = hv_msi_free, 1522 }; 1523 1524 /** 1525 * hv_pcie_init_irq_domain() - Initialize IRQ domain 1526 * @hbus: The root PCI bus 1527 * 1528 * This function creates an IRQ domain which will be used for 1529 * interrupts from devices that have been passed through. These 1530 * devices only support MSI and MSI-X, not line-based interrupts 1531 * or simulations of line-based interrupts through PCIe's 1532 * fabric-layer messages. Because interrupts are remapped, we 1533 * can support multi-message MSI here. 1534 * 1535 * Return: '0' on success and error value on failure 1536 */ 1537 static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus) 1538 { 1539 hbus->msi_info.chip = &hv_msi_irq_chip; 1540 hbus->msi_info.ops = &hv_msi_ops; 1541 hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS | 1542 MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI | 1543 MSI_FLAG_PCI_MSIX); 1544 hbus->msi_info.handler = handle_edge_irq; 1545 hbus->msi_info.handler_name = "edge"; 1546 hbus->msi_info.data = hbus; 1547 hbus->irq_domain = pci_msi_create_irq_domain(hbus->sysdata.fwnode, 1548 &hbus->msi_info, 1549 x86_vector_domain); 1550 if (!hbus->irq_domain) { 1551 dev_err(&hbus->hdev->device, 1552 "Failed to build an MSI IRQ domain\n"); 1553 return -ENODEV; 1554 } 1555 1556 return 0; 1557 } 1558 1559 /** 1560 * get_bar_size() - Get the address space consumed by a BAR 1561 * @bar_val: Value that a BAR returned after -1 was written 1562 * to it. 1563 * 1564 * This function returns the size of the BAR, rounded up to 1 1565 * page. It has to be rounded up because the hypervisor's page 1566 * table entry that maps the BAR into the VM can't specify an 1567 * offset within a page. The invariant is that the hypervisor 1568 * must place any BARs of smaller than page length at the 1569 * beginning of a page. 1570 * 1571 * Return: Size in bytes of the consumed MMIO space. 1572 */ 1573 static u64 get_bar_size(u64 bar_val) 1574 { 1575 return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)), 1576 PAGE_SIZE); 1577 } 1578 1579 /** 1580 * survey_child_resources() - Total all MMIO requirements 1581 * @hbus: Root PCI bus, as understood by this driver 1582 */ 1583 static void survey_child_resources(struct hv_pcibus_device *hbus) 1584 { 1585 struct hv_pci_dev *hpdev; 1586 resource_size_t bar_size = 0; 1587 unsigned long flags; 1588 struct completion *event; 1589 u64 bar_val; 1590 int i; 1591 1592 /* If nobody is waiting on the answer, don't compute it. */ 1593 event = xchg(&hbus->survey_event, NULL); 1594 if (!event) 1595 return; 1596 1597 /* If the answer has already been computed, go with it. */ 1598 if (hbus->low_mmio_space || hbus->high_mmio_space) { 1599 complete(event); 1600 return; 1601 } 1602 1603 spin_lock_irqsave(&hbus->device_list_lock, flags); 1604 1605 /* 1606 * Due to an interesting quirk of the PCI spec, all memory regions 1607 * for a child device are a power of 2 in size and aligned in memory, 1608 * so it's sufficient to just add them up without tracking alignment. 1609 */ 1610 list_for_each_entry(hpdev, &hbus->children, list_entry) { 1611 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 1612 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO) 1613 dev_err(&hbus->hdev->device, 1614 "There's an I/O BAR in this list!\n"); 1615 1616 if (hpdev->probed_bar[i] != 0) { 1617 /* 1618 * A probed BAR has all the upper bits set that 1619 * can be changed. 1620 */ 1621 1622 bar_val = hpdev->probed_bar[i]; 1623 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 1624 bar_val |= 1625 ((u64)hpdev->probed_bar[++i] << 32); 1626 else 1627 bar_val |= 0xffffffff00000000ULL; 1628 1629 bar_size = get_bar_size(bar_val); 1630 1631 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 1632 hbus->high_mmio_space += bar_size; 1633 else 1634 hbus->low_mmio_space += bar_size; 1635 } 1636 } 1637 } 1638 1639 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1640 complete(event); 1641 } 1642 1643 /** 1644 * prepopulate_bars() - Fill in BARs with defaults 1645 * @hbus: Root PCI bus, as understood by this driver 1646 * 1647 * The core PCI driver code seems much, much happier if the BARs 1648 * for a device have values upon first scan. So fill them in. 1649 * The algorithm below works down from large sizes to small, 1650 * attempting to pack the assignments optimally. The assumption, 1651 * enforced in other parts of the code, is that the beginning of 1652 * the memory-mapped I/O space will be aligned on the largest 1653 * BAR size. 1654 */ 1655 static void prepopulate_bars(struct hv_pcibus_device *hbus) 1656 { 1657 resource_size_t high_size = 0; 1658 resource_size_t low_size = 0; 1659 resource_size_t high_base = 0; 1660 resource_size_t low_base = 0; 1661 resource_size_t bar_size; 1662 struct hv_pci_dev *hpdev; 1663 unsigned long flags; 1664 u64 bar_val; 1665 u32 command; 1666 bool high; 1667 int i; 1668 1669 if (hbus->low_mmio_space) { 1670 low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 1671 low_base = hbus->low_mmio_res->start; 1672 } 1673 1674 if (hbus->high_mmio_space) { 1675 high_size = 1ULL << 1676 (63 - __builtin_clzll(hbus->high_mmio_space)); 1677 high_base = hbus->high_mmio_res->start; 1678 } 1679 1680 spin_lock_irqsave(&hbus->device_list_lock, flags); 1681 1682 /* 1683 * Clear the memory enable bit, in case it's already set. This occurs 1684 * in the suspend path of hibernation, where the device is suspended, 1685 * resumed and suspended again: see hibernation_snapshot() and 1686 * hibernation_platform_enter(). 1687 * 1688 * If the memory enable bit is already set, Hyper-V sliently ignores 1689 * the below BAR updates, and the related PCI device driver can not 1690 * work, because reading from the device register(s) always returns 1691 * 0xFFFFFFFF. 1692 */ 1693 list_for_each_entry(hpdev, &hbus->children, list_entry) { 1694 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command); 1695 command &= ~PCI_COMMAND_MEMORY; 1696 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command); 1697 } 1698 1699 /* Pick addresses for the BARs. */ 1700 do { 1701 list_for_each_entry(hpdev, &hbus->children, list_entry) { 1702 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 1703 bar_val = hpdev->probed_bar[i]; 1704 if (bar_val == 0) 1705 continue; 1706 high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64; 1707 if (high) { 1708 bar_val |= 1709 ((u64)hpdev->probed_bar[i + 1] 1710 << 32); 1711 } else { 1712 bar_val |= 0xffffffffULL << 32; 1713 } 1714 bar_size = get_bar_size(bar_val); 1715 if (high) { 1716 if (high_size != bar_size) { 1717 i++; 1718 continue; 1719 } 1720 _hv_pcifront_write_config(hpdev, 1721 PCI_BASE_ADDRESS_0 + (4 * i), 1722 4, 1723 (u32)(high_base & 0xffffff00)); 1724 i++; 1725 _hv_pcifront_write_config(hpdev, 1726 PCI_BASE_ADDRESS_0 + (4 * i), 1727 4, (u32)(high_base >> 32)); 1728 high_base += bar_size; 1729 } else { 1730 if (low_size != bar_size) 1731 continue; 1732 _hv_pcifront_write_config(hpdev, 1733 PCI_BASE_ADDRESS_0 + (4 * i), 1734 4, 1735 (u32)(low_base & 0xffffff00)); 1736 low_base += bar_size; 1737 } 1738 } 1739 if (high_size <= 1 && low_size <= 1) { 1740 /* Set the memory enable bit. */ 1741 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, 1742 &command); 1743 command |= PCI_COMMAND_MEMORY; 1744 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, 1745 command); 1746 break; 1747 } 1748 } 1749 1750 high_size >>= 1; 1751 low_size >>= 1; 1752 } while (high_size || low_size); 1753 1754 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1755 } 1756 1757 /* 1758 * Assign entries in sysfs pci slot directory. 1759 * 1760 * Note that this function does not need to lock the children list 1761 * because it is called from pci_devices_present_work which 1762 * is serialized with hv_eject_device_work because they are on the 1763 * same ordered workqueue. Therefore hbus->children list will not change 1764 * even when pci_create_slot sleeps. 1765 */ 1766 static void hv_pci_assign_slots(struct hv_pcibus_device *hbus) 1767 { 1768 struct hv_pci_dev *hpdev; 1769 char name[SLOT_NAME_SIZE]; 1770 int slot_nr; 1771 1772 list_for_each_entry(hpdev, &hbus->children, list_entry) { 1773 if (hpdev->pci_slot) 1774 continue; 1775 1776 slot_nr = PCI_SLOT(wslot_to_devfn(hpdev->desc.win_slot.slot)); 1777 snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser); 1778 hpdev->pci_slot = pci_create_slot(hbus->pci_bus, slot_nr, 1779 name, NULL); 1780 if (IS_ERR(hpdev->pci_slot)) { 1781 pr_warn("pci_create slot %s failed\n", name); 1782 hpdev->pci_slot = NULL; 1783 } 1784 } 1785 } 1786 1787 /* 1788 * Remove entries in sysfs pci slot directory. 1789 */ 1790 static void hv_pci_remove_slots(struct hv_pcibus_device *hbus) 1791 { 1792 struct hv_pci_dev *hpdev; 1793 1794 list_for_each_entry(hpdev, &hbus->children, list_entry) { 1795 if (!hpdev->pci_slot) 1796 continue; 1797 pci_destroy_slot(hpdev->pci_slot); 1798 hpdev->pci_slot = NULL; 1799 } 1800 } 1801 1802 /** 1803 * create_root_hv_pci_bus() - Expose a new root PCI bus 1804 * @hbus: Root PCI bus, as understood by this driver 1805 * 1806 * Return: 0 on success, -errno on failure 1807 */ 1808 static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus) 1809 { 1810 /* Register the device */ 1811 hbus->pci_bus = pci_create_root_bus(&hbus->hdev->device, 1812 0, /* bus number is always zero */ 1813 &hv_pcifront_ops, 1814 &hbus->sysdata, 1815 &hbus->resources_for_children); 1816 if (!hbus->pci_bus) 1817 return -ENODEV; 1818 1819 hbus->pci_bus->msi = &hbus->msi_chip; 1820 hbus->pci_bus->msi->dev = &hbus->hdev->device; 1821 1822 pci_lock_rescan_remove(); 1823 pci_scan_child_bus(hbus->pci_bus); 1824 pci_bus_assign_resources(hbus->pci_bus); 1825 hv_pci_assign_slots(hbus); 1826 pci_bus_add_devices(hbus->pci_bus); 1827 pci_unlock_rescan_remove(); 1828 hbus->state = hv_pcibus_installed; 1829 return 0; 1830 } 1831 1832 struct q_res_req_compl { 1833 struct completion host_event; 1834 struct hv_pci_dev *hpdev; 1835 }; 1836 1837 /** 1838 * q_resource_requirements() - Query Resource Requirements 1839 * @context: The completion context. 1840 * @resp: The response that came from the host. 1841 * @resp_packet_size: The size in bytes of resp. 1842 * 1843 * This function is invoked on completion of a Query Resource 1844 * Requirements packet. 1845 */ 1846 static void q_resource_requirements(void *context, struct pci_response *resp, 1847 int resp_packet_size) 1848 { 1849 struct q_res_req_compl *completion = context; 1850 struct pci_q_res_req_response *q_res_req = 1851 (struct pci_q_res_req_response *)resp; 1852 int i; 1853 1854 if (resp->status < 0) { 1855 dev_err(&completion->hpdev->hbus->hdev->device, 1856 "query resource requirements failed: %x\n", 1857 resp->status); 1858 } else { 1859 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 1860 completion->hpdev->probed_bar[i] = 1861 q_res_req->probed_bar[i]; 1862 } 1863 } 1864 1865 complete(&completion->host_event); 1866 } 1867 1868 /** 1869 * new_pcichild_device() - Create a new child device 1870 * @hbus: The internal struct tracking this root PCI bus. 1871 * @desc: The information supplied so far from the host 1872 * about the device. 1873 * 1874 * This function creates the tracking structure for a new child 1875 * device and kicks off the process of figuring out what it is. 1876 * 1877 * Return: Pointer to the new tracking struct 1878 */ 1879 static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus, 1880 struct pci_function_description *desc) 1881 { 1882 struct hv_pci_dev *hpdev; 1883 struct pci_child_message *res_req; 1884 struct q_res_req_compl comp_pkt; 1885 struct { 1886 struct pci_packet init_packet; 1887 u8 buffer[sizeof(struct pci_child_message)]; 1888 } pkt; 1889 unsigned long flags; 1890 int ret; 1891 1892 hpdev = kzalloc(sizeof(*hpdev), GFP_KERNEL); 1893 if (!hpdev) 1894 return NULL; 1895 1896 hpdev->hbus = hbus; 1897 1898 memset(&pkt, 0, sizeof(pkt)); 1899 init_completion(&comp_pkt.host_event); 1900 comp_pkt.hpdev = hpdev; 1901 pkt.init_packet.compl_ctxt = &comp_pkt; 1902 pkt.init_packet.completion_func = q_resource_requirements; 1903 res_req = (struct pci_child_message *)&pkt.init_packet.message; 1904 res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS; 1905 res_req->wslot.slot = desc->win_slot.slot; 1906 1907 ret = vmbus_sendpacket(hbus->hdev->channel, res_req, 1908 sizeof(struct pci_child_message), 1909 (unsigned long)&pkt.init_packet, 1910 VM_PKT_DATA_INBAND, 1911 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1912 if (ret) 1913 goto error; 1914 1915 if (wait_for_response(hbus->hdev, &comp_pkt.host_event)) 1916 goto error; 1917 1918 hpdev->desc = *desc; 1919 refcount_set(&hpdev->refs, 1); 1920 get_pcichild(hpdev); 1921 spin_lock_irqsave(&hbus->device_list_lock, flags); 1922 1923 list_add_tail(&hpdev->list_entry, &hbus->children); 1924 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1925 return hpdev; 1926 1927 error: 1928 kfree(hpdev); 1929 return NULL; 1930 } 1931 1932 /** 1933 * get_pcichild_wslot() - Find device from slot 1934 * @hbus: Root PCI bus, as understood by this driver 1935 * @wslot: Location on the bus 1936 * 1937 * This function looks up a PCI device and returns the internal 1938 * representation of it. It acquires a reference on it, so that 1939 * the device won't be deleted while somebody is using it. The 1940 * caller is responsible for calling put_pcichild() to release 1941 * this reference. 1942 * 1943 * Return: Internal representation of a PCI device 1944 */ 1945 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 1946 u32 wslot) 1947 { 1948 unsigned long flags; 1949 struct hv_pci_dev *iter, *hpdev = NULL; 1950 1951 spin_lock_irqsave(&hbus->device_list_lock, flags); 1952 list_for_each_entry(iter, &hbus->children, list_entry) { 1953 if (iter->desc.win_slot.slot == wslot) { 1954 hpdev = iter; 1955 get_pcichild(hpdev); 1956 break; 1957 } 1958 } 1959 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1960 1961 return hpdev; 1962 } 1963 1964 /** 1965 * pci_devices_present_work() - Handle new list of child devices 1966 * @work: Work struct embedded in struct hv_dr_work 1967 * 1968 * "Bus Relations" is the Windows term for "children of this 1969 * bus." The terminology is preserved here for people trying to 1970 * debug the interaction between Hyper-V and Linux. This 1971 * function is called when the parent partition reports a list 1972 * of functions that should be observed under this PCI Express 1973 * port (bus). 1974 * 1975 * This function updates the list, and must tolerate being 1976 * called multiple times with the same information. The typical 1977 * number of child devices is one, with very atypical cases 1978 * involving three or four, so the algorithms used here can be 1979 * simple and inefficient. 1980 * 1981 * It must also treat the omission of a previously observed device as 1982 * notification that the device no longer exists. 1983 * 1984 * Note that this function is serialized with hv_eject_device_work(), 1985 * because both are pushed to the ordered workqueue hbus->wq. 1986 */ 1987 static void pci_devices_present_work(struct work_struct *work) 1988 { 1989 u32 child_no; 1990 bool found; 1991 struct pci_function_description *new_desc; 1992 struct hv_pci_dev *hpdev; 1993 struct hv_pcibus_device *hbus; 1994 struct list_head removed; 1995 struct hv_dr_work *dr_wrk; 1996 struct hv_dr_state *dr = NULL; 1997 unsigned long flags; 1998 1999 dr_wrk = container_of(work, struct hv_dr_work, wrk); 2000 hbus = dr_wrk->bus; 2001 kfree(dr_wrk); 2002 2003 INIT_LIST_HEAD(&removed); 2004 2005 /* Pull this off the queue and process it if it was the last one. */ 2006 spin_lock_irqsave(&hbus->device_list_lock, flags); 2007 while (!list_empty(&hbus->dr_list)) { 2008 dr = list_first_entry(&hbus->dr_list, struct hv_dr_state, 2009 list_entry); 2010 list_del(&dr->list_entry); 2011 2012 /* Throw this away if the list still has stuff in it. */ 2013 if (!list_empty(&hbus->dr_list)) { 2014 kfree(dr); 2015 continue; 2016 } 2017 } 2018 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2019 2020 if (!dr) { 2021 put_hvpcibus(hbus); 2022 return; 2023 } 2024 2025 /* First, mark all existing children as reported missing. */ 2026 spin_lock_irqsave(&hbus->device_list_lock, flags); 2027 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2028 hpdev->reported_missing = true; 2029 } 2030 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2031 2032 /* Next, add back any reported devices. */ 2033 for (child_no = 0; child_no < dr->device_count; child_no++) { 2034 found = false; 2035 new_desc = &dr->func[child_no]; 2036 2037 spin_lock_irqsave(&hbus->device_list_lock, flags); 2038 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2039 if ((hpdev->desc.win_slot.slot == new_desc->win_slot.slot) && 2040 (hpdev->desc.v_id == new_desc->v_id) && 2041 (hpdev->desc.d_id == new_desc->d_id) && 2042 (hpdev->desc.ser == new_desc->ser)) { 2043 hpdev->reported_missing = false; 2044 found = true; 2045 } 2046 } 2047 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2048 2049 if (!found) { 2050 hpdev = new_pcichild_device(hbus, new_desc); 2051 if (!hpdev) 2052 dev_err(&hbus->hdev->device, 2053 "couldn't record a child device.\n"); 2054 } 2055 } 2056 2057 /* Move missing children to a list on the stack. */ 2058 spin_lock_irqsave(&hbus->device_list_lock, flags); 2059 do { 2060 found = false; 2061 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2062 if (hpdev->reported_missing) { 2063 found = true; 2064 put_pcichild(hpdev); 2065 list_move_tail(&hpdev->list_entry, &removed); 2066 break; 2067 } 2068 } 2069 } while (found); 2070 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2071 2072 /* Delete everything that should no longer exist. */ 2073 while (!list_empty(&removed)) { 2074 hpdev = list_first_entry(&removed, struct hv_pci_dev, 2075 list_entry); 2076 list_del(&hpdev->list_entry); 2077 2078 if (hpdev->pci_slot) 2079 pci_destroy_slot(hpdev->pci_slot); 2080 2081 put_pcichild(hpdev); 2082 } 2083 2084 switch (hbus->state) { 2085 case hv_pcibus_installed: 2086 /* 2087 * Tell the core to rescan bus 2088 * because there may have been changes. 2089 */ 2090 pci_lock_rescan_remove(); 2091 pci_scan_child_bus(hbus->pci_bus); 2092 hv_pci_assign_slots(hbus); 2093 pci_unlock_rescan_remove(); 2094 break; 2095 2096 case hv_pcibus_init: 2097 case hv_pcibus_probed: 2098 survey_child_resources(hbus); 2099 break; 2100 2101 default: 2102 break; 2103 } 2104 2105 put_hvpcibus(hbus); 2106 kfree(dr); 2107 } 2108 2109 /** 2110 * hv_pci_devices_present() - Handles list of new children 2111 * @hbus: Root PCI bus, as understood by this driver 2112 * @relations: Packet from host listing children 2113 * 2114 * This function is invoked whenever a new list of devices for 2115 * this bus appears. 2116 */ 2117 static void hv_pci_devices_present(struct hv_pcibus_device *hbus, 2118 struct pci_bus_relations *relations) 2119 { 2120 struct hv_dr_state *dr; 2121 struct hv_dr_work *dr_wrk; 2122 unsigned long flags; 2123 bool pending_dr; 2124 2125 if (hbus->state == hv_pcibus_removing) { 2126 dev_info(&hbus->hdev->device, 2127 "PCI VMBus BUS_RELATIONS: ignored\n"); 2128 return; 2129 } 2130 2131 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT); 2132 if (!dr_wrk) 2133 return; 2134 2135 dr = kzalloc(offsetof(struct hv_dr_state, func) + 2136 (sizeof(struct pci_function_description) * 2137 (relations->device_count)), GFP_NOWAIT); 2138 if (!dr) { 2139 kfree(dr_wrk); 2140 return; 2141 } 2142 2143 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work); 2144 dr_wrk->bus = hbus; 2145 dr->device_count = relations->device_count; 2146 if (dr->device_count != 0) { 2147 memcpy(dr->func, relations->func, 2148 sizeof(struct pci_function_description) * 2149 dr->device_count); 2150 } 2151 2152 spin_lock_irqsave(&hbus->device_list_lock, flags); 2153 /* 2154 * If pending_dr is true, we have already queued a work, 2155 * which will see the new dr. Otherwise, we need to 2156 * queue a new work. 2157 */ 2158 pending_dr = !list_empty(&hbus->dr_list); 2159 list_add_tail(&dr->list_entry, &hbus->dr_list); 2160 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2161 2162 if (pending_dr) { 2163 kfree(dr_wrk); 2164 } else { 2165 get_hvpcibus(hbus); 2166 queue_work(hbus->wq, &dr_wrk->wrk); 2167 } 2168 } 2169 2170 /** 2171 * hv_eject_device_work() - Asynchronously handles ejection 2172 * @work: Work struct embedded in internal device struct 2173 * 2174 * This function handles ejecting a device. Windows will 2175 * attempt to gracefully eject a device, waiting 60 seconds to 2176 * hear back from the guest OS that this completed successfully. 2177 * If this timer expires, the device will be forcibly removed. 2178 */ 2179 static void hv_eject_device_work(struct work_struct *work) 2180 { 2181 struct pci_eject_response *ejct_pkt; 2182 struct hv_pcibus_device *hbus; 2183 struct hv_pci_dev *hpdev; 2184 struct pci_dev *pdev; 2185 unsigned long flags; 2186 int wslot; 2187 struct { 2188 struct pci_packet pkt; 2189 u8 buffer[sizeof(struct pci_eject_response)]; 2190 } ctxt; 2191 2192 hpdev = container_of(work, struct hv_pci_dev, wrk); 2193 hbus = hpdev->hbus; 2194 2195 WARN_ON(hpdev->state != hv_pcichild_ejecting); 2196 2197 /* 2198 * Ejection can come before or after the PCI bus has been set up, so 2199 * attempt to find it and tear down the bus state, if it exists. This 2200 * must be done without constructs like pci_domain_nr(hbus->pci_bus) 2201 * because hbus->pci_bus may not exist yet. 2202 */ 2203 wslot = wslot_to_devfn(hpdev->desc.win_slot.slot); 2204 pdev = pci_get_domain_bus_and_slot(hbus->sysdata.domain, 0, wslot); 2205 if (pdev) { 2206 pci_lock_rescan_remove(); 2207 pci_stop_and_remove_bus_device(pdev); 2208 pci_dev_put(pdev); 2209 pci_unlock_rescan_remove(); 2210 } 2211 2212 spin_lock_irqsave(&hbus->device_list_lock, flags); 2213 list_del(&hpdev->list_entry); 2214 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2215 2216 if (hpdev->pci_slot) 2217 pci_destroy_slot(hpdev->pci_slot); 2218 2219 memset(&ctxt, 0, sizeof(ctxt)); 2220 ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message; 2221 ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE; 2222 ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot; 2223 vmbus_sendpacket(hbus->hdev->channel, ejct_pkt, 2224 sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt, 2225 VM_PKT_DATA_INBAND, 0); 2226 2227 /* For the get_pcichild() in hv_pci_eject_device() */ 2228 put_pcichild(hpdev); 2229 /* For the two refs got in new_pcichild_device() */ 2230 put_pcichild(hpdev); 2231 put_pcichild(hpdev); 2232 /* hpdev has been freed. Do not use it any more. */ 2233 2234 put_hvpcibus(hbus); 2235 } 2236 2237 /** 2238 * hv_pci_eject_device() - Handles device ejection 2239 * @hpdev: Internal device tracking struct 2240 * 2241 * This function is invoked when an ejection packet arrives. It 2242 * just schedules work so that we don't re-enter the packet 2243 * delivery code handling the ejection. 2244 */ 2245 static void hv_pci_eject_device(struct hv_pci_dev *hpdev) 2246 { 2247 struct hv_pcibus_device *hbus = hpdev->hbus; 2248 struct hv_device *hdev = hbus->hdev; 2249 2250 if (hbus->state == hv_pcibus_removing) { 2251 dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n"); 2252 return; 2253 } 2254 2255 hpdev->state = hv_pcichild_ejecting; 2256 get_pcichild(hpdev); 2257 INIT_WORK(&hpdev->wrk, hv_eject_device_work); 2258 get_hvpcibus(hbus); 2259 queue_work(hbus->wq, &hpdev->wrk); 2260 } 2261 2262 /** 2263 * hv_pci_onchannelcallback() - Handles incoming packets 2264 * @context: Internal bus tracking struct 2265 * 2266 * This function is invoked whenever the host sends a packet to 2267 * this channel (which is private to this root PCI bus). 2268 */ 2269 static void hv_pci_onchannelcallback(void *context) 2270 { 2271 const int packet_size = 0x100; 2272 int ret; 2273 struct hv_pcibus_device *hbus = context; 2274 u32 bytes_recvd; 2275 u64 req_id; 2276 struct vmpacket_descriptor *desc; 2277 unsigned char *buffer; 2278 int bufferlen = packet_size; 2279 struct pci_packet *comp_packet; 2280 struct pci_response *response; 2281 struct pci_incoming_message *new_message; 2282 struct pci_bus_relations *bus_rel; 2283 struct pci_dev_inval_block *inval; 2284 struct pci_dev_incoming *dev_message; 2285 struct hv_pci_dev *hpdev; 2286 2287 buffer = kmalloc(bufferlen, GFP_ATOMIC); 2288 if (!buffer) 2289 return; 2290 2291 while (1) { 2292 ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer, 2293 bufferlen, &bytes_recvd, &req_id); 2294 2295 if (ret == -ENOBUFS) { 2296 kfree(buffer); 2297 /* Handle large packet */ 2298 bufferlen = bytes_recvd; 2299 buffer = kmalloc(bytes_recvd, GFP_ATOMIC); 2300 if (!buffer) 2301 return; 2302 continue; 2303 } 2304 2305 /* Zero length indicates there are no more packets. */ 2306 if (ret || !bytes_recvd) 2307 break; 2308 2309 /* 2310 * All incoming packets must be at least as large as a 2311 * response. 2312 */ 2313 if (bytes_recvd <= sizeof(struct pci_response)) 2314 continue; 2315 desc = (struct vmpacket_descriptor *)buffer; 2316 2317 switch (desc->type) { 2318 case VM_PKT_COMP: 2319 2320 /* 2321 * The host is trusted, and thus it's safe to interpret 2322 * this transaction ID as a pointer. 2323 */ 2324 comp_packet = (struct pci_packet *)req_id; 2325 response = (struct pci_response *)buffer; 2326 comp_packet->completion_func(comp_packet->compl_ctxt, 2327 response, 2328 bytes_recvd); 2329 break; 2330 2331 case VM_PKT_DATA_INBAND: 2332 2333 new_message = (struct pci_incoming_message *)buffer; 2334 switch (new_message->message_type.type) { 2335 case PCI_BUS_RELATIONS: 2336 2337 bus_rel = (struct pci_bus_relations *)buffer; 2338 if (bytes_recvd < 2339 offsetof(struct pci_bus_relations, func) + 2340 (sizeof(struct pci_function_description) * 2341 (bus_rel->device_count))) { 2342 dev_err(&hbus->hdev->device, 2343 "bus relations too small\n"); 2344 break; 2345 } 2346 2347 hv_pci_devices_present(hbus, bus_rel); 2348 break; 2349 2350 case PCI_EJECT: 2351 2352 dev_message = (struct pci_dev_incoming *)buffer; 2353 hpdev = get_pcichild_wslot(hbus, 2354 dev_message->wslot.slot); 2355 if (hpdev) { 2356 hv_pci_eject_device(hpdev); 2357 put_pcichild(hpdev); 2358 } 2359 break; 2360 2361 case PCI_INVALIDATE_BLOCK: 2362 2363 inval = (struct pci_dev_inval_block *)buffer; 2364 hpdev = get_pcichild_wslot(hbus, 2365 inval->wslot.slot); 2366 if (hpdev) { 2367 if (hpdev->block_invalidate) { 2368 hpdev->block_invalidate( 2369 hpdev->invalidate_context, 2370 inval->block_mask); 2371 } 2372 put_pcichild(hpdev); 2373 } 2374 break; 2375 2376 default: 2377 dev_warn(&hbus->hdev->device, 2378 "Unimplemented protocol message %x\n", 2379 new_message->message_type.type); 2380 break; 2381 } 2382 break; 2383 2384 default: 2385 dev_err(&hbus->hdev->device, 2386 "unhandled packet type %d, tid %llx len %d\n", 2387 desc->type, req_id, bytes_recvd); 2388 break; 2389 } 2390 } 2391 2392 kfree(buffer); 2393 } 2394 2395 /** 2396 * hv_pci_protocol_negotiation() - Set up protocol 2397 * @hdev: VMBus's tracking struct for this root PCI bus 2398 * 2399 * This driver is intended to support running on Windows 10 2400 * (server) and later versions. It will not run on earlier 2401 * versions, as they assume that many of the operations which 2402 * Linux needs accomplished with a spinlock held were done via 2403 * asynchronous messaging via VMBus. Windows 10 increases the 2404 * surface area of PCI emulation so that these actions can take 2405 * place by suspending a virtual processor for their duration. 2406 * 2407 * This function negotiates the channel protocol version, 2408 * failing if the host doesn't support the necessary protocol 2409 * level. 2410 */ 2411 static int hv_pci_protocol_negotiation(struct hv_device *hdev, 2412 enum pci_protocol_version_t version[], 2413 int num_version) 2414 { 2415 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 2416 struct pci_version_request *version_req; 2417 struct hv_pci_compl comp_pkt; 2418 struct pci_packet *pkt; 2419 int ret; 2420 int i; 2421 2422 /* 2423 * Initiate the handshake with the host and negotiate 2424 * a version that the host can support. We start with the 2425 * highest version number and go down if the host cannot 2426 * support it. 2427 */ 2428 pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL); 2429 if (!pkt) 2430 return -ENOMEM; 2431 2432 init_completion(&comp_pkt.host_event); 2433 pkt->completion_func = hv_pci_generic_compl; 2434 pkt->compl_ctxt = &comp_pkt; 2435 version_req = (struct pci_version_request *)&pkt->message; 2436 version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION; 2437 2438 for (i = 0; i < num_version; i++) { 2439 version_req->protocol_version = version[i]; 2440 ret = vmbus_sendpacket(hdev->channel, version_req, 2441 sizeof(struct pci_version_request), 2442 (unsigned long)pkt, VM_PKT_DATA_INBAND, 2443 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 2444 if (!ret) 2445 ret = wait_for_response(hdev, &comp_pkt.host_event); 2446 2447 if (ret) { 2448 dev_err(&hdev->device, 2449 "PCI Pass-through VSP failed to request version: %d", 2450 ret); 2451 goto exit; 2452 } 2453 2454 if (comp_pkt.completion_status >= 0) { 2455 hbus->protocol_version = version[i]; 2456 dev_info(&hdev->device, 2457 "PCI VMBus probing: Using version %#x\n", 2458 hbus->protocol_version); 2459 goto exit; 2460 } 2461 2462 if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) { 2463 dev_err(&hdev->device, 2464 "PCI Pass-through VSP failed version request: %#x", 2465 comp_pkt.completion_status); 2466 ret = -EPROTO; 2467 goto exit; 2468 } 2469 2470 reinit_completion(&comp_pkt.host_event); 2471 } 2472 2473 dev_err(&hdev->device, 2474 "PCI pass-through VSP failed to find supported version"); 2475 ret = -EPROTO; 2476 2477 exit: 2478 kfree(pkt); 2479 return ret; 2480 } 2481 2482 /** 2483 * hv_pci_free_bridge_windows() - Release memory regions for the 2484 * bus 2485 * @hbus: Root PCI bus, as understood by this driver 2486 */ 2487 static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus) 2488 { 2489 /* 2490 * Set the resources back to the way they looked when they 2491 * were allocated by setting IORESOURCE_BUSY again. 2492 */ 2493 2494 if (hbus->low_mmio_space && hbus->low_mmio_res) { 2495 hbus->low_mmio_res->flags |= IORESOURCE_BUSY; 2496 vmbus_free_mmio(hbus->low_mmio_res->start, 2497 resource_size(hbus->low_mmio_res)); 2498 } 2499 2500 if (hbus->high_mmio_space && hbus->high_mmio_res) { 2501 hbus->high_mmio_res->flags |= IORESOURCE_BUSY; 2502 vmbus_free_mmio(hbus->high_mmio_res->start, 2503 resource_size(hbus->high_mmio_res)); 2504 } 2505 } 2506 2507 /** 2508 * hv_pci_allocate_bridge_windows() - Allocate memory regions 2509 * for the bus 2510 * @hbus: Root PCI bus, as understood by this driver 2511 * 2512 * This function calls vmbus_allocate_mmio(), which is itself a 2513 * bit of a compromise. Ideally, we might change the pnp layer 2514 * in the kernel such that it comprehends either PCI devices 2515 * which are "grandchildren of ACPI," with some intermediate bus 2516 * node (in this case, VMBus) or change it such that it 2517 * understands VMBus. The pnp layer, however, has been declared 2518 * deprecated, and not subject to change. 2519 * 2520 * The workaround, implemented here, is to ask VMBus to allocate 2521 * MMIO space for this bus. VMBus itself knows which ranges are 2522 * appropriate by looking at its own ACPI objects. Then, after 2523 * these ranges are claimed, they're modified to look like they 2524 * would have looked if the ACPI and pnp code had allocated 2525 * bridge windows. These descriptors have to exist in this form 2526 * in order to satisfy the code which will get invoked when the 2527 * endpoint PCI function driver calls request_mem_region() or 2528 * request_mem_region_exclusive(). 2529 * 2530 * Return: 0 on success, -errno on failure 2531 */ 2532 static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus) 2533 { 2534 resource_size_t align; 2535 int ret; 2536 2537 if (hbus->low_mmio_space) { 2538 align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 2539 ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0, 2540 (u64)(u32)0xffffffff, 2541 hbus->low_mmio_space, 2542 align, false); 2543 if (ret) { 2544 dev_err(&hbus->hdev->device, 2545 "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n", 2546 hbus->low_mmio_space); 2547 return ret; 2548 } 2549 2550 /* Modify this resource to become a bridge window. */ 2551 hbus->low_mmio_res->flags |= IORESOURCE_WINDOW; 2552 hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY; 2553 pci_add_resource(&hbus->resources_for_children, 2554 hbus->low_mmio_res); 2555 } 2556 2557 if (hbus->high_mmio_space) { 2558 align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space)); 2559 ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev, 2560 0x100000000, -1, 2561 hbus->high_mmio_space, align, 2562 false); 2563 if (ret) { 2564 dev_err(&hbus->hdev->device, 2565 "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n", 2566 hbus->high_mmio_space); 2567 goto release_low_mmio; 2568 } 2569 2570 /* Modify this resource to become a bridge window. */ 2571 hbus->high_mmio_res->flags |= IORESOURCE_WINDOW; 2572 hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY; 2573 pci_add_resource(&hbus->resources_for_children, 2574 hbus->high_mmio_res); 2575 } 2576 2577 return 0; 2578 2579 release_low_mmio: 2580 if (hbus->low_mmio_res) { 2581 vmbus_free_mmio(hbus->low_mmio_res->start, 2582 resource_size(hbus->low_mmio_res)); 2583 } 2584 2585 return ret; 2586 } 2587 2588 /** 2589 * hv_allocate_config_window() - Find MMIO space for PCI Config 2590 * @hbus: Root PCI bus, as understood by this driver 2591 * 2592 * This function claims memory-mapped I/O space for accessing 2593 * configuration space for the functions on this bus. 2594 * 2595 * Return: 0 on success, -errno on failure 2596 */ 2597 static int hv_allocate_config_window(struct hv_pcibus_device *hbus) 2598 { 2599 int ret; 2600 2601 /* 2602 * Set up a region of MMIO space to use for accessing configuration 2603 * space. 2604 */ 2605 ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1, 2606 PCI_CONFIG_MMIO_LENGTH, 0x1000, false); 2607 if (ret) 2608 return ret; 2609 2610 /* 2611 * vmbus_allocate_mmio() gets used for allocating both device endpoint 2612 * resource claims (those which cannot be overlapped) and the ranges 2613 * which are valid for the children of this bus, which are intended 2614 * to be overlapped by those children. Set the flag on this claim 2615 * meaning that this region can't be overlapped. 2616 */ 2617 2618 hbus->mem_config->flags |= IORESOURCE_BUSY; 2619 2620 return 0; 2621 } 2622 2623 static void hv_free_config_window(struct hv_pcibus_device *hbus) 2624 { 2625 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH); 2626 } 2627 2628 /** 2629 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state 2630 * @hdev: VMBus's tracking struct for this root PCI bus 2631 * 2632 * Return: 0 on success, -errno on failure 2633 */ 2634 static int hv_pci_enter_d0(struct hv_device *hdev) 2635 { 2636 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 2637 struct pci_bus_d0_entry *d0_entry; 2638 struct hv_pci_compl comp_pkt; 2639 struct pci_packet *pkt; 2640 int ret; 2641 2642 /* 2643 * Tell the host that the bus is ready to use, and moved into the 2644 * powered-on state. This includes telling the host which region 2645 * of memory-mapped I/O space has been chosen for configuration space 2646 * access. 2647 */ 2648 pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL); 2649 if (!pkt) 2650 return -ENOMEM; 2651 2652 init_completion(&comp_pkt.host_event); 2653 pkt->completion_func = hv_pci_generic_compl; 2654 pkt->compl_ctxt = &comp_pkt; 2655 d0_entry = (struct pci_bus_d0_entry *)&pkt->message; 2656 d0_entry->message_type.type = PCI_BUS_D0ENTRY; 2657 d0_entry->mmio_base = hbus->mem_config->start; 2658 2659 ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry), 2660 (unsigned long)pkt, VM_PKT_DATA_INBAND, 2661 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 2662 if (!ret) 2663 ret = wait_for_response(hdev, &comp_pkt.host_event); 2664 2665 if (ret) 2666 goto exit; 2667 2668 if (comp_pkt.completion_status < 0) { 2669 dev_err(&hdev->device, 2670 "PCI Pass-through VSP failed D0 Entry with status %x\n", 2671 comp_pkt.completion_status); 2672 ret = -EPROTO; 2673 goto exit; 2674 } 2675 2676 ret = 0; 2677 2678 exit: 2679 kfree(pkt); 2680 return ret; 2681 } 2682 2683 /** 2684 * hv_pci_query_relations() - Ask host to send list of child 2685 * devices 2686 * @hdev: VMBus's tracking struct for this root PCI bus 2687 * 2688 * Return: 0 on success, -errno on failure 2689 */ 2690 static int hv_pci_query_relations(struct hv_device *hdev) 2691 { 2692 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 2693 struct pci_message message; 2694 struct completion comp; 2695 int ret; 2696 2697 /* Ask the host to send along the list of child devices */ 2698 init_completion(&comp); 2699 if (cmpxchg(&hbus->survey_event, NULL, &comp)) 2700 return -ENOTEMPTY; 2701 2702 memset(&message, 0, sizeof(message)); 2703 message.type = PCI_QUERY_BUS_RELATIONS; 2704 2705 ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message), 2706 0, VM_PKT_DATA_INBAND, 0); 2707 if (!ret) 2708 ret = wait_for_response(hdev, &comp); 2709 2710 return ret; 2711 } 2712 2713 /** 2714 * hv_send_resources_allocated() - Report local resource choices 2715 * @hdev: VMBus's tracking struct for this root PCI bus 2716 * 2717 * The host OS is expecting to be sent a request as a message 2718 * which contains all the resources that the device will use. 2719 * The response contains those same resources, "translated" 2720 * which is to say, the values which should be used by the 2721 * hardware, when it delivers an interrupt. (MMIO resources are 2722 * used in local terms.) This is nice for Windows, and lines up 2723 * with the FDO/PDO split, which doesn't exist in Linux. Linux 2724 * is deeply expecting to scan an emulated PCI configuration 2725 * space. So this message is sent here only to drive the state 2726 * machine on the host forward. 2727 * 2728 * Return: 0 on success, -errno on failure 2729 */ 2730 static int hv_send_resources_allocated(struct hv_device *hdev) 2731 { 2732 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 2733 struct pci_resources_assigned *res_assigned; 2734 struct pci_resources_assigned2 *res_assigned2; 2735 struct hv_pci_compl comp_pkt; 2736 struct hv_pci_dev *hpdev; 2737 struct pci_packet *pkt; 2738 size_t size_res; 2739 u32 wslot; 2740 int ret; 2741 2742 size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) 2743 ? sizeof(*res_assigned) : sizeof(*res_assigned2); 2744 2745 pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL); 2746 if (!pkt) 2747 return -ENOMEM; 2748 2749 ret = 0; 2750 2751 for (wslot = 0; wslot < 256; wslot++) { 2752 hpdev = get_pcichild_wslot(hbus, wslot); 2753 if (!hpdev) 2754 continue; 2755 2756 memset(pkt, 0, sizeof(*pkt) + size_res); 2757 init_completion(&comp_pkt.host_event); 2758 pkt->completion_func = hv_pci_generic_compl; 2759 pkt->compl_ctxt = &comp_pkt; 2760 2761 if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) { 2762 res_assigned = 2763 (struct pci_resources_assigned *)&pkt->message; 2764 res_assigned->message_type.type = 2765 PCI_RESOURCES_ASSIGNED; 2766 res_assigned->wslot.slot = hpdev->desc.win_slot.slot; 2767 } else { 2768 res_assigned2 = 2769 (struct pci_resources_assigned2 *)&pkt->message; 2770 res_assigned2->message_type.type = 2771 PCI_RESOURCES_ASSIGNED2; 2772 res_assigned2->wslot.slot = hpdev->desc.win_slot.slot; 2773 } 2774 put_pcichild(hpdev); 2775 2776 ret = vmbus_sendpacket(hdev->channel, &pkt->message, 2777 size_res, (unsigned long)pkt, 2778 VM_PKT_DATA_INBAND, 2779 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 2780 if (!ret) 2781 ret = wait_for_response(hdev, &comp_pkt.host_event); 2782 if (ret) 2783 break; 2784 2785 if (comp_pkt.completion_status < 0) { 2786 ret = -EPROTO; 2787 dev_err(&hdev->device, 2788 "resource allocated returned 0x%x", 2789 comp_pkt.completion_status); 2790 break; 2791 } 2792 } 2793 2794 kfree(pkt); 2795 return ret; 2796 } 2797 2798 /** 2799 * hv_send_resources_released() - Report local resources 2800 * released 2801 * @hdev: VMBus's tracking struct for this root PCI bus 2802 * 2803 * Return: 0 on success, -errno on failure 2804 */ 2805 static int hv_send_resources_released(struct hv_device *hdev) 2806 { 2807 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 2808 struct pci_child_message pkt; 2809 struct hv_pci_dev *hpdev; 2810 u32 wslot; 2811 int ret; 2812 2813 for (wslot = 0; wslot < 256; wslot++) { 2814 hpdev = get_pcichild_wslot(hbus, wslot); 2815 if (!hpdev) 2816 continue; 2817 2818 memset(&pkt, 0, sizeof(pkt)); 2819 pkt.message_type.type = PCI_RESOURCES_RELEASED; 2820 pkt.wslot.slot = hpdev->desc.win_slot.slot; 2821 2822 put_pcichild(hpdev); 2823 2824 ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0, 2825 VM_PKT_DATA_INBAND, 0); 2826 if (ret) 2827 return ret; 2828 } 2829 2830 return 0; 2831 } 2832 2833 static void get_hvpcibus(struct hv_pcibus_device *hbus) 2834 { 2835 refcount_inc(&hbus->remove_lock); 2836 } 2837 2838 static void put_hvpcibus(struct hv_pcibus_device *hbus) 2839 { 2840 if (refcount_dec_and_test(&hbus->remove_lock)) 2841 complete(&hbus->remove_event); 2842 } 2843 2844 #define HVPCI_DOM_MAP_SIZE (64 * 1024) 2845 static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); 2846 2847 /* 2848 * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 2849 * as invalid for passthrough PCI devices of this driver. 2850 */ 2851 #define HVPCI_DOM_INVALID 0 2852 2853 /** 2854 * hv_get_dom_num() - Get a valid PCI domain number 2855 * Check if the PCI domain number is in use, and return another number if 2856 * it is in use. 2857 * 2858 * @dom: Requested domain number 2859 * 2860 * return: domain number on success, HVPCI_DOM_INVALID on failure 2861 */ 2862 static u16 hv_get_dom_num(u16 dom) 2863 { 2864 unsigned int i; 2865 2866 if (test_and_set_bit(dom, hvpci_dom_map) == 0) 2867 return dom; 2868 2869 for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { 2870 if (test_and_set_bit(i, hvpci_dom_map) == 0) 2871 return i; 2872 } 2873 2874 return HVPCI_DOM_INVALID; 2875 } 2876 2877 /** 2878 * hv_put_dom_num() - Mark the PCI domain number as free 2879 * @dom: Domain number to be freed 2880 */ 2881 static void hv_put_dom_num(u16 dom) 2882 { 2883 clear_bit(dom, hvpci_dom_map); 2884 } 2885 2886 /** 2887 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus 2888 * @hdev: VMBus's tracking struct for this root PCI bus 2889 * @dev_id: Identifies the device itself 2890 * 2891 * Return: 0 on success, -errno on failure 2892 */ 2893 static int hv_pci_probe(struct hv_device *hdev, 2894 const struct hv_vmbus_device_id *dev_id) 2895 { 2896 struct hv_pcibus_device *hbus; 2897 u16 dom_req, dom; 2898 char *name; 2899 int ret; 2900 2901 /* 2902 * hv_pcibus_device contains the hypercall arguments for retargeting in 2903 * hv_irq_unmask(). Those must not cross a page boundary. 2904 */ 2905 BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE); 2906 2907 /* 2908 * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural 2909 * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate 2910 * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and 2911 * alignment of hbus is important because hbus's field 2912 * retarget_msi_interrupt_params must not cross a 4KB page boundary. 2913 * 2914 * Here we prefer kzalloc to get_zeroed_page(), because a buffer 2915 * allocated by the latter is not tracked and scanned by kmemleak, and 2916 * hence kmemleak reports the pointer contained in the hbus buffer 2917 * (i.e. the hpdev struct, which is created in new_pcichild_device() and 2918 * is tracked by hbus->children) as memory leak (false positive). 2919 * 2920 * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be 2921 * used to allocate the hbus buffer and we can avoid the kmemleak false 2922 * positive by using kmemleak_alloc() and kmemleak_free() to ask 2923 * kmemleak to track and scan the hbus buffer. 2924 */ 2925 hbus = (struct hv_pcibus_device *)kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); 2926 if (!hbus) 2927 return -ENOMEM; 2928 hbus->state = hv_pcibus_init; 2929 2930 /* 2931 * The PCI bus "domain" is what is called "segment" in ACPI and other 2932 * specs. Pull it from the instance ID, to get something usually 2933 * unique. In rare cases of collision, we will find out another number 2934 * not in use. 2935 * 2936 * Note that, since this code only runs in a Hyper-V VM, Hyper-V 2937 * together with this guest driver can guarantee that (1) The only 2938 * domain used by Gen1 VMs for something that looks like a physical 2939 * PCI bus (which is actually emulated by the hypervisor) is domain 0. 2940 * (2) There will be no overlap between domains (after fixing possible 2941 * collisions) in the same VM. 2942 */ 2943 dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; 2944 dom = hv_get_dom_num(dom_req); 2945 2946 if (dom == HVPCI_DOM_INVALID) { 2947 dev_err(&hdev->device, 2948 "Unable to use dom# 0x%hx or other numbers", dom_req); 2949 ret = -EINVAL; 2950 goto free_bus; 2951 } 2952 2953 if (dom != dom_req) 2954 dev_info(&hdev->device, 2955 "PCI dom# 0x%hx has collision, using 0x%hx", 2956 dom_req, dom); 2957 2958 hbus->sysdata.domain = dom; 2959 2960 hbus->hdev = hdev; 2961 refcount_set(&hbus->remove_lock, 1); 2962 INIT_LIST_HEAD(&hbus->children); 2963 INIT_LIST_HEAD(&hbus->dr_list); 2964 INIT_LIST_HEAD(&hbus->resources_for_children); 2965 spin_lock_init(&hbus->config_lock); 2966 spin_lock_init(&hbus->device_list_lock); 2967 spin_lock_init(&hbus->retarget_msi_interrupt_lock); 2968 init_completion(&hbus->remove_event); 2969 hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0, 2970 hbus->sysdata.domain); 2971 if (!hbus->wq) { 2972 ret = -ENOMEM; 2973 goto free_dom; 2974 } 2975 2976 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, 2977 hv_pci_onchannelcallback, hbus); 2978 if (ret) 2979 goto destroy_wq; 2980 2981 hv_set_drvdata(hdev, hbus); 2982 2983 ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions, 2984 ARRAY_SIZE(pci_protocol_versions)); 2985 if (ret) 2986 goto close; 2987 2988 ret = hv_allocate_config_window(hbus); 2989 if (ret) 2990 goto close; 2991 2992 hbus->cfg_addr = ioremap(hbus->mem_config->start, 2993 PCI_CONFIG_MMIO_LENGTH); 2994 if (!hbus->cfg_addr) { 2995 dev_err(&hdev->device, 2996 "Unable to map a virtual address for config space\n"); 2997 ret = -ENOMEM; 2998 goto free_config; 2999 } 3000 3001 name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance); 3002 if (!name) { 3003 ret = -ENOMEM; 3004 goto unmap; 3005 } 3006 3007 hbus->sysdata.fwnode = irq_domain_alloc_named_fwnode(name); 3008 kfree(name); 3009 if (!hbus->sysdata.fwnode) { 3010 ret = -ENOMEM; 3011 goto unmap; 3012 } 3013 3014 ret = hv_pcie_init_irq_domain(hbus); 3015 if (ret) 3016 goto free_fwnode; 3017 3018 ret = hv_pci_query_relations(hdev); 3019 if (ret) 3020 goto free_irq_domain; 3021 3022 ret = hv_pci_enter_d0(hdev); 3023 if (ret) 3024 goto free_irq_domain; 3025 3026 ret = hv_pci_allocate_bridge_windows(hbus); 3027 if (ret) 3028 goto free_irq_domain; 3029 3030 ret = hv_send_resources_allocated(hdev); 3031 if (ret) 3032 goto free_windows; 3033 3034 prepopulate_bars(hbus); 3035 3036 hbus->state = hv_pcibus_probed; 3037 3038 ret = create_root_hv_pci_bus(hbus); 3039 if (ret) 3040 goto free_windows; 3041 3042 return 0; 3043 3044 free_windows: 3045 hv_pci_free_bridge_windows(hbus); 3046 free_irq_domain: 3047 irq_domain_remove(hbus->irq_domain); 3048 free_fwnode: 3049 irq_domain_free_fwnode(hbus->sysdata.fwnode); 3050 unmap: 3051 iounmap(hbus->cfg_addr); 3052 free_config: 3053 hv_free_config_window(hbus); 3054 close: 3055 vmbus_close(hdev->channel); 3056 destroy_wq: 3057 destroy_workqueue(hbus->wq); 3058 free_dom: 3059 hv_put_dom_num(hbus->sysdata.domain); 3060 free_bus: 3061 free_page((unsigned long)hbus); 3062 return ret; 3063 } 3064 3065 static int hv_pci_bus_exit(struct hv_device *hdev, bool hibernating) 3066 { 3067 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3068 struct { 3069 struct pci_packet teardown_packet; 3070 u8 buffer[sizeof(struct pci_message)]; 3071 } pkt; 3072 struct pci_bus_relations relations; 3073 struct hv_pci_compl comp_pkt; 3074 int ret; 3075 3076 /* 3077 * After the host sends the RESCIND_CHANNEL message, it doesn't 3078 * access the per-channel ringbuffer any longer. 3079 */ 3080 if (hdev->channel->rescind) 3081 return 0; 3082 3083 if (!hibernating) { 3084 /* Delete any children which might still exist. */ 3085 memset(&relations, 0, sizeof(relations)); 3086 hv_pci_devices_present(hbus, &relations); 3087 } 3088 3089 ret = hv_send_resources_released(hdev); 3090 if (ret) { 3091 dev_err(&hdev->device, 3092 "Couldn't send resources released packet(s)\n"); 3093 return ret; 3094 } 3095 3096 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet)); 3097 init_completion(&comp_pkt.host_event); 3098 pkt.teardown_packet.completion_func = hv_pci_generic_compl; 3099 pkt.teardown_packet.compl_ctxt = &comp_pkt; 3100 pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT; 3101 3102 ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message, 3103 sizeof(struct pci_message), 3104 (unsigned long)&pkt.teardown_packet, 3105 VM_PKT_DATA_INBAND, 3106 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3107 if (ret) 3108 return ret; 3109 3110 if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) 3111 return -ETIMEDOUT; 3112 3113 return 0; 3114 } 3115 3116 /** 3117 * hv_pci_remove() - Remove routine for this VMBus channel 3118 * @hdev: VMBus's tracking struct for this root PCI bus 3119 * 3120 * Return: 0 on success, -errno on failure 3121 */ 3122 static int hv_pci_remove(struct hv_device *hdev) 3123 { 3124 struct hv_pcibus_device *hbus; 3125 int ret; 3126 3127 hbus = hv_get_drvdata(hdev); 3128 if (hbus->state == hv_pcibus_installed) { 3129 /* Remove the bus from PCI's point of view. */ 3130 pci_lock_rescan_remove(); 3131 pci_stop_root_bus(hbus->pci_bus); 3132 hv_pci_remove_slots(hbus); 3133 pci_remove_root_bus(hbus->pci_bus); 3134 pci_unlock_rescan_remove(); 3135 hbus->state = hv_pcibus_removed; 3136 } 3137 3138 ret = hv_pci_bus_exit(hdev, false); 3139 3140 vmbus_close(hdev->channel); 3141 3142 iounmap(hbus->cfg_addr); 3143 hv_free_config_window(hbus); 3144 pci_free_resource_list(&hbus->resources_for_children); 3145 hv_pci_free_bridge_windows(hbus); 3146 irq_domain_remove(hbus->irq_domain); 3147 irq_domain_free_fwnode(hbus->sysdata.fwnode); 3148 put_hvpcibus(hbus); 3149 wait_for_completion(&hbus->remove_event); 3150 destroy_workqueue(hbus->wq); 3151 3152 hv_put_dom_num(hbus->sysdata.domain); 3153 3154 kfree(hbus); 3155 return ret; 3156 } 3157 3158 static int hv_pci_suspend(struct hv_device *hdev) 3159 { 3160 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3161 enum hv_pcibus_state old_state; 3162 int ret; 3163 3164 /* 3165 * hv_pci_suspend() must make sure there are no pending work items 3166 * before calling vmbus_close(), since it runs in a process context 3167 * as a callback in dpm_suspend(). When it starts to run, the channel 3168 * callback hv_pci_onchannelcallback(), which runs in a tasklet 3169 * context, can be still running concurrently and scheduling new work 3170 * items onto hbus->wq in hv_pci_devices_present() and 3171 * hv_pci_eject_device(), and the work item handlers can access the 3172 * vmbus channel, which can be being closed by hv_pci_suspend(), e.g. 3173 * the work item handler pci_devices_present_work() -> 3174 * new_pcichild_device() writes to the vmbus channel. 3175 * 3176 * To eliminate the race, hv_pci_suspend() disables the channel 3177 * callback tasklet, sets hbus->state to hv_pcibus_removing, and 3178 * re-enables the tasklet. This way, when hv_pci_suspend() proceeds, 3179 * it knows that no new work item can be scheduled, and then it flushes 3180 * hbus->wq and safely closes the vmbus channel. 3181 */ 3182 tasklet_disable(&hdev->channel->callback_event); 3183 3184 /* Change the hbus state to prevent new work items. */ 3185 old_state = hbus->state; 3186 if (hbus->state == hv_pcibus_installed) 3187 hbus->state = hv_pcibus_removing; 3188 3189 tasklet_enable(&hdev->channel->callback_event); 3190 3191 if (old_state != hv_pcibus_installed) 3192 return -EINVAL; 3193 3194 flush_workqueue(hbus->wq); 3195 3196 ret = hv_pci_bus_exit(hdev, true); 3197 if (ret) 3198 return ret; 3199 3200 vmbus_close(hdev->channel); 3201 3202 return 0; 3203 } 3204 3205 static int hv_pci_resume(struct hv_device *hdev) 3206 { 3207 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3208 enum pci_protocol_version_t version[1]; 3209 int ret; 3210 3211 hbus->state = hv_pcibus_init; 3212 3213 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, 3214 hv_pci_onchannelcallback, hbus); 3215 if (ret) 3216 return ret; 3217 3218 /* Only use the version that was in use before hibernation. */ 3219 version[0] = hbus->protocol_version; 3220 ret = hv_pci_protocol_negotiation(hdev, version, 1); 3221 if (ret) 3222 goto out; 3223 3224 ret = hv_pci_query_relations(hdev); 3225 if (ret) 3226 goto out; 3227 3228 ret = hv_pci_enter_d0(hdev); 3229 if (ret) 3230 goto out; 3231 3232 ret = hv_send_resources_allocated(hdev); 3233 if (ret) 3234 goto out; 3235 3236 prepopulate_bars(hbus); 3237 3238 hbus->state = hv_pcibus_installed; 3239 return 0; 3240 out: 3241 vmbus_close(hdev->channel); 3242 return ret; 3243 } 3244 3245 static const struct hv_vmbus_device_id hv_pci_id_table[] = { 3246 /* PCI Pass-through Class ID */ 3247 /* 44C4F61D-4444-4400-9D52-802E27EDE19F */ 3248 { HV_PCIE_GUID, }, 3249 { }, 3250 }; 3251 3252 MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table); 3253 3254 static struct hv_driver hv_pci_drv = { 3255 .name = "hv_pci", 3256 .id_table = hv_pci_id_table, 3257 .probe = hv_pci_probe, 3258 .remove = hv_pci_remove, 3259 .suspend = hv_pci_suspend, 3260 .resume = hv_pci_resume, 3261 }; 3262 3263 static void __exit exit_hv_pci_drv(void) 3264 { 3265 vmbus_driver_unregister(&hv_pci_drv); 3266 3267 hvpci_block_ops.read_block = NULL; 3268 hvpci_block_ops.write_block = NULL; 3269 hvpci_block_ops.reg_blk_invalidate = NULL; 3270 } 3271 3272 static int __init init_hv_pci_drv(void) 3273 { 3274 /* Set the invalid domain number's bit, so it will not be used */ 3275 set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); 3276 3277 /* Initialize PCI block r/w interface */ 3278 hvpci_block_ops.read_block = hv_read_config_block; 3279 hvpci_block_ops.write_block = hv_write_config_block; 3280 hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate; 3281 3282 return vmbus_driver_register(&hv_pci_drv); 3283 } 3284 3285 module_init(init_hv_pci_drv); 3286 module_exit(exit_hv_pci_drv); 3287 3288 MODULE_DESCRIPTION("Hyper-V PCI"); 3289 MODULE_LICENSE("GPL v2"); 3290