1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) Microsoft Corporation. 4 * 5 * Author: 6 * Jake Oshins <jakeo@microsoft.com> 7 * 8 * This driver acts as a paravirtual front-end for PCI Express root buses. 9 * When a PCI Express function (either an entire device or an SR-IOV 10 * Virtual Function) is being passed through to the VM, this driver exposes 11 * a new bus to the guest VM. This is modeled as a root PCI bus because 12 * no bridges are being exposed to the VM. In fact, with a "Generation 2" 13 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM 14 * until a device as been exposed using this driver. 15 * 16 * Each root PCI bus has its own PCI domain, which is called "Segment" in 17 * the PCI Firmware Specifications. Thus while each device passed through 18 * to the VM using this front-end will appear at "device 0", the domain will 19 * be unique. Typically, each bus will have one PCI function on it, though 20 * this driver does support more than one. 21 * 22 * In order to map the interrupts from the device through to the guest VM, 23 * this driver also implements an IRQ Domain, which handles interrupts (either 24 * MSI or MSI-X) associated with the functions on the bus. As interrupts are 25 * set up, torn down, or reaffined, this driver communicates with the 26 * underlying hypervisor to adjust the mappings in the I/O MMU so that each 27 * interrupt will be delivered to the correct virtual processor at the right 28 * vector. This driver does not support level-triggered (line-based) 29 * interrupts, and will report that the Interrupt Line register in the 30 * function's configuration space is zero. 31 * 32 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V 33 * facilities. For instance, the configuration space of a function exposed 34 * by Hyper-V is mapped into a single page of memory space, and the 35 * read and write handlers for config space must be aware of this mechanism. 36 * Similarly, device setup and teardown involves messages sent to and from 37 * the PCI back-end driver in Hyper-V. 38 */ 39 40 #include <linux/kernel.h> 41 #include <linux/module.h> 42 #include <linux/pci.h> 43 #include <linux/pci-ecam.h> 44 #include <linux/delay.h> 45 #include <linux/semaphore.h> 46 #include <linux/irq.h> 47 #include <linux/msi.h> 48 #include <linux/hyperv.h> 49 #include <linux/refcount.h> 50 #include <linux/irqdomain.h> 51 #include <linux/acpi.h> 52 #include <asm/mshyperv.h> 53 54 /* 55 * Protocol versions. The low word is the minor version, the high word the 56 * major version. 57 */ 58 59 #define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor))) 60 #define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16) 61 #define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff) 62 63 enum pci_protocol_version_t { 64 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */ 65 PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */ 66 PCI_PROTOCOL_VERSION_1_3 = PCI_MAKE_VERSION(1, 3), /* Vibranium */ 67 PCI_PROTOCOL_VERSION_1_4 = PCI_MAKE_VERSION(1, 4), /* WS2022 */ 68 }; 69 70 #define CPU_AFFINITY_ALL -1ULL 71 72 /* 73 * Supported protocol versions in the order of probing - highest go 74 * first. 75 */ 76 static enum pci_protocol_version_t pci_protocol_versions[] = { 77 PCI_PROTOCOL_VERSION_1_4, 78 PCI_PROTOCOL_VERSION_1_3, 79 PCI_PROTOCOL_VERSION_1_2, 80 PCI_PROTOCOL_VERSION_1_1, 81 }; 82 83 #define PCI_CONFIG_MMIO_LENGTH 0x2000 84 #define CFG_PAGE_OFFSET 0x1000 85 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET) 86 87 #define MAX_SUPPORTED_MSI_MESSAGES 0x400 88 89 #define STATUS_REVISION_MISMATCH 0xC0000059 90 91 /* space for 32bit serial number as string */ 92 #define SLOT_NAME_SIZE 11 93 94 /* 95 * Message Types 96 */ 97 98 enum pci_message_type { 99 /* 100 * Version 1.1 101 */ 102 PCI_MESSAGE_BASE = 0x42490000, 103 PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0, 104 PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1, 105 PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4, 106 PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5, 107 PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6, 108 PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7, 109 PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8, 110 PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9, 111 PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA, 112 PCI_EJECT = PCI_MESSAGE_BASE + 0xB, 113 PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC, 114 PCI_REENABLE = PCI_MESSAGE_BASE + 0xD, 115 PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE, 116 PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF, 117 PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10, 118 PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11, 119 PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12, 120 PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13, 121 PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14, 122 PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15, 123 PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16, 124 PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17, 125 PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */ 126 PCI_BUS_RELATIONS2 = PCI_MESSAGE_BASE + 0x19, 127 PCI_RESOURCES_ASSIGNED3 = PCI_MESSAGE_BASE + 0x1A, 128 PCI_CREATE_INTERRUPT_MESSAGE3 = PCI_MESSAGE_BASE + 0x1B, 129 PCI_MESSAGE_MAXIMUM 130 }; 131 132 /* 133 * Structures defining the virtual PCI Express protocol. 134 */ 135 136 union pci_version { 137 struct { 138 u16 minor_version; 139 u16 major_version; 140 } parts; 141 u32 version; 142 } __packed; 143 144 /* 145 * Function numbers are 8-bits wide on Express, as interpreted through ARI, 146 * which is all this driver does. This representation is the one used in 147 * Windows, which is what is expected when sending this back and forth with 148 * the Hyper-V parent partition. 149 */ 150 union win_slot_encoding { 151 struct { 152 u32 dev:5; 153 u32 func:3; 154 u32 reserved:24; 155 } bits; 156 u32 slot; 157 } __packed; 158 159 /* 160 * Pretty much as defined in the PCI Specifications. 161 */ 162 struct pci_function_description { 163 u16 v_id; /* vendor ID */ 164 u16 d_id; /* device ID */ 165 u8 rev; 166 u8 prog_intf; 167 u8 subclass; 168 u8 base_class; 169 u32 subsystem_id; 170 union win_slot_encoding win_slot; 171 u32 ser; /* serial number */ 172 } __packed; 173 174 enum pci_device_description_flags { 175 HV_PCI_DEVICE_FLAG_NONE = 0x0, 176 HV_PCI_DEVICE_FLAG_NUMA_AFFINITY = 0x1, 177 }; 178 179 struct pci_function_description2 { 180 u16 v_id; /* vendor ID */ 181 u16 d_id; /* device ID */ 182 u8 rev; 183 u8 prog_intf; 184 u8 subclass; 185 u8 base_class; 186 u32 subsystem_id; 187 union win_slot_encoding win_slot; 188 u32 ser; /* serial number */ 189 u32 flags; 190 u16 virtual_numa_node; 191 u16 reserved; 192 } __packed; 193 194 /** 195 * struct hv_msi_desc 196 * @vector: IDT entry 197 * @delivery_mode: As defined in Intel's Programmer's 198 * Reference Manual, Volume 3, Chapter 8. 199 * @vector_count: Number of contiguous entries in the 200 * Interrupt Descriptor Table that are 201 * occupied by this Message-Signaled 202 * Interrupt. For "MSI", as first defined 203 * in PCI 2.2, this can be between 1 and 204 * 32. For "MSI-X," as first defined in PCI 205 * 3.0, this must be 1, as each MSI-X table 206 * entry would have its own descriptor. 207 * @reserved: Empty space 208 * @cpu_mask: All the target virtual processors. 209 */ 210 struct hv_msi_desc { 211 u8 vector; 212 u8 delivery_mode; 213 u16 vector_count; 214 u32 reserved; 215 u64 cpu_mask; 216 } __packed; 217 218 /** 219 * struct hv_msi_desc2 - 1.2 version of hv_msi_desc 220 * @vector: IDT entry 221 * @delivery_mode: As defined in Intel's Programmer's 222 * Reference Manual, Volume 3, Chapter 8. 223 * @vector_count: Number of contiguous entries in the 224 * Interrupt Descriptor Table that are 225 * occupied by this Message-Signaled 226 * Interrupt. For "MSI", as first defined 227 * in PCI 2.2, this can be between 1 and 228 * 32. For "MSI-X," as first defined in PCI 229 * 3.0, this must be 1, as each MSI-X table 230 * entry would have its own descriptor. 231 * @processor_count: number of bits enabled in array. 232 * @processor_array: All the target virtual processors. 233 */ 234 struct hv_msi_desc2 { 235 u8 vector; 236 u8 delivery_mode; 237 u16 vector_count; 238 u16 processor_count; 239 u16 processor_array[32]; 240 } __packed; 241 242 /* 243 * struct hv_msi_desc3 - 1.3 version of hv_msi_desc 244 * Everything is the same as in 'hv_msi_desc2' except that the size of the 245 * 'vector' field is larger to support bigger vector values. For ex: LPI 246 * vectors on ARM. 247 */ 248 struct hv_msi_desc3 { 249 u32 vector; 250 u8 delivery_mode; 251 u8 reserved; 252 u16 vector_count; 253 u16 processor_count; 254 u16 processor_array[32]; 255 } __packed; 256 257 /** 258 * struct tran_int_desc 259 * @reserved: unused, padding 260 * @vector_count: same as in hv_msi_desc 261 * @data: This is the "data payload" value that is 262 * written by the device when it generates 263 * a message-signaled interrupt, either MSI 264 * or MSI-X. 265 * @address: This is the address to which the data 266 * payload is written on interrupt 267 * generation. 268 */ 269 struct tran_int_desc { 270 u16 reserved; 271 u16 vector_count; 272 u32 data; 273 u64 address; 274 } __packed; 275 276 /* 277 * A generic message format for virtual PCI. 278 * Specific message formats are defined later in the file. 279 */ 280 281 struct pci_message { 282 u32 type; 283 } __packed; 284 285 struct pci_child_message { 286 struct pci_message message_type; 287 union win_slot_encoding wslot; 288 } __packed; 289 290 struct pci_incoming_message { 291 struct vmpacket_descriptor hdr; 292 struct pci_message message_type; 293 } __packed; 294 295 struct pci_response { 296 struct vmpacket_descriptor hdr; 297 s32 status; /* negative values are failures */ 298 } __packed; 299 300 struct pci_packet { 301 void (*completion_func)(void *context, struct pci_response *resp, 302 int resp_packet_size); 303 void *compl_ctxt; 304 305 struct pci_message message[]; 306 }; 307 308 /* 309 * Specific message types supporting the PCI protocol. 310 */ 311 312 /* 313 * Version negotiation message. Sent from the guest to the host. 314 * The guest is free to try different versions until the host 315 * accepts the version. 316 * 317 * pci_version: The protocol version requested. 318 * is_last_attempt: If TRUE, this is the last version guest will request. 319 * reservedz: Reserved field, set to zero. 320 */ 321 322 struct pci_version_request { 323 struct pci_message message_type; 324 u32 protocol_version; 325 } __packed; 326 327 /* 328 * Bus D0 Entry. This is sent from the guest to the host when the virtual 329 * bus (PCI Express port) is ready for action. 330 */ 331 332 struct pci_bus_d0_entry { 333 struct pci_message message_type; 334 u32 reserved; 335 u64 mmio_base; 336 } __packed; 337 338 struct pci_bus_relations { 339 struct pci_incoming_message incoming; 340 u32 device_count; 341 struct pci_function_description func[]; 342 } __packed; 343 344 struct pci_bus_relations2 { 345 struct pci_incoming_message incoming; 346 u32 device_count; 347 struct pci_function_description2 func[]; 348 } __packed; 349 350 struct pci_q_res_req_response { 351 struct vmpacket_descriptor hdr; 352 s32 status; /* negative values are failures */ 353 u32 probed_bar[PCI_STD_NUM_BARS]; 354 } __packed; 355 356 struct pci_set_power { 357 struct pci_message message_type; 358 union win_slot_encoding wslot; 359 u32 power_state; /* In Windows terms */ 360 u32 reserved; 361 } __packed; 362 363 struct pci_set_power_response { 364 struct vmpacket_descriptor hdr; 365 s32 status; /* negative values are failures */ 366 union win_slot_encoding wslot; 367 u32 resultant_state; /* In Windows terms */ 368 u32 reserved; 369 } __packed; 370 371 struct pci_resources_assigned { 372 struct pci_message message_type; 373 union win_slot_encoding wslot; 374 u8 memory_range[0x14][6]; /* not used here */ 375 u32 msi_descriptors; 376 u32 reserved[4]; 377 } __packed; 378 379 struct pci_resources_assigned2 { 380 struct pci_message message_type; 381 union win_slot_encoding wslot; 382 u8 memory_range[0x14][6]; /* not used here */ 383 u32 msi_descriptor_count; 384 u8 reserved[70]; 385 } __packed; 386 387 struct pci_create_interrupt { 388 struct pci_message message_type; 389 union win_slot_encoding wslot; 390 struct hv_msi_desc int_desc; 391 } __packed; 392 393 struct pci_create_int_response { 394 struct pci_response response; 395 u32 reserved; 396 struct tran_int_desc int_desc; 397 } __packed; 398 399 struct pci_create_interrupt2 { 400 struct pci_message message_type; 401 union win_slot_encoding wslot; 402 struct hv_msi_desc2 int_desc; 403 } __packed; 404 405 struct pci_create_interrupt3 { 406 struct pci_message message_type; 407 union win_slot_encoding wslot; 408 struct hv_msi_desc3 int_desc; 409 } __packed; 410 411 struct pci_delete_interrupt { 412 struct pci_message message_type; 413 union win_slot_encoding wslot; 414 struct tran_int_desc int_desc; 415 } __packed; 416 417 /* 418 * Note: the VM must pass a valid block id, wslot and bytes_requested. 419 */ 420 struct pci_read_block { 421 struct pci_message message_type; 422 u32 block_id; 423 union win_slot_encoding wslot; 424 u32 bytes_requested; 425 } __packed; 426 427 struct pci_read_block_response { 428 struct vmpacket_descriptor hdr; 429 u32 status; 430 u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; 431 } __packed; 432 433 /* 434 * Note: the VM must pass a valid block id, wslot and byte_count. 435 */ 436 struct pci_write_block { 437 struct pci_message message_type; 438 u32 block_id; 439 union win_slot_encoding wslot; 440 u32 byte_count; 441 u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; 442 } __packed; 443 444 struct pci_dev_inval_block { 445 struct pci_incoming_message incoming; 446 union win_slot_encoding wslot; 447 u64 block_mask; 448 } __packed; 449 450 struct pci_dev_incoming { 451 struct pci_incoming_message incoming; 452 union win_slot_encoding wslot; 453 } __packed; 454 455 struct pci_eject_response { 456 struct pci_message message_type; 457 union win_slot_encoding wslot; 458 u32 status; 459 } __packed; 460 461 static int pci_ring_size = (4 * PAGE_SIZE); 462 463 /* 464 * Driver specific state. 465 */ 466 467 enum hv_pcibus_state { 468 hv_pcibus_init = 0, 469 hv_pcibus_probed, 470 hv_pcibus_installed, 471 hv_pcibus_removing, 472 hv_pcibus_maximum 473 }; 474 475 struct hv_pcibus_device { 476 #ifdef CONFIG_X86 477 struct pci_sysdata sysdata; 478 #elif defined(CONFIG_ARM64) 479 struct pci_config_window sysdata; 480 #endif 481 struct pci_host_bridge *bridge; 482 struct fwnode_handle *fwnode; 483 /* Protocol version negotiated with the host */ 484 enum pci_protocol_version_t protocol_version; 485 enum hv_pcibus_state state; 486 struct hv_device *hdev; 487 resource_size_t low_mmio_space; 488 resource_size_t high_mmio_space; 489 struct resource *mem_config; 490 struct resource *low_mmio_res; 491 struct resource *high_mmio_res; 492 struct completion *survey_event; 493 struct pci_bus *pci_bus; 494 spinlock_t config_lock; /* Avoid two threads writing index page */ 495 spinlock_t device_list_lock; /* Protect lists below */ 496 void __iomem *cfg_addr; 497 498 struct list_head children; 499 struct list_head dr_list; 500 501 struct msi_domain_info msi_info; 502 struct irq_domain *irq_domain; 503 504 spinlock_t retarget_msi_interrupt_lock; 505 506 struct workqueue_struct *wq; 507 508 /* Highest slot of child device with resources allocated */ 509 int wslot_res_allocated; 510 511 /* hypercall arg, must not cross page boundary */ 512 struct hv_retarget_device_interrupt retarget_msi_interrupt_params; 513 514 /* 515 * Don't put anything here: retarget_msi_interrupt_params must be last 516 */ 517 }; 518 519 /* 520 * Tracks "Device Relations" messages from the host, which must be both 521 * processed in order and deferred so that they don't run in the context 522 * of the incoming packet callback. 523 */ 524 struct hv_dr_work { 525 struct work_struct wrk; 526 struct hv_pcibus_device *bus; 527 }; 528 529 struct hv_pcidev_description { 530 u16 v_id; /* vendor ID */ 531 u16 d_id; /* device ID */ 532 u8 rev; 533 u8 prog_intf; 534 u8 subclass; 535 u8 base_class; 536 u32 subsystem_id; 537 union win_slot_encoding win_slot; 538 u32 ser; /* serial number */ 539 u32 flags; 540 u16 virtual_numa_node; 541 }; 542 543 struct hv_dr_state { 544 struct list_head list_entry; 545 u32 device_count; 546 struct hv_pcidev_description func[]; 547 }; 548 549 enum hv_pcichild_state { 550 hv_pcichild_init = 0, 551 hv_pcichild_requirements, 552 hv_pcichild_resourced, 553 hv_pcichild_ejecting, 554 hv_pcichild_maximum 555 }; 556 557 struct hv_pci_dev { 558 /* List protected by pci_rescan_remove_lock */ 559 struct list_head list_entry; 560 refcount_t refs; 561 enum hv_pcichild_state state; 562 struct pci_slot *pci_slot; 563 struct hv_pcidev_description desc; 564 bool reported_missing; 565 struct hv_pcibus_device *hbus; 566 struct work_struct wrk; 567 568 void (*block_invalidate)(void *context, u64 block_mask); 569 void *invalidate_context; 570 571 /* 572 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then 573 * read it back, for each of the BAR offsets within config space. 574 */ 575 u32 probed_bar[PCI_STD_NUM_BARS]; 576 }; 577 578 struct hv_pci_compl { 579 struct completion host_event; 580 s32 completion_status; 581 }; 582 583 static void hv_pci_onchannelcallback(void *context); 584 585 #ifdef CONFIG_X86 586 #define DELIVERY_MODE APIC_DELIVERY_MODE_FIXED 587 #define FLOW_HANDLER handle_edge_irq 588 #define FLOW_NAME "edge" 589 590 static int hv_pci_irqchip_init(void) 591 { 592 return 0; 593 } 594 595 static struct irq_domain *hv_pci_get_root_domain(void) 596 { 597 return x86_vector_domain; 598 } 599 600 static unsigned int hv_msi_get_int_vector(struct irq_data *data) 601 { 602 struct irq_cfg *cfg = irqd_cfg(data); 603 604 return cfg->vector; 605 } 606 607 static void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry, 608 struct msi_desc *msi_desc) 609 { 610 msi_entry->address.as_uint32 = msi_desc->msg.address_lo; 611 msi_entry->data.as_uint32 = msi_desc->msg.data; 612 } 613 614 static int hv_msi_prepare(struct irq_domain *domain, struct device *dev, 615 int nvec, msi_alloc_info_t *info) 616 { 617 return pci_msi_prepare(domain, dev, nvec, info); 618 } 619 620 /** 621 * hv_arch_irq_unmask() - "Unmask" the IRQ by setting its current 622 * affinity. 623 * @data: Describes the IRQ 624 * 625 * Build new a destination for the MSI and make a hypercall to 626 * update the Interrupt Redirection Table. "Device Logical ID" 627 * is built out of this PCI bus's instance GUID and the function 628 * number of the device. 629 */ 630 static void hv_arch_irq_unmask(struct irq_data *data) 631 { 632 struct msi_desc *msi_desc = irq_data_get_msi_desc(data); 633 struct hv_retarget_device_interrupt *params; 634 struct hv_pcibus_device *hbus; 635 struct cpumask *dest; 636 cpumask_var_t tmp; 637 struct pci_bus *pbus; 638 struct pci_dev *pdev; 639 unsigned long flags; 640 u32 var_size = 0; 641 int cpu, nr_bank; 642 u64 res; 643 644 dest = irq_data_get_effective_affinity_mask(data); 645 pdev = msi_desc_to_pci_dev(msi_desc); 646 pbus = pdev->bus; 647 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 648 649 spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags); 650 651 params = &hbus->retarget_msi_interrupt_params; 652 memset(params, 0, sizeof(*params)); 653 params->partition_id = HV_PARTITION_ID_SELF; 654 params->int_entry.source = HV_INTERRUPT_SOURCE_MSI; 655 hv_set_msi_entry_from_desc(¶ms->int_entry.msi_entry, msi_desc); 656 params->device_id = (hbus->hdev->dev_instance.b[5] << 24) | 657 (hbus->hdev->dev_instance.b[4] << 16) | 658 (hbus->hdev->dev_instance.b[7] << 8) | 659 (hbus->hdev->dev_instance.b[6] & 0xf8) | 660 PCI_FUNC(pdev->devfn); 661 params->int_target.vector = hv_msi_get_int_vector(data); 662 663 /* 664 * Honoring apic->delivery_mode set to APIC_DELIVERY_MODE_FIXED by 665 * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a 666 * spurious interrupt storm. Not doing so does not seem to have a 667 * negative effect (yet?). 668 */ 669 670 if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) { 671 /* 672 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the 673 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides 674 * with >64 VP support. 675 * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED 676 * is not sufficient for this hypercall. 677 */ 678 params->int_target.flags |= 679 HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; 680 681 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) { 682 res = 1; 683 goto exit_unlock; 684 } 685 686 cpumask_and(tmp, dest, cpu_online_mask); 687 nr_bank = cpumask_to_vpset(¶ms->int_target.vp_set, tmp); 688 free_cpumask_var(tmp); 689 690 if (nr_bank <= 0) { 691 res = 1; 692 goto exit_unlock; 693 } 694 695 /* 696 * var-sized hypercall, var-size starts after vp_mask (thus 697 * vp_set.format does not count, but vp_set.valid_bank_mask 698 * does). 699 */ 700 var_size = 1 + nr_bank; 701 } else { 702 for_each_cpu_and(cpu, dest, cpu_online_mask) { 703 params->int_target.vp_mask |= 704 (1ULL << hv_cpu_number_to_vp_number(cpu)); 705 } 706 } 707 708 res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17), 709 params, NULL); 710 711 exit_unlock: 712 spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags); 713 714 /* 715 * During hibernation, when a CPU is offlined, the kernel tries 716 * to move the interrupt to the remaining CPUs that haven't 717 * been offlined yet. In this case, the below hv_do_hypercall() 718 * always fails since the vmbus channel has been closed: 719 * refer to cpu_disable_common() -> fixup_irqs() -> 720 * irq_migrate_all_off_this_cpu() -> migrate_one_irq(). 721 * 722 * Suppress the error message for hibernation because the failure 723 * during hibernation does not matter (at this time all the devices 724 * have been frozen). Note: the correct affinity info is still updated 725 * into the irqdata data structure in migrate_one_irq() -> 726 * irq_do_set_affinity() -> hv_set_affinity(), so later when the VM 727 * resumes, hv_pci_restore_msi_state() is able to correctly restore 728 * the interrupt with the correct affinity. 729 */ 730 if (!hv_result_success(res) && hbus->state != hv_pcibus_removing) 731 dev_err(&hbus->hdev->device, 732 "%s() failed: %#llx", __func__, res); 733 } 734 #elif defined(CONFIG_ARM64) 735 /* 736 * SPI vectors to use for vPCI; arch SPIs range is [32, 1019], but leaving a bit 737 * of room at the start to allow for SPIs to be specified through ACPI and 738 * starting with a power of two to satisfy power of 2 multi-MSI requirement. 739 */ 740 #define HV_PCI_MSI_SPI_START 64 741 #define HV_PCI_MSI_SPI_NR (1020 - HV_PCI_MSI_SPI_START) 742 #define DELIVERY_MODE 0 743 #define FLOW_HANDLER NULL 744 #define FLOW_NAME NULL 745 #define hv_msi_prepare NULL 746 747 struct hv_pci_chip_data { 748 DECLARE_BITMAP(spi_map, HV_PCI_MSI_SPI_NR); 749 struct mutex map_lock; 750 }; 751 752 /* Hyper-V vPCI MSI GIC IRQ domain */ 753 static struct irq_domain *hv_msi_gic_irq_domain; 754 755 /* Hyper-V PCI MSI IRQ chip */ 756 static struct irq_chip hv_arm64_msi_irq_chip = { 757 .name = "MSI", 758 .irq_set_affinity = irq_chip_set_affinity_parent, 759 .irq_eoi = irq_chip_eoi_parent, 760 .irq_mask = irq_chip_mask_parent, 761 .irq_unmask = irq_chip_unmask_parent 762 }; 763 764 static unsigned int hv_msi_get_int_vector(struct irq_data *irqd) 765 { 766 return irqd->parent_data->hwirq; 767 } 768 769 /* 770 * @nr_bm_irqs: Indicates the number of IRQs that were allocated from 771 * the bitmap. 772 * @nr_dom_irqs: Indicates the number of IRQs that were allocated from 773 * the parent domain. 774 */ 775 static void hv_pci_vec_irq_free(struct irq_domain *domain, 776 unsigned int virq, 777 unsigned int nr_bm_irqs, 778 unsigned int nr_dom_irqs) 779 { 780 struct hv_pci_chip_data *chip_data = domain->host_data; 781 struct irq_data *d = irq_domain_get_irq_data(domain, virq); 782 int first = d->hwirq - HV_PCI_MSI_SPI_START; 783 int i; 784 785 mutex_lock(&chip_data->map_lock); 786 bitmap_release_region(chip_data->spi_map, 787 first, 788 get_count_order(nr_bm_irqs)); 789 mutex_unlock(&chip_data->map_lock); 790 for (i = 0; i < nr_dom_irqs; i++) { 791 if (i) 792 d = irq_domain_get_irq_data(domain, virq + i); 793 irq_domain_reset_irq_data(d); 794 } 795 796 irq_domain_free_irqs_parent(domain, virq, nr_dom_irqs); 797 } 798 799 static void hv_pci_vec_irq_domain_free(struct irq_domain *domain, 800 unsigned int virq, 801 unsigned int nr_irqs) 802 { 803 hv_pci_vec_irq_free(domain, virq, nr_irqs, nr_irqs); 804 } 805 806 static int hv_pci_vec_alloc_device_irq(struct irq_domain *domain, 807 unsigned int nr_irqs, 808 irq_hw_number_t *hwirq) 809 { 810 struct hv_pci_chip_data *chip_data = domain->host_data; 811 int index; 812 813 /* Find and allocate region from the SPI bitmap */ 814 mutex_lock(&chip_data->map_lock); 815 index = bitmap_find_free_region(chip_data->spi_map, 816 HV_PCI_MSI_SPI_NR, 817 get_count_order(nr_irqs)); 818 mutex_unlock(&chip_data->map_lock); 819 if (index < 0) 820 return -ENOSPC; 821 822 *hwirq = index + HV_PCI_MSI_SPI_START; 823 824 return 0; 825 } 826 827 static int hv_pci_vec_irq_gic_domain_alloc(struct irq_domain *domain, 828 unsigned int virq, 829 irq_hw_number_t hwirq) 830 { 831 struct irq_fwspec fwspec; 832 struct irq_data *d; 833 int ret; 834 835 fwspec.fwnode = domain->parent->fwnode; 836 fwspec.param_count = 2; 837 fwspec.param[0] = hwirq; 838 fwspec.param[1] = IRQ_TYPE_EDGE_RISING; 839 840 ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); 841 if (ret) 842 return ret; 843 844 /* 845 * Since the interrupt specifier is not coming from ACPI or DT, the 846 * trigger type will need to be set explicitly. Otherwise, it will be 847 * set to whatever is in the GIC configuration. 848 */ 849 d = irq_domain_get_irq_data(domain->parent, virq); 850 851 return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); 852 } 853 854 static int hv_pci_vec_irq_domain_alloc(struct irq_domain *domain, 855 unsigned int virq, unsigned int nr_irqs, 856 void *args) 857 { 858 irq_hw_number_t hwirq; 859 unsigned int i; 860 int ret; 861 862 ret = hv_pci_vec_alloc_device_irq(domain, nr_irqs, &hwirq); 863 if (ret) 864 return ret; 865 866 for (i = 0; i < nr_irqs; i++) { 867 ret = hv_pci_vec_irq_gic_domain_alloc(domain, virq + i, 868 hwirq + i); 869 if (ret) { 870 hv_pci_vec_irq_free(domain, virq, nr_irqs, i); 871 return ret; 872 } 873 874 irq_domain_set_hwirq_and_chip(domain, virq + i, 875 hwirq + i, 876 &hv_arm64_msi_irq_chip, 877 domain->host_data); 878 pr_debug("pID:%d vID:%u\n", (int)(hwirq + i), virq + i); 879 } 880 881 return 0; 882 } 883 884 /* 885 * Pick the first cpu as the irq affinity that can be temporarily used for 886 * composing MSI from the hypervisor. GIC will eventually set the right 887 * affinity for the irq and the 'unmask' will retarget the interrupt to that 888 * cpu. 889 */ 890 static int hv_pci_vec_irq_domain_activate(struct irq_domain *domain, 891 struct irq_data *irqd, bool reserve) 892 { 893 int cpu = cpumask_first(cpu_present_mask); 894 895 irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); 896 897 return 0; 898 } 899 900 static const struct irq_domain_ops hv_pci_domain_ops = { 901 .alloc = hv_pci_vec_irq_domain_alloc, 902 .free = hv_pci_vec_irq_domain_free, 903 .activate = hv_pci_vec_irq_domain_activate, 904 }; 905 906 static int hv_pci_irqchip_init(void) 907 { 908 static struct hv_pci_chip_data *chip_data; 909 struct fwnode_handle *fn = NULL; 910 int ret = -ENOMEM; 911 912 chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL); 913 if (!chip_data) 914 return ret; 915 916 mutex_init(&chip_data->map_lock); 917 fn = irq_domain_alloc_named_fwnode("hv_vpci_arm64"); 918 if (!fn) 919 goto free_chip; 920 921 /* 922 * IRQ domain once enabled, should not be removed since there is no 923 * way to ensure that all the corresponding devices are also gone and 924 * no interrupts will be generated. 925 */ 926 hv_msi_gic_irq_domain = acpi_irq_create_hierarchy(0, HV_PCI_MSI_SPI_NR, 927 fn, &hv_pci_domain_ops, 928 chip_data); 929 930 if (!hv_msi_gic_irq_domain) { 931 pr_err("Failed to create Hyper-V arm64 vPCI MSI IRQ domain\n"); 932 goto free_chip; 933 } 934 935 return 0; 936 937 free_chip: 938 kfree(chip_data); 939 if (fn) 940 irq_domain_free_fwnode(fn); 941 942 return ret; 943 } 944 945 static struct irq_domain *hv_pci_get_root_domain(void) 946 { 947 return hv_msi_gic_irq_domain; 948 } 949 950 /* 951 * SPIs are used for interrupts of PCI devices and SPIs is managed via GICD 952 * registers which Hyper-V already supports, so no hypercall needed. 953 */ 954 static void hv_arch_irq_unmask(struct irq_data *data) { } 955 #endif /* CONFIG_ARM64 */ 956 957 /** 958 * hv_pci_generic_compl() - Invoked for a completion packet 959 * @context: Set up by the sender of the packet. 960 * @resp: The response packet 961 * @resp_packet_size: Size in bytes of the packet 962 * 963 * This function is used to trigger an event and report status 964 * for any message for which the completion packet contains a 965 * status and nothing else. 966 */ 967 static void hv_pci_generic_compl(void *context, struct pci_response *resp, 968 int resp_packet_size) 969 { 970 struct hv_pci_compl *comp_pkt = context; 971 972 if (resp_packet_size >= offsetofend(struct pci_response, status)) 973 comp_pkt->completion_status = resp->status; 974 else 975 comp_pkt->completion_status = -1; 976 977 complete(&comp_pkt->host_event); 978 } 979 980 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 981 u32 wslot); 982 983 static void get_pcichild(struct hv_pci_dev *hpdev) 984 { 985 refcount_inc(&hpdev->refs); 986 } 987 988 static void put_pcichild(struct hv_pci_dev *hpdev) 989 { 990 if (refcount_dec_and_test(&hpdev->refs)) 991 kfree(hpdev); 992 } 993 994 /* 995 * There is no good way to get notified from vmbus_onoffer_rescind(), 996 * so let's use polling here, since this is not a hot path. 997 */ 998 static int wait_for_response(struct hv_device *hdev, 999 struct completion *comp) 1000 { 1001 while (true) { 1002 if (hdev->channel->rescind) { 1003 dev_warn_once(&hdev->device, "The device is gone.\n"); 1004 return -ENODEV; 1005 } 1006 1007 if (wait_for_completion_timeout(comp, HZ / 10)) 1008 break; 1009 } 1010 1011 return 0; 1012 } 1013 1014 /** 1015 * devfn_to_wslot() - Convert from Linux PCI slot to Windows 1016 * @devfn: The Linux representation of PCI slot 1017 * 1018 * Windows uses a slightly different representation of PCI slot. 1019 * 1020 * Return: The Windows representation 1021 */ 1022 static u32 devfn_to_wslot(int devfn) 1023 { 1024 union win_slot_encoding wslot; 1025 1026 wslot.slot = 0; 1027 wslot.bits.dev = PCI_SLOT(devfn); 1028 wslot.bits.func = PCI_FUNC(devfn); 1029 1030 return wslot.slot; 1031 } 1032 1033 /** 1034 * wslot_to_devfn() - Convert from Windows PCI slot to Linux 1035 * @wslot: The Windows representation of PCI slot 1036 * 1037 * Windows uses a slightly different representation of PCI slot. 1038 * 1039 * Return: The Linux representation 1040 */ 1041 static int wslot_to_devfn(u32 wslot) 1042 { 1043 union win_slot_encoding slot_no; 1044 1045 slot_no.slot = wslot; 1046 return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func); 1047 } 1048 1049 /* 1050 * PCI Configuration Space for these root PCI buses is implemented as a pair 1051 * of pages in memory-mapped I/O space. Writing to the first page chooses 1052 * the PCI function being written or read. Once the first page has been 1053 * written to, the following page maps in the entire configuration space of 1054 * the function. 1055 */ 1056 1057 /** 1058 * _hv_pcifront_read_config() - Internal PCI config read 1059 * @hpdev: The PCI driver's representation of the device 1060 * @where: Offset within config space 1061 * @size: Size of the transfer 1062 * @val: Pointer to the buffer receiving the data 1063 */ 1064 static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, 1065 int size, u32 *val) 1066 { 1067 unsigned long flags; 1068 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; 1069 1070 /* 1071 * If the attempt is to read the IDs or the ROM BAR, simulate that. 1072 */ 1073 if (where + size <= PCI_COMMAND) { 1074 memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size); 1075 } else if (where >= PCI_CLASS_REVISION && where + size <= 1076 PCI_CACHE_LINE_SIZE) { 1077 memcpy(val, ((u8 *)&hpdev->desc.rev) + where - 1078 PCI_CLASS_REVISION, size); 1079 } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <= 1080 PCI_ROM_ADDRESS) { 1081 memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where - 1082 PCI_SUBSYSTEM_VENDOR_ID, size); 1083 } else if (where >= PCI_ROM_ADDRESS && where + size <= 1084 PCI_CAPABILITY_LIST) { 1085 /* ROM BARs are unimplemented */ 1086 *val = 0; 1087 } else if (where >= PCI_INTERRUPT_LINE && where + size <= 1088 PCI_INTERRUPT_PIN) { 1089 /* 1090 * Interrupt Line and Interrupt PIN are hard-wired to zero 1091 * because this front-end only supports message-signaled 1092 * interrupts. 1093 */ 1094 *val = 0; 1095 } else if (where + size <= CFG_PAGE_SIZE) { 1096 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 1097 /* Choose the function to be read. (See comment above) */ 1098 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 1099 /* Make sure the function was chosen before we start reading. */ 1100 mb(); 1101 /* Read from that function's config space. */ 1102 switch (size) { 1103 case 1: 1104 *val = readb(addr); 1105 break; 1106 case 2: 1107 *val = readw(addr); 1108 break; 1109 default: 1110 *val = readl(addr); 1111 break; 1112 } 1113 /* 1114 * Make sure the read was done before we release the spinlock 1115 * allowing consecutive reads/writes. 1116 */ 1117 mb(); 1118 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 1119 } else { 1120 dev_err(&hpdev->hbus->hdev->device, 1121 "Attempt to read beyond a function's config space.\n"); 1122 } 1123 } 1124 1125 static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev) 1126 { 1127 u16 ret; 1128 unsigned long flags; 1129 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + 1130 PCI_VENDOR_ID; 1131 1132 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 1133 1134 /* Choose the function to be read. (See comment above) */ 1135 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 1136 /* Make sure the function was chosen before we start reading. */ 1137 mb(); 1138 /* Read from that function's config space. */ 1139 ret = readw(addr); 1140 /* 1141 * mb() is not required here, because the spin_unlock_irqrestore() 1142 * is a barrier. 1143 */ 1144 1145 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 1146 1147 return ret; 1148 } 1149 1150 /** 1151 * _hv_pcifront_write_config() - Internal PCI config write 1152 * @hpdev: The PCI driver's representation of the device 1153 * @where: Offset within config space 1154 * @size: Size of the transfer 1155 * @val: The data being transferred 1156 */ 1157 static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, 1158 int size, u32 val) 1159 { 1160 unsigned long flags; 1161 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; 1162 1163 if (where >= PCI_SUBSYSTEM_VENDOR_ID && 1164 where + size <= PCI_CAPABILITY_LIST) { 1165 /* SSIDs and ROM BARs are read-only */ 1166 } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) { 1167 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 1168 /* Choose the function to be written. (See comment above) */ 1169 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 1170 /* Make sure the function was chosen before we start writing. */ 1171 wmb(); 1172 /* Write to that function's config space. */ 1173 switch (size) { 1174 case 1: 1175 writeb(val, addr); 1176 break; 1177 case 2: 1178 writew(val, addr); 1179 break; 1180 default: 1181 writel(val, addr); 1182 break; 1183 } 1184 /* 1185 * Make sure the write was done before we release the spinlock 1186 * allowing consecutive reads/writes. 1187 */ 1188 mb(); 1189 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 1190 } else { 1191 dev_err(&hpdev->hbus->hdev->device, 1192 "Attempt to write beyond a function's config space.\n"); 1193 } 1194 } 1195 1196 /** 1197 * hv_pcifront_read_config() - Read configuration space 1198 * @bus: PCI Bus structure 1199 * @devfn: Device/function 1200 * @where: Offset from base 1201 * @size: Byte/word/dword 1202 * @val: Value to be read 1203 * 1204 * Return: PCIBIOS_SUCCESSFUL on success 1205 * PCIBIOS_DEVICE_NOT_FOUND on failure 1206 */ 1207 static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn, 1208 int where, int size, u32 *val) 1209 { 1210 struct hv_pcibus_device *hbus = 1211 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 1212 struct hv_pci_dev *hpdev; 1213 1214 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 1215 if (!hpdev) 1216 return PCIBIOS_DEVICE_NOT_FOUND; 1217 1218 _hv_pcifront_read_config(hpdev, where, size, val); 1219 1220 put_pcichild(hpdev); 1221 return PCIBIOS_SUCCESSFUL; 1222 } 1223 1224 /** 1225 * hv_pcifront_write_config() - Write configuration space 1226 * @bus: PCI Bus structure 1227 * @devfn: Device/function 1228 * @where: Offset from base 1229 * @size: Byte/word/dword 1230 * @val: Value to be written to device 1231 * 1232 * Return: PCIBIOS_SUCCESSFUL on success 1233 * PCIBIOS_DEVICE_NOT_FOUND on failure 1234 */ 1235 static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn, 1236 int where, int size, u32 val) 1237 { 1238 struct hv_pcibus_device *hbus = 1239 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 1240 struct hv_pci_dev *hpdev; 1241 1242 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 1243 if (!hpdev) 1244 return PCIBIOS_DEVICE_NOT_FOUND; 1245 1246 _hv_pcifront_write_config(hpdev, where, size, val); 1247 1248 put_pcichild(hpdev); 1249 return PCIBIOS_SUCCESSFUL; 1250 } 1251 1252 /* PCIe operations */ 1253 static struct pci_ops hv_pcifront_ops = { 1254 .read = hv_pcifront_read_config, 1255 .write = hv_pcifront_write_config, 1256 }; 1257 1258 /* 1259 * Paravirtual backchannel 1260 * 1261 * Hyper-V SR-IOV provides a backchannel mechanism in software for 1262 * communication between a VF driver and a PF driver. These 1263 * "configuration blocks" are similar in concept to PCI configuration space, 1264 * but instead of doing reads and writes in 32-bit chunks through a very slow 1265 * path, packets of up to 128 bytes can be sent or received asynchronously. 1266 * 1267 * Nearly every SR-IOV device contains just such a communications channel in 1268 * hardware, so using this one in software is usually optional. Using the 1269 * software channel, however, allows driver implementers to leverage software 1270 * tools that fuzz the communications channel looking for vulnerabilities. 1271 * 1272 * The usage model for these packets puts the responsibility for reading or 1273 * writing on the VF driver. The VF driver sends a read or a write packet, 1274 * indicating which "block" is being referred to by number. 1275 * 1276 * If the PF driver wishes to initiate communication, it can "invalidate" one or 1277 * more of the first 64 blocks. This invalidation is delivered via a callback 1278 * supplied by the VF driver by this driver. 1279 * 1280 * No protocol is implied, except that supplied by the PF and VF drivers. 1281 */ 1282 1283 struct hv_read_config_compl { 1284 struct hv_pci_compl comp_pkt; 1285 void *buf; 1286 unsigned int len; 1287 unsigned int bytes_returned; 1288 }; 1289 1290 /** 1291 * hv_pci_read_config_compl() - Invoked when a response packet 1292 * for a read config block operation arrives. 1293 * @context: Identifies the read config operation 1294 * @resp: The response packet itself 1295 * @resp_packet_size: Size in bytes of the response packet 1296 */ 1297 static void hv_pci_read_config_compl(void *context, struct pci_response *resp, 1298 int resp_packet_size) 1299 { 1300 struct hv_read_config_compl *comp = context; 1301 struct pci_read_block_response *read_resp = 1302 (struct pci_read_block_response *)resp; 1303 unsigned int data_len, hdr_len; 1304 1305 hdr_len = offsetof(struct pci_read_block_response, bytes); 1306 if (resp_packet_size < hdr_len) { 1307 comp->comp_pkt.completion_status = -1; 1308 goto out; 1309 } 1310 1311 data_len = resp_packet_size - hdr_len; 1312 if (data_len > 0 && read_resp->status == 0) { 1313 comp->bytes_returned = min(comp->len, data_len); 1314 memcpy(comp->buf, read_resp->bytes, comp->bytes_returned); 1315 } else { 1316 comp->bytes_returned = 0; 1317 } 1318 1319 comp->comp_pkt.completion_status = read_resp->status; 1320 out: 1321 complete(&comp->comp_pkt.host_event); 1322 } 1323 1324 /** 1325 * hv_read_config_block() - Sends a read config block request to 1326 * the back-end driver running in the Hyper-V parent partition. 1327 * @pdev: The PCI driver's representation for this device. 1328 * @buf: Buffer into which the config block will be copied. 1329 * @len: Size in bytes of buf. 1330 * @block_id: Identifies the config block which has been requested. 1331 * @bytes_returned: Size which came back from the back-end driver. 1332 * 1333 * Return: 0 on success, -errno on failure 1334 */ 1335 static int hv_read_config_block(struct pci_dev *pdev, void *buf, 1336 unsigned int len, unsigned int block_id, 1337 unsigned int *bytes_returned) 1338 { 1339 struct hv_pcibus_device *hbus = 1340 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1341 sysdata); 1342 struct { 1343 struct pci_packet pkt; 1344 char buf[sizeof(struct pci_read_block)]; 1345 } pkt; 1346 struct hv_read_config_compl comp_pkt; 1347 struct pci_read_block *read_blk; 1348 int ret; 1349 1350 if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) 1351 return -EINVAL; 1352 1353 init_completion(&comp_pkt.comp_pkt.host_event); 1354 comp_pkt.buf = buf; 1355 comp_pkt.len = len; 1356 1357 memset(&pkt, 0, sizeof(pkt)); 1358 pkt.pkt.completion_func = hv_pci_read_config_compl; 1359 pkt.pkt.compl_ctxt = &comp_pkt; 1360 read_blk = (struct pci_read_block *)&pkt.pkt.message; 1361 read_blk->message_type.type = PCI_READ_BLOCK; 1362 read_blk->wslot.slot = devfn_to_wslot(pdev->devfn); 1363 read_blk->block_id = block_id; 1364 read_blk->bytes_requested = len; 1365 1366 ret = vmbus_sendpacket(hbus->hdev->channel, read_blk, 1367 sizeof(*read_blk), (unsigned long)&pkt.pkt, 1368 VM_PKT_DATA_INBAND, 1369 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1370 if (ret) 1371 return ret; 1372 1373 ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event); 1374 if (ret) 1375 return ret; 1376 1377 if (comp_pkt.comp_pkt.completion_status != 0 || 1378 comp_pkt.bytes_returned == 0) { 1379 dev_err(&hbus->hdev->device, 1380 "Read Config Block failed: 0x%x, bytes_returned=%d\n", 1381 comp_pkt.comp_pkt.completion_status, 1382 comp_pkt.bytes_returned); 1383 return -EIO; 1384 } 1385 1386 *bytes_returned = comp_pkt.bytes_returned; 1387 return 0; 1388 } 1389 1390 /** 1391 * hv_pci_write_config_compl() - Invoked when a response packet for a write 1392 * config block operation arrives. 1393 * @context: Identifies the write config operation 1394 * @resp: The response packet itself 1395 * @resp_packet_size: Size in bytes of the response packet 1396 */ 1397 static void hv_pci_write_config_compl(void *context, struct pci_response *resp, 1398 int resp_packet_size) 1399 { 1400 struct hv_pci_compl *comp_pkt = context; 1401 1402 comp_pkt->completion_status = resp->status; 1403 complete(&comp_pkt->host_event); 1404 } 1405 1406 /** 1407 * hv_write_config_block() - Sends a write config block request to the 1408 * back-end driver running in the Hyper-V parent partition. 1409 * @pdev: The PCI driver's representation for this device. 1410 * @buf: Buffer from which the config block will be copied. 1411 * @len: Size in bytes of buf. 1412 * @block_id: Identifies the config block which is being written. 1413 * 1414 * Return: 0 on success, -errno on failure 1415 */ 1416 static int hv_write_config_block(struct pci_dev *pdev, void *buf, 1417 unsigned int len, unsigned int block_id) 1418 { 1419 struct hv_pcibus_device *hbus = 1420 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1421 sysdata); 1422 struct { 1423 struct pci_packet pkt; 1424 char buf[sizeof(struct pci_write_block)]; 1425 u32 reserved; 1426 } pkt; 1427 struct hv_pci_compl comp_pkt; 1428 struct pci_write_block *write_blk; 1429 u32 pkt_size; 1430 int ret; 1431 1432 if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) 1433 return -EINVAL; 1434 1435 init_completion(&comp_pkt.host_event); 1436 1437 memset(&pkt, 0, sizeof(pkt)); 1438 pkt.pkt.completion_func = hv_pci_write_config_compl; 1439 pkt.pkt.compl_ctxt = &comp_pkt; 1440 write_blk = (struct pci_write_block *)&pkt.pkt.message; 1441 write_blk->message_type.type = PCI_WRITE_BLOCK; 1442 write_blk->wslot.slot = devfn_to_wslot(pdev->devfn); 1443 write_blk->block_id = block_id; 1444 write_blk->byte_count = len; 1445 memcpy(write_blk->bytes, buf, len); 1446 pkt_size = offsetof(struct pci_write_block, bytes) + len; 1447 /* 1448 * This quirk is required on some hosts shipped around 2018, because 1449 * these hosts don't check the pkt_size correctly (new hosts have been 1450 * fixed since early 2019). The quirk is also safe on very old hosts 1451 * and new hosts, because, on them, what really matters is the length 1452 * specified in write_blk->byte_count. 1453 */ 1454 pkt_size += sizeof(pkt.reserved); 1455 1456 ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size, 1457 (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND, 1458 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1459 if (ret) 1460 return ret; 1461 1462 ret = wait_for_response(hbus->hdev, &comp_pkt.host_event); 1463 if (ret) 1464 return ret; 1465 1466 if (comp_pkt.completion_status != 0) { 1467 dev_err(&hbus->hdev->device, 1468 "Write Config Block failed: 0x%x\n", 1469 comp_pkt.completion_status); 1470 return -EIO; 1471 } 1472 1473 return 0; 1474 } 1475 1476 /** 1477 * hv_register_block_invalidate() - Invoked when a config block invalidation 1478 * arrives from the back-end driver. 1479 * @pdev: The PCI driver's representation for this device. 1480 * @context: Identifies the device. 1481 * @block_invalidate: Identifies all of the blocks being invalidated. 1482 * 1483 * Return: 0 on success, -errno on failure 1484 */ 1485 static int hv_register_block_invalidate(struct pci_dev *pdev, void *context, 1486 void (*block_invalidate)(void *context, 1487 u64 block_mask)) 1488 { 1489 struct hv_pcibus_device *hbus = 1490 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1491 sysdata); 1492 struct hv_pci_dev *hpdev; 1493 1494 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1495 if (!hpdev) 1496 return -ENODEV; 1497 1498 hpdev->block_invalidate = block_invalidate; 1499 hpdev->invalidate_context = context; 1500 1501 put_pcichild(hpdev); 1502 return 0; 1503 1504 } 1505 1506 /* Interrupt management hooks */ 1507 static void hv_int_desc_free(struct hv_pci_dev *hpdev, 1508 struct tran_int_desc *int_desc) 1509 { 1510 struct pci_delete_interrupt *int_pkt; 1511 struct { 1512 struct pci_packet pkt; 1513 u8 buffer[sizeof(struct pci_delete_interrupt)]; 1514 } ctxt; 1515 1516 memset(&ctxt, 0, sizeof(ctxt)); 1517 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; 1518 int_pkt->message_type.type = 1519 PCI_DELETE_INTERRUPT_MESSAGE; 1520 int_pkt->wslot.slot = hpdev->desc.win_slot.slot; 1521 int_pkt->int_desc = *int_desc; 1522 vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt), 1523 (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0); 1524 kfree(int_desc); 1525 } 1526 1527 /** 1528 * hv_msi_free() - Free the MSI. 1529 * @domain: The interrupt domain pointer 1530 * @info: Extra MSI-related context 1531 * @irq: Identifies the IRQ. 1532 * 1533 * The Hyper-V parent partition and hypervisor are tracking the 1534 * messages that are in use, keeping the interrupt redirection 1535 * table up to date. This callback sends a message that frees 1536 * the IRT entry and related tracking nonsense. 1537 */ 1538 static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info, 1539 unsigned int irq) 1540 { 1541 struct hv_pcibus_device *hbus; 1542 struct hv_pci_dev *hpdev; 1543 struct pci_dev *pdev; 1544 struct tran_int_desc *int_desc; 1545 struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq); 1546 struct msi_desc *msi = irq_data_get_msi_desc(irq_data); 1547 1548 pdev = msi_desc_to_pci_dev(msi); 1549 hbus = info->data; 1550 int_desc = irq_data_get_irq_chip_data(irq_data); 1551 if (!int_desc) 1552 return; 1553 1554 irq_data->chip_data = NULL; 1555 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1556 if (!hpdev) { 1557 kfree(int_desc); 1558 return; 1559 } 1560 1561 hv_int_desc_free(hpdev, int_desc); 1562 put_pcichild(hpdev); 1563 } 1564 1565 static void hv_irq_mask(struct irq_data *data) 1566 { 1567 pci_msi_mask_irq(data); 1568 if (data->parent_data->chip->irq_mask) 1569 irq_chip_mask_parent(data); 1570 } 1571 1572 static void hv_irq_unmask(struct irq_data *data) 1573 { 1574 hv_arch_irq_unmask(data); 1575 1576 if (data->parent_data->chip->irq_unmask) 1577 irq_chip_unmask_parent(data); 1578 pci_msi_unmask_irq(data); 1579 } 1580 1581 struct compose_comp_ctxt { 1582 struct hv_pci_compl comp_pkt; 1583 struct tran_int_desc int_desc; 1584 }; 1585 1586 static void hv_pci_compose_compl(void *context, struct pci_response *resp, 1587 int resp_packet_size) 1588 { 1589 struct compose_comp_ctxt *comp_pkt = context; 1590 struct pci_create_int_response *int_resp = 1591 (struct pci_create_int_response *)resp; 1592 1593 comp_pkt->comp_pkt.completion_status = resp->status; 1594 comp_pkt->int_desc = int_resp->int_desc; 1595 complete(&comp_pkt->comp_pkt.host_event); 1596 } 1597 1598 static u32 hv_compose_msi_req_v1( 1599 struct pci_create_interrupt *int_pkt, struct cpumask *affinity, 1600 u32 slot, u8 vector) 1601 { 1602 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; 1603 int_pkt->wslot.slot = slot; 1604 int_pkt->int_desc.vector = vector; 1605 int_pkt->int_desc.vector_count = 1; 1606 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1607 1608 /* 1609 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in 1610 * hv_irq_unmask(). 1611 */ 1612 int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL; 1613 1614 return sizeof(*int_pkt); 1615 } 1616 1617 /* 1618 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten 1619 * by subsequent retarget in hv_irq_unmask(). 1620 */ 1621 static int hv_compose_msi_req_get_cpu(struct cpumask *affinity) 1622 { 1623 return cpumask_first_and(affinity, cpu_online_mask); 1624 } 1625 1626 static u32 hv_compose_msi_req_v2( 1627 struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity, 1628 u32 slot, u8 vector) 1629 { 1630 int cpu; 1631 1632 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2; 1633 int_pkt->wslot.slot = slot; 1634 int_pkt->int_desc.vector = vector; 1635 int_pkt->int_desc.vector_count = 1; 1636 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1637 cpu = hv_compose_msi_req_get_cpu(affinity); 1638 int_pkt->int_desc.processor_array[0] = 1639 hv_cpu_number_to_vp_number(cpu); 1640 int_pkt->int_desc.processor_count = 1; 1641 1642 return sizeof(*int_pkt); 1643 } 1644 1645 static u32 hv_compose_msi_req_v3( 1646 struct pci_create_interrupt3 *int_pkt, struct cpumask *affinity, 1647 u32 slot, u32 vector) 1648 { 1649 int cpu; 1650 1651 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3; 1652 int_pkt->wslot.slot = slot; 1653 int_pkt->int_desc.vector = vector; 1654 int_pkt->int_desc.reserved = 0; 1655 int_pkt->int_desc.vector_count = 1; 1656 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1657 cpu = hv_compose_msi_req_get_cpu(affinity); 1658 int_pkt->int_desc.processor_array[0] = 1659 hv_cpu_number_to_vp_number(cpu); 1660 int_pkt->int_desc.processor_count = 1; 1661 1662 return sizeof(*int_pkt); 1663 } 1664 1665 /** 1666 * hv_compose_msi_msg() - Supplies a valid MSI address/data 1667 * @data: Everything about this MSI 1668 * @msg: Buffer that is filled in by this function 1669 * 1670 * This function unpacks the IRQ looking for target CPU set, IDT 1671 * vector and mode and sends a message to the parent partition 1672 * asking for a mapping for that tuple in this partition. The 1673 * response supplies a data value and address to which that data 1674 * should be written to trigger that interrupt. 1675 */ 1676 static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) 1677 { 1678 struct hv_pcibus_device *hbus; 1679 struct vmbus_channel *channel; 1680 struct hv_pci_dev *hpdev; 1681 struct pci_bus *pbus; 1682 struct pci_dev *pdev; 1683 struct cpumask *dest; 1684 struct compose_comp_ctxt comp; 1685 struct tran_int_desc *int_desc; 1686 struct { 1687 struct pci_packet pci_pkt; 1688 union { 1689 struct pci_create_interrupt v1; 1690 struct pci_create_interrupt2 v2; 1691 struct pci_create_interrupt3 v3; 1692 } int_pkts; 1693 } __packed ctxt; 1694 1695 u32 size; 1696 int ret; 1697 1698 pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data)); 1699 dest = irq_data_get_effective_affinity_mask(data); 1700 pbus = pdev->bus; 1701 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 1702 channel = hbus->hdev->channel; 1703 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1704 if (!hpdev) 1705 goto return_null_message; 1706 1707 /* Free any previous message that might have already been composed. */ 1708 if (data->chip_data) { 1709 int_desc = data->chip_data; 1710 data->chip_data = NULL; 1711 hv_int_desc_free(hpdev, int_desc); 1712 } 1713 1714 int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC); 1715 if (!int_desc) 1716 goto drop_reference; 1717 1718 memset(&ctxt, 0, sizeof(ctxt)); 1719 init_completion(&comp.comp_pkt.host_event); 1720 ctxt.pci_pkt.completion_func = hv_pci_compose_compl; 1721 ctxt.pci_pkt.compl_ctxt = ∁ 1722 1723 switch (hbus->protocol_version) { 1724 case PCI_PROTOCOL_VERSION_1_1: 1725 size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1, 1726 dest, 1727 hpdev->desc.win_slot.slot, 1728 hv_msi_get_int_vector(data)); 1729 break; 1730 1731 case PCI_PROTOCOL_VERSION_1_2: 1732 case PCI_PROTOCOL_VERSION_1_3: 1733 size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2, 1734 dest, 1735 hpdev->desc.win_slot.slot, 1736 hv_msi_get_int_vector(data)); 1737 break; 1738 1739 case PCI_PROTOCOL_VERSION_1_4: 1740 size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3, 1741 dest, 1742 hpdev->desc.win_slot.slot, 1743 hv_msi_get_int_vector(data)); 1744 break; 1745 1746 default: 1747 /* As we only negotiate protocol versions known to this driver, 1748 * this path should never hit. However, this is it not a hot 1749 * path so we print a message to aid future updates. 1750 */ 1751 dev_err(&hbus->hdev->device, 1752 "Unexpected vPCI protocol, update driver."); 1753 goto free_int_desc; 1754 } 1755 1756 ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, &ctxt.int_pkts, 1757 size, (unsigned long)&ctxt.pci_pkt, 1758 VM_PKT_DATA_INBAND, 1759 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1760 if (ret) { 1761 dev_err(&hbus->hdev->device, 1762 "Sending request for interrupt failed: 0x%x", 1763 comp.comp_pkt.completion_status); 1764 goto free_int_desc; 1765 } 1766 1767 /* 1768 * Prevents hv_pci_onchannelcallback() from running concurrently 1769 * in the tasklet. 1770 */ 1771 tasklet_disable_in_atomic(&channel->callback_event); 1772 1773 /* 1774 * Since this function is called with IRQ locks held, can't 1775 * do normal wait for completion; instead poll. 1776 */ 1777 while (!try_wait_for_completion(&comp.comp_pkt.host_event)) { 1778 unsigned long flags; 1779 1780 /* 0xFFFF means an invalid PCI VENDOR ID. */ 1781 if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) { 1782 dev_err_once(&hbus->hdev->device, 1783 "the device has gone\n"); 1784 goto enable_tasklet; 1785 } 1786 1787 /* 1788 * Make sure that the ring buffer data structure doesn't get 1789 * freed while we dereference the ring buffer pointer. Test 1790 * for the channel's onchannel_callback being NULL within a 1791 * sched_lock critical section. See also the inline comments 1792 * in vmbus_reset_channel_cb(). 1793 */ 1794 spin_lock_irqsave(&channel->sched_lock, flags); 1795 if (unlikely(channel->onchannel_callback == NULL)) { 1796 spin_unlock_irqrestore(&channel->sched_lock, flags); 1797 goto enable_tasklet; 1798 } 1799 hv_pci_onchannelcallback(hbus); 1800 spin_unlock_irqrestore(&channel->sched_lock, flags); 1801 1802 if (hpdev->state == hv_pcichild_ejecting) { 1803 dev_err_once(&hbus->hdev->device, 1804 "the device is being ejected\n"); 1805 goto enable_tasklet; 1806 } 1807 1808 udelay(100); 1809 } 1810 1811 tasklet_enable(&channel->callback_event); 1812 1813 if (comp.comp_pkt.completion_status < 0) { 1814 dev_err(&hbus->hdev->device, 1815 "Request for interrupt failed: 0x%x", 1816 comp.comp_pkt.completion_status); 1817 goto free_int_desc; 1818 } 1819 1820 /* 1821 * Record the assignment so that this can be unwound later. Using 1822 * irq_set_chip_data() here would be appropriate, but the lock it takes 1823 * is already held. 1824 */ 1825 *int_desc = comp.int_desc; 1826 data->chip_data = int_desc; 1827 1828 /* Pass up the result. */ 1829 msg->address_hi = comp.int_desc.address >> 32; 1830 msg->address_lo = comp.int_desc.address & 0xffffffff; 1831 msg->data = comp.int_desc.data; 1832 1833 put_pcichild(hpdev); 1834 return; 1835 1836 enable_tasklet: 1837 tasklet_enable(&channel->callback_event); 1838 free_int_desc: 1839 kfree(int_desc); 1840 drop_reference: 1841 put_pcichild(hpdev); 1842 return_null_message: 1843 msg->address_hi = 0; 1844 msg->address_lo = 0; 1845 msg->data = 0; 1846 } 1847 1848 /* HW Interrupt Chip Descriptor */ 1849 static struct irq_chip hv_msi_irq_chip = { 1850 .name = "Hyper-V PCIe MSI", 1851 .irq_compose_msi_msg = hv_compose_msi_msg, 1852 .irq_set_affinity = irq_chip_set_affinity_parent, 1853 #ifdef CONFIG_X86 1854 .irq_ack = irq_chip_ack_parent, 1855 #elif defined(CONFIG_ARM64) 1856 .irq_eoi = irq_chip_eoi_parent, 1857 #endif 1858 .irq_mask = hv_irq_mask, 1859 .irq_unmask = hv_irq_unmask, 1860 }; 1861 1862 static struct msi_domain_ops hv_msi_ops = { 1863 .msi_prepare = hv_msi_prepare, 1864 .msi_free = hv_msi_free, 1865 }; 1866 1867 /** 1868 * hv_pcie_init_irq_domain() - Initialize IRQ domain 1869 * @hbus: The root PCI bus 1870 * 1871 * This function creates an IRQ domain which will be used for 1872 * interrupts from devices that have been passed through. These 1873 * devices only support MSI and MSI-X, not line-based interrupts 1874 * or simulations of line-based interrupts through PCIe's 1875 * fabric-layer messages. Because interrupts are remapped, we 1876 * can support multi-message MSI here. 1877 * 1878 * Return: '0' on success and error value on failure 1879 */ 1880 static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus) 1881 { 1882 hbus->msi_info.chip = &hv_msi_irq_chip; 1883 hbus->msi_info.ops = &hv_msi_ops; 1884 hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS | 1885 MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI | 1886 MSI_FLAG_PCI_MSIX); 1887 hbus->msi_info.handler = FLOW_HANDLER; 1888 hbus->msi_info.handler_name = FLOW_NAME; 1889 hbus->msi_info.data = hbus; 1890 hbus->irq_domain = pci_msi_create_irq_domain(hbus->fwnode, 1891 &hbus->msi_info, 1892 hv_pci_get_root_domain()); 1893 if (!hbus->irq_domain) { 1894 dev_err(&hbus->hdev->device, 1895 "Failed to build an MSI IRQ domain\n"); 1896 return -ENODEV; 1897 } 1898 1899 dev_set_msi_domain(&hbus->bridge->dev, hbus->irq_domain); 1900 1901 return 0; 1902 } 1903 1904 /** 1905 * get_bar_size() - Get the address space consumed by a BAR 1906 * @bar_val: Value that a BAR returned after -1 was written 1907 * to it. 1908 * 1909 * This function returns the size of the BAR, rounded up to 1 1910 * page. It has to be rounded up because the hypervisor's page 1911 * table entry that maps the BAR into the VM can't specify an 1912 * offset within a page. The invariant is that the hypervisor 1913 * must place any BARs of smaller than page length at the 1914 * beginning of a page. 1915 * 1916 * Return: Size in bytes of the consumed MMIO space. 1917 */ 1918 static u64 get_bar_size(u64 bar_val) 1919 { 1920 return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)), 1921 PAGE_SIZE); 1922 } 1923 1924 /** 1925 * survey_child_resources() - Total all MMIO requirements 1926 * @hbus: Root PCI bus, as understood by this driver 1927 */ 1928 static void survey_child_resources(struct hv_pcibus_device *hbus) 1929 { 1930 struct hv_pci_dev *hpdev; 1931 resource_size_t bar_size = 0; 1932 unsigned long flags; 1933 struct completion *event; 1934 u64 bar_val; 1935 int i; 1936 1937 /* If nobody is waiting on the answer, don't compute it. */ 1938 event = xchg(&hbus->survey_event, NULL); 1939 if (!event) 1940 return; 1941 1942 /* If the answer has already been computed, go with it. */ 1943 if (hbus->low_mmio_space || hbus->high_mmio_space) { 1944 complete(event); 1945 return; 1946 } 1947 1948 spin_lock_irqsave(&hbus->device_list_lock, flags); 1949 1950 /* 1951 * Due to an interesting quirk of the PCI spec, all memory regions 1952 * for a child device are a power of 2 in size and aligned in memory, 1953 * so it's sufficient to just add them up without tracking alignment. 1954 */ 1955 list_for_each_entry(hpdev, &hbus->children, list_entry) { 1956 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 1957 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO) 1958 dev_err(&hbus->hdev->device, 1959 "There's an I/O BAR in this list!\n"); 1960 1961 if (hpdev->probed_bar[i] != 0) { 1962 /* 1963 * A probed BAR has all the upper bits set that 1964 * can be changed. 1965 */ 1966 1967 bar_val = hpdev->probed_bar[i]; 1968 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 1969 bar_val |= 1970 ((u64)hpdev->probed_bar[++i] << 32); 1971 else 1972 bar_val |= 0xffffffff00000000ULL; 1973 1974 bar_size = get_bar_size(bar_val); 1975 1976 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 1977 hbus->high_mmio_space += bar_size; 1978 else 1979 hbus->low_mmio_space += bar_size; 1980 } 1981 } 1982 } 1983 1984 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1985 complete(event); 1986 } 1987 1988 /** 1989 * prepopulate_bars() - Fill in BARs with defaults 1990 * @hbus: Root PCI bus, as understood by this driver 1991 * 1992 * The core PCI driver code seems much, much happier if the BARs 1993 * for a device have values upon first scan. So fill them in. 1994 * The algorithm below works down from large sizes to small, 1995 * attempting to pack the assignments optimally. The assumption, 1996 * enforced in other parts of the code, is that the beginning of 1997 * the memory-mapped I/O space will be aligned on the largest 1998 * BAR size. 1999 */ 2000 static void prepopulate_bars(struct hv_pcibus_device *hbus) 2001 { 2002 resource_size_t high_size = 0; 2003 resource_size_t low_size = 0; 2004 resource_size_t high_base = 0; 2005 resource_size_t low_base = 0; 2006 resource_size_t bar_size; 2007 struct hv_pci_dev *hpdev; 2008 unsigned long flags; 2009 u64 bar_val; 2010 u32 command; 2011 bool high; 2012 int i; 2013 2014 if (hbus->low_mmio_space) { 2015 low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 2016 low_base = hbus->low_mmio_res->start; 2017 } 2018 2019 if (hbus->high_mmio_space) { 2020 high_size = 1ULL << 2021 (63 - __builtin_clzll(hbus->high_mmio_space)); 2022 high_base = hbus->high_mmio_res->start; 2023 } 2024 2025 spin_lock_irqsave(&hbus->device_list_lock, flags); 2026 2027 /* 2028 * Clear the memory enable bit, in case it's already set. This occurs 2029 * in the suspend path of hibernation, where the device is suspended, 2030 * resumed and suspended again: see hibernation_snapshot() and 2031 * hibernation_platform_enter(). 2032 * 2033 * If the memory enable bit is already set, Hyper-V silently ignores 2034 * the below BAR updates, and the related PCI device driver can not 2035 * work, because reading from the device register(s) always returns 2036 * 0xFFFFFFFF (PCI_ERROR_RESPONSE). 2037 */ 2038 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2039 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command); 2040 command &= ~PCI_COMMAND_MEMORY; 2041 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command); 2042 } 2043 2044 /* Pick addresses for the BARs. */ 2045 do { 2046 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2047 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 2048 bar_val = hpdev->probed_bar[i]; 2049 if (bar_val == 0) 2050 continue; 2051 high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64; 2052 if (high) { 2053 bar_val |= 2054 ((u64)hpdev->probed_bar[i + 1] 2055 << 32); 2056 } else { 2057 bar_val |= 0xffffffffULL << 32; 2058 } 2059 bar_size = get_bar_size(bar_val); 2060 if (high) { 2061 if (high_size != bar_size) { 2062 i++; 2063 continue; 2064 } 2065 _hv_pcifront_write_config(hpdev, 2066 PCI_BASE_ADDRESS_0 + (4 * i), 2067 4, 2068 (u32)(high_base & 0xffffff00)); 2069 i++; 2070 _hv_pcifront_write_config(hpdev, 2071 PCI_BASE_ADDRESS_0 + (4 * i), 2072 4, (u32)(high_base >> 32)); 2073 high_base += bar_size; 2074 } else { 2075 if (low_size != bar_size) 2076 continue; 2077 _hv_pcifront_write_config(hpdev, 2078 PCI_BASE_ADDRESS_0 + (4 * i), 2079 4, 2080 (u32)(low_base & 0xffffff00)); 2081 low_base += bar_size; 2082 } 2083 } 2084 if (high_size <= 1 && low_size <= 1) { 2085 /* Set the memory enable bit. */ 2086 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, 2087 &command); 2088 command |= PCI_COMMAND_MEMORY; 2089 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, 2090 command); 2091 break; 2092 } 2093 } 2094 2095 high_size >>= 1; 2096 low_size >>= 1; 2097 } while (high_size || low_size); 2098 2099 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2100 } 2101 2102 /* 2103 * Assign entries in sysfs pci slot directory. 2104 * 2105 * Note that this function does not need to lock the children list 2106 * because it is called from pci_devices_present_work which 2107 * is serialized with hv_eject_device_work because they are on the 2108 * same ordered workqueue. Therefore hbus->children list will not change 2109 * even when pci_create_slot sleeps. 2110 */ 2111 static void hv_pci_assign_slots(struct hv_pcibus_device *hbus) 2112 { 2113 struct hv_pci_dev *hpdev; 2114 char name[SLOT_NAME_SIZE]; 2115 int slot_nr; 2116 2117 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2118 if (hpdev->pci_slot) 2119 continue; 2120 2121 slot_nr = PCI_SLOT(wslot_to_devfn(hpdev->desc.win_slot.slot)); 2122 snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser); 2123 hpdev->pci_slot = pci_create_slot(hbus->bridge->bus, slot_nr, 2124 name, NULL); 2125 if (IS_ERR(hpdev->pci_slot)) { 2126 pr_warn("pci_create slot %s failed\n", name); 2127 hpdev->pci_slot = NULL; 2128 } 2129 } 2130 } 2131 2132 /* 2133 * Remove entries in sysfs pci slot directory. 2134 */ 2135 static void hv_pci_remove_slots(struct hv_pcibus_device *hbus) 2136 { 2137 struct hv_pci_dev *hpdev; 2138 2139 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2140 if (!hpdev->pci_slot) 2141 continue; 2142 pci_destroy_slot(hpdev->pci_slot); 2143 hpdev->pci_slot = NULL; 2144 } 2145 } 2146 2147 /* 2148 * Set NUMA node for the devices on the bus 2149 */ 2150 static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus) 2151 { 2152 struct pci_dev *dev; 2153 struct pci_bus *bus = hbus->bridge->bus; 2154 struct hv_pci_dev *hv_dev; 2155 2156 list_for_each_entry(dev, &bus->devices, bus_list) { 2157 hv_dev = get_pcichild_wslot(hbus, devfn_to_wslot(dev->devfn)); 2158 if (!hv_dev) 2159 continue; 2160 2161 if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY && 2162 hv_dev->desc.virtual_numa_node < num_possible_nodes()) 2163 /* 2164 * The kernel may boot with some NUMA nodes offline 2165 * (e.g. in a KDUMP kernel) or with NUMA disabled via 2166 * "numa=off". In those cases, adjust the host provided 2167 * NUMA node to a valid NUMA node used by the kernel. 2168 */ 2169 set_dev_node(&dev->dev, 2170 numa_map_to_online_node( 2171 hv_dev->desc.virtual_numa_node)); 2172 2173 put_pcichild(hv_dev); 2174 } 2175 } 2176 2177 /** 2178 * create_root_hv_pci_bus() - Expose a new root PCI bus 2179 * @hbus: Root PCI bus, as understood by this driver 2180 * 2181 * Return: 0 on success, -errno on failure 2182 */ 2183 static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus) 2184 { 2185 int error; 2186 struct pci_host_bridge *bridge = hbus->bridge; 2187 2188 bridge->dev.parent = &hbus->hdev->device; 2189 bridge->sysdata = &hbus->sysdata; 2190 bridge->ops = &hv_pcifront_ops; 2191 2192 error = pci_scan_root_bus_bridge(bridge); 2193 if (error) 2194 return error; 2195 2196 pci_lock_rescan_remove(); 2197 hv_pci_assign_numa_node(hbus); 2198 pci_bus_assign_resources(bridge->bus); 2199 hv_pci_assign_slots(hbus); 2200 pci_bus_add_devices(bridge->bus); 2201 pci_unlock_rescan_remove(); 2202 hbus->state = hv_pcibus_installed; 2203 return 0; 2204 } 2205 2206 struct q_res_req_compl { 2207 struct completion host_event; 2208 struct hv_pci_dev *hpdev; 2209 }; 2210 2211 /** 2212 * q_resource_requirements() - Query Resource Requirements 2213 * @context: The completion context. 2214 * @resp: The response that came from the host. 2215 * @resp_packet_size: The size in bytes of resp. 2216 * 2217 * This function is invoked on completion of a Query Resource 2218 * Requirements packet. 2219 */ 2220 static void q_resource_requirements(void *context, struct pci_response *resp, 2221 int resp_packet_size) 2222 { 2223 struct q_res_req_compl *completion = context; 2224 struct pci_q_res_req_response *q_res_req = 2225 (struct pci_q_res_req_response *)resp; 2226 int i; 2227 2228 if (resp->status < 0) { 2229 dev_err(&completion->hpdev->hbus->hdev->device, 2230 "query resource requirements failed: %x\n", 2231 resp->status); 2232 } else { 2233 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 2234 completion->hpdev->probed_bar[i] = 2235 q_res_req->probed_bar[i]; 2236 } 2237 } 2238 2239 complete(&completion->host_event); 2240 } 2241 2242 /** 2243 * new_pcichild_device() - Create a new child device 2244 * @hbus: The internal struct tracking this root PCI bus. 2245 * @desc: The information supplied so far from the host 2246 * about the device. 2247 * 2248 * This function creates the tracking structure for a new child 2249 * device and kicks off the process of figuring out what it is. 2250 * 2251 * Return: Pointer to the new tracking struct 2252 */ 2253 static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus, 2254 struct hv_pcidev_description *desc) 2255 { 2256 struct hv_pci_dev *hpdev; 2257 struct pci_child_message *res_req; 2258 struct q_res_req_compl comp_pkt; 2259 struct { 2260 struct pci_packet init_packet; 2261 u8 buffer[sizeof(struct pci_child_message)]; 2262 } pkt; 2263 unsigned long flags; 2264 int ret; 2265 2266 hpdev = kzalloc(sizeof(*hpdev), GFP_KERNEL); 2267 if (!hpdev) 2268 return NULL; 2269 2270 hpdev->hbus = hbus; 2271 2272 memset(&pkt, 0, sizeof(pkt)); 2273 init_completion(&comp_pkt.host_event); 2274 comp_pkt.hpdev = hpdev; 2275 pkt.init_packet.compl_ctxt = &comp_pkt; 2276 pkt.init_packet.completion_func = q_resource_requirements; 2277 res_req = (struct pci_child_message *)&pkt.init_packet.message; 2278 res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS; 2279 res_req->wslot.slot = desc->win_slot.slot; 2280 2281 ret = vmbus_sendpacket(hbus->hdev->channel, res_req, 2282 sizeof(struct pci_child_message), 2283 (unsigned long)&pkt.init_packet, 2284 VM_PKT_DATA_INBAND, 2285 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 2286 if (ret) 2287 goto error; 2288 2289 if (wait_for_response(hbus->hdev, &comp_pkt.host_event)) 2290 goto error; 2291 2292 hpdev->desc = *desc; 2293 refcount_set(&hpdev->refs, 1); 2294 get_pcichild(hpdev); 2295 spin_lock_irqsave(&hbus->device_list_lock, flags); 2296 2297 list_add_tail(&hpdev->list_entry, &hbus->children); 2298 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2299 return hpdev; 2300 2301 error: 2302 kfree(hpdev); 2303 return NULL; 2304 } 2305 2306 /** 2307 * get_pcichild_wslot() - Find device from slot 2308 * @hbus: Root PCI bus, as understood by this driver 2309 * @wslot: Location on the bus 2310 * 2311 * This function looks up a PCI device and returns the internal 2312 * representation of it. It acquires a reference on it, so that 2313 * the device won't be deleted while somebody is using it. The 2314 * caller is responsible for calling put_pcichild() to release 2315 * this reference. 2316 * 2317 * Return: Internal representation of a PCI device 2318 */ 2319 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 2320 u32 wslot) 2321 { 2322 unsigned long flags; 2323 struct hv_pci_dev *iter, *hpdev = NULL; 2324 2325 spin_lock_irqsave(&hbus->device_list_lock, flags); 2326 list_for_each_entry(iter, &hbus->children, list_entry) { 2327 if (iter->desc.win_slot.slot == wslot) { 2328 hpdev = iter; 2329 get_pcichild(hpdev); 2330 break; 2331 } 2332 } 2333 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2334 2335 return hpdev; 2336 } 2337 2338 /** 2339 * pci_devices_present_work() - Handle new list of child devices 2340 * @work: Work struct embedded in struct hv_dr_work 2341 * 2342 * "Bus Relations" is the Windows term for "children of this 2343 * bus." The terminology is preserved here for people trying to 2344 * debug the interaction between Hyper-V and Linux. This 2345 * function is called when the parent partition reports a list 2346 * of functions that should be observed under this PCI Express 2347 * port (bus). 2348 * 2349 * This function updates the list, and must tolerate being 2350 * called multiple times with the same information. The typical 2351 * number of child devices is one, with very atypical cases 2352 * involving three or four, so the algorithms used here can be 2353 * simple and inefficient. 2354 * 2355 * It must also treat the omission of a previously observed device as 2356 * notification that the device no longer exists. 2357 * 2358 * Note that this function is serialized with hv_eject_device_work(), 2359 * because both are pushed to the ordered workqueue hbus->wq. 2360 */ 2361 static void pci_devices_present_work(struct work_struct *work) 2362 { 2363 u32 child_no; 2364 bool found; 2365 struct hv_pcidev_description *new_desc; 2366 struct hv_pci_dev *hpdev; 2367 struct hv_pcibus_device *hbus; 2368 struct list_head removed; 2369 struct hv_dr_work *dr_wrk; 2370 struct hv_dr_state *dr = NULL; 2371 unsigned long flags; 2372 2373 dr_wrk = container_of(work, struct hv_dr_work, wrk); 2374 hbus = dr_wrk->bus; 2375 kfree(dr_wrk); 2376 2377 INIT_LIST_HEAD(&removed); 2378 2379 /* Pull this off the queue and process it if it was the last one. */ 2380 spin_lock_irqsave(&hbus->device_list_lock, flags); 2381 while (!list_empty(&hbus->dr_list)) { 2382 dr = list_first_entry(&hbus->dr_list, struct hv_dr_state, 2383 list_entry); 2384 list_del(&dr->list_entry); 2385 2386 /* Throw this away if the list still has stuff in it. */ 2387 if (!list_empty(&hbus->dr_list)) { 2388 kfree(dr); 2389 continue; 2390 } 2391 } 2392 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2393 2394 if (!dr) 2395 return; 2396 2397 /* First, mark all existing children as reported missing. */ 2398 spin_lock_irqsave(&hbus->device_list_lock, flags); 2399 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2400 hpdev->reported_missing = true; 2401 } 2402 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2403 2404 /* Next, add back any reported devices. */ 2405 for (child_no = 0; child_no < dr->device_count; child_no++) { 2406 found = false; 2407 new_desc = &dr->func[child_no]; 2408 2409 spin_lock_irqsave(&hbus->device_list_lock, flags); 2410 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2411 if ((hpdev->desc.win_slot.slot == new_desc->win_slot.slot) && 2412 (hpdev->desc.v_id == new_desc->v_id) && 2413 (hpdev->desc.d_id == new_desc->d_id) && 2414 (hpdev->desc.ser == new_desc->ser)) { 2415 hpdev->reported_missing = false; 2416 found = true; 2417 } 2418 } 2419 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2420 2421 if (!found) { 2422 hpdev = new_pcichild_device(hbus, new_desc); 2423 if (!hpdev) 2424 dev_err(&hbus->hdev->device, 2425 "couldn't record a child device.\n"); 2426 } 2427 } 2428 2429 /* Move missing children to a list on the stack. */ 2430 spin_lock_irqsave(&hbus->device_list_lock, flags); 2431 do { 2432 found = false; 2433 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2434 if (hpdev->reported_missing) { 2435 found = true; 2436 put_pcichild(hpdev); 2437 list_move_tail(&hpdev->list_entry, &removed); 2438 break; 2439 } 2440 } 2441 } while (found); 2442 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2443 2444 /* Delete everything that should no longer exist. */ 2445 while (!list_empty(&removed)) { 2446 hpdev = list_first_entry(&removed, struct hv_pci_dev, 2447 list_entry); 2448 list_del(&hpdev->list_entry); 2449 2450 if (hpdev->pci_slot) 2451 pci_destroy_slot(hpdev->pci_slot); 2452 2453 put_pcichild(hpdev); 2454 } 2455 2456 switch (hbus->state) { 2457 case hv_pcibus_installed: 2458 /* 2459 * Tell the core to rescan bus 2460 * because there may have been changes. 2461 */ 2462 pci_lock_rescan_remove(); 2463 pci_scan_child_bus(hbus->bridge->bus); 2464 hv_pci_assign_numa_node(hbus); 2465 hv_pci_assign_slots(hbus); 2466 pci_unlock_rescan_remove(); 2467 break; 2468 2469 case hv_pcibus_init: 2470 case hv_pcibus_probed: 2471 survey_child_resources(hbus); 2472 break; 2473 2474 default: 2475 break; 2476 } 2477 2478 kfree(dr); 2479 } 2480 2481 /** 2482 * hv_pci_start_relations_work() - Queue work to start device discovery 2483 * @hbus: Root PCI bus, as understood by this driver 2484 * @dr: The list of children returned from host 2485 * 2486 * Return: 0 on success, -errno on failure 2487 */ 2488 static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus, 2489 struct hv_dr_state *dr) 2490 { 2491 struct hv_dr_work *dr_wrk; 2492 unsigned long flags; 2493 bool pending_dr; 2494 2495 if (hbus->state == hv_pcibus_removing) { 2496 dev_info(&hbus->hdev->device, 2497 "PCI VMBus BUS_RELATIONS: ignored\n"); 2498 return -ENOENT; 2499 } 2500 2501 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT); 2502 if (!dr_wrk) 2503 return -ENOMEM; 2504 2505 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work); 2506 dr_wrk->bus = hbus; 2507 2508 spin_lock_irqsave(&hbus->device_list_lock, flags); 2509 /* 2510 * If pending_dr is true, we have already queued a work, 2511 * which will see the new dr. Otherwise, we need to 2512 * queue a new work. 2513 */ 2514 pending_dr = !list_empty(&hbus->dr_list); 2515 list_add_tail(&dr->list_entry, &hbus->dr_list); 2516 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2517 2518 if (pending_dr) 2519 kfree(dr_wrk); 2520 else 2521 queue_work(hbus->wq, &dr_wrk->wrk); 2522 2523 return 0; 2524 } 2525 2526 /** 2527 * hv_pci_devices_present() - Handle list of new children 2528 * @hbus: Root PCI bus, as understood by this driver 2529 * @relations: Packet from host listing children 2530 * 2531 * Process a new list of devices on the bus. The list of devices is 2532 * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS, 2533 * whenever a new list of devices for this bus appears. 2534 */ 2535 static void hv_pci_devices_present(struct hv_pcibus_device *hbus, 2536 struct pci_bus_relations *relations) 2537 { 2538 struct hv_dr_state *dr; 2539 int i; 2540 2541 dr = kzalloc(struct_size(dr, func, relations->device_count), 2542 GFP_NOWAIT); 2543 if (!dr) 2544 return; 2545 2546 dr->device_count = relations->device_count; 2547 for (i = 0; i < dr->device_count; i++) { 2548 dr->func[i].v_id = relations->func[i].v_id; 2549 dr->func[i].d_id = relations->func[i].d_id; 2550 dr->func[i].rev = relations->func[i].rev; 2551 dr->func[i].prog_intf = relations->func[i].prog_intf; 2552 dr->func[i].subclass = relations->func[i].subclass; 2553 dr->func[i].base_class = relations->func[i].base_class; 2554 dr->func[i].subsystem_id = relations->func[i].subsystem_id; 2555 dr->func[i].win_slot = relations->func[i].win_slot; 2556 dr->func[i].ser = relations->func[i].ser; 2557 } 2558 2559 if (hv_pci_start_relations_work(hbus, dr)) 2560 kfree(dr); 2561 } 2562 2563 /** 2564 * hv_pci_devices_present2() - Handle list of new children 2565 * @hbus: Root PCI bus, as understood by this driver 2566 * @relations: Packet from host listing children 2567 * 2568 * This function is the v2 version of hv_pci_devices_present() 2569 */ 2570 static void hv_pci_devices_present2(struct hv_pcibus_device *hbus, 2571 struct pci_bus_relations2 *relations) 2572 { 2573 struct hv_dr_state *dr; 2574 int i; 2575 2576 dr = kzalloc(struct_size(dr, func, relations->device_count), 2577 GFP_NOWAIT); 2578 if (!dr) 2579 return; 2580 2581 dr->device_count = relations->device_count; 2582 for (i = 0; i < dr->device_count; i++) { 2583 dr->func[i].v_id = relations->func[i].v_id; 2584 dr->func[i].d_id = relations->func[i].d_id; 2585 dr->func[i].rev = relations->func[i].rev; 2586 dr->func[i].prog_intf = relations->func[i].prog_intf; 2587 dr->func[i].subclass = relations->func[i].subclass; 2588 dr->func[i].base_class = relations->func[i].base_class; 2589 dr->func[i].subsystem_id = relations->func[i].subsystem_id; 2590 dr->func[i].win_slot = relations->func[i].win_slot; 2591 dr->func[i].ser = relations->func[i].ser; 2592 dr->func[i].flags = relations->func[i].flags; 2593 dr->func[i].virtual_numa_node = 2594 relations->func[i].virtual_numa_node; 2595 } 2596 2597 if (hv_pci_start_relations_work(hbus, dr)) 2598 kfree(dr); 2599 } 2600 2601 /** 2602 * hv_eject_device_work() - Asynchronously handles ejection 2603 * @work: Work struct embedded in internal device struct 2604 * 2605 * This function handles ejecting a device. Windows will 2606 * attempt to gracefully eject a device, waiting 60 seconds to 2607 * hear back from the guest OS that this completed successfully. 2608 * If this timer expires, the device will be forcibly removed. 2609 */ 2610 static void hv_eject_device_work(struct work_struct *work) 2611 { 2612 struct pci_eject_response *ejct_pkt; 2613 struct hv_pcibus_device *hbus; 2614 struct hv_pci_dev *hpdev; 2615 struct pci_dev *pdev; 2616 unsigned long flags; 2617 int wslot; 2618 struct { 2619 struct pci_packet pkt; 2620 u8 buffer[sizeof(struct pci_eject_response)]; 2621 } ctxt; 2622 2623 hpdev = container_of(work, struct hv_pci_dev, wrk); 2624 hbus = hpdev->hbus; 2625 2626 WARN_ON(hpdev->state != hv_pcichild_ejecting); 2627 2628 /* 2629 * Ejection can come before or after the PCI bus has been set up, so 2630 * attempt to find it and tear down the bus state, if it exists. This 2631 * must be done without constructs like pci_domain_nr(hbus->bridge->bus) 2632 * because hbus->bridge->bus may not exist yet. 2633 */ 2634 wslot = wslot_to_devfn(hpdev->desc.win_slot.slot); 2635 pdev = pci_get_domain_bus_and_slot(hbus->bridge->domain_nr, 0, wslot); 2636 if (pdev) { 2637 pci_lock_rescan_remove(); 2638 pci_stop_and_remove_bus_device(pdev); 2639 pci_dev_put(pdev); 2640 pci_unlock_rescan_remove(); 2641 } 2642 2643 spin_lock_irqsave(&hbus->device_list_lock, flags); 2644 list_del(&hpdev->list_entry); 2645 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2646 2647 if (hpdev->pci_slot) 2648 pci_destroy_slot(hpdev->pci_slot); 2649 2650 memset(&ctxt, 0, sizeof(ctxt)); 2651 ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message; 2652 ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE; 2653 ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot; 2654 vmbus_sendpacket(hbus->hdev->channel, ejct_pkt, 2655 sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt, 2656 VM_PKT_DATA_INBAND, 0); 2657 2658 /* For the get_pcichild() in hv_pci_eject_device() */ 2659 put_pcichild(hpdev); 2660 /* For the two refs got in new_pcichild_device() */ 2661 put_pcichild(hpdev); 2662 put_pcichild(hpdev); 2663 /* hpdev has been freed. Do not use it any more. */ 2664 } 2665 2666 /** 2667 * hv_pci_eject_device() - Handles device ejection 2668 * @hpdev: Internal device tracking struct 2669 * 2670 * This function is invoked when an ejection packet arrives. It 2671 * just schedules work so that we don't re-enter the packet 2672 * delivery code handling the ejection. 2673 */ 2674 static void hv_pci_eject_device(struct hv_pci_dev *hpdev) 2675 { 2676 struct hv_pcibus_device *hbus = hpdev->hbus; 2677 struct hv_device *hdev = hbus->hdev; 2678 2679 if (hbus->state == hv_pcibus_removing) { 2680 dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n"); 2681 return; 2682 } 2683 2684 hpdev->state = hv_pcichild_ejecting; 2685 get_pcichild(hpdev); 2686 INIT_WORK(&hpdev->wrk, hv_eject_device_work); 2687 queue_work(hbus->wq, &hpdev->wrk); 2688 } 2689 2690 /** 2691 * hv_pci_onchannelcallback() - Handles incoming packets 2692 * @context: Internal bus tracking struct 2693 * 2694 * This function is invoked whenever the host sends a packet to 2695 * this channel (which is private to this root PCI bus). 2696 */ 2697 static void hv_pci_onchannelcallback(void *context) 2698 { 2699 const int packet_size = 0x100; 2700 int ret; 2701 struct hv_pcibus_device *hbus = context; 2702 u32 bytes_recvd; 2703 u64 req_id; 2704 struct vmpacket_descriptor *desc; 2705 unsigned char *buffer; 2706 int bufferlen = packet_size; 2707 struct pci_packet *comp_packet; 2708 struct pci_response *response; 2709 struct pci_incoming_message *new_message; 2710 struct pci_bus_relations *bus_rel; 2711 struct pci_bus_relations2 *bus_rel2; 2712 struct pci_dev_inval_block *inval; 2713 struct pci_dev_incoming *dev_message; 2714 struct hv_pci_dev *hpdev; 2715 2716 buffer = kmalloc(bufferlen, GFP_ATOMIC); 2717 if (!buffer) 2718 return; 2719 2720 while (1) { 2721 ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer, 2722 bufferlen, &bytes_recvd, &req_id); 2723 2724 if (ret == -ENOBUFS) { 2725 kfree(buffer); 2726 /* Handle large packet */ 2727 bufferlen = bytes_recvd; 2728 buffer = kmalloc(bytes_recvd, GFP_ATOMIC); 2729 if (!buffer) 2730 return; 2731 continue; 2732 } 2733 2734 /* Zero length indicates there are no more packets. */ 2735 if (ret || !bytes_recvd) 2736 break; 2737 2738 /* 2739 * All incoming packets must be at least as large as a 2740 * response. 2741 */ 2742 if (bytes_recvd <= sizeof(struct pci_response)) 2743 continue; 2744 desc = (struct vmpacket_descriptor *)buffer; 2745 2746 switch (desc->type) { 2747 case VM_PKT_COMP: 2748 2749 /* 2750 * The host is trusted, and thus it's safe to interpret 2751 * this transaction ID as a pointer. 2752 */ 2753 comp_packet = (struct pci_packet *)req_id; 2754 response = (struct pci_response *)buffer; 2755 comp_packet->completion_func(comp_packet->compl_ctxt, 2756 response, 2757 bytes_recvd); 2758 break; 2759 2760 case VM_PKT_DATA_INBAND: 2761 2762 new_message = (struct pci_incoming_message *)buffer; 2763 switch (new_message->message_type.type) { 2764 case PCI_BUS_RELATIONS: 2765 2766 bus_rel = (struct pci_bus_relations *)buffer; 2767 if (bytes_recvd < 2768 struct_size(bus_rel, func, 2769 bus_rel->device_count)) { 2770 dev_err(&hbus->hdev->device, 2771 "bus relations too small\n"); 2772 break; 2773 } 2774 2775 hv_pci_devices_present(hbus, bus_rel); 2776 break; 2777 2778 case PCI_BUS_RELATIONS2: 2779 2780 bus_rel2 = (struct pci_bus_relations2 *)buffer; 2781 if (bytes_recvd < 2782 struct_size(bus_rel2, func, 2783 bus_rel2->device_count)) { 2784 dev_err(&hbus->hdev->device, 2785 "bus relations v2 too small\n"); 2786 break; 2787 } 2788 2789 hv_pci_devices_present2(hbus, bus_rel2); 2790 break; 2791 2792 case PCI_EJECT: 2793 2794 dev_message = (struct pci_dev_incoming *)buffer; 2795 hpdev = get_pcichild_wslot(hbus, 2796 dev_message->wslot.slot); 2797 if (hpdev) { 2798 hv_pci_eject_device(hpdev); 2799 put_pcichild(hpdev); 2800 } 2801 break; 2802 2803 case PCI_INVALIDATE_BLOCK: 2804 2805 inval = (struct pci_dev_inval_block *)buffer; 2806 hpdev = get_pcichild_wslot(hbus, 2807 inval->wslot.slot); 2808 if (hpdev) { 2809 if (hpdev->block_invalidate) { 2810 hpdev->block_invalidate( 2811 hpdev->invalidate_context, 2812 inval->block_mask); 2813 } 2814 put_pcichild(hpdev); 2815 } 2816 break; 2817 2818 default: 2819 dev_warn(&hbus->hdev->device, 2820 "Unimplemented protocol message %x\n", 2821 new_message->message_type.type); 2822 break; 2823 } 2824 break; 2825 2826 default: 2827 dev_err(&hbus->hdev->device, 2828 "unhandled packet type %d, tid %llx len %d\n", 2829 desc->type, req_id, bytes_recvd); 2830 break; 2831 } 2832 } 2833 2834 kfree(buffer); 2835 } 2836 2837 /** 2838 * hv_pci_protocol_negotiation() - Set up protocol 2839 * @hdev: VMBus's tracking struct for this root PCI bus. 2840 * @version: Array of supported channel protocol versions in 2841 * the order of probing - highest go first. 2842 * @num_version: Number of elements in the version array. 2843 * 2844 * This driver is intended to support running on Windows 10 2845 * (server) and later versions. It will not run on earlier 2846 * versions, as they assume that many of the operations which 2847 * Linux needs accomplished with a spinlock held were done via 2848 * asynchronous messaging via VMBus. Windows 10 increases the 2849 * surface area of PCI emulation so that these actions can take 2850 * place by suspending a virtual processor for their duration. 2851 * 2852 * This function negotiates the channel protocol version, 2853 * failing if the host doesn't support the necessary protocol 2854 * level. 2855 */ 2856 static int hv_pci_protocol_negotiation(struct hv_device *hdev, 2857 enum pci_protocol_version_t version[], 2858 int num_version) 2859 { 2860 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 2861 struct pci_version_request *version_req; 2862 struct hv_pci_compl comp_pkt; 2863 struct pci_packet *pkt; 2864 int ret; 2865 int i; 2866 2867 /* 2868 * Initiate the handshake with the host and negotiate 2869 * a version that the host can support. We start with the 2870 * highest version number and go down if the host cannot 2871 * support it. 2872 */ 2873 pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL); 2874 if (!pkt) 2875 return -ENOMEM; 2876 2877 init_completion(&comp_pkt.host_event); 2878 pkt->completion_func = hv_pci_generic_compl; 2879 pkt->compl_ctxt = &comp_pkt; 2880 version_req = (struct pci_version_request *)&pkt->message; 2881 version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION; 2882 2883 for (i = 0; i < num_version; i++) { 2884 version_req->protocol_version = version[i]; 2885 ret = vmbus_sendpacket(hdev->channel, version_req, 2886 sizeof(struct pci_version_request), 2887 (unsigned long)pkt, VM_PKT_DATA_INBAND, 2888 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 2889 if (!ret) 2890 ret = wait_for_response(hdev, &comp_pkt.host_event); 2891 2892 if (ret) { 2893 dev_err(&hdev->device, 2894 "PCI Pass-through VSP failed to request version: %d", 2895 ret); 2896 goto exit; 2897 } 2898 2899 if (comp_pkt.completion_status >= 0) { 2900 hbus->protocol_version = version[i]; 2901 dev_info(&hdev->device, 2902 "PCI VMBus probing: Using version %#x\n", 2903 hbus->protocol_version); 2904 goto exit; 2905 } 2906 2907 if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) { 2908 dev_err(&hdev->device, 2909 "PCI Pass-through VSP failed version request: %#x", 2910 comp_pkt.completion_status); 2911 ret = -EPROTO; 2912 goto exit; 2913 } 2914 2915 reinit_completion(&comp_pkt.host_event); 2916 } 2917 2918 dev_err(&hdev->device, 2919 "PCI pass-through VSP failed to find supported version"); 2920 ret = -EPROTO; 2921 2922 exit: 2923 kfree(pkt); 2924 return ret; 2925 } 2926 2927 /** 2928 * hv_pci_free_bridge_windows() - Release memory regions for the 2929 * bus 2930 * @hbus: Root PCI bus, as understood by this driver 2931 */ 2932 static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus) 2933 { 2934 /* 2935 * Set the resources back to the way they looked when they 2936 * were allocated by setting IORESOURCE_BUSY again. 2937 */ 2938 2939 if (hbus->low_mmio_space && hbus->low_mmio_res) { 2940 hbus->low_mmio_res->flags |= IORESOURCE_BUSY; 2941 vmbus_free_mmio(hbus->low_mmio_res->start, 2942 resource_size(hbus->low_mmio_res)); 2943 } 2944 2945 if (hbus->high_mmio_space && hbus->high_mmio_res) { 2946 hbus->high_mmio_res->flags |= IORESOURCE_BUSY; 2947 vmbus_free_mmio(hbus->high_mmio_res->start, 2948 resource_size(hbus->high_mmio_res)); 2949 } 2950 } 2951 2952 /** 2953 * hv_pci_allocate_bridge_windows() - Allocate memory regions 2954 * for the bus 2955 * @hbus: Root PCI bus, as understood by this driver 2956 * 2957 * This function calls vmbus_allocate_mmio(), which is itself a 2958 * bit of a compromise. Ideally, we might change the pnp layer 2959 * in the kernel such that it comprehends either PCI devices 2960 * which are "grandchildren of ACPI," with some intermediate bus 2961 * node (in this case, VMBus) or change it such that it 2962 * understands VMBus. The pnp layer, however, has been declared 2963 * deprecated, and not subject to change. 2964 * 2965 * The workaround, implemented here, is to ask VMBus to allocate 2966 * MMIO space for this bus. VMBus itself knows which ranges are 2967 * appropriate by looking at its own ACPI objects. Then, after 2968 * these ranges are claimed, they're modified to look like they 2969 * would have looked if the ACPI and pnp code had allocated 2970 * bridge windows. These descriptors have to exist in this form 2971 * in order to satisfy the code which will get invoked when the 2972 * endpoint PCI function driver calls request_mem_region() or 2973 * request_mem_region_exclusive(). 2974 * 2975 * Return: 0 on success, -errno on failure 2976 */ 2977 static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus) 2978 { 2979 resource_size_t align; 2980 int ret; 2981 2982 if (hbus->low_mmio_space) { 2983 align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 2984 ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0, 2985 (u64)(u32)0xffffffff, 2986 hbus->low_mmio_space, 2987 align, false); 2988 if (ret) { 2989 dev_err(&hbus->hdev->device, 2990 "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n", 2991 hbus->low_mmio_space); 2992 return ret; 2993 } 2994 2995 /* Modify this resource to become a bridge window. */ 2996 hbus->low_mmio_res->flags |= IORESOURCE_WINDOW; 2997 hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY; 2998 pci_add_resource(&hbus->bridge->windows, hbus->low_mmio_res); 2999 } 3000 3001 if (hbus->high_mmio_space) { 3002 align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space)); 3003 ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev, 3004 0x100000000, -1, 3005 hbus->high_mmio_space, align, 3006 false); 3007 if (ret) { 3008 dev_err(&hbus->hdev->device, 3009 "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n", 3010 hbus->high_mmio_space); 3011 goto release_low_mmio; 3012 } 3013 3014 /* Modify this resource to become a bridge window. */ 3015 hbus->high_mmio_res->flags |= IORESOURCE_WINDOW; 3016 hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY; 3017 pci_add_resource(&hbus->bridge->windows, hbus->high_mmio_res); 3018 } 3019 3020 return 0; 3021 3022 release_low_mmio: 3023 if (hbus->low_mmio_res) { 3024 vmbus_free_mmio(hbus->low_mmio_res->start, 3025 resource_size(hbus->low_mmio_res)); 3026 } 3027 3028 return ret; 3029 } 3030 3031 /** 3032 * hv_allocate_config_window() - Find MMIO space for PCI Config 3033 * @hbus: Root PCI bus, as understood by this driver 3034 * 3035 * This function claims memory-mapped I/O space for accessing 3036 * configuration space for the functions on this bus. 3037 * 3038 * Return: 0 on success, -errno on failure 3039 */ 3040 static int hv_allocate_config_window(struct hv_pcibus_device *hbus) 3041 { 3042 int ret; 3043 3044 /* 3045 * Set up a region of MMIO space to use for accessing configuration 3046 * space. 3047 */ 3048 ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1, 3049 PCI_CONFIG_MMIO_LENGTH, 0x1000, false); 3050 if (ret) 3051 return ret; 3052 3053 /* 3054 * vmbus_allocate_mmio() gets used for allocating both device endpoint 3055 * resource claims (those which cannot be overlapped) and the ranges 3056 * which are valid for the children of this bus, which are intended 3057 * to be overlapped by those children. Set the flag on this claim 3058 * meaning that this region can't be overlapped. 3059 */ 3060 3061 hbus->mem_config->flags |= IORESOURCE_BUSY; 3062 3063 return 0; 3064 } 3065 3066 static void hv_free_config_window(struct hv_pcibus_device *hbus) 3067 { 3068 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH); 3069 } 3070 3071 static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs); 3072 3073 /** 3074 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state 3075 * @hdev: VMBus's tracking struct for this root PCI bus 3076 * 3077 * Return: 0 on success, -errno on failure 3078 */ 3079 static int hv_pci_enter_d0(struct hv_device *hdev) 3080 { 3081 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3082 struct pci_bus_d0_entry *d0_entry; 3083 struct hv_pci_compl comp_pkt; 3084 struct pci_packet *pkt; 3085 int ret; 3086 3087 /* 3088 * Tell the host that the bus is ready to use, and moved into the 3089 * powered-on state. This includes telling the host which region 3090 * of memory-mapped I/O space has been chosen for configuration space 3091 * access. 3092 */ 3093 pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL); 3094 if (!pkt) 3095 return -ENOMEM; 3096 3097 init_completion(&comp_pkt.host_event); 3098 pkt->completion_func = hv_pci_generic_compl; 3099 pkt->compl_ctxt = &comp_pkt; 3100 d0_entry = (struct pci_bus_d0_entry *)&pkt->message; 3101 d0_entry->message_type.type = PCI_BUS_D0ENTRY; 3102 d0_entry->mmio_base = hbus->mem_config->start; 3103 3104 ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry), 3105 (unsigned long)pkt, VM_PKT_DATA_INBAND, 3106 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3107 if (!ret) 3108 ret = wait_for_response(hdev, &comp_pkt.host_event); 3109 3110 if (ret) 3111 goto exit; 3112 3113 if (comp_pkt.completion_status < 0) { 3114 dev_err(&hdev->device, 3115 "PCI Pass-through VSP failed D0 Entry with status %x\n", 3116 comp_pkt.completion_status); 3117 ret = -EPROTO; 3118 goto exit; 3119 } 3120 3121 ret = 0; 3122 3123 exit: 3124 kfree(pkt); 3125 return ret; 3126 } 3127 3128 /** 3129 * hv_pci_query_relations() - Ask host to send list of child 3130 * devices 3131 * @hdev: VMBus's tracking struct for this root PCI bus 3132 * 3133 * Return: 0 on success, -errno on failure 3134 */ 3135 static int hv_pci_query_relations(struct hv_device *hdev) 3136 { 3137 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3138 struct pci_message message; 3139 struct completion comp; 3140 int ret; 3141 3142 /* Ask the host to send along the list of child devices */ 3143 init_completion(&comp); 3144 if (cmpxchg(&hbus->survey_event, NULL, &comp)) 3145 return -ENOTEMPTY; 3146 3147 memset(&message, 0, sizeof(message)); 3148 message.type = PCI_QUERY_BUS_RELATIONS; 3149 3150 ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message), 3151 0, VM_PKT_DATA_INBAND, 0); 3152 if (!ret) 3153 ret = wait_for_response(hdev, &comp); 3154 3155 return ret; 3156 } 3157 3158 /** 3159 * hv_send_resources_allocated() - Report local resource choices 3160 * @hdev: VMBus's tracking struct for this root PCI bus 3161 * 3162 * The host OS is expecting to be sent a request as a message 3163 * which contains all the resources that the device will use. 3164 * The response contains those same resources, "translated" 3165 * which is to say, the values which should be used by the 3166 * hardware, when it delivers an interrupt. (MMIO resources are 3167 * used in local terms.) This is nice for Windows, and lines up 3168 * with the FDO/PDO split, which doesn't exist in Linux. Linux 3169 * is deeply expecting to scan an emulated PCI configuration 3170 * space. So this message is sent here only to drive the state 3171 * machine on the host forward. 3172 * 3173 * Return: 0 on success, -errno on failure 3174 */ 3175 static int hv_send_resources_allocated(struct hv_device *hdev) 3176 { 3177 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3178 struct pci_resources_assigned *res_assigned; 3179 struct pci_resources_assigned2 *res_assigned2; 3180 struct hv_pci_compl comp_pkt; 3181 struct hv_pci_dev *hpdev; 3182 struct pci_packet *pkt; 3183 size_t size_res; 3184 int wslot; 3185 int ret; 3186 3187 size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) 3188 ? sizeof(*res_assigned) : sizeof(*res_assigned2); 3189 3190 pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL); 3191 if (!pkt) 3192 return -ENOMEM; 3193 3194 ret = 0; 3195 3196 for (wslot = 0; wslot < 256; wslot++) { 3197 hpdev = get_pcichild_wslot(hbus, wslot); 3198 if (!hpdev) 3199 continue; 3200 3201 memset(pkt, 0, sizeof(*pkt) + size_res); 3202 init_completion(&comp_pkt.host_event); 3203 pkt->completion_func = hv_pci_generic_compl; 3204 pkt->compl_ctxt = &comp_pkt; 3205 3206 if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) { 3207 res_assigned = 3208 (struct pci_resources_assigned *)&pkt->message; 3209 res_assigned->message_type.type = 3210 PCI_RESOURCES_ASSIGNED; 3211 res_assigned->wslot.slot = hpdev->desc.win_slot.slot; 3212 } else { 3213 res_assigned2 = 3214 (struct pci_resources_assigned2 *)&pkt->message; 3215 res_assigned2->message_type.type = 3216 PCI_RESOURCES_ASSIGNED2; 3217 res_assigned2->wslot.slot = hpdev->desc.win_slot.slot; 3218 } 3219 put_pcichild(hpdev); 3220 3221 ret = vmbus_sendpacket(hdev->channel, &pkt->message, 3222 size_res, (unsigned long)pkt, 3223 VM_PKT_DATA_INBAND, 3224 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3225 if (!ret) 3226 ret = wait_for_response(hdev, &comp_pkt.host_event); 3227 if (ret) 3228 break; 3229 3230 if (comp_pkt.completion_status < 0) { 3231 ret = -EPROTO; 3232 dev_err(&hdev->device, 3233 "resource allocated returned 0x%x", 3234 comp_pkt.completion_status); 3235 break; 3236 } 3237 3238 hbus->wslot_res_allocated = wslot; 3239 } 3240 3241 kfree(pkt); 3242 return ret; 3243 } 3244 3245 /** 3246 * hv_send_resources_released() - Report local resources 3247 * released 3248 * @hdev: VMBus's tracking struct for this root PCI bus 3249 * 3250 * Return: 0 on success, -errno on failure 3251 */ 3252 static int hv_send_resources_released(struct hv_device *hdev) 3253 { 3254 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3255 struct pci_child_message pkt; 3256 struct hv_pci_dev *hpdev; 3257 int wslot; 3258 int ret; 3259 3260 for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) { 3261 hpdev = get_pcichild_wslot(hbus, wslot); 3262 if (!hpdev) 3263 continue; 3264 3265 memset(&pkt, 0, sizeof(pkt)); 3266 pkt.message_type.type = PCI_RESOURCES_RELEASED; 3267 pkt.wslot.slot = hpdev->desc.win_slot.slot; 3268 3269 put_pcichild(hpdev); 3270 3271 ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0, 3272 VM_PKT_DATA_INBAND, 0); 3273 if (ret) 3274 return ret; 3275 3276 hbus->wslot_res_allocated = wslot - 1; 3277 } 3278 3279 hbus->wslot_res_allocated = -1; 3280 3281 return 0; 3282 } 3283 3284 #define HVPCI_DOM_MAP_SIZE (64 * 1024) 3285 static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); 3286 3287 /* 3288 * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 3289 * as invalid for passthrough PCI devices of this driver. 3290 */ 3291 #define HVPCI_DOM_INVALID 0 3292 3293 /** 3294 * hv_get_dom_num() - Get a valid PCI domain number 3295 * Check if the PCI domain number is in use, and return another number if 3296 * it is in use. 3297 * 3298 * @dom: Requested domain number 3299 * 3300 * return: domain number on success, HVPCI_DOM_INVALID on failure 3301 */ 3302 static u16 hv_get_dom_num(u16 dom) 3303 { 3304 unsigned int i; 3305 3306 if (test_and_set_bit(dom, hvpci_dom_map) == 0) 3307 return dom; 3308 3309 for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { 3310 if (test_and_set_bit(i, hvpci_dom_map) == 0) 3311 return i; 3312 } 3313 3314 return HVPCI_DOM_INVALID; 3315 } 3316 3317 /** 3318 * hv_put_dom_num() - Mark the PCI domain number as free 3319 * @dom: Domain number to be freed 3320 */ 3321 static void hv_put_dom_num(u16 dom) 3322 { 3323 clear_bit(dom, hvpci_dom_map); 3324 } 3325 3326 /** 3327 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus 3328 * @hdev: VMBus's tracking struct for this root PCI bus 3329 * @dev_id: Identifies the device itself 3330 * 3331 * Return: 0 on success, -errno on failure 3332 */ 3333 static int hv_pci_probe(struct hv_device *hdev, 3334 const struct hv_vmbus_device_id *dev_id) 3335 { 3336 struct pci_host_bridge *bridge; 3337 struct hv_pcibus_device *hbus; 3338 u16 dom_req, dom; 3339 char *name; 3340 bool enter_d0_retry = true; 3341 int ret; 3342 3343 /* 3344 * hv_pcibus_device contains the hypercall arguments for retargeting in 3345 * hv_irq_unmask(). Those must not cross a page boundary. 3346 */ 3347 BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE); 3348 3349 bridge = devm_pci_alloc_host_bridge(&hdev->device, 0); 3350 if (!bridge) 3351 return -ENOMEM; 3352 3353 /* 3354 * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural 3355 * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate 3356 * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and 3357 * alignment of hbus is important because hbus's field 3358 * retarget_msi_interrupt_params must not cross a 4KB page boundary. 3359 * 3360 * Here we prefer kzalloc to get_zeroed_page(), because a buffer 3361 * allocated by the latter is not tracked and scanned by kmemleak, and 3362 * hence kmemleak reports the pointer contained in the hbus buffer 3363 * (i.e. the hpdev struct, which is created in new_pcichild_device() and 3364 * is tracked by hbus->children) as memory leak (false positive). 3365 * 3366 * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be 3367 * used to allocate the hbus buffer and we can avoid the kmemleak false 3368 * positive by using kmemleak_alloc() and kmemleak_free() to ask 3369 * kmemleak to track and scan the hbus buffer. 3370 */ 3371 hbus = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); 3372 if (!hbus) 3373 return -ENOMEM; 3374 3375 hbus->bridge = bridge; 3376 hbus->state = hv_pcibus_init; 3377 hbus->wslot_res_allocated = -1; 3378 3379 /* 3380 * The PCI bus "domain" is what is called "segment" in ACPI and other 3381 * specs. Pull it from the instance ID, to get something usually 3382 * unique. In rare cases of collision, we will find out another number 3383 * not in use. 3384 * 3385 * Note that, since this code only runs in a Hyper-V VM, Hyper-V 3386 * together with this guest driver can guarantee that (1) The only 3387 * domain used by Gen1 VMs for something that looks like a physical 3388 * PCI bus (which is actually emulated by the hypervisor) is domain 0. 3389 * (2) There will be no overlap between domains (after fixing possible 3390 * collisions) in the same VM. 3391 */ 3392 dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; 3393 dom = hv_get_dom_num(dom_req); 3394 3395 if (dom == HVPCI_DOM_INVALID) { 3396 dev_err(&hdev->device, 3397 "Unable to use dom# 0x%x or other numbers", dom_req); 3398 ret = -EINVAL; 3399 goto free_bus; 3400 } 3401 3402 if (dom != dom_req) 3403 dev_info(&hdev->device, 3404 "PCI dom# 0x%x has collision, using 0x%x", 3405 dom_req, dom); 3406 3407 hbus->bridge->domain_nr = dom; 3408 #ifdef CONFIG_X86 3409 hbus->sysdata.domain = dom; 3410 #elif defined(CONFIG_ARM64) 3411 /* 3412 * Set the PCI bus parent to be the corresponding VMbus 3413 * device. Then the VMbus device will be assigned as the 3414 * ACPI companion in pcibios_root_bridge_prepare() and 3415 * pci_dma_configure() will propagate device coherence 3416 * information to devices created on the bus. 3417 */ 3418 hbus->sysdata.parent = hdev->device.parent; 3419 #endif 3420 3421 hbus->hdev = hdev; 3422 INIT_LIST_HEAD(&hbus->children); 3423 INIT_LIST_HEAD(&hbus->dr_list); 3424 spin_lock_init(&hbus->config_lock); 3425 spin_lock_init(&hbus->device_list_lock); 3426 spin_lock_init(&hbus->retarget_msi_interrupt_lock); 3427 hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0, 3428 hbus->bridge->domain_nr); 3429 if (!hbus->wq) { 3430 ret = -ENOMEM; 3431 goto free_dom; 3432 } 3433 3434 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, 3435 hv_pci_onchannelcallback, hbus); 3436 if (ret) 3437 goto destroy_wq; 3438 3439 hv_set_drvdata(hdev, hbus); 3440 3441 ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions, 3442 ARRAY_SIZE(pci_protocol_versions)); 3443 if (ret) 3444 goto close; 3445 3446 ret = hv_allocate_config_window(hbus); 3447 if (ret) 3448 goto close; 3449 3450 hbus->cfg_addr = ioremap(hbus->mem_config->start, 3451 PCI_CONFIG_MMIO_LENGTH); 3452 if (!hbus->cfg_addr) { 3453 dev_err(&hdev->device, 3454 "Unable to map a virtual address for config space\n"); 3455 ret = -ENOMEM; 3456 goto free_config; 3457 } 3458 3459 name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance); 3460 if (!name) { 3461 ret = -ENOMEM; 3462 goto unmap; 3463 } 3464 3465 hbus->fwnode = irq_domain_alloc_named_fwnode(name); 3466 kfree(name); 3467 if (!hbus->fwnode) { 3468 ret = -ENOMEM; 3469 goto unmap; 3470 } 3471 3472 ret = hv_pcie_init_irq_domain(hbus); 3473 if (ret) 3474 goto free_fwnode; 3475 3476 retry: 3477 ret = hv_pci_query_relations(hdev); 3478 if (ret) 3479 goto free_irq_domain; 3480 3481 ret = hv_pci_enter_d0(hdev); 3482 /* 3483 * In certain case (Kdump) the pci device of interest was 3484 * not cleanly shut down and resource is still held on host 3485 * side, the host could return invalid device status. 3486 * We need to explicitly request host to release the resource 3487 * and try to enter D0 again. 3488 * Since the hv_pci_bus_exit() call releases structures 3489 * of all its child devices, we need to start the retry from 3490 * hv_pci_query_relations() call, requesting host to send 3491 * the synchronous child device relations message before this 3492 * information is needed in hv_send_resources_allocated() 3493 * call later. 3494 */ 3495 if (ret == -EPROTO && enter_d0_retry) { 3496 enter_d0_retry = false; 3497 3498 dev_err(&hdev->device, "Retrying D0 Entry\n"); 3499 3500 /* 3501 * Hv_pci_bus_exit() calls hv_send_resources_released() 3502 * to free up resources of its child devices. 3503 * In the kdump kernel we need to set the 3504 * wslot_res_allocated to 255 so it scans all child 3505 * devices to release resources allocated in the 3506 * normal kernel before panic happened. 3507 */ 3508 hbus->wslot_res_allocated = 255; 3509 ret = hv_pci_bus_exit(hdev, true); 3510 3511 if (ret == 0) 3512 goto retry; 3513 3514 dev_err(&hdev->device, 3515 "Retrying D0 failed with ret %d\n", ret); 3516 } 3517 if (ret) 3518 goto free_irq_domain; 3519 3520 ret = hv_pci_allocate_bridge_windows(hbus); 3521 if (ret) 3522 goto exit_d0; 3523 3524 ret = hv_send_resources_allocated(hdev); 3525 if (ret) 3526 goto free_windows; 3527 3528 prepopulate_bars(hbus); 3529 3530 hbus->state = hv_pcibus_probed; 3531 3532 ret = create_root_hv_pci_bus(hbus); 3533 if (ret) 3534 goto free_windows; 3535 3536 return 0; 3537 3538 free_windows: 3539 hv_pci_free_bridge_windows(hbus); 3540 exit_d0: 3541 (void) hv_pci_bus_exit(hdev, true); 3542 free_irq_domain: 3543 irq_domain_remove(hbus->irq_domain); 3544 free_fwnode: 3545 irq_domain_free_fwnode(hbus->fwnode); 3546 unmap: 3547 iounmap(hbus->cfg_addr); 3548 free_config: 3549 hv_free_config_window(hbus); 3550 close: 3551 vmbus_close(hdev->channel); 3552 destroy_wq: 3553 destroy_workqueue(hbus->wq); 3554 free_dom: 3555 hv_put_dom_num(hbus->bridge->domain_nr); 3556 free_bus: 3557 kfree(hbus); 3558 return ret; 3559 } 3560 3561 static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) 3562 { 3563 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3564 struct { 3565 struct pci_packet teardown_packet; 3566 u8 buffer[sizeof(struct pci_message)]; 3567 } pkt; 3568 struct hv_pci_compl comp_pkt; 3569 struct hv_pci_dev *hpdev, *tmp; 3570 unsigned long flags; 3571 int ret; 3572 3573 /* 3574 * After the host sends the RESCIND_CHANNEL message, it doesn't 3575 * access the per-channel ringbuffer any longer. 3576 */ 3577 if (hdev->channel->rescind) 3578 return 0; 3579 3580 if (!keep_devs) { 3581 struct list_head removed; 3582 3583 /* Move all present children to the list on stack */ 3584 INIT_LIST_HEAD(&removed); 3585 spin_lock_irqsave(&hbus->device_list_lock, flags); 3586 list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry) 3587 list_move_tail(&hpdev->list_entry, &removed); 3588 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 3589 3590 /* Remove all children in the list */ 3591 list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) { 3592 list_del(&hpdev->list_entry); 3593 if (hpdev->pci_slot) 3594 pci_destroy_slot(hpdev->pci_slot); 3595 /* For the two refs got in new_pcichild_device() */ 3596 put_pcichild(hpdev); 3597 put_pcichild(hpdev); 3598 } 3599 } 3600 3601 ret = hv_send_resources_released(hdev); 3602 if (ret) { 3603 dev_err(&hdev->device, 3604 "Couldn't send resources released packet(s)\n"); 3605 return ret; 3606 } 3607 3608 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet)); 3609 init_completion(&comp_pkt.host_event); 3610 pkt.teardown_packet.completion_func = hv_pci_generic_compl; 3611 pkt.teardown_packet.compl_ctxt = &comp_pkt; 3612 pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT; 3613 3614 ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message, 3615 sizeof(struct pci_message), 3616 (unsigned long)&pkt.teardown_packet, 3617 VM_PKT_DATA_INBAND, 3618 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3619 if (ret) 3620 return ret; 3621 3622 if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) 3623 return -ETIMEDOUT; 3624 3625 return 0; 3626 } 3627 3628 /** 3629 * hv_pci_remove() - Remove routine for this VMBus channel 3630 * @hdev: VMBus's tracking struct for this root PCI bus 3631 * 3632 * Return: 0 on success, -errno on failure 3633 */ 3634 static int hv_pci_remove(struct hv_device *hdev) 3635 { 3636 struct hv_pcibus_device *hbus; 3637 int ret; 3638 3639 hbus = hv_get_drvdata(hdev); 3640 if (hbus->state == hv_pcibus_installed) { 3641 tasklet_disable(&hdev->channel->callback_event); 3642 hbus->state = hv_pcibus_removing; 3643 tasklet_enable(&hdev->channel->callback_event); 3644 destroy_workqueue(hbus->wq); 3645 hbus->wq = NULL; 3646 /* 3647 * At this point, no work is running or can be scheduled 3648 * on hbus-wq. We can't race with hv_pci_devices_present() 3649 * or hv_pci_eject_device(), it's safe to proceed. 3650 */ 3651 3652 /* Remove the bus from PCI's point of view. */ 3653 pci_lock_rescan_remove(); 3654 pci_stop_root_bus(hbus->bridge->bus); 3655 hv_pci_remove_slots(hbus); 3656 pci_remove_root_bus(hbus->bridge->bus); 3657 pci_unlock_rescan_remove(); 3658 } 3659 3660 ret = hv_pci_bus_exit(hdev, false); 3661 3662 vmbus_close(hdev->channel); 3663 3664 iounmap(hbus->cfg_addr); 3665 hv_free_config_window(hbus); 3666 hv_pci_free_bridge_windows(hbus); 3667 irq_domain_remove(hbus->irq_domain); 3668 irq_domain_free_fwnode(hbus->fwnode); 3669 3670 hv_put_dom_num(hbus->bridge->domain_nr); 3671 3672 kfree(hbus); 3673 return ret; 3674 } 3675 3676 static int hv_pci_suspend(struct hv_device *hdev) 3677 { 3678 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3679 enum hv_pcibus_state old_state; 3680 int ret; 3681 3682 /* 3683 * hv_pci_suspend() must make sure there are no pending work items 3684 * before calling vmbus_close(), since it runs in a process context 3685 * as a callback in dpm_suspend(). When it starts to run, the channel 3686 * callback hv_pci_onchannelcallback(), which runs in a tasklet 3687 * context, can be still running concurrently and scheduling new work 3688 * items onto hbus->wq in hv_pci_devices_present() and 3689 * hv_pci_eject_device(), and the work item handlers can access the 3690 * vmbus channel, which can be being closed by hv_pci_suspend(), e.g. 3691 * the work item handler pci_devices_present_work() -> 3692 * new_pcichild_device() writes to the vmbus channel. 3693 * 3694 * To eliminate the race, hv_pci_suspend() disables the channel 3695 * callback tasklet, sets hbus->state to hv_pcibus_removing, and 3696 * re-enables the tasklet. This way, when hv_pci_suspend() proceeds, 3697 * it knows that no new work item can be scheduled, and then it flushes 3698 * hbus->wq and safely closes the vmbus channel. 3699 */ 3700 tasklet_disable(&hdev->channel->callback_event); 3701 3702 /* Change the hbus state to prevent new work items. */ 3703 old_state = hbus->state; 3704 if (hbus->state == hv_pcibus_installed) 3705 hbus->state = hv_pcibus_removing; 3706 3707 tasklet_enable(&hdev->channel->callback_event); 3708 3709 if (old_state != hv_pcibus_installed) 3710 return -EINVAL; 3711 3712 flush_workqueue(hbus->wq); 3713 3714 ret = hv_pci_bus_exit(hdev, true); 3715 if (ret) 3716 return ret; 3717 3718 vmbus_close(hdev->channel); 3719 3720 return 0; 3721 } 3722 3723 static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg) 3724 { 3725 struct irq_data *irq_data; 3726 struct msi_desc *entry; 3727 int ret = 0; 3728 3729 msi_lock_descs(&pdev->dev); 3730 msi_for_each_desc(entry, &pdev->dev, MSI_DESC_ASSOCIATED) { 3731 irq_data = irq_get_irq_data(entry->irq); 3732 if (WARN_ON_ONCE(!irq_data)) { 3733 ret = -EINVAL; 3734 break; 3735 } 3736 3737 hv_compose_msi_msg(irq_data, &entry->msg); 3738 } 3739 msi_unlock_descs(&pdev->dev); 3740 3741 return ret; 3742 } 3743 3744 /* 3745 * Upon resume, pci_restore_msi_state() -> ... -> __pci_write_msi_msg() 3746 * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V 3747 * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg() 3748 * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping 3749 * Table entries. 3750 */ 3751 static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus) 3752 { 3753 pci_walk_bus(hbus->bridge->bus, hv_pci_restore_msi_msg, NULL); 3754 } 3755 3756 static int hv_pci_resume(struct hv_device *hdev) 3757 { 3758 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3759 enum pci_protocol_version_t version[1]; 3760 int ret; 3761 3762 hbus->state = hv_pcibus_init; 3763 3764 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, 3765 hv_pci_onchannelcallback, hbus); 3766 if (ret) 3767 return ret; 3768 3769 /* Only use the version that was in use before hibernation. */ 3770 version[0] = hbus->protocol_version; 3771 ret = hv_pci_protocol_negotiation(hdev, version, 1); 3772 if (ret) 3773 goto out; 3774 3775 ret = hv_pci_query_relations(hdev); 3776 if (ret) 3777 goto out; 3778 3779 ret = hv_pci_enter_d0(hdev); 3780 if (ret) 3781 goto out; 3782 3783 ret = hv_send_resources_allocated(hdev); 3784 if (ret) 3785 goto out; 3786 3787 prepopulate_bars(hbus); 3788 3789 hv_pci_restore_msi_state(hbus); 3790 3791 hbus->state = hv_pcibus_installed; 3792 return 0; 3793 out: 3794 vmbus_close(hdev->channel); 3795 return ret; 3796 } 3797 3798 static const struct hv_vmbus_device_id hv_pci_id_table[] = { 3799 /* PCI Pass-through Class ID */ 3800 /* 44C4F61D-4444-4400-9D52-802E27EDE19F */ 3801 { HV_PCIE_GUID, }, 3802 { }, 3803 }; 3804 3805 MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table); 3806 3807 static struct hv_driver hv_pci_drv = { 3808 .name = "hv_pci", 3809 .id_table = hv_pci_id_table, 3810 .probe = hv_pci_probe, 3811 .remove = hv_pci_remove, 3812 .suspend = hv_pci_suspend, 3813 .resume = hv_pci_resume, 3814 }; 3815 3816 static void __exit exit_hv_pci_drv(void) 3817 { 3818 vmbus_driver_unregister(&hv_pci_drv); 3819 3820 hvpci_block_ops.read_block = NULL; 3821 hvpci_block_ops.write_block = NULL; 3822 hvpci_block_ops.reg_blk_invalidate = NULL; 3823 } 3824 3825 static int __init init_hv_pci_drv(void) 3826 { 3827 int ret; 3828 3829 if (!hv_is_hyperv_initialized()) 3830 return -ENODEV; 3831 3832 ret = hv_pci_irqchip_init(); 3833 if (ret) 3834 return ret; 3835 3836 /* Set the invalid domain number's bit, so it will not be used */ 3837 set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); 3838 3839 /* Initialize PCI block r/w interface */ 3840 hvpci_block_ops.read_block = hv_read_config_block; 3841 hvpci_block_ops.write_block = hv_write_config_block; 3842 hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate; 3843 3844 return vmbus_driver_register(&hv_pci_drv); 3845 } 3846 3847 module_init(init_hv_pci_drv); 3848 module_exit(exit_hv_pci_drv); 3849 3850 MODULE_DESCRIPTION("Hyper-V PCI"); 3851 MODULE_LICENSE("GPL v2"); 3852