1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) Microsoft Corporation. 4 * 5 * Author: 6 * Jake Oshins <jakeo@microsoft.com> 7 * 8 * This driver acts as a paravirtual front-end for PCI Express root buses. 9 * When a PCI Express function (either an entire device or an SR-IOV 10 * Virtual Function) is being passed through to the VM, this driver exposes 11 * a new bus to the guest VM. This is modeled as a root PCI bus because 12 * no bridges are being exposed to the VM. In fact, with a "Generation 2" 13 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM 14 * until a device as been exposed using this driver. 15 * 16 * Each root PCI bus has its own PCI domain, which is called "Segment" in 17 * the PCI Firmware Specifications. Thus while each device passed through 18 * to the VM using this front-end will appear at "device 0", the domain will 19 * be unique. Typically, each bus will have one PCI function on it, though 20 * this driver does support more than one. 21 * 22 * In order to map the interrupts from the device through to the guest VM, 23 * this driver also implements an IRQ Domain, which handles interrupts (either 24 * MSI or MSI-X) associated with the functions on the bus. As interrupts are 25 * set up, torn down, or reaffined, this driver communicates with the 26 * underlying hypervisor to adjust the mappings in the I/O MMU so that each 27 * interrupt will be delivered to the correct virtual processor at the right 28 * vector. This driver does not support level-triggered (line-based) 29 * interrupts, and will report that the Interrupt Line register in the 30 * function's configuration space is zero. 31 * 32 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V 33 * facilities. For instance, the configuration space of a function exposed 34 * by Hyper-V is mapped into a single page of memory space, and the 35 * read and write handlers for config space must be aware of this mechanism. 36 * Similarly, device setup and teardown involves messages sent to and from 37 * the PCI back-end driver in Hyper-V. 38 */ 39 40 #include <linux/kernel.h> 41 #include <linux/module.h> 42 #include <linux/pci.h> 43 #include <linux/pci-ecam.h> 44 #include <linux/delay.h> 45 #include <linux/semaphore.h> 46 #include <linux/irq.h> 47 #include <linux/msi.h> 48 #include <linux/hyperv.h> 49 #include <linux/refcount.h> 50 #include <linux/irqdomain.h> 51 #include <linux/acpi.h> 52 #include <asm/mshyperv.h> 53 54 /* 55 * Protocol versions. The low word is the minor version, the high word the 56 * major version. 57 */ 58 59 #define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor))) 60 #define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16) 61 #define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff) 62 63 enum pci_protocol_version_t { 64 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */ 65 PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */ 66 PCI_PROTOCOL_VERSION_1_3 = PCI_MAKE_VERSION(1, 3), /* Vibranium */ 67 PCI_PROTOCOL_VERSION_1_4 = PCI_MAKE_VERSION(1, 4), /* WS2022 */ 68 }; 69 70 #define CPU_AFFINITY_ALL -1ULL 71 72 /* 73 * Supported protocol versions in the order of probing - highest go 74 * first. 75 */ 76 static enum pci_protocol_version_t pci_protocol_versions[] = { 77 PCI_PROTOCOL_VERSION_1_4, 78 PCI_PROTOCOL_VERSION_1_3, 79 PCI_PROTOCOL_VERSION_1_2, 80 PCI_PROTOCOL_VERSION_1_1, 81 }; 82 83 #define PCI_CONFIG_MMIO_LENGTH 0x2000 84 #define CFG_PAGE_OFFSET 0x1000 85 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET) 86 87 #define MAX_SUPPORTED_MSI_MESSAGES 0x400 88 89 #define STATUS_REVISION_MISMATCH 0xC0000059 90 91 /* space for 32bit serial number as string */ 92 #define SLOT_NAME_SIZE 11 93 94 /* 95 * Message Types 96 */ 97 98 enum pci_message_type { 99 /* 100 * Version 1.1 101 */ 102 PCI_MESSAGE_BASE = 0x42490000, 103 PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0, 104 PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1, 105 PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4, 106 PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5, 107 PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6, 108 PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7, 109 PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8, 110 PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9, 111 PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA, 112 PCI_EJECT = PCI_MESSAGE_BASE + 0xB, 113 PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC, 114 PCI_REENABLE = PCI_MESSAGE_BASE + 0xD, 115 PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE, 116 PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF, 117 PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10, 118 PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11, 119 PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12, 120 PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13, 121 PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14, 122 PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15, 123 PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16, 124 PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17, 125 PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */ 126 PCI_BUS_RELATIONS2 = PCI_MESSAGE_BASE + 0x19, 127 PCI_RESOURCES_ASSIGNED3 = PCI_MESSAGE_BASE + 0x1A, 128 PCI_CREATE_INTERRUPT_MESSAGE3 = PCI_MESSAGE_BASE + 0x1B, 129 PCI_MESSAGE_MAXIMUM 130 }; 131 132 /* 133 * Structures defining the virtual PCI Express protocol. 134 */ 135 136 union pci_version { 137 struct { 138 u16 minor_version; 139 u16 major_version; 140 } parts; 141 u32 version; 142 } __packed; 143 144 /* 145 * Function numbers are 8-bits wide on Express, as interpreted through ARI, 146 * which is all this driver does. This representation is the one used in 147 * Windows, which is what is expected when sending this back and forth with 148 * the Hyper-V parent partition. 149 */ 150 union win_slot_encoding { 151 struct { 152 u32 dev:5; 153 u32 func:3; 154 u32 reserved:24; 155 } bits; 156 u32 slot; 157 } __packed; 158 159 /* 160 * Pretty much as defined in the PCI Specifications. 161 */ 162 struct pci_function_description { 163 u16 v_id; /* vendor ID */ 164 u16 d_id; /* device ID */ 165 u8 rev; 166 u8 prog_intf; 167 u8 subclass; 168 u8 base_class; 169 u32 subsystem_id; 170 union win_slot_encoding win_slot; 171 u32 ser; /* serial number */ 172 } __packed; 173 174 enum pci_device_description_flags { 175 HV_PCI_DEVICE_FLAG_NONE = 0x0, 176 HV_PCI_DEVICE_FLAG_NUMA_AFFINITY = 0x1, 177 }; 178 179 struct pci_function_description2 { 180 u16 v_id; /* vendor ID */ 181 u16 d_id; /* device ID */ 182 u8 rev; 183 u8 prog_intf; 184 u8 subclass; 185 u8 base_class; 186 u32 subsystem_id; 187 union win_slot_encoding win_slot; 188 u32 ser; /* serial number */ 189 u32 flags; 190 u16 virtual_numa_node; 191 u16 reserved; 192 } __packed; 193 194 /** 195 * struct hv_msi_desc 196 * @vector: IDT entry 197 * @delivery_mode: As defined in Intel's Programmer's 198 * Reference Manual, Volume 3, Chapter 8. 199 * @vector_count: Number of contiguous entries in the 200 * Interrupt Descriptor Table that are 201 * occupied by this Message-Signaled 202 * Interrupt. For "MSI", as first defined 203 * in PCI 2.2, this can be between 1 and 204 * 32. For "MSI-X," as first defined in PCI 205 * 3.0, this must be 1, as each MSI-X table 206 * entry would have its own descriptor. 207 * @reserved: Empty space 208 * @cpu_mask: All the target virtual processors. 209 */ 210 struct hv_msi_desc { 211 u8 vector; 212 u8 delivery_mode; 213 u16 vector_count; 214 u32 reserved; 215 u64 cpu_mask; 216 } __packed; 217 218 /** 219 * struct hv_msi_desc2 - 1.2 version of hv_msi_desc 220 * @vector: IDT entry 221 * @delivery_mode: As defined in Intel's Programmer's 222 * Reference Manual, Volume 3, Chapter 8. 223 * @vector_count: Number of contiguous entries in the 224 * Interrupt Descriptor Table that are 225 * occupied by this Message-Signaled 226 * Interrupt. For "MSI", as first defined 227 * in PCI 2.2, this can be between 1 and 228 * 32. For "MSI-X," as first defined in PCI 229 * 3.0, this must be 1, as each MSI-X table 230 * entry would have its own descriptor. 231 * @processor_count: number of bits enabled in array. 232 * @processor_array: All the target virtual processors. 233 */ 234 struct hv_msi_desc2 { 235 u8 vector; 236 u8 delivery_mode; 237 u16 vector_count; 238 u16 processor_count; 239 u16 processor_array[32]; 240 } __packed; 241 242 /* 243 * struct hv_msi_desc3 - 1.3 version of hv_msi_desc 244 * Everything is the same as in 'hv_msi_desc2' except that the size of the 245 * 'vector' field is larger to support bigger vector values. For ex: LPI 246 * vectors on ARM. 247 */ 248 struct hv_msi_desc3 { 249 u32 vector; 250 u8 delivery_mode; 251 u8 reserved; 252 u16 vector_count; 253 u16 processor_count; 254 u16 processor_array[32]; 255 } __packed; 256 257 /** 258 * struct tran_int_desc 259 * @reserved: unused, padding 260 * @vector_count: same as in hv_msi_desc 261 * @data: This is the "data payload" value that is 262 * written by the device when it generates 263 * a message-signaled interrupt, either MSI 264 * or MSI-X. 265 * @address: This is the address to which the data 266 * payload is written on interrupt 267 * generation. 268 */ 269 struct tran_int_desc { 270 u16 reserved; 271 u16 vector_count; 272 u32 data; 273 u64 address; 274 } __packed; 275 276 /* 277 * A generic message format for virtual PCI. 278 * Specific message formats are defined later in the file. 279 */ 280 281 struct pci_message { 282 u32 type; 283 } __packed; 284 285 struct pci_child_message { 286 struct pci_message message_type; 287 union win_slot_encoding wslot; 288 } __packed; 289 290 struct pci_incoming_message { 291 struct vmpacket_descriptor hdr; 292 struct pci_message message_type; 293 } __packed; 294 295 struct pci_response { 296 struct vmpacket_descriptor hdr; 297 s32 status; /* negative values are failures */ 298 } __packed; 299 300 struct pci_packet { 301 void (*completion_func)(void *context, struct pci_response *resp, 302 int resp_packet_size); 303 void *compl_ctxt; 304 305 struct pci_message message[]; 306 }; 307 308 /* 309 * Specific message types supporting the PCI protocol. 310 */ 311 312 /* 313 * Version negotiation message. Sent from the guest to the host. 314 * The guest is free to try different versions until the host 315 * accepts the version. 316 * 317 * pci_version: The protocol version requested. 318 * is_last_attempt: If TRUE, this is the last version guest will request. 319 * reservedz: Reserved field, set to zero. 320 */ 321 322 struct pci_version_request { 323 struct pci_message message_type; 324 u32 protocol_version; 325 } __packed; 326 327 /* 328 * Bus D0 Entry. This is sent from the guest to the host when the virtual 329 * bus (PCI Express port) is ready for action. 330 */ 331 332 struct pci_bus_d0_entry { 333 struct pci_message message_type; 334 u32 reserved; 335 u64 mmio_base; 336 } __packed; 337 338 struct pci_bus_relations { 339 struct pci_incoming_message incoming; 340 u32 device_count; 341 struct pci_function_description func[]; 342 } __packed; 343 344 struct pci_bus_relations2 { 345 struct pci_incoming_message incoming; 346 u32 device_count; 347 struct pci_function_description2 func[]; 348 } __packed; 349 350 struct pci_q_res_req_response { 351 struct vmpacket_descriptor hdr; 352 s32 status; /* negative values are failures */ 353 u32 probed_bar[PCI_STD_NUM_BARS]; 354 } __packed; 355 356 struct pci_set_power { 357 struct pci_message message_type; 358 union win_slot_encoding wslot; 359 u32 power_state; /* In Windows terms */ 360 u32 reserved; 361 } __packed; 362 363 struct pci_set_power_response { 364 struct vmpacket_descriptor hdr; 365 s32 status; /* negative values are failures */ 366 union win_slot_encoding wslot; 367 u32 resultant_state; /* In Windows terms */ 368 u32 reserved; 369 } __packed; 370 371 struct pci_resources_assigned { 372 struct pci_message message_type; 373 union win_slot_encoding wslot; 374 u8 memory_range[0x14][6]; /* not used here */ 375 u32 msi_descriptors; 376 u32 reserved[4]; 377 } __packed; 378 379 struct pci_resources_assigned2 { 380 struct pci_message message_type; 381 union win_slot_encoding wslot; 382 u8 memory_range[0x14][6]; /* not used here */ 383 u32 msi_descriptor_count; 384 u8 reserved[70]; 385 } __packed; 386 387 struct pci_create_interrupt { 388 struct pci_message message_type; 389 union win_slot_encoding wslot; 390 struct hv_msi_desc int_desc; 391 } __packed; 392 393 struct pci_create_int_response { 394 struct pci_response response; 395 u32 reserved; 396 struct tran_int_desc int_desc; 397 } __packed; 398 399 struct pci_create_interrupt2 { 400 struct pci_message message_type; 401 union win_slot_encoding wslot; 402 struct hv_msi_desc2 int_desc; 403 } __packed; 404 405 struct pci_create_interrupt3 { 406 struct pci_message message_type; 407 union win_slot_encoding wslot; 408 struct hv_msi_desc3 int_desc; 409 } __packed; 410 411 struct pci_delete_interrupt { 412 struct pci_message message_type; 413 union win_slot_encoding wslot; 414 struct tran_int_desc int_desc; 415 } __packed; 416 417 /* 418 * Note: the VM must pass a valid block id, wslot and bytes_requested. 419 */ 420 struct pci_read_block { 421 struct pci_message message_type; 422 u32 block_id; 423 union win_slot_encoding wslot; 424 u32 bytes_requested; 425 } __packed; 426 427 struct pci_read_block_response { 428 struct vmpacket_descriptor hdr; 429 u32 status; 430 u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; 431 } __packed; 432 433 /* 434 * Note: the VM must pass a valid block id, wslot and byte_count. 435 */ 436 struct pci_write_block { 437 struct pci_message message_type; 438 u32 block_id; 439 union win_slot_encoding wslot; 440 u32 byte_count; 441 u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; 442 } __packed; 443 444 struct pci_dev_inval_block { 445 struct pci_incoming_message incoming; 446 union win_slot_encoding wslot; 447 u64 block_mask; 448 } __packed; 449 450 struct pci_dev_incoming { 451 struct pci_incoming_message incoming; 452 union win_slot_encoding wslot; 453 } __packed; 454 455 struct pci_eject_response { 456 struct pci_message message_type; 457 union win_slot_encoding wslot; 458 u32 status; 459 } __packed; 460 461 static int pci_ring_size = (4 * PAGE_SIZE); 462 463 /* 464 * Driver specific state. 465 */ 466 467 enum hv_pcibus_state { 468 hv_pcibus_init = 0, 469 hv_pcibus_probed, 470 hv_pcibus_installed, 471 hv_pcibus_removing, 472 hv_pcibus_maximum 473 }; 474 475 struct hv_pcibus_device { 476 #ifdef CONFIG_X86 477 struct pci_sysdata sysdata; 478 #elif defined(CONFIG_ARM64) 479 struct pci_config_window sysdata; 480 #endif 481 struct pci_host_bridge *bridge; 482 struct fwnode_handle *fwnode; 483 /* Protocol version negotiated with the host */ 484 enum pci_protocol_version_t protocol_version; 485 enum hv_pcibus_state state; 486 struct hv_device *hdev; 487 resource_size_t low_mmio_space; 488 resource_size_t high_mmio_space; 489 struct resource *mem_config; 490 struct resource *low_mmio_res; 491 struct resource *high_mmio_res; 492 struct completion *survey_event; 493 struct pci_bus *pci_bus; 494 spinlock_t config_lock; /* Avoid two threads writing index page */ 495 spinlock_t device_list_lock; /* Protect lists below */ 496 void __iomem *cfg_addr; 497 498 struct list_head children; 499 struct list_head dr_list; 500 501 struct msi_domain_info msi_info; 502 struct irq_domain *irq_domain; 503 504 spinlock_t retarget_msi_interrupt_lock; 505 506 struct workqueue_struct *wq; 507 508 /* Highest slot of child device with resources allocated */ 509 int wslot_res_allocated; 510 511 /* hypercall arg, must not cross page boundary */ 512 struct hv_retarget_device_interrupt retarget_msi_interrupt_params; 513 514 /* 515 * Don't put anything here: retarget_msi_interrupt_params must be last 516 */ 517 }; 518 519 /* 520 * Tracks "Device Relations" messages from the host, which must be both 521 * processed in order and deferred so that they don't run in the context 522 * of the incoming packet callback. 523 */ 524 struct hv_dr_work { 525 struct work_struct wrk; 526 struct hv_pcibus_device *bus; 527 }; 528 529 struct hv_pcidev_description { 530 u16 v_id; /* vendor ID */ 531 u16 d_id; /* device ID */ 532 u8 rev; 533 u8 prog_intf; 534 u8 subclass; 535 u8 base_class; 536 u32 subsystem_id; 537 union win_slot_encoding win_slot; 538 u32 ser; /* serial number */ 539 u32 flags; 540 u16 virtual_numa_node; 541 }; 542 543 struct hv_dr_state { 544 struct list_head list_entry; 545 u32 device_count; 546 struct hv_pcidev_description func[]; 547 }; 548 549 enum hv_pcichild_state { 550 hv_pcichild_init = 0, 551 hv_pcichild_requirements, 552 hv_pcichild_resourced, 553 hv_pcichild_ejecting, 554 hv_pcichild_maximum 555 }; 556 557 struct hv_pci_dev { 558 /* List protected by pci_rescan_remove_lock */ 559 struct list_head list_entry; 560 refcount_t refs; 561 enum hv_pcichild_state state; 562 struct pci_slot *pci_slot; 563 struct hv_pcidev_description desc; 564 bool reported_missing; 565 struct hv_pcibus_device *hbus; 566 struct work_struct wrk; 567 568 void (*block_invalidate)(void *context, u64 block_mask); 569 void *invalidate_context; 570 571 /* 572 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then 573 * read it back, for each of the BAR offsets within config space. 574 */ 575 u32 probed_bar[PCI_STD_NUM_BARS]; 576 }; 577 578 struct hv_pci_compl { 579 struct completion host_event; 580 s32 completion_status; 581 }; 582 583 static void hv_pci_onchannelcallback(void *context); 584 585 #ifdef CONFIG_X86 586 #define DELIVERY_MODE APIC_DELIVERY_MODE_FIXED 587 #define FLOW_HANDLER handle_edge_irq 588 #define FLOW_NAME "edge" 589 590 static int hv_pci_irqchip_init(void) 591 { 592 return 0; 593 } 594 595 static struct irq_domain *hv_pci_get_root_domain(void) 596 { 597 return x86_vector_domain; 598 } 599 600 static unsigned int hv_msi_get_int_vector(struct irq_data *data) 601 { 602 struct irq_cfg *cfg = irqd_cfg(data); 603 604 return cfg->vector; 605 } 606 607 static void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry, 608 struct msi_desc *msi_desc) 609 { 610 msi_entry->address.as_uint32 = msi_desc->msg.address_lo; 611 msi_entry->data.as_uint32 = msi_desc->msg.data; 612 } 613 614 static int hv_msi_prepare(struct irq_domain *domain, struct device *dev, 615 int nvec, msi_alloc_info_t *info) 616 { 617 return pci_msi_prepare(domain, dev, nvec, info); 618 } 619 620 /** 621 * hv_arch_irq_unmask() - "Unmask" the IRQ by setting its current 622 * affinity. 623 * @data: Describes the IRQ 624 * 625 * Build new a destination for the MSI and make a hypercall to 626 * update the Interrupt Redirection Table. "Device Logical ID" 627 * is built out of this PCI bus's instance GUID and the function 628 * number of the device. 629 */ 630 static void hv_arch_irq_unmask(struct irq_data *data) 631 { 632 struct msi_desc *msi_desc = irq_data_get_msi_desc(data); 633 struct hv_retarget_device_interrupt *params; 634 struct hv_pcibus_device *hbus; 635 struct cpumask *dest; 636 cpumask_var_t tmp; 637 struct pci_bus *pbus; 638 struct pci_dev *pdev; 639 unsigned long flags; 640 u32 var_size = 0; 641 int cpu, nr_bank; 642 u64 res; 643 644 dest = irq_data_get_effective_affinity_mask(data); 645 pdev = msi_desc_to_pci_dev(msi_desc); 646 pbus = pdev->bus; 647 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 648 649 spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags); 650 651 params = &hbus->retarget_msi_interrupt_params; 652 memset(params, 0, sizeof(*params)); 653 params->partition_id = HV_PARTITION_ID_SELF; 654 params->int_entry.source = HV_INTERRUPT_SOURCE_MSI; 655 hv_set_msi_entry_from_desc(¶ms->int_entry.msi_entry, msi_desc); 656 params->device_id = (hbus->hdev->dev_instance.b[5] << 24) | 657 (hbus->hdev->dev_instance.b[4] << 16) | 658 (hbus->hdev->dev_instance.b[7] << 8) | 659 (hbus->hdev->dev_instance.b[6] & 0xf8) | 660 PCI_FUNC(pdev->devfn); 661 params->int_target.vector = hv_msi_get_int_vector(data); 662 663 /* 664 * Honoring apic->delivery_mode set to APIC_DELIVERY_MODE_FIXED by 665 * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a 666 * spurious interrupt storm. Not doing so does not seem to have a 667 * negative effect (yet?). 668 */ 669 670 if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) { 671 /* 672 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the 673 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides 674 * with >64 VP support. 675 * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED 676 * is not sufficient for this hypercall. 677 */ 678 params->int_target.flags |= 679 HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; 680 681 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) { 682 res = 1; 683 goto exit_unlock; 684 } 685 686 cpumask_and(tmp, dest, cpu_online_mask); 687 nr_bank = cpumask_to_vpset(¶ms->int_target.vp_set, tmp); 688 free_cpumask_var(tmp); 689 690 if (nr_bank <= 0) { 691 res = 1; 692 goto exit_unlock; 693 } 694 695 /* 696 * var-sized hypercall, var-size starts after vp_mask (thus 697 * vp_set.format does not count, but vp_set.valid_bank_mask 698 * does). 699 */ 700 var_size = 1 + nr_bank; 701 } else { 702 for_each_cpu_and(cpu, dest, cpu_online_mask) { 703 params->int_target.vp_mask |= 704 (1ULL << hv_cpu_number_to_vp_number(cpu)); 705 } 706 } 707 708 res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17), 709 params, NULL); 710 711 exit_unlock: 712 spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags); 713 714 /* 715 * During hibernation, when a CPU is offlined, the kernel tries 716 * to move the interrupt to the remaining CPUs that haven't 717 * been offlined yet. In this case, the below hv_do_hypercall() 718 * always fails since the vmbus channel has been closed: 719 * refer to cpu_disable_common() -> fixup_irqs() -> 720 * irq_migrate_all_off_this_cpu() -> migrate_one_irq(). 721 * 722 * Suppress the error message for hibernation because the failure 723 * during hibernation does not matter (at this time all the devices 724 * have been frozen). Note: the correct affinity info is still updated 725 * into the irqdata data structure in migrate_one_irq() -> 726 * irq_do_set_affinity() -> hv_set_affinity(), so later when the VM 727 * resumes, hv_pci_restore_msi_state() is able to correctly restore 728 * the interrupt with the correct affinity. 729 */ 730 if (!hv_result_success(res) && hbus->state != hv_pcibus_removing) 731 dev_err(&hbus->hdev->device, 732 "%s() failed: %#llx", __func__, res); 733 } 734 #elif defined(CONFIG_ARM64) 735 /* 736 * SPI vectors to use for vPCI; arch SPIs range is [32, 1019], but leaving a bit 737 * of room at the start to allow for SPIs to be specified through ACPI and 738 * starting with a power of two to satisfy power of 2 multi-MSI requirement. 739 */ 740 #define HV_PCI_MSI_SPI_START 64 741 #define HV_PCI_MSI_SPI_NR (1020 - HV_PCI_MSI_SPI_START) 742 #define DELIVERY_MODE 0 743 #define FLOW_HANDLER NULL 744 #define FLOW_NAME NULL 745 #define hv_msi_prepare NULL 746 747 struct hv_pci_chip_data { 748 DECLARE_BITMAP(spi_map, HV_PCI_MSI_SPI_NR); 749 struct mutex map_lock; 750 }; 751 752 /* Hyper-V vPCI MSI GIC IRQ domain */ 753 static struct irq_domain *hv_msi_gic_irq_domain; 754 755 /* Hyper-V PCI MSI IRQ chip */ 756 static struct irq_chip hv_arm64_msi_irq_chip = { 757 .name = "MSI", 758 .irq_set_affinity = irq_chip_set_affinity_parent, 759 .irq_eoi = irq_chip_eoi_parent, 760 .irq_mask = irq_chip_mask_parent, 761 .irq_unmask = irq_chip_unmask_parent 762 }; 763 764 static unsigned int hv_msi_get_int_vector(struct irq_data *irqd) 765 { 766 return irqd->parent_data->hwirq; 767 } 768 769 static void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry, 770 struct msi_desc *msi_desc) 771 { 772 msi_entry->address = ((u64)msi_desc->msg.address_hi << 32) | 773 msi_desc->msg.address_lo; 774 msi_entry->data = msi_desc->msg.data; 775 } 776 777 /* 778 * @nr_bm_irqs: Indicates the number of IRQs that were allocated from 779 * the bitmap. 780 * @nr_dom_irqs: Indicates the number of IRQs that were allocated from 781 * the parent domain. 782 */ 783 static void hv_pci_vec_irq_free(struct irq_domain *domain, 784 unsigned int virq, 785 unsigned int nr_bm_irqs, 786 unsigned int nr_dom_irqs) 787 { 788 struct hv_pci_chip_data *chip_data = domain->host_data; 789 struct irq_data *d = irq_domain_get_irq_data(domain, virq); 790 int first = d->hwirq - HV_PCI_MSI_SPI_START; 791 int i; 792 793 mutex_lock(&chip_data->map_lock); 794 bitmap_release_region(chip_data->spi_map, 795 first, 796 get_count_order(nr_bm_irqs)); 797 mutex_unlock(&chip_data->map_lock); 798 for (i = 0; i < nr_dom_irqs; i++) { 799 if (i) 800 d = irq_domain_get_irq_data(domain, virq + i); 801 irq_domain_reset_irq_data(d); 802 } 803 804 irq_domain_free_irqs_parent(domain, virq, nr_dom_irqs); 805 } 806 807 static void hv_pci_vec_irq_domain_free(struct irq_domain *domain, 808 unsigned int virq, 809 unsigned int nr_irqs) 810 { 811 hv_pci_vec_irq_free(domain, virq, nr_irqs, nr_irqs); 812 } 813 814 static int hv_pci_vec_alloc_device_irq(struct irq_domain *domain, 815 unsigned int nr_irqs, 816 irq_hw_number_t *hwirq) 817 { 818 struct hv_pci_chip_data *chip_data = domain->host_data; 819 int index; 820 821 /* Find and allocate region from the SPI bitmap */ 822 mutex_lock(&chip_data->map_lock); 823 index = bitmap_find_free_region(chip_data->spi_map, 824 HV_PCI_MSI_SPI_NR, 825 get_count_order(nr_irqs)); 826 mutex_unlock(&chip_data->map_lock); 827 if (index < 0) 828 return -ENOSPC; 829 830 *hwirq = index + HV_PCI_MSI_SPI_START; 831 832 return 0; 833 } 834 835 static int hv_pci_vec_irq_gic_domain_alloc(struct irq_domain *domain, 836 unsigned int virq, 837 irq_hw_number_t hwirq) 838 { 839 struct irq_fwspec fwspec; 840 struct irq_data *d; 841 int ret; 842 843 fwspec.fwnode = domain->parent->fwnode; 844 fwspec.param_count = 2; 845 fwspec.param[0] = hwirq; 846 fwspec.param[1] = IRQ_TYPE_EDGE_RISING; 847 848 ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); 849 if (ret) 850 return ret; 851 852 /* 853 * Since the interrupt specifier is not coming from ACPI or DT, the 854 * trigger type will need to be set explicitly. Otherwise, it will be 855 * set to whatever is in the GIC configuration. 856 */ 857 d = irq_domain_get_irq_data(domain->parent, virq); 858 859 return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); 860 } 861 862 static int hv_pci_vec_irq_domain_alloc(struct irq_domain *domain, 863 unsigned int virq, unsigned int nr_irqs, 864 void *args) 865 { 866 irq_hw_number_t hwirq; 867 unsigned int i; 868 int ret; 869 870 ret = hv_pci_vec_alloc_device_irq(domain, nr_irqs, &hwirq); 871 if (ret) 872 return ret; 873 874 for (i = 0; i < nr_irqs; i++) { 875 ret = hv_pci_vec_irq_gic_domain_alloc(domain, virq + i, 876 hwirq + i); 877 if (ret) { 878 hv_pci_vec_irq_free(domain, virq, nr_irqs, i); 879 return ret; 880 } 881 882 irq_domain_set_hwirq_and_chip(domain, virq + i, 883 hwirq + i, 884 &hv_arm64_msi_irq_chip, 885 domain->host_data); 886 pr_debug("pID:%d vID:%u\n", (int)(hwirq + i), virq + i); 887 } 888 889 return 0; 890 } 891 892 /* 893 * Pick the first cpu as the irq affinity that can be temporarily used for 894 * composing MSI from the hypervisor. GIC will eventually set the right 895 * affinity for the irq and the 'unmask' will retarget the interrupt to that 896 * cpu. 897 */ 898 static int hv_pci_vec_irq_domain_activate(struct irq_domain *domain, 899 struct irq_data *irqd, bool reserve) 900 { 901 int cpu = cpumask_first(cpu_present_mask); 902 903 irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); 904 905 return 0; 906 } 907 908 static const struct irq_domain_ops hv_pci_domain_ops = { 909 .alloc = hv_pci_vec_irq_domain_alloc, 910 .free = hv_pci_vec_irq_domain_free, 911 .activate = hv_pci_vec_irq_domain_activate, 912 }; 913 914 static int hv_pci_irqchip_init(void) 915 { 916 static struct hv_pci_chip_data *chip_data; 917 struct fwnode_handle *fn = NULL; 918 int ret = -ENOMEM; 919 920 chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL); 921 if (!chip_data) 922 return ret; 923 924 mutex_init(&chip_data->map_lock); 925 fn = irq_domain_alloc_named_fwnode("hv_vpci_arm64"); 926 if (!fn) 927 goto free_chip; 928 929 /* 930 * IRQ domain once enabled, should not be removed since there is no 931 * way to ensure that all the corresponding devices are also gone and 932 * no interrupts will be generated. 933 */ 934 hv_msi_gic_irq_domain = acpi_irq_create_hierarchy(0, HV_PCI_MSI_SPI_NR, 935 fn, &hv_pci_domain_ops, 936 chip_data); 937 938 if (!hv_msi_gic_irq_domain) { 939 pr_err("Failed to create Hyper-V arm64 vPCI MSI IRQ domain\n"); 940 goto free_chip; 941 } 942 943 return 0; 944 945 free_chip: 946 kfree(chip_data); 947 if (fn) 948 irq_domain_free_fwnode(fn); 949 950 return ret; 951 } 952 953 static struct irq_domain *hv_pci_get_root_domain(void) 954 { 955 return hv_msi_gic_irq_domain; 956 } 957 958 /* 959 * SPIs are used for interrupts of PCI devices and SPIs is managed via GICD 960 * registers which Hyper-V already supports, so no hypercall needed. 961 */ 962 static void hv_arch_irq_unmask(struct irq_data *data) { } 963 #endif /* CONFIG_ARM64 */ 964 965 /** 966 * hv_pci_generic_compl() - Invoked for a completion packet 967 * @context: Set up by the sender of the packet. 968 * @resp: The response packet 969 * @resp_packet_size: Size in bytes of the packet 970 * 971 * This function is used to trigger an event and report status 972 * for any message for which the completion packet contains a 973 * status and nothing else. 974 */ 975 static void hv_pci_generic_compl(void *context, struct pci_response *resp, 976 int resp_packet_size) 977 { 978 struct hv_pci_compl *comp_pkt = context; 979 980 if (resp_packet_size >= offsetofend(struct pci_response, status)) 981 comp_pkt->completion_status = resp->status; 982 else 983 comp_pkt->completion_status = -1; 984 985 complete(&comp_pkt->host_event); 986 } 987 988 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 989 u32 wslot); 990 991 static void get_pcichild(struct hv_pci_dev *hpdev) 992 { 993 refcount_inc(&hpdev->refs); 994 } 995 996 static void put_pcichild(struct hv_pci_dev *hpdev) 997 { 998 if (refcount_dec_and_test(&hpdev->refs)) 999 kfree(hpdev); 1000 } 1001 1002 /* 1003 * There is no good way to get notified from vmbus_onoffer_rescind(), 1004 * so let's use polling here, since this is not a hot path. 1005 */ 1006 static int wait_for_response(struct hv_device *hdev, 1007 struct completion *comp) 1008 { 1009 while (true) { 1010 if (hdev->channel->rescind) { 1011 dev_warn_once(&hdev->device, "The device is gone.\n"); 1012 return -ENODEV; 1013 } 1014 1015 if (wait_for_completion_timeout(comp, HZ / 10)) 1016 break; 1017 } 1018 1019 return 0; 1020 } 1021 1022 /** 1023 * devfn_to_wslot() - Convert from Linux PCI slot to Windows 1024 * @devfn: The Linux representation of PCI slot 1025 * 1026 * Windows uses a slightly different representation of PCI slot. 1027 * 1028 * Return: The Windows representation 1029 */ 1030 static u32 devfn_to_wslot(int devfn) 1031 { 1032 union win_slot_encoding wslot; 1033 1034 wslot.slot = 0; 1035 wslot.bits.dev = PCI_SLOT(devfn); 1036 wslot.bits.func = PCI_FUNC(devfn); 1037 1038 return wslot.slot; 1039 } 1040 1041 /** 1042 * wslot_to_devfn() - Convert from Windows PCI slot to Linux 1043 * @wslot: The Windows representation of PCI slot 1044 * 1045 * Windows uses a slightly different representation of PCI slot. 1046 * 1047 * Return: The Linux representation 1048 */ 1049 static int wslot_to_devfn(u32 wslot) 1050 { 1051 union win_slot_encoding slot_no; 1052 1053 slot_no.slot = wslot; 1054 return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func); 1055 } 1056 1057 /* 1058 * PCI Configuration Space for these root PCI buses is implemented as a pair 1059 * of pages in memory-mapped I/O space. Writing to the first page chooses 1060 * the PCI function being written or read. Once the first page has been 1061 * written to, the following page maps in the entire configuration space of 1062 * the function. 1063 */ 1064 1065 /** 1066 * _hv_pcifront_read_config() - Internal PCI config read 1067 * @hpdev: The PCI driver's representation of the device 1068 * @where: Offset within config space 1069 * @size: Size of the transfer 1070 * @val: Pointer to the buffer receiving the data 1071 */ 1072 static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, 1073 int size, u32 *val) 1074 { 1075 unsigned long flags; 1076 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; 1077 1078 /* 1079 * If the attempt is to read the IDs or the ROM BAR, simulate that. 1080 */ 1081 if (where + size <= PCI_COMMAND) { 1082 memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size); 1083 } else if (where >= PCI_CLASS_REVISION && where + size <= 1084 PCI_CACHE_LINE_SIZE) { 1085 memcpy(val, ((u8 *)&hpdev->desc.rev) + where - 1086 PCI_CLASS_REVISION, size); 1087 } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <= 1088 PCI_ROM_ADDRESS) { 1089 memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where - 1090 PCI_SUBSYSTEM_VENDOR_ID, size); 1091 } else if (where >= PCI_ROM_ADDRESS && where + size <= 1092 PCI_CAPABILITY_LIST) { 1093 /* ROM BARs are unimplemented */ 1094 *val = 0; 1095 } else if (where >= PCI_INTERRUPT_LINE && where + size <= 1096 PCI_INTERRUPT_PIN) { 1097 /* 1098 * Interrupt Line and Interrupt PIN are hard-wired to zero 1099 * because this front-end only supports message-signaled 1100 * interrupts. 1101 */ 1102 *val = 0; 1103 } else if (where + size <= CFG_PAGE_SIZE) { 1104 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 1105 /* Choose the function to be read. (See comment above) */ 1106 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 1107 /* Make sure the function was chosen before we start reading. */ 1108 mb(); 1109 /* Read from that function's config space. */ 1110 switch (size) { 1111 case 1: 1112 *val = readb(addr); 1113 break; 1114 case 2: 1115 *val = readw(addr); 1116 break; 1117 default: 1118 *val = readl(addr); 1119 break; 1120 } 1121 /* 1122 * Make sure the read was done before we release the spinlock 1123 * allowing consecutive reads/writes. 1124 */ 1125 mb(); 1126 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 1127 } else { 1128 dev_err(&hpdev->hbus->hdev->device, 1129 "Attempt to read beyond a function's config space.\n"); 1130 } 1131 } 1132 1133 static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev) 1134 { 1135 u16 ret; 1136 unsigned long flags; 1137 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + 1138 PCI_VENDOR_ID; 1139 1140 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 1141 1142 /* Choose the function to be read. (See comment above) */ 1143 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 1144 /* Make sure the function was chosen before we start reading. */ 1145 mb(); 1146 /* Read from that function's config space. */ 1147 ret = readw(addr); 1148 /* 1149 * mb() is not required here, because the spin_unlock_irqrestore() 1150 * is a barrier. 1151 */ 1152 1153 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 1154 1155 return ret; 1156 } 1157 1158 /** 1159 * _hv_pcifront_write_config() - Internal PCI config write 1160 * @hpdev: The PCI driver's representation of the device 1161 * @where: Offset within config space 1162 * @size: Size of the transfer 1163 * @val: The data being transferred 1164 */ 1165 static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, 1166 int size, u32 val) 1167 { 1168 unsigned long flags; 1169 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; 1170 1171 if (where >= PCI_SUBSYSTEM_VENDOR_ID && 1172 where + size <= PCI_CAPABILITY_LIST) { 1173 /* SSIDs and ROM BARs are read-only */ 1174 } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) { 1175 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 1176 /* Choose the function to be written. (See comment above) */ 1177 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 1178 /* Make sure the function was chosen before we start writing. */ 1179 wmb(); 1180 /* Write to that function's config space. */ 1181 switch (size) { 1182 case 1: 1183 writeb(val, addr); 1184 break; 1185 case 2: 1186 writew(val, addr); 1187 break; 1188 default: 1189 writel(val, addr); 1190 break; 1191 } 1192 /* 1193 * Make sure the write was done before we release the spinlock 1194 * allowing consecutive reads/writes. 1195 */ 1196 mb(); 1197 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 1198 } else { 1199 dev_err(&hpdev->hbus->hdev->device, 1200 "Attempt to write beyond a function's config space.\n"); 1201 } 1202 } 1203 1204 /** 1205 * hv_pcifront_read_config() - Read configuration space 1206 * @bus: PCI Bus structure 1207 * @devfn: Device/function 1208 * @where: Offset from base 1209 * @size: Byte/word/dword 1210 * @val: Value to be read 1211 * 1212 * Return: PCIBIOS_SUCCESSFUL on success 1213 * PCIBIOS_DEVICE_NOT_FOUND on failure 1214 */ 1215 static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn, 1216 int where, int size, u32 *val) 1217 { 1218 struct hv_pcibus_device *hbus = 1219 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 1220 struct hv_pci_dev *hpdev; 1221 1222 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 1223 if (!hpdev) 1224 return PCIBIOS_DEVICE_NOT_FOUND; 1225 1226 _hv_pcifront_read_config(hpdev, where, size, val); 1227 1228 put_pcichild(hpdev); 1229 return PCIBIOS_SUCCESSFUL; 1230 } 1231 1232 /** 1233 * hv_pcifront_write_config() - Write configuration space 1234 * @bus: PCI Bus structure 1235 * @devfn: Device/function 1236 * @where: Offset from base 1237 * @size: Byte/word/dword 1238 * @val: Value to be written to device 1239 * 1240 * Return: PCIBIOS_SUCCESSFUL on success 1241 * PCIBIOS_DEVICE_NOT_FOUND on failure 1242 */ 1243 static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn, 1244 int where, int size, u32 val) 1245 { 1246 struct hv_pcibus_device *hbus = 1247 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 1248 struct hv_pci_dev *hpdev; 1249 1250 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 1251 if (!hpdev) 1252 return PCIBIOS_DEVICE_NOT_FOUND; 1253 1254 _hv_pcifront_write_config(hpdev, where, size, val); 1255 1256 put_pcichild(hpdev); 1257 return PCIBIOS_SUCCESSFUL; 1258 } 1259 1260 /* PCIe operations */ 1261 static struct pci_ops hv_pcifront_ops = { 1262 .read = hv_pcifront_read_config, 1263 .write = hv_pcifront_write_config, 1264 }; 1265 1266 /* 1267 * Paravirtual backchannel 1268 * 1269 * Hyper-V SR-IOV provides a backchannel mechanism in software for 1270 * communication between a VF driver and a PF driver. These 1271 * "configuration blocks" are similar in concept to PCI configuration space, 1272 * but instead of doing reads and writes in 32-bit chunks through a very slow 1273 * path, packets of up to 128 bytes can be sent or received asynchronously. 1274 * 1275 * Nearly every SR-IOV device contains just such a communications channel in 1276 * hardware, so using this one in software is usually optional. Using the 1277 * software channel, however, allows driver implementers to leverage software 1278 * tools that fuzz the communications channel looking for vulnerabilities. 1279 * 1280 * The usage model for these packets puts the responsibility for reading or 1281 * writing on the VF driver. The VF driver sends a read or a write packet, 1282 * indicating which "block" is being referred to by number. 1283 * 1284 * If the PF driver wishes to initiate communication, it can "invalidate" one or 1285 * more of the first 64 blocks. This invalidation is delivered via a callback 1286 * supplied by the VF driver by this driver. 1287 * 1288 * No protocol is implied, except that supplied by the PF and VF drivers. 1289 */ 1290 1291 struct hv_read_config_compl { 1292 struct hv_pci_compl comp_pkt; 1293 void *buf; 1294 unsigned int len; 1295 unsigned int bytes_returned; 1296 }; 1297 1298 /** 1299 * hv_pci_read_config_compl() - Invoked when a response packet 1300 * for a read config block operation arrives. 1301 * @context: Identifies the read config operation 1302 * @resp: The response packet itself 1303 * @resp_packet_size: Size in bytes of the response packet 1304 */ 1305 static void hv_pci_read_config_compl(void *context, struct pci_response *resp, 1306 int resp_packet_size) 1307 { 1308 struct hv_read_config_compl *comp = context; 1309 struct pci_read_block_response *read_resp = 1310 (struct pci_read_block_response *)resp; 1311 unsigned int data_len, hdr_len; 1312 1313 hdr_len = offsetof(struct pci_read_block_response, bytes); 1314 if (resp_packet_size < hdr_len) { 1315 comp->comp_pkt.completion_status = -1; 1316 goto out; 1317 } 1318 1319 data_len = resp_packet_size - hdr_len; 1320 if (data_len > 0 && read_resp->status == 0) { 1321 comp->bytes_returned = min(comp->len, data_len); 1322 memcpy(comp->buf, read_resp->bytes, comp->bytes_returned); 1323 } else { 1324 comp->bytes_returned = 0; 1325 } 1326 1327 comp->comp_pkt.completion_status = read_resp->status; 1328 out: 1329 complete(&comp->comp_pkt.host_event); 1330 } 1331 1332 /** 1333 * hv_read_config_block() - Sends a read config block request to 1334 * the back-end driver running in the Hyper-V parent partition. 1335 * @pdev: The PCI driver's representation for this device. 1336 * @buf: Buffer into which the config block will be copied. 1337 * @len: Size in bytes of buf. 1338 * @block_id: Identifies the config block which has been requested. 1339 * @bytes_returned: Size which came back from the back-end driver. 1340 * 1341 * Return: 0 on success, -errno on failure 1342 */ 1343 static int hv_read_config_block(struct pci_dev *pdev, void *buf, 1344 unsigned int len, unsigned int block_id, 1345 unsigned int *bytes_returned) 1346 { 1347 struct hv_pcibus_device *hbus = 1348 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1349 sysdata); 1350 struct { 1351 struct pci_packet pkt; 1352 char buf[sizeof(struct pci_read_block)]; 1353 } pkt; 1354 struct hv_read_config_compl comp_pkt; 1355 struct pci_read_block *read_blk; 1356 int ret; 1357 1358 if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) 1359 return -EINVAL; 1360 1361 init_completion(&comp_pkt.comp_pkt.host_event); 1362 comp_pkt.buf = buf; 1363 comp_pkt.len = len; 1364 1365 memset(&pkt, 0, sizeof(pkt)); 1366 pkt.pkt.completion_func = hv_pci_read_config_compl; 1367 pkt.pkt.compl_ctxt = &comp_pkt; 1368 read_blk = (struct pci_read_block *)&pkt.pkt.message; 1369 read_blk->message_type.type = PCI_READ_BLOCK; 1370 read_blk->wslot.slot = devfn_to_wslot(pdev->devfn); 1371 read_blk->block_id = block_id; 1372 read_blk->bytes_requested = len; 1373 1374 ret = vmbus_sendpacket(hbus->hdev->channel, read_blk, 1375 sizeof(*read_blk), (unsigned long)&pkt.pkt, 1376 VM_PKT_DATA_INBAND, 1377 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1378 if (ret) 1379 return ret; 1380 1381 ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event); 1382 if (ret) 1383 return ret; 1384 1385 if (comp_pkt.comp_pkt.completion_status != 0 || 1386 comp_pkt.bytes_returned == 0) { 1387 dev_err(&hbus->hdev->device, 1388 "Read Config Block failed: 0x%x, bytes_returned=%d\n", 1389 comp_pkt.comp_pkt.completion_status, 1390 comp_pkt.bytes_returned); 1391 return -EIO; 1392 } 1393 1394 *bytes_returned = comp_pkt.bytes_returned; 1395 return 0; 1396 } 1397 1398 /** 1399 * hv_pci_write_config_compl() - Invoked when a response packet for a write 1400 * config block operation arrives. 1401 * @context: Identifies the write config operation 1402 * @resp: The response packet itself 1403 * @resp_packet_size: Size in bytes of the response packet 1404 */ 1405 static void hv_pci_write_config_compl(void *context, struct pci_response *resp, 1406 int resp_packet_size) 1407 { 1408 struct hv_pci_compl *comp_pkt = context; 1409 1410 comp_pkt->completion_status = resp->status; 1411 complete(&comp_pkt->host_event); 1412 } 1413 1414 /** 1415 * hv_write_config_block() - Sends a write config block request to the 1416 * back-end driver running in the Hyper-V parent partition. 1417 * @pdev: The PCI driver's representation for this device. 1418 * @buf: Buffer from which the config block will be copied. 1419 * @len: Size in bytes of buf. 1420 * @block_id: Identifies the config block which is being written. 1421 * 1422 * Return: 0 on success, -errno on failure 1423 */ 1424 static int hv_write_config_block(struct pci_dev *pdev, void *buf, 1425 unsigned int len, unsigned int block_id) 1426 { 1427 struct hv_pcibus_device *hbus = 1428 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1429 sysdata); 1430 struct { 1431 struct pci_packet pkt; 1432 char buf[sizeof(struct pci_write_block)]; 1433 u32 reserved; 1434 } pkt; 1435 struct hv_pci_compl comp_pkt; 1436 struct pci_write_block *write_blk; 1437 u32 pkt_size; 1438 int ret; 1439 1440 if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) 1441 return -EINVAL; 1442 1443 init_completion(&comp_pkt.host_event); 1444 1445 memset(&pkt, 0, sizeof(pkt)); 1446 pkt.pkt.completion_func = hv_pci_write_config_compl; 1447 pkt.pkt.compl_ctxt = &comp_pkt; 1448 write_blk = (struct pci_write_block *)&pkt.pkt.message; 1449 write_blk->message_type.type = PCI_WRITE_BLOCK; 1450 write_blk->wslot.slot = devfn_to_wslot(pdev->devfn); 1451 write_blk->block_id = block_id; 1452 write_blk->byte_count = len; 1453 memcpy(write_blk->bytes, buf, len); 1454 pkt_size = offsetof(struct pci_write_block, bytes) + len; 1455 /* 1456 * This quirk is required on some hosts shipped around 2018, because 1457 * these hosts don't check the pkt_size correctly (new hosts have been 1458 * fixed since early 2019). The quirk is also safe on very old hosts 1459 * and new hosts, because, on them, what really matters is the length 1460 * specified in write_blk->byte_count. 1461 */ 1462 pkt_size += sizeof(pkt.reserved); 1463 1464 ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size, 1465 (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND, 1466 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1467 if (ret) 1468 return ret; 1469 1470 ret = wait_for_response(hbus->hdev, &comp_pkt.host_event); 1471 if (ret) 1472 return ret; 1473 1474 if (comp_pkt.completion_status != 0) { 1475 dev_err(&hbus->hdev->device, 1476 "Write Config Block failed: 0x%x\n", 1477 comp_pkt.completion_status); 1478 return -EIO; 1479 } 1480 1481 return 0; 1482 } 1483 1484 /** 1485 * hv_register_block_invalidate() - Invoked when a config block invalidation 1486 * arrives from the back-end driver. 1487 * @pdev: The PCI driver's representation for this device. 1488 * @context: Identifies the device. 1489 * @block_invalidate: Identifies all of the blocks being invalidated. 1490 * 1491 * Return: 0 on success, -errno on failure 1492 */ 1493 static int hv_register_block_invalidate(struct pci_dev *pdev, void *context, 1494 void (*block_invalidate)(void *context, 1495 u64 block_mask)) 1496 { 1497 struct hv_pcibus_device *hbus = 1498 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1499 sysdata); 1500 struct hv_pci_dev *hpdev; 1501 1502 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1503 if (!hpdev) 1504 return -ENODEV; 1505 1506 hpdev->block_invalidate = block_invalidate; 1507 hpdev->invalidate_context = context; 1508 1509 put_pcichild(hpdev); 1510 return 0; 1511 1512 } 1513 1514 /* Interrupt management hooks */ 1515 static void hv_int_desc_free(struct hv_pci_dev *hpdev, 1516 struct tran_int_desc *int_desc) 1517 { 1518 struct pci_delete_interrupt *int_pkt; 1519 struct { 1520 struct pci_packet pkt; 1521 u8 buffer[sizeof(struct pci_delete_interrupt)]; 1522 } ctxt; 1523 1524 memset(&ctxt, 0, sizeof(ctxt)); 1525 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; 1526 int_pkt->message_type.type = 1527 PCI_DELETE_INTERRUPT_MESSAGE; 1528 int_pkt->wslot.slot = hpdev->desc.win_slot.slot; 1529 int_pkt->int_desc = *int_desc; 1530 vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt), 1531 (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0); 1532 kfree(int_desc); 1533 } 1534 1535 /** 1536 * hv_msi_free() - Free the MSI. 1537 * @domain: The interrupt domain pointer 1538 * @info: Extra MSI-related context 1539 * @irq: Identifies the IRQ. 1540 * 1541 * The Hyper-V parent partition and hypervisor are tracking the 1542 * messages that are in use, keeping the interrupt redirection 1543 * table up to date. This callback sends a message that frees 1544 * the IRT entry and related tracking nonsense. 1545 */ 1546 static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info, 1547 unsigned int irq) 1548 { 1549 struct hv_pcibus_device *hbus; 1550 struct hv_pci_dev *hpdev; 1551 struct pci_dev *pdev; 1552 struct tran_int_desc *int_desc; 1553 struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq); 1554 struct msi_desc *msi = irq_data_get_msi_desc(irq_data); 1555 1556 pdev = msi_desc_to_pci_dev(msi); 1557 hbus = info->data; 1558 int_desc = irq_data_get_irq_chip_data(irq_data); 1559 if (!int_desc) 1560 return; 1561 1562 irq_data->chip_data = NULL; 1563 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1564 if (!hpdev) { 1565 kfree(int_desc); 1566 return; 1567 } 1568 1569 hv_int_desc_free(hpdev, int_desc); 1570 put_pcichild(hpdev); 1571 } 1572 1573 static void hv_irq_mask(struct irq_data *data) 1574 { 1575 pci_msi_mask_irq(data); 1576 if (data->parent_data->chip->irq_mask) 1577 irq_chip_mask_parent(data); 1578 } 1579 1580 static void hv_irq_unmask(struct irq_data *data) 1581 { 1582 hv_arch_irq_unmask(data); 1583 1584 if (data->parent_data->chip->irq_unmask) 1585 irq_chip_unmask_parent(data); 1586 pci_msi_unmask_irq(data); 1587 } 1588 1589 struct compose_comp_ctxt { 1590 struct hv_pci_compl comp_pkt; 1591 struct tran_int_desc int_desc; 1592 }; 1593 1594 static void hv_pci_compose_compl(void *context, struct pci_response *resp, 1595 int resp_packet_size) 1596 { 1597 struct compose_comp_ctxt *comp_pkt = context; 1598 struct pci_create_int_response *int_resp = 1599 (struct pci_create_int_response *)resp; 1600 1601 comp_pkt->comp_pkt.completion_status = resp->status; 1602 comp_pkt->int_desc = int_resp->int_desc; 1603 complete(&comp_pkt->comp_pkt.host_event); 1604 } 1605 1606 static u32 hv_compose_msi_req_v1( 1607 struct pci_create_interrupt *int_pkt, struct cpumask *affinity, 1608 u32 slot, u8 vector) 1609 { 1610 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; 1611 int_pkt->wslot.slot = slot; 1612 int_pkt->int_desc.vector = vector; 1613 int_pkt->int_desc.vector_count = 1; 1614 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1615 1616 /* 1617 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in 1618 * hv_irq_unmask(). 1619 */ 1620 int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL; 1621 1622 return sizeof(*int_pkt); 1623 } 1624 1625 /* 1626 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten 1627 * by subsequent retarget in hv_irq_unmask(). 1628 */ 1629 static int hv_compose_msi_req_get_cpu(struct cpumask *affinity) 1630 { 1631 return cpumask_first_and(affinity, cpu_online_mask); 1632 } 1633 1634 static u32 hv_compose_msi_req_v2( 1635 struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity, 1636 u32 slot, u8 vector) 1637 { 1638 int cpu; 1639 1640 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2; 1641 int_pkt->wslot.slot = slot; 1642 int_pkt->int_desc.vector = vector; 1643 int_pkt->int_desc.vector_count = 1; 1644 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1645 cpu = hv_compose_msi_req_get_cpu(affinity); 1646 int_pkt->int_desc.processor_array[0] = 1647 hv_cpu_number_to_vp_number(cpu); 1648 int_pkt->int_desc.processor_count = 1; 1649 1650 return sizeof(*int_pkt); 1651 } 1652 1653 static u32 hv_compose_msi_req_v3( 1654 struct pci_create_interrupt3 *int_pkt, struct cpumask *affinity, 1655 u32 slot, u32 vector) 1656 { 1657 int cpu; 1658 1659 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3; 1660 int_pkt->wslot.slot = slot; 1661 int_pkt->int_desc.vector = vector; 1662 int_pkt->int_desc.reserved = 0; 1663 int_pkt->int_desc.vector_count = 1; 1664 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1665 cpu = hv_compose_msi_req_get_cpu(affinity); 1666 int_pkt->int_desc.processor_array[0] = 1667 hv_cpu_number_to_vp_number(cpu); 1668 int_pkt->int_desc.processor_count = 1; 1669 1670 return sizeof(*int_pkt); 1671 } 1672 1673 /** 1674 * hv_compose_msi_msg() - Supplies a valid MSI address/data 1675 * @data: Everything about this MSI 1676 * @msg: Buffer that is filled in by this function 1677 * 1678 * This function unpacks the IRQ looking for target CPU set, IDT 1679 * vector and mode and sends a message to the parent partition 1680 * asking for a mapping for that tuple in this partition. The 1681 * response supplies a data value and address to which that data 1682 * should be written to trigger that interrupt. 1683 */ 1684 static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) 1685 { 1686 struct hv_pcibus_device *hbus; 1687 struct vmbus_channel *channel; 1688 struct hv_pci_dev *hpdev; 1689 struct pci_bus *pbus; 1690 struct pci_dev *pdev; 1691 struct cpumask *dest; 1692 struct compose_comp_ctxt comp; 1693 struct tran_int_desc *int_desc; 1694 struct { 1695 struct pci_packet pci_pkt; 1696 union { 1697 struct pci_create_interrupt v1; 1698 struct pci_create_interrupt2 v2; 1699 struct pci_create_interrupt3 v3; 1700 } int_pkts; 1701 } __packed ctxt; 1702 1703 u32 size; 1704 int ret; 1705 1706 pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data)); 1707 dest = irq_data_get_effective_affinity_mask(data); 1708 pbus = pdev->bus; 1709 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 1710 channel = hbus->hdev->channel; 1711 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1712 if (!hpdev) 1713 goto return_null_message; 1714 1715 /* Free any previous message that might have already been composed. */ 1716 if (data->chip_data) { 1717 int_desc = data->chip_data; 1718 data->chip_data = NULL; 1719 hv_int_desc_free(hpdev, int_desc); 1720 } 1721 1722 int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC); 1723 if (!int_desc) 1724 goto drop_reference; 1725 1726 memset(&ctxt, 0, sizeof(ctxt)); 1727 init_completion(&comp.comp_pkt.host_event); 1728 ctxt.pci_pkt.completion_func = hv_pci_compose_compl; 1729 ctxt.pci_pkt.compl_ctxt = ∁ 1730 1731 switch (hbus->protocol_version) { 1732 case PCI_PROTOCOL_VERSION_1_1: 1733 size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1, 1734 dest, 1735 hpdev->desc.win_slot.slot, 1736 hv_msi_get_int_vector(data)); 1737 break; 1738 1739 case PCI_PROTOCOL_VERSION_1_2: 1740 case PCI_PROTOCOL_VERSION_1_3: 1741 size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2, 1742 dest, 1743 hpdev->desc.win_slot.slot, 1744 hv_msi_get_int_vector(data)); 1745 break; 1746 1747 case PCI_PROTOCOL_VERSION_1_4: 1748 size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3, 1749 dest, 1750 hpdev->desc.win_slot.slot, 1751 hv_msi_get_int_vector(data)); 1752 break; 1753 1754 default: 1755 /* As we only negotiate protocol versions known to this driver, 1756 * this path should never hit. However, this is it not a hot 1757 * path so we print a message to aid future updates. 1758 */ 1759 dev_err(&hbus->hdev->device, 1760 "Unexpected vPCI protocol, update driver."); 1761 goto free_int_desc; 1762 } 1763 1764 ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, &ctxt.int_pkts, 1765 size, (unsigned long)&ctxt.pci_pkt, 1766 VM_PKT_DATA_INBAND, 1767 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1768 if (ret) { 1769 dev_err(&hbus->hdev->device, 1770 "Sending request for interrupt failed: 0x%x", 1771 comp.comp_pkt.completion_status); 1772 goto free_int_desc; 1773 } 1774 1775 /* 1776 * Prevents hv_pci_onchannelcallback() from running concurrently 1777 * in the tasklet. 1778 */ 1779 tasklet_disable_in_atomic(&channel->callback_event); 1780 1781 /* 1782 * Since this function is called with IRQ locks held, can't 1783 * do normal wait for completion; instead poll. 1784 */ 1785 while (!try_wait_for_completion(&comp.comp_pkt.host_event)) { 1786 unsigned long flags; 1787 1788 /* 0xFFFF means an invalid PCI VENDOR ID. */ 1789 if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) { 1790 dev_err_once(&hbus->hdev->device, 1791 "the device has gone\n"); 1792 goto enable_tasklet; 1793 } 1794 1795 /* 1796 * Make sure that the ring buffer data structure doesn't get 1797 * freed while we dereference the ring buffer pointer. Test 1798 * for the channel's onchannel_callback being NULL within a 1799 * sched_lock critical section. See also the inline comments 1800 * in vmbus_reset_channel_cb(). 1801 */ 1802 spin_lock_irqsave(&channel->sched_lock, flags); 1803 if (unlikely(channel->onchannel_callback == NULL)) { 1804 spin_unlock_irqrestore(&channel->sched_lock, flags); 1805 goto enable_tasklet; 1806 } 1807 hv_pci_onchannelcallback(hbus); 1808 spin_unlock_irqrestore(&channel->sched_lock, flags); 1809 1810 if (hpdev->state == hv_pcichild_ejecting) { 1811 dev_err_once(&hbus->hdev->device, 1812 "the device is being ejected\n"); 1813 goto enable_tasklet; 1814 } 1815 1816 udelay(100); 1817 } 1818 1819 tasklet_enable(&channel->callback_event); 1820 1821 if (comp.comp_pkt.completion_status < 0) { 1822 dev_err(&hbus->hdev->device, 1823 "Request for interrupt failed: 0x%x", 1824 comp.comp_pkt.completion_status); 1825 goto free_int_desc; 1826 } 1827 1828 /* 1829 * Record the assignment so that this can be unwound later. Using 1830 * irq_set_chip_data() here would be appropriate, but the lock it takes 1831 * is already held. 1832 */ 1833 *int_desc = comp.int_desc; 1834 data->chip_data = int_desc; 1835 1836 /* Pass up the result. */ 1837 msg->address_hi = comp.int_desc.address >> 32; 1838 msg->address_lo = comp.int_desc.address & 0xffffffff; 1839 msg->data = comp.int_desc.data; 1840 1841 put_pcichild(hpdev); 1842 return; 1843 1844 enable_tasklet: 1845 tasklet_enable(&channel->callback_event); 1846 free_int_desc: 1847 kfree(int_desc); 1848 drop_reference: 1849 put_pcichild(hpdev); 1850 return_null_message: 1851 msg->address_hi = 0; 1852 msg->address_lo = 0; 1853 msg->data = 0; 1854 } 1855 1856 /* HW Interrupt Chip Descriptor */ 1857 static struct irq_chip hv_msi_irq_chip = { 1858 .name = "Hyper-V PCIe MSI", 1859 .irq_compose_msi_msg = hv_compose_msi_msg, 1860 .irq_set_affinity = irq_chip_set_affinity_parent, 1861 #ifdef CONFIG_X86 1862 .irq_ack = irq_chip_ack_parent, 1863 #elif defined(CONFIG_ARM64) 1864 .irq_eoi = irq_chip_eoi_parent, 1865 #endif 1866 .irq_mask = hv_irq_mask, 1867 .irq_unmask = hv_irq_unmask, 1868 }; 1869 1870 static struct msi_domain_ops hv_msi_ops = { 1871 .msi_prepare = hv_msi_prepare, 1872 .msi_free = hv_msi_free, 1873 }; 1874 1875 /** 1876 * hv_pcie_init_irq_domain() - Initialize IRQ domain 1877 * @hbus: The root PCI bus 1878 * 1879 * This function creates an IRQ domain which will be used for 1880 * interrupts from devices that have been passed through. These 1881 * devices only support MSI and MSI-X, not line-based interrupts 1882 * or simulations of line-based interrupts through PCIe's 1883 * fabric-layer messages. Because interrupts are remapped, we 1884 * can support multi-message MSI here. 1885 * 1886 * Return: '0' on success and error value on failure 1887 */ 1888 static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus) 1889 { 1890 hbus->msi_info.chip = &hv_msi_irq_chip; 1891 hbus->msi_info.ops = &hv_msi_ops; 1892 hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS | 1893 MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI | 1894 MSI_FLAG_PCI_MSIX); 1895 hbus->msi_info.handler = FLOW_HANDLER; 1896 hbus->msi_info.handler_name = FLOW_NAME; 1897 hbus->msi_info.data = hbus; 1898 hbus->irq_domain = pci_msi_create_irq_domain(hbus->fwnode, 1899 &hbus->msi_info, 1900 hv_pci_get_root_domain()); 1901 if (!hbus->irq_domain) { 1902 dev_err(&hbus->hdev->device, 1903 "Failed to build an MSI IRQ domain\n"); 1904 return -ENODEV; 1905 } 1906 1907 dev_set_msi_domain(&hbus->bridge->dev, hbus->irq_domain); 1908 1909 return 0; 1910 } 1911 1912 /** 1913 * get_bar_size() - Get the address space consumed by a BAR 1914 * @bar_val: Value that a BAR returned after -1 was written 1915 * to it. 1916 * 1917 * This function returns the size of the BAR, rounded up to 1 1918 * page. It has to be rounded up because the hypervisor's page 1919 * table entry that maps the BAR into the VM can't specify an 1920 * offset within a page. The invariant is that the hypervisor 1921 * must place any BARs of smaller than page length at the 1922 * beginning of a page. 1923 * 1924 * Return: Size in bytes of the consumed MMIO space. 1925 */ 1926 static u64 get_bar_size(u64 bar_val) 1927 { 1928 return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)), 1929 PAGE_SIZE); 1930 } 1931 1932 /** 1933 * survey_child_resources() - Total all MMIO requirements 1934 * @hbus: Root PCI bus, as understood by this driver 1935 */ 1936 static void survey_child_resources(struct hv_pcibus_device *hbus) 1937 { 1938 struct hv_pci_dev *hpdev; 1939 resource_size_t bar_size = 0; 1940 unsigned long flags; 1941 struct completion *event; 1942 u64 bar_val; 1943 int i; 1944 1945 /* If nobody is waiting on the answer, don't compute it. */ 1946 event = xchg(&hbus->survey_event, NULL); 1947 if (!event) 1948 return; 1949 1950 /* If the answer has already been computed, go with it. */ 1951 if (hbus->low_mmio_space || hbus->high_mmio_space) { 1952 complete(event); 1953 return; 1954 } 1955 1956 spin_lock_irqsave(&hbus->device_list_lock, flags); 1957 1958 /* 1959 * Due to an interesting quirk of the PCI spec, all memory regions 1960 * for a child device are a power of 2 in size and aligned in memory, 1961 * so it's sufficient to just add them up without tracking alignment. 1962 */ 1963 list_for_each_entry(hpdev, &hbus->children, list_entry) { 1964 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 1965 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO) 1966 dev_err(&hbus->hdev->device, 1967 "There's an I/O BAR in this list!\n"); 1968 1969 if (hpdev->probed_bar[i] != 0) { 1970 /* 1971 * A probed BAR has all the upper bits set that 1972 * can be changed. 1973 */ 1974 1975 bar_val = hpdev->probed_bar[i]; 1976 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 1977 bar_val |= 1978 ((u64)hpdev->probed_bar[++i] << 32); 1979 else 1980 bar_val |= 0xffffffff00000000ULL; 1981 1982 bar_size = get_bar_size(bar_val); 1983 1984 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 1985 hbus->high_mmio_space += bar_size; 1986 else 1987 hbus->low_mmio_space += bar_size; 1988 } 1989 } 1990 } 1991 1992 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 1993 complete(event); 1994 } 1995 1996 /** 1997 * prepopulate_bars() - Fill in BARs with defaults 1998 * @hbus: Root PCI bus, as understood by this driver 1999 * 2000 * The core PCI driver code seems much, much happier if the BARs 2001 * for a device have values upon first scan. So fill them in. 2002 * The algorithm below works down from large sizes to small, 2003 * attempting to pack the assignments optimally. The assumption, 2004 * enforced in other parts of the code, is that the beginning of 2005 * the memory-mapped I/O space will be aligned on the largest 2006 * BAR size. 2007 */ 2008 static void prepopulate_bars(struct hv_pcibus_device *hbus) 2009 { 2010 resource_size_t high_size = 0; 2011 resource_size_t low_size = 0; 2012 resource_size_t high_base = 0; 2013 resource_size_t low_base = 0; 2014 resource_size_t bar_size; 2015 struct hv_pci_dev *hpdev; 2016 unsigned long flags; 2017 u64 bar_val; 2018 u32 command; 2019 bool high; 2020 int i; 2021 2022 if (hbus->low_mmio_space) { 2023 low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 2024 low_base = hbus->low_mmio_res->start; 2025 } 2026 2027 if (hbus->high_mmio_space) { 2028 high_size = 1ULL << 2029 (63 - __builtin_clzll(hbus->high_mmio_space)); 2030 high_base = hbus->high_mmio_res->start; 2031 } 2032 2033 spin_lock_irqsave(&hbus->device_list_lock, flags); 2034 2035 /* 2036 * Clear the memory enable bit, in case it's already set. This occurs 2037 * in the suspend path of hibernation, where the device is suspended, 2038 * resumed and suspended again: see hibernation_snapshot() and 2039 * hibernation_platform_enter(). 2040 * 2041 * If the memory enable bit is already set, Hyper-V silently ignores 2042 * the below BAR updates, and the related PCI device driver can not 2043 * work, because reading from the device register(s) always returns 2044 * 0xFFFFFFFF (PCI_ERROR_RESPONSE). 2045 */ 2046 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2047 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command); 2048 command &= ~PCI_COMMAND_MEMORY; 2049 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command); 2050 } 2051 2052 /* Pick addresses for the BARs. */ 2053 do { 2054 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2055 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 2056 bar_val = hpdev->probed_bar[i]; 2057 if (bar_val == 0) 2058 continue; 2059 high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64; 2060 if (high) { 2061 bar_val |= 2062 ((u64)hpdev->probed_bar[i + 1] 2063 << 32); 2064 } else { 2065 bar_val |= 0xffffffffULL << 32; 2066 } 2067 bar_size = get_bar_size(bar_val); 2068 if (high) { 2069 if (high_size != bar_size) { 2070 i++; 2071 continue; 2072 } 2073 _hv_pcifront_write_config(hpdev, 2074 PCI_BASE_ADDRESS_0 + (4 * i), 2075 4, 2076 (u32)(high_base & 0xffffff00)); 2077 i++; 2078 _hv_pcifront_write_config(hpdev, 2079 PCI_BASE_ADDRESS_0 + (4 * i), 2080 4, (u32)(high_base >> 32)); 2081 high_base += bar_size; 2082 } else { 2083 if (low_size != bar_size) 2084 continue; 2085 _hv_pcifront_write_config(hpdev, 2086 PCI_BASE_ADDRESS_0 + (4 * i), 2087 4, 2088 (u32)(low_base & 0xffffff00)); 2089 low_base += bar_size; 2090 } 2091 } 2092 if (high_size <= 1 && low_size <= 1) { 2093 /* Set the memory enable bit. */ 2094 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, 2095 &command); 2096 command |= PCI_COMMAND_MEMORY; 2097 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, 2098 command); 2099 break; 2100 } 2101 } 2102 2103 high_size >>= 1; 2104 low_size >>= 1; 2105 } while (high_size || low_size); 2106 2107 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2108 } 2109 2110 /* 2111 * Assign entries in sysfs pci slot directory. 2112 * 2113 * Note that this function does not need to lock the children list 2114 * because it is called from pci_devices_present_work which 2115 * is serialized with hv_eject_device_work because they are on the 2116 * same ordered workqueue. Therefore hbus->children list will not change 2117 * even when pci_create_slot sleeps. 2118 */ 2119 static void hv_pci_assign_slots(struct hv_pcibus_device *hbus) 2120 { 2121 struct hv_pci_dev *hpdev; 2122 char name[SLOT_NAME_SIZE]; 2123 int slot_nr; 2124 2125 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2126 if (hpdev->pci_slot) 2127 continue; 2128 2129 slot_nr = PCI_SLOT(wslot_to_devfn(hpdev->desc.win_slot.slot)); 2130 snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser); 2131 hpdev->pci_slot = pci_create_slot(hbus->bridge->bus, slot_nr, 2132 name, NULL); 2133 if (IS_ERR(hpdev->pci_slot)) { 2134 pr_warn("pci_create slot %s failed\n", name); 2135 hpdev->pci_slot = NULL; 2136 } 2137 } 2138 } 2139 2140 /* 2141 * Remove entries in sysfs pci slot directory. 2142 */ 2143 static void hv_pci_remove_slots(struct hv_pcibus_device *hbus) 2144 { 2145 struct hv_pci_dev *hpdev; 2146 2147 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2148 if (!hpdev->pci_slot) 2149 continue; 2150 pci_destroy_slot(hpdev->pci_slot); 2151 hpdev->pci_slot = NULL; 2152 } 2153 } 2154 2155 /* 2156 * Set NUMA node for the devices on the bus 2157 */ 2158 static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus) 2159 { 2160 struct pci_dev *dev; 2161 struct pci_bus *bus = hbus->bridge->bus; 2162 struct hv_pci_dev *hv_dev; 2163 2164 list_for_each_entry(dev, &bus->devices, bus_list) { 2165 hv_dev = get_pcichild_wslot(hbus, devfn_to_wslot(dev->devfn)); 2166 if (!hv_dev) 2167 continue; 2168 2169 if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY && 2170 hv_dev->desc.virtual_numa_node < num_possible_nodes()) 2171 /* 2172 * The kernel may boot with some NUMA nodes offline 2173 * (e.g. in a KDUMP kernel) or with NUMA disabled via 2174 * "numa=off". In those cases, adjust the host provided 2175 * NUMA node to a valid NUMA node used by the kernel. 2176 */ 2177 set_dev_node(&dev->dev, 2178 numa_map_to_online_node( 2179 hv_dev->desc.virtual_numa_node)); 2180 2181 put_pcichild(hv_dev); 2182 } 2183 } 2184 2185 /** 2186 * create_root_hv_pci_bus() - Expose a new root PCI bus 2187 * @hbus: Root PCI bus, as understood by this driver 2188 * 2189 * Return: 0 on success, -errno on failure 2190 */ 2191 static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus) 2192 { 2193 int error; 2194 struct pci_host_bridge *bridge = hbus->bridge; 2195 2196 bridge->dev.parent = &hbus->hdev->device; 2197 bridge->sysdata = &hbus->sysdata; 2198 bridge->ops = &hv_pcifront_ops; 2199 2200 error = pci_scan_root_bus_bridge(bridge); 2201 if (error) 2202 return error; 2203 2204 pci_lock_rescan_remove(); 2205 hv_pci_assign_numa_node(hbus); 2206 pci_bus_assign_resources(bridge->bus); 2207 hv_pci_assign_slots(hbus); 2208 pci_bus_add_devices(bridge->bus); 2209 pci_unlock_rescan_remove(); 2210 hbus->state = hv_pcibus_installed; 2211 return 0; 2212 } 2213 2214 struct q_res_req_compl { 2215 struct completion host_event; 2216 struct hv_pci_dev *hpdev; 2217 }; 2218 2219 /** 2220 * q_resource_requirements() - Query Resource Requirements 2221 * @context: The completion context. 2222 * @resp: The response that came from the host. 2223 * @resp_packet_size: The size in bytes of resp. 2224 * 2225 * This function is invoked on completion of a Query Resource 2226 * Requirements packet. 2227 */ 2228 static void q_resource_requirements(void *context, struct pci_response *resp, 2229 int resp_packet_size) 2230 { 2231 struct q_res_req_compl *completion = context; 2232 struct pci_q_res_req_response *q_res_req = 2233 (struct pci_q_res_req_response *)resp; 2234 int i; 2235 2236 if (resp->status < 0) { 2237 dev_err(&completion->hpdev->hbus->hdev->device, 2238 "query resource requirements failed: %x\n", 2239 resp->status); 2240 } else { 2241 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 2242 completion->hpdev->probed_bar[i] = 2243 q_res_req->probed_bar[i]; 2244 } 2245 } 2246 2247 complete(&completion->host_event); 2248 } 2249 2250 /** 2251 * new_pcichild_device() - Create a new child device 2252 * @hbus: The internal struct tracking this root PCI bus. 2253 * @desc: The information supplied so far from the host 2254 * about the device. 2255 * 2256 * This function creates the tracking structure for a new child 2257 * device and kicks off the process of figuring out what it is. 2258 * 2259 * Return: Pointer to the new tracking struct 2260 */ 2261 static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus, 2262 struct hv_pcidev_description *desc) 2263 { 2264 struct hv_pci_dev *hpdev; 2265 struct pci_child_message *res_req; 2266 struct q_res_req_compl comp_pkt; 2267 struct { 2268 struct pci_packet init_packet; 2269 u8 buffer[sizeof(struct pci_child_message)]; 2270 } pkt; 2271 unsigned long flags; 2272 int ret; 2273 2274 hpdev = kzalloc(sizeof(*hpdev), GFP_KERNEL); 2275 if (!hpdev) 2276 return NULL; 2277 2278 hpdev->hbus = hbus; 2279 2280 memset(&pkt, 0, sizeof(pkt)); 2281 init_completion(&comp_pkt.host_event); 2282 comp_pkt.hpdev = hpdev; 2283 pkt.init_packet.compl_ctxt = &comp_pkt; 2284 pkt.init_packet.completion_func = q_resource_requirements; 2285 res_req = (struct pci_child_message *)&pkt.init_packet.message; 2286 res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS; 2287 res_req->wslot.slot = desc->win_slot.slot; 2288 2289 ret = vmbus_sendpacket(hbus->hdev->channel, res_req, 2290 sizeof(struct pci_child_message), 2291 (unsigned long)&pkt.init_packet, 2292 VM_PKT_DATA_INBAND, 2293 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 2294 if (ret) 2295 goto error; 2296 2297 if (wait_for_response(hbus->hdev, &comp_pkt.host_event)) 2298 goto error; 2299 2300 hpdev->desc = *desc; 2301 refcount_set(&hpdev->refs, 1); 2302 get_pcichild(hpdev); 2303 spin_lock_irqsave(&hbus->device_list_lock, flags); 2304 2305 list_add_tail(&hpdev->list_entry, &hbus->children); 2306 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2307 return hpdev; 2308 2309 error: 2310 kfree(hpdev); 2311 return NULL; 2312 } 2313 2314 /** 2315 * get_pcichild_wslot() - Find device from slot 2316 * @hbus: Root PCI bus, as understood by this driver 2317 * @wslot: Location on the bus 2318 * 2319 * This function looks up a PCI device and returns the internal 2320 * representation of it. It acquires a reference on it, so that 2321 * the device won't be deleted while somebody is using it. The 2322 * caller is responsible for calling put_pcichild() to release 2323 * this reference. 2324 * 2325 * Return: Internal representation of a PCI device 2326 */ 2327 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 2328 u32 wslot) 2329 { 2330 unsigned long flags; 2331 struct hv_pci_dev *iter, *hpdev = NULL; 2332 2333 spin_lock_irqsave(&hbus->device_list_lock, flags); 2334 list_for_each_entry(iter, &hbus->children, list_entry) { 2335 if (iter->desc.win_slot.slot == wslot) { 2336 hpdev = iter; 2337 get_pcichild(hpdev); 2338 break; 2339 } 2340 } 2341 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2342 2343 return hpdev; 2344 } 2345 2346 /** 2347 * pci_devices_present_work() - Handle new list of child devices 2348 * @work: Work struct embedded in struct hv_dr_work 2349 * 2350 * "Bus Relations" is the Windows term for "children of this 2351 * bus." The terminology is preserved here for people trying to 2352 * debug the interaction between Hyper-V and Linux. This 2353 * function is called when the parent partition reports a list 2354 * of functions that should be observed under this PCI Express 2355 * port (bus). 2356 * 2357 * This function updates the list, and must tolerate being 2358 * called multiple times with the same information. The typical 2359 * number of child devices is one, with very atypical cases 2360 * involving three or four, so the algorithms used here can be 2361 * simple and inefficient. 2362 * 2363 * It must also treat the omission of a previously observed device as 2364 * notification that the device no longer exists. 2365 * 2366 * Note that this function is serialized with hv_eject_device_work(), 2367 * because both are pushed to the ordered workqueue hbus->wq. 2368 */ 2369 static void pci_devices_present_work(struct work_struct *work) 2370 { 2371 u32 child_no; 2372 bool found; 2373 struct hv_pcidev_description *new_desc; 2374 struct hv_pci_dev *hpdev; 2375 struct hv_pcibus_device *hbus; 2376 struct list_head removed; 2377 struct hv_dr_work *dr_wrk; 2378 struct hv_dr_state *dr = NULL; 2379 unsigned long flags; 2380 2381 dr_wrk = container_of(work, struct hv_dr_work, wrk); 2382 hbus = dr_wrk->bus; 2383 kfree(dr_wrk); 2384 2385 INIT_LIST_HEAD(&removed); 2386 2387 /* Pull this off the queue and process it if it was the last one. */ 2388 spin_lock_irqsave(&hbus->device_list_lock, flags); 2389 while (!list_empty(&hbus->dr_list)) { 2390 dr = list_first_entry(&hbus->dr_list, struct hv_dr_state, 2391 list_entry); 2392 list_del(&dr->list_entry); 2393 2394 /* Throw this away if the list still has stuff in it. */ 2395 if (!list_empty(&hbus->dr_list)) { 2396 kfree(dr); 2397 continue; 2398 } 2399 } 2400 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2401 2402 if (!dr) 2403 return; 2404 2405 /* First, mark all existing children as reported missing. */ 2406 spin_lock_irqsave(&hbus->device_list_lock, flags); 2407 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2408 hpdev->reported_missing = true; 2409 } 2410 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2411 2412 /* Next, add back any reported devices. */ 2413 for (child_no = 0; child_no < dr->device_count; child_no++) { 2414 found = false; 2415 new_desc = &dr->func[child_no]; 2416 2417 spin_lock_irqsave(&hbus->device_list_lock, flags); 2418 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2419 if ((hpdev->desc.win_slot.slot == new_desc->win_slot.slot) && 2420 (hpdev->desc.v_id == new_desc->v_id) && 2421 (hpdev->desc.d_id == new_desc->d_id) && 2422 (hpdev->desc.ser == new_desc->ser)) { 2423 hpdev->reported_missing = false; 2424 found = true; 2425 } 2426 } 2427 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2428 2429 if (!found) { 2430 hpdev = new_pcichild_device(hbus, new_desc); 2431 if (!hpdev) 2432 dev_err(&hbus->hdev->device, 2433 "couldn't record a child device.\n"); 2434 } 2435 } 2436 2437 /* Move missing children to a list on the stack. */ 2438 spin_lock_irqsave(&hbus->device_list_lock, flags); 2439 do { 2440 found = false; 2441 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2442 if (hpdev->reported_missing) { 2443 found = true; 2444 put_pcichild(hpdev); 2445 list_move_tail(&hpdev->list_entry, &removed); 2446 break; 2447 } 2448 } 2449 } while (found); 2450 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2451 2452 /* Delete everything that should no longer exist. */ 2453 while (!list_empty(&removed)) { 2454 hpdev = list_first_entry(&removed, struct hv_pci_dev, 2455 list_entry); 2456 list_del(&hpdev->list_entry); 2457 2458 if (hpdev->pci_slot) 2459 pci_destroy_slot(hpdev->pci_slot); 2460 2461 put_pcichild(hpdev); 2462 } 2463 2464 switch (hbus->state) { 2465 case hv_pcibus_installed: 2466 /* 2467 * Tell the core to rescan bus 2468 * because there may have been changes. 2469 */ 2470 pci_lock_rescan_remove(); 2471 pci_scan_child_bus(hbus->bridge->bus); 2472 hv_pci_assign_numa_node(hbus); 2473 hv_pci_assign_slots(hbus); 2474 pci_unlock_rescan_remove(); 2475 break; 2476 2477 case hv_pcibus_init: 2478 case hv_pcibus_probed: 2479 survey_child_resources(hbus); 2480 break; 2481 2482 default: 2483 break; 2484 } 2485 2486 kfree(dr); 2487 } 2488 2489 /** 2490 * hv_pci_start_relations_work() - Queue work to start device discovery 2491 * @hbus: Root PCI bus, as understood by this driver 2492 * @dr: The list of children returned from host 2493 * 2494 * Return: 0 on success, -errno on failure 2495 */ 2496 static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus, 2497 struct hv_dr_state *dr) 2498 { 2499 struct hv_dr_work *dr_wrk; 2500 unsigned long flags; 2501 bool pending_dr; 2502 2503 if (hbus->state == hv_pcibus_removing) { 2504 dev_info(&hbus->hdev->device, 2505 "PCI VMBus BUS_RELATIONS: ignored\n"); 2506 return -ENOENT; 2507 } 2508 2509 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT); 2510 if (!dr_wrk) 2511 return -ENOMEM; 2512 2513 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work); 2514 dr_wrk->bus = hbus; 2515 2516 spin_lock_irqsave(&hbus->device_list_lock, flags); 2517 /* 2518 * If pending_dr is true, we have already queued a work, 2519 * which will see the new dr. Otherwise, we need to 2520 * queue a new work. 2521 */ 2522 pending_dr = !list_empty(&hbus->dr_list); 2523 list_add_tail(&dr->list_entry, &hbus->dr_list); 2524 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2525 2526 if (pending_dr) 2527 kfree(dr_wrk); 2528 else 2529 queue_work(hbus->wq, &dr_wrk->wrk); 2530 2531 return 0; 2532 } 2533 2534 /** 2535 * hv_pci_devices_present() - Handle list of new children 2536 * @hbus: Root PCI bus, as understood by this driver 2537 * @relations: Packet from host listing children 2538 * 2539 * Process a new list of devices on the bus. The list of devices is 2540 * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS, 2541 * whenever a new list of devices for this bus appears. 2542 */ 2543 static void hv_pci_devices_present(struct hv_pcibus_device *hbus, 2544 struct pci_bus_relations *relations) 2545 { 2546 struct hv_dr_state *dr; 2547 int i; 2548 2549 dr = kzalloc(struct_size(dr, func, relations->device_count), 2550 GFP_NOWAIT); 2551 if (!dr) 2552 return; 2553 2554 dr->device_count = relations->device_count; 2555 for (i = 0; i < dr->device_count; i++) { 2556 dr->func[i].v_id = relations->func[i].v_id; 2557 dr->func[i].d_id = relations->func[i].d_id; 2558 dr->func[i].rev = relations->func[i].rev; 2559 dr->func[i].prog_intf = relations->func[i].prog_intf; 2560 dr->func[i].subclass = relations->func[i].subclass; 2561 dr->func[i].base_class = relations->func[i].base_class; 2562 dr->func[i].subsystem_id = relations->func[i].subsystem_id; 2563 dr->func[i].win_slot = relations->func[i].win_slot; 2564 dr->func[i].ser = relations->func[i].ser; 2565 } 2566 2567 if (hv_pci_start_relations_work(hbus, dr)) 2568 kfree(dr); 2569 } 2570 2571 /** 2572 * hv_pci_devices_present2() - Handle list of new children 2573 * @hbus: Root PCI bus, as understood by this driver 2574 * @relations: Packet from host listing children 2575 * 2576 * This function is the v2 version of hv_pci_devices_present() 2577 */ 2578 static void hv_pci_devices_present2(struct hv_pcibus_device *hbus, 2579 struct pci_bus_relations2 *relations) 2580 { 2581 struct hv_dr_state *dr; 2582 int i; 2583 2584 dr = kzalloc(struct_size(dr, func, relations->device_count), 2585 GFP_NOWAIT); 2586 if (!dr) 2587 return; 2588 2589 dr->device_count = relations->device_count; 2590 for (i = 0; i < dr->device_count; i++) { 2591 dr->func[i].v_id = relations->func[i].v_id; 2592 dr->func[i].d_id = relations->func[i].d_id; 2593 dr->func[i].rev = relations->func[i].rev; 2594 dr->func[i].prog_intf = relations->func[i].prog_intf; 2595 dr->func[i].subclass = relations->func[i].subclass; 2596 dr->func[i].base_class = relations->func[i].base_class; 2597 dr->func[i].subsystem_id = relations->func[i].subsystem_id; 2598 dr->func[i].win_slot = relations->func[i].win_slot; 2599 dr->func[i].ser = relations->func[i].ser; 2600 dr->func[i].flags = relations->func[i].flags; 2601 dr->func[i].virtual_numa_node = 2602 relations->func[i].virtual_numa_node; 2603 } 2604 2605 if (hv_pci_start_relations_work(hbus, dr)) 2606 kfree(dr); 2607 } 2608 2609 /** 2610 * hv_eject_device_work() - Asynchronously handles ejection 2611 * @work: Work struct embedded in internal device struct 2612 * 2613 * This function handles ejecting a device. Windows will 2614 * attempt to gracefully eject a device, waiting 60 seconds to 2615 * hear back from the guest OS that this completed successfully. 2616 * If this timer expires, the device will be forcibly removed. 2617 */ 2618 static void hv_eject_device_work(struct work_struct *work) 2619 { 2620 struct pci_eject_response *ejct_pkt; 2621 struct hv_pcibus_device *hbus; 2622 struct hv_pci_dev *hpdev; 2623 struct pci_dev *pdev; 2624 unsigned long flags; 2625 int wslot; 2626 struct { 2627 struct pci_packet pkt; 2628 u8 buffer[sizeof(struct pci_eject_response)]; 2629 } ctxt; 2630 2631 hpdev = container_of(work, struct hv_pci_dev, wrk); 2632 hbus = hpdev->hbus; 2633 2634 WARN_ON(hpdev->state != hv_pcichild_ejecting); 2635 2636 /* 2637 * Ejection can come before or after the PCI bus has been set up, so 2638 * attempt to find it and tear down the bus state, if it exists. This 2639 * must be done without constructs like pci_domain_nr(hbus->bridge->bus) 2640 * because hbus->bridge->bus may not exist yet. 2641 */ 2642 wslot = wslot_to_devfn(hpdev->desc.win_slot.slot); 2643 pdev = pci_get_domain_bus_and_slot(hbus->bridge->domain_nr, 0, wslot); 2644 if (pdev) { 2645 pci_lock_rescan_remove(); 2646 pci_stop_and_remove_bus_device(pdev); 2647 pci_dev_put(pdev); 2648 pci_unlock_rescan_remove(); 2649 } 2650 2651 spin_lock_irqsave(&hbus->device_list_lock, flags); 2652 list_del(&hpdev->list_entry); 2653 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2654 2655 if (hpdev->pci_slot) 2656 pci_destroy_slot(hpdev->pci_slot); 2657 2658 memset(&ctxt, 0, sizeof(ctxt)); 2659 ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message; 2660 ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE; 2661 ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot; 2662 vmbus_sendpacket(hbus->hdev->channel, ejct_pkt, 2663 sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt, 2664 VM_PKT_DATA_INBAND, 0); 2665 2666 /* For the get_pcichild() in hv_pci_eject_device() */ 2667 put_pcichild(hpdev); 2668 /* For the two refs got in new_pcichild_device() */ 2669 put_pcichild(hpdev); 2670 put_pcichild(hpdev); 2671 /* hpdev has been freed. Do not use it any more. */ 2672 } 2673 2674 /** 2675 * hv_pci_eject_device() - Handles device ejection 2676 * @hpdev: Internal device tracking struct 2677 * 2678 * This function is invoked when an ejection packet arrives. It 2679 * just schedules work so that we don't re-enter the packet 2680 * delivery code handling the ejection. 2681 */ 2682 static void hv_pci_eject_device(struct hv_pci_dev *hpdev) 2683 { 2684 struct hv_pcibus_device *hbus = hpdev->hbus; 2685 struct hv_device *hdev = hbus->hdev; 2686 2687 if (hbus->state == hv_pcibus_removing) { 2688 dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n"); 2689 return; 2690 } 2691 2692 hpdev->state = hv_pcichild_ejecting; 2693 get_pcichild(hpdev); 2694 INIT_WORK(&hpdev->wrk, hv_eject_device_work); 2695 queue_work(hbus->wq, &hpdev->wrk); 2696 } 2697 2698 /** 2699 * hv_pci_onchannelcallback() - Handles incoming packets 2700 * @context: Internal bus tracking struct 2701 * 2702 * This function is invoked whenever the host sends a packet to 2703 * this channel (which is private to this root PCI bus). 2704 */ 2705 static void hv_pci_onchannelcallback(void *context) 2706 { 2707 const int packet_size = 0x100; 2708 int ret; 2709 struct hv_pcibus_device *hbus = context; 2710 u32 bytes_recvd; 2711 u64 req_id; 2712 struct vmpacket_descriptor *desc; 2713 unsigned char *buffer; 2714 int bufferlen = packet_size; 2715 struct pci_packet *comp_packet; 2716 struct pci_response *response; 2717 struct pci_incoming_message *new_message; 2718 struct pci_bus_relations *bus_rel; 2719 struct pci_bus_relations2 *bus_rel2; 2720 struct pci_dev_inval_block *inval; 2721 struct pci_dev_incoming *dev_message; 2722 struct hv_pci_dev *hpdev; 2723 2724 buffer = kmalloc(bufferlen, GFP_ATOMIC); 2725 if (!buffer) 2726 return; 2727 2728 while (1) { 2729 ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer, 2730 bufferlen, &bytes_recvd, &req_id); 2731 2732 if (ret == -ENOBUFS) { 2733 kfree(buffer); 2734 /* Handle large packet */ 2735 bufferlen = bytes_recvd; 2736 buffer = kmalloc(bytes_recvd, GFP_ATOMIC); 2737 if (!buffer) 2738 return; 2739 continue; 2740 } 2741 2742 /* Zero length indicates there are no more packets. */ 2743 if (ret || !bytes_recvd) 2744 break; 2745 2746 /* 2747 * All incoming packets must be at least as large as a 2748 * response. 2749 */ 2750 if (bytes_recvd <= sizeof(struct pci_response)) 2751 continue; 2752 desc = (struct vmpacket_descriptor *)buffer; 2753 2754 switch (desc->type) { 2755 case VM_PKT_COMP: 2756 2757 /* 2758 * The host is trusted, and thus it's safe to interpret 2759 * this transaction ID as a pointer. 2760 */ 2761 comp_packet = (struct pci_packet *)req_id; 2762 response = (struct pci_response *)buffer; 2763 comp_packet->completion_func(comp_packet->compl_ctxt, 2764 response, 2765 bytes_recvd); 2766 break; 2767 2768 case VM_PKT_DATA_INBAND: 2769 2770 new_message = (struct pci_incoming_message *)buffer; 2771 switch (new_message->message_type.type) { 2772 case PCI_BUS_RELATIONS: 2773 2774 bus_rel = (struct pci_bus_relations *)buffer; 2775 if (bytes_recvd < 2776 struct_size(bus_rel, func, 2777 bus_rel->device_count)) { 2778 dev_err(&hbus->hdev->device, 2779 "bus relations too small\n"); 2780 break; 2781 } 2782 2783 hv_pci_devices_present(hbus, bus_rel); 2784 break; 2785 2786 case PCI_BUS_RELATIONS2: 2787 2788 bus_rel2 = (struct pci_bus_relations2 *)buffer; 2789 if (bytes_recvd < 2790 struct_size(bus_rel2, func, 2791 bus_rel2->device_count)) { 2792 dev_err(&hbus->hdev->device, 2793 "bus relations v2 too small\n"); 2794 break; 2795 } 2796 2797 hv_pci_devices_present2(hbus, bus_rel2); 2798 break; 2799 2800 case PCI_EJECT: 2801 2802 dev_message = (struct pci_dev_incoming *)buffer; 2803 hpdev = get_pcichild_wslot(hbus, 2804 dev_message->wslot.slot); 2805 if (hpdev) { 2806 hv_pci_eject_device(hpdev); 2807 put_pcichild(hpdev); 2808 } 2809 break; 2810 2811 case PCI_INVALIDATE_BLOCK: 2812 2813 inval = (struct pci_dev_inval_block *)buffer; 2814 hpdev = get_pcichild_wslot(hbus, 2815 inval->wslot.slot); 2816 if (hpdev) { 2817 if (hpdev->block_invalidate) { 2818 hpdev->block_invalidate( 2819 hpdev->invalidate_context, 2820 inval->block_mask); 2821 } 2822 put_pcichild(hpdev); 2823 } 2824 break; 2825 2826 default: 2827 dev_warn(&hbus->hdev->device, 2828 "Unimplemented protocol message %x\n", 2829 new_message->message_type.type); 2830 break; 2831 } 2832 break; 2833 2834 default: 2835 dev_err(&hbus->hdev->device, 2836 "unhandled packet type %d, tid %llx len %d\n", 2837 desc->type, req_id, bytes_recvd); 2838 break; 2839 } 2840 } 2841 2842 kfree(buffer); 2843 } 2844 2845 /** 2846 * hv_pci_protocol_negotiation() - Set up protocol 2847 * @hdev: VMBus's tracking struct for this root PCI bus. 2848 * @version: Array of supported channel protocol versions in 2849 * the order of probing - highest go first. 2850 * @num_version: Number of elements in the version array. 2851 * 2852 * This driver is intended to support running on Windows 10 2853 * (server) and later versions. It will not run on earlier 2854 * versions, as they assume that many of the operations which 2855 * Linux needs accomplished with a spinlock held were done via 2856 * asynchronous messaging via VMBus. Windows 10 increases the 2857 * surface area of PCI emulation so that these actions can take 2858 * place by suspending a virtual processor for their duration. 2859 * 2860 * This function negotiates the channel protocol version, 2861 * failing if the host doesn't support the necessary protocol 2862 * level. 2863 */ 2864 static int hv_pci_protocol_negotiation(struct hv_device *hdev, 2865 enum pci_protocol_version_t version[], 2866 int num_version) 2867 { 2868 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 2869 struct pci_version_request *version_req; 2870 struct hv_pci_compl comp_pkt; 2871 struct pci_packet *pkt; 2872 int ret; 2873 int i; 2874 2875 /* 2876 * Initiate the handshake with the host and negotiate 2877 * a version that the host can support. We start with the 2878 * highest version number and go down if the host cannot 2879 * support it. 2880 */ 2881 pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL); 2882 if (!pkt) 2883 return -ENOMEM; 2884 2885 init_completion(&comp_pkt.host_event); 2886 pkt->completion_func = hv_pci_generic_compl; 2887 pkt->compl_ctxt = &comp_pkt; 2888 version_req = (struct pci_version_request *)&pkt->message; 2889 version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION; 2890 2891 for (i = 0; i < num_version; i++) { 2892 version_req->protocol_version = version[i]; 2893 ret = vmbus_sendpacket(hdev->channel, version_req, 2894 sizeof(struct pci_version_request), 2895 (unsigned long)pkt, VM_PKT_DATA_INBAND, 2896 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 2897 if (!ret) 2898 ret = wait_for_response(hdev, &comp_pkt.host_event); 2899 2900 if (ret) { 2901 dev_err(&hdev->device, 2902 "PCI Pass-through VSP failed to request version: %d", 2903 ret); 2904 goto exit; 2905 } 2906 2907 if (comp_pkt.completion_status >= 0) { 2908 hbus->protocol_version = version[i]; 2909 dev_info(&hdev->device, 2910 "PCI VMBus probing: Using version %#x\n", 2911 hbus->protocol_version); 2912 goto exit; 2913 } 2914 2915 if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) { 2916 dev_err(&hdev->device, 2917 "PCI Pass-through VSP failed version request: %#x", 2918 comp_pkt.completion_status); 2919 ret = -EPROTO; 2920 goto exit; 2921 } 2922 2923 reinit_completion(&comp_pkt.host_event); 2924 } 2925 2926 dev_err(&hdev->device, 2927 "PCI pass-through VSP failed to find supported version"); 2928 ret = -EPROTO; 2929 2930 exit: 2931 kfree(pkt); 2932 return ret; 2933 } 2934 2935 /** 2936 * hv_pci_free_bridge_windows() - Release memory regions for the 2937 * bus 2938 * @hbus: Root PCI bus, as understood by this driver 2939 */ 2940 static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus) 2941 { 2942 /* 2943 * Set the resources back to the way they looked when they 2944 * were allocated by setting IORESOURCE_BUSY again. 2945 */ 2946 2947 if (hbus->low_mmio_space && hbus->low_mmio_res) { 2948 hbus->low_mmio_res->flags |= IORESOURCE_BUSY; 2949 vmbus_free_mmio(hbus->low_mmio_res->start, 2950 resource_size(hbus->low_mmio_res)); 2951 } 2952 2953 if (hbus->high_mmio_space && hbus->high_mmio_res) { 2954 hbus->high_mmio_res->flags |= IORESOURCE_BUSY; 2955 vmbus_free_mmio(hbus->high_mmio_res->start, 2956 resource_size(hbus->high_mmio_res)); 2957 } 2958 } 2959 2960 /** 2961 * hv_pci_allocate_bridge_windows() - Allocate memory regions 2962 * for the bus 2963 * @hbus: Root PCI bus, as understood by this driver 2964 * 2965 * This function calls vmbus_allocate_mmio(), which is itself a 2966 * bit of a compromise. Ideally, we might change the pnp layer 2967 * in the kernel such that it comprehends either PCI devices 2968 * which are "grandchildren of ACPI," with some intermediate bus 2969 * node (in this case, VMBus) or change it such that it 2970 * understands VMBus. The pnp layer, however, has been declared 2971 * deprecated, and not subject to change. 2972 * 2973 * The workaround, implemented here, is to ask VMBus to allocate 2974 * MMIO space for this bus. VMBus itself knows which ranges are 2975 * appropriate by looking at its own ACPI objects. Then, after 2976 * these ranges are claimed, they're modified to look like they 2977 * would have looked if the ACPI and pnp code had allocated 2978 * bridge windows. These descriptors have to exist in this form 2979 * in order to satisfy the code which will get invoked when the 2980 * endpoint PCI function driver calls request_mem_region() or 2981 * request_mem_region_exclusive(). 2982 * 2983 * Return: 0 on success, -errno on failure 2984 */ 2985 static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus) 2986 { 2987 resource_size_t align; 2988 int ret; 2989 2990 if (hbus->low_mmio_space) { 2991 align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 2992 ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0, 2993 (u64)(u32)0xffffffff, 2994 hbus->low_mmio_space, 2995 align, false); 2996 if (ret) { 2997 dev_err(&hbus->hdev->device, 2998 "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n", 2999 hbus->low_mmio_space); 3000 return ret; 3001 } 3002 3003 /* Modify this resource to become a bridge window. */ 3004 hbus->low_mmio_res->flags |= IORESOURCE_WINDOW; 3005 hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY; 3006 pci_add_resource(&hbus->bridge->windows, hbus->low_mmio_res); 3007 } 3008 3009 if (hbus->high_mmio_space) { 3010 align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space)); 3011 ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev, 3012 0x100000000, -1, 3013 hbus->high_mmio_space, align, 3014 false); 3015 if (ret) { 3016 dev_err(&hbus->hdev->device, 3017 "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n", 3018 hbus->high_mmio_space); 3019 goto release_low_mmio; 3020 } 3021 3022 /* Modify this resource to become a bridge window. */ 3023 hbus->high_mmio_res->flags |= IORESOURCE_WINDOW; 3024 hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY; 3025 pci_add_resource(&hbus->bridge->windows, hbus->high_mmio_res); 3026 } 3027 3028 return 0; 3029 3030 release_low_mmio: 3031 if (hbus->low_mmio_res) { 3032 vmbus_free_mmio(hbus->low_mmio_res->start, 3033 resource_size(hbus->low_mmio_res)); 3034 } 3035 3036 return ret; 3037 } 3038 3039 /** 3040 * hv_allocate_config_window() - Find MMIO space for PCI Config 3041 * @hbus: Root PCI bus, as understood by this driver 3042 * 3043 * This function claims memory-mapped I/O space for accessing 3044 * configuration space for the functions on this bus. 3045 * 3046 * Return: 0 on success, -errno on failure 3047 */ 3048 static int hv_allocate_config_window(struct hv_pcibus_device *hbus) 3049 { 3050 int ret; 3051 3052 /* 3053 * Set up a region of MMIO space to use for accessing configuration 3054 * space. 3055 */ 3056 ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1, 3057 PCI_CONFIG_MMIO_LENGTH, 0x1000, false); 3058 if (ret) 3059 return ret; 3060 3061 /* 3062 * vmbus_allocate_mmio() gets used for allocating both device endpoint 3063 * resource claims (those which cannot be overlapped) and the ranges 3064 * which are valid for the children of this bus, which are intended 3065 * to be overlapped by those children. Set the flag on this claim 3066 * meaning that this region can't be overlapped. 3067 */ 3068 3069 hbus->mem_config->flags |= IORESOURCE_BUSY; 3070 3071 return 0; 3072 } 3073 3074 static void hv_free_config_window(struct hv_pcibus_device *hbus) 3075 { 3076 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH); 3077 } 3078 3079 static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs); 3080 3081 /** 3082 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state 3083 * @hdev: VMBus's tracking struct for this root PCI bus 3084 * 3085 * Return: 0 on success, -errno on failure 3086 */ 3087 static int hv_pci_enter_d0(struct hv_device *hdev) 3088 { 3089 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3090 struct pci_bus_d0_entry *d0_entry; 3091 struct hv_pci_compl comp_pkt; 3092 struct pci_packet *pkt; 3093 int ret; 3094 3095 /* 3096 * Tell the host that the bus is ready to use, and moved into the 3097 * powered-on state. This includes telling the host which region 3098 * of memory-mapped I/O space has been chosen for configuration space 3099 * access. 3100 */ 3101 pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL); 3102 if (!pkt) 3103 return -ENOMEM; 3104 3105 init_completion(&comp_pkt.host_event); 3106 pkt->completion_func = hv_pci_generic_compl; 3107 pkt->compl_ctxt = &comp_pkt; 3108 d0_entry = (struct pci_bus_d0_entry *)&pkt->message; 3109 d0_entry->message_type.type = PCI_BUS_D0ENTRY; 3110 d0_entry->mmio_base = hbus->mem_config->start; 3111 3112 ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry), 3113 (unsigned long)pkt, VM_PKT_DATA_INBAND, 3114 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3115 if (!ret) 3116 ret = wait_for_response(hdev, &comp_pkt.host_event); 3117 3118 if (ret) 3119 goto exit; 3120 3121 if (comp_pkt.completion_status < 0) { 3122 dev_err(&hdev->device, 3123 "PCI Pass-through VSP failed D0 Entry with status %x\n", 3124 comp_pkt.completion_status); 3125 ret = -EPROTO; 3126 goto exit; 3127 } 3128 3129 ret = 0; 3130 3131 exit: 3132 kfree(pkt); 3133 return ret; 3134 } 3135 3136 /** 3137 * hv_pci_query_relations() - Ask host to send list of child 3138 * devices 3139 * @hdev: VMBus's tracking struct for this root PCI bus 3140 * 3141 * Return: 0 on success, -errno on failure 3142 */ 3143 static int hv_pci_query_relations(struct hv_device *hdev) 3144 { 3145 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3146 struct pci_message message; 3147 struct completion comp; 3148 int ret; 3149 3150 /* Ask the host to send along the list of child devices */ 3151 init_completion(&comp); 3152 if (cmpxchg(&hbus->survey_event, NULL, &comp)) 3153 return -ENOTEMPTY; 3154 3155 memset(&message, 0, sizeof(message)); 3156 message.type = PCI_QUERY_BUS_RELATIONS; 3157 3158 ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message), 3159 0, VM_PKT_DATA_INBAND, 0); 3160 if (!ret) 3161 ret = wait_for_response(hdev, &comp); 3162 3163 return ret; 3164 } 3165 3166 /** 3167 * hv_send_resources_allocated() - Report local resource choices 3168 * @hdev: VMBus's tracking struct for this root PCI bus 3169 * 3170 * The host OS is expecting to be sent a request as a message 3171 * which contains all the resources that the device will use. 3172 * The response contains those same resources, "translated" 3173 * which is to say, the values which should be used by the 3174 * hardware, when it delivers an interrupt. (MMIO resources are 3175 * used in local terms.) This is nice for Windows, and lines up 3176 * with the FDO/PDO split, which doesn't exist in Linux. Linux 3177 * is deeply expecting to scan an emulated PCI configuration 3178 * space. So this message is sent here only to drive the state 3179 * machine on the host forward. 3180 * 3181 * Return: 0 on success, -errno on failure 3182 */ 3183 static int hv_send_resources_allocated(struct hv_device *hdev) 3184 { 3185 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3186 struct pci_resources_assigned *res_assigned; 3187 struct pci_resources_assigned2 *res_assigned2; 3188 struct hv_pci_compl comp_pkt; 3189 struct hv_pci_dev *hpdev; 3190 struct pci_packet *pkt; 3191 size_t size_res; 3192 int wslot; 3193 int ret; 3194 3195 size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) 3196 ? sizeof(*res_assigned) : sizeof(*res_assigned2); 3197 3198 pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL); 3199 if (!pkt) 3200 return -ENOMEM; 3201 3202 ret = 0; 3203 3204 for (wslot = 0; wslot < 256; wslot++) { 3205 hpdev = get_pcichild_wslot(hbus, wslot); 3206 if (!hpdev) 3207 continue; 3208 3209 memset(pkt, 0, sizeof(*pkt) + size_res); 3210 init_completion(&comp_pkt.host_event); 3211 pkt->completion_func = hv_pci_generic_compl; 3212 pkt->compl_ctxt = &comp_pkt; 3213 3214 if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) { 3215 res_assigned = 3216 (struct pci_resources_assigned *)&pkt->message; 3217 res_assigned->message_type.type = 3218 PCI_RESOURCES_ASSIGNED; 3219 res_assigned->wslot.slot = hpdev->desc.win_slot.slot; 3220 } else { 3221 res_assigned2 = 3222 (struct pci_resources_assigned2 *)&pkt->message; 3223 res_assigned2->message_type.type = 3224 PCI_RESOURCES_ASSIGNED2; 3225 res_assigned2->wslot.slot = hpdev->desc.win_slot.slot; 3226 } 3227 put_pcichild(hpdev); 3228 3229 ret = vmbus_sendpacket(hdev->channel, &pkt->message, 3230 size_res, (unsigned long)pkt, 3231 VM_PKT_DATA_INBAND, 3232 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3233 if (!ret) 3234 ret = wait_for_response(hdev, &comp_pkt.host_event); 3235 if (ret) 3236 break; 3237 3238 if (comp_pkt.completion_status < 0) { 3239 ret = -EPROTO; 3240 dev_err(&hdev->device, 3241 "resource allocated returned 0x%x", 3242 comp_pkt.completion_status); 3243 break; 3244 } 3245 3246 hbus->wslot_res_allocated = wslot; 3247 } 3248 3249 kfree(pkt); 3250 return ret; 3251 } 3252 3253 /** 3254 * hv_send_resources_released() - Report local resources 3255 * released 3256 * @hdev: VMBus's tracking struct for this root PCI bus 3257 * 3258 * Return: 0 on success, -errno on failure 3259 */ 3260 static int hv_send_resources_released(struct hv_device *hdev) 3261 { 3262 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3263 struct pci_child_message pkt; 3264 struct hv_pci_dev *hpdev; 3265 int wslot; 3266 int ret; 3267 3268 for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) { 3269 hpdev = get_pcichild_wslot(hbus, wslot); 3270 if (!hpdev) 3271 continue; 3272 3273 memset(&pkt, 0, sizeof(pkt)); 3274 pkt.message_type.type = PCI_RESOURCES_RELEASED; 3275 pkt.wslot.slot = hpdev->desc.win_slot.slot; 3276 3277 put_pcichild(hpdev); 3278 3279 ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0, 3280 VM_PKT_DATA_INBAND, 0); 3281 if (ret) 3282 return ret; 3283 3284 hbus->wslot_res_allocated = wslot - 1; 3285 } 3286 3287 hbus->wslot_res_allocated = -1; 3288 3289 return 0; 3290 } 3291 3292 #define HVPCI_DOM_MAP_SIZE (64 * 1024) 3293 static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); 3294 3295 /* 3296 * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 3297 * as invalid for passthrough PCI devices of this driver. 3298 */ 3299 #define HVPCI_DOM_INVALID 0 3300 3301 /** 3302 * hv_get_dom_num() - Get a valid PCI domain number 3303 * Check if the PCI domain number is in use, and return another number if 3304 * it is in use. 3305 * 3306 * @dom: Requested domain number 3307 * 3308 * return: domain number on success, HVPCI_DOM_INVALID on failure 3309 */ 3310 static u16 hv_get_dom_num(u16 dom) 3311 { 3312 unsigned int i; 3313 3314 if (test_and_set_bit(dom, hvpci_dom_map) == 0) 3315 return dom; 3316 3317 for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { 3318 if (test_and_set_bit(i, hvpci_dom_map) == 0) 3319 return i; 3320 } 3321 3322 return HVPCI_DOM_INVALID; 3323 } 3324 3325 /** 3326 * hv_put_dom_num() - Mark the PCI domain number as free 3327 * @dom: Domain number to be freed 3328 */ 3329 static void hv_put_dom_num(u16 dom) 3330 { 3331 clear_bit(dom, hvpci_dom_map); 3332 } 3333 3334 /** 3335 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus 3336 * @hdev: VMBus's tracking struct for this root PCI bus 3337 * @dev_id: Identifies the device itself 3338 * 3339 * Return: 0 on success, -errno on failure 3340 */ 3341 static int hv_pci_probe(struct hv_device *hdev, 3342 const struct hv_vmbus_device_id *dev_id) 3343 { 3344 struct pci_host_bridge *bridge; 3345 struct hv_pcibus_device *hbus; 3346 u16 dom_req, dom; 3347 char *name; 3348 bool enter_d0_retry = true; 3349 int ret; 3350 3351 /* 3352 * hv_pcibus_device contains the hypercall arguments for retargeting in 3353 * hv_irq_unmask(). Those must not cross a page boundary. 3354 */ 3355 BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE); 3356 3357 bridge = devm_pci_alloc_host_bridge(&hdev->device, 0); 3358 if (!bridge) 3359 return -ENOMEM; 3360 3361 /* 3362 * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural 3363 * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate 3364 * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and 3365 * alignment of hbus is important because hbus's field 3366 * retarget_msi_interrupt_params must not cross a 4KB page boundary. 3367 * 3368 * Here we prefer kzalloc to get_zeroed_page(), because a buffer 3369 * allocated by the latter is not tracked and scanned by kmemleak, and 3370 * hence kmemleak reports the pointer contained in the hbus buffer 3371 * (i.e. the hpdev struct, which is created in new_pcichild_device() and 3372 * is tracked by hbus->children) as memory leak (false positive). 3373 * 3374 * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be 3375 * used to allocate the hbus buffer and we can avoid the kmemleak false 3376 * positive by using kmemleak_alloc() and kmemleak_free() to ask 3377 * kmemleak to track and scan the hbus buffer. 3378 */ 3379 hbus = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); 3380 if (!hbus) 3381 return -ENOMEM; 3382 3383 hbus->bridge = bridge; 3384 hbus->state = hv_pcibus_init; 3385 hbus->wslot_res_allocated = -1; 3386 3387 /* 3388 * The PCI bus "domain" is what is called "segment" in ACPI and other 3389 * specs. Pull it from the instance ID, to get something usually 3390 * unique. In rare cases of collision, we will find out another number 3391 * not in use. 3392 * 3393 * Note that, since this code only runs in a Hyper-V VM, Hyper-V 3394 * together with this guest driver can guarantee that (1) The only 3395 * domain used by Gen1 VMs for something that looks like a physical 3396 * PCI bus (which is actually emulated by the hypervisor) is domain 0. 3397 * (2) There will be no overlap between domains (after fixing possible 3398 * collisions) in the same VM. 3399 */ 3400 dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; 3401 dom = hv_get_dom_num(dom_req); 3402 3403 if (dom == HVPCI_DOM_INVALID) { 3404 dev_err(&hdev->device, 3405 "Unable to use dom# 0x%x or other numbers", dom_req); 3406 ret = -EINVAL; 3407 goto free_bus; 3408 } 3409 3410 if (dom != dom_req) 3411 dev_info(&hdev->device, 3412 "PCI dom# 0x%x has collision, using 0x%x", 3413 dom_req, dom); 3414 3415 hbus->bridge->domain_nr = dom; 3416 #ifdef CONFIG_X86 3417 hbus->sysdata.domain = dom; 3418 #endif 3419 3420 hbus->hdev = hdev; 3421 INIT_LIST_HEAD(&hbus->children); 3422 INIT_LIST_HEAD(&hbus->dr_list); 3423 spin_lock_init(&hbus->config_lock); 3424 spin_lock_init(&hbus->device_list_lock); 3425 spin_lock_init(&hbus->retarget_msi_interrupt_lock); 3426 hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0, 3427 hbus->bridge->domain_nr); 3428 if (!hbus->wq) { 3429 ret = -ENOMEM; 3430 goto free_dom; 3431 } 3432 3433 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, 3434 hv_pci_onchannelcallback, hbus); 3435 if (ret) 3436 goto destroy_wq; 3437 3438 hv_set_drvdata(hdev, hbus); 3439 3440 ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions, 3441 ARRAY_SIZE(pci_protocol_versions)); 3442 if (ret) 3443 goto close; 3444 3445 ret = hv_allocate_config_window(hbus); 3446 if (ret) 3447 goto close; 3448 3449 hbus->cfg_addr = ioremap(hbus->mem_config->start, 3450 PCI_CONFIG_MMIO_LENGTH); 3451 if (!hbus->cfg_addr) { 3452 dev_err(&hdev->device, 3453 "Unable to map a virtual address for config space\n"); 3454 ret = -ENOMEM; 3455 goto free_config; 3456 } 3457 3458 name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance); 3459 if (!name) { 3460 ret = -ENOMEM; 3461 goto unmap; 3462 } 3463 3464 hbus->fwnode = irq_domain_alloc_named_fwnode(name); 3465 kfree(name); 3466 if (!hbus->fwnode) { 3467 ret = -ENOMEM; 3468 goto unmap; 3469 } 3470 3471 ret = hv_pcie_init_irq_domain(hbus); 3472 if (ret) 3473 goto free_fwnode; 3474 3475 retry: 3476 ret = hv_pci_query_relations(hdev); 3477 if (ret) 3478 goto free_irq_domain; 3479 3480 ret = hv_pci_enter_d0(hdev); 3481 /* 3482 * In certain case (Kdump) the pci device of interest was 3483 * not cleanly shut down and resource is still held on host 3484 * side, the host could return invalid device status. 3485 * We need to explicitly request host to release the resource 3486 * and try to enter D0 again. 3487 * Since the hv_pci_bus_exit() call releases structures 3488 * of all its child devices, we need to start the retry from 3489 * hv_pci_query_relations() call, requesting host to send 3490 * the synchronous child device relations message before this 3491 * information is needed in hv_send_resources_allocated() 3492 * call later. 3493 */ 3494 if (ret == -EPROTO && enter_d0_retry) { 3495 enter_d0_retry = false; 3496 3497 dev_err(&hdev->device, "Retrying D0 Entry\n"); 3498 3499 /* 3500 * Hv_pci_bus_exit() calls hv_send_resources_released() 3501 * to free up resources of its child devices. 3502 * In the kdump kernel we need to set the 3503 * wslot_res_allocated to 255 so it scans all child 3504 * devices to release resources allocated in the 3505 * normal kernel before panic happened. 3506 */ 3507 hbus->wslot_res_allocated = 255; 3508 ret = hv_pci_bus_exit(hdev, true); 3509 3510 if (ret == 0) 3511 goto retry; 3512 3513 dev_err(&hdev->device, 3514 "Retrying D0 failed with ret %d\n", ret); 3515 } 3516 if (ret) 3517 goto free_irq_domain; 3518 3519 ret = hv_pci_allocate_bridge_windows(hbus); 3520 if (ret) 3521 goto exit_d0; 3522 3523 ret = hv_send_resources_allocated(hdev); 3524 if (ret) 3525 goto free_windows; 3526 3527 prepopulate_bars(hbus); 3528 3529 hbus->state = hv_pcibus_probed; 3530 3531 ret = create_root_hv_pci_bus(hbus); 3532 if (ret) 3533 goto free_windows; 3534 3535 return 0; 3536 3537 free_windows: 3538 hv_pci_free_bridge_windows(hbus); 3539 exit_d0: 3540 (void) hv_pci_bus_exit(hdev, true); 3541 free_irq_domain: 3542 irq_domain_remove(hbus->irq_domain); 3543 free_fwnode: 3544 irq_domain_free_fwnode(hbus->fwnode); 3545 unmap: 3546 iounmap(hbus->cfg_addr); 3547 free_config: 3548 hv_free_config_window(hbus); 3549 close: 3550 vmbus_close(hdev->channel); 3551 destroy_wq: 3552 destroy_workqueue(hbus->wq); 3553 free_dom: 3554 hv_put_dom_num(hbus->bridge->domain_nr); 3555 free_bus: 3556 kfree(hbus); 3557 return ret; 3558 } 3559 3560 static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) 3561 { 3562 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3563 struct { 3564 struct pci_packet teardown_packet; 3565 u8 buffer[sizeof(struct pci_message)]; 3566 } pkt; 3567 struct hv_pci_compl comp_pkt; 3568 struct hv_pci_dev *hpdev, *tmp; 3569 unsigned long flags; 3570 int ret; 3571 3572 /* 3573 * After the host sends the RESCIND_CHANNEL message, it doesn't 3574 * access the per-channel ringbuffer any longer. 3575 */ 3576 if (hdev->channel->rescind) 3577 return 0; 3578 3579 if (!keep_devs) { 3580 struct list_head removed; 3581 3582 /* Move all present children to the list on stack */ 3583 INIT_LIST_HEAD(&removed); 3584 spin_lock_irqsave(&hbus->device_list_lock, flags); 3585 list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry) 3586 list_move_tail(&hpdev->list_entry, &removed); 3587 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 3588 3589 /* Remove all children in the list */ 3590 list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) { 3591 list_del(&hpdev->list_entry); 3592 if (hpdev->pci_slot) 3593 pci_destroy_slot(hpdev->pci_slot); 3594 /* For the two refs got in new_pcichild_device() */ 3595 put_pcichild(hpdev); 3596 put_pcichild(hpdev); 3597 } 3598 } 3599 3600 ret = hv_send_resources_released(hdev); 3601 if (ret) { 3602 dev_err(&hdev->device, 3603 "Couldn't send resources released packet(s)\n"); 3604 return ret; 3605 } 3606 3607 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet)); 3608 init_completion(&comp_pkt.host_event); 3609 pkt.teardown_packet.completion_func = hv_pci_generic_compl; 3610 pkt.teardown_packet.compl_ctxt = &comp_pkt; 3611 pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT; 3612 3613 ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message, 3614 sizeof(struct pci_message), 3615 (unsigned long)&pkt.teardown_packet, 3616 VM_PKT_DATA_INBAND, 3617 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3618 if (ret) 3619 return ret; 3620 3621 if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) 3622 return -ETIMEDOUT; 3623 3624 return 0; 3625 } 3626 3627 /** 3628 * hv_pci_remove() - Remove routine for this VMBus channel 3629 * @hdev: VMBus's tracking struct for this root PCI bus 3630 * 3631 * Return: 0 on success, -errno on failure 3632 */ 3633 static int hv_pci_remove(struct hv_device *hdev) 3634 { 3635 struct hv_pcibus_device *hbus; 3636 int ret; 3637 3638 hbus = hv_get_drvdata(hdev); 3639 if (hbus->state == hv_pcibus_installed) { 3640 tasklet_disable(&hdev->channel->callback_event); 3641 hbus->state = hv_pcibus_removing; 3642 tasklet_enable(&hdev->channel->callback_event); 3643 destroy_workqueue(hbus->wq); 3644 hbus->wq = NULL; 3645 /* 3646 * At this point, no work is running or can be scheduled 3647 * on hbus-wq. We can't race with hv_pci_devices_present() 3648 * or hv_pci_eject_device(), it's safe to proceed. 3649 */ 3650 3651 /* Remove the bus from PCI's point of view. */ 3652 pci_lock_rescan_remove(); 3653 pci_stop_root_bus(hbus->bridge->bus); 3654 hv_pci_remove_slots(hbus); 3655 pci_remove_root_bus(hbus->bridge->bus); 3656 pci_unlock_rescan_remove(); 3657 } 3658 3659 ret = hv_pci_bus_exit(hdev, false); 3660 3661 vmbus_close(hdev->channel); 3662 3663 iounmap(hbus->cfg_addr); 3664 hv_free_config_window(hbus); 3665 hv_pci_free_bridge_windows(hbus); 3666 irq_domain_remove(hbus->irq_domain); 3667 irq_domain_free_fwnode(hbus->fwnode); 3668 3669 hv_put_dom_num(hbus->bridge->domain_nr); 3670 3671 kfree(hbus); 3672 return ret; 3673 } 3674 3675 static int hv_pci_suspend(struct hv_device *hdev) 3676 { 3677 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3678 enum hv_pcibus_state old_state; 3679 int ret; 3680 3681 /* 3682 * hv_pci_suspend() must make sure there are no pending work items 3683 * before calling vmbus_close(), since it runs in a process context 3684 * as a callback in dpm_suspend(). When it starts to run, the channel 3685 * callback hv_pci_onchannelcallback(), which runs in a tasklet 3686 * context, can be still running concurrently and scheduling new work 3687 * items onto hbus->wq in hv_pci_devices_present() and 3688 * hv_pci_eject_device(), and the work item handlers can access the 3689 * vmbus channel, which can be being closed by hv_pci_suspend(), e.g. 3690 * the work item handler pci_devices_present_work() -> 3691 * new_pcichild_device() writes to the vmbus channel. 3692 * 3693 * To eliminate the race, hv_pci_suspend() disables the channel 3694 * callback tasklet, sets hbus->state to hv_pcibus_removing, and 3695 * re-enables the tasklet. This way, when hv_pci_suspend() proceeds, 3696 * it knows that no new work item can be scheduled, and then it flushes 3697 * hbus->wq and safely closes the vmbus channel. 3698 */ 3699 tasklet_disable(&hdev->channel->callback_event); 3700 3701 /* Change the hbus state to prevent new work items. */ 3702 old_state = hbus->state; 3703 if (hbus->state == hv_pcibus_installed) 3704 hbus->state = hv_pcibus_removing; 3705 3706 tasklet_enable(&hdev->channel->callback_event); 3707 3708 if (old_state != hv_pcibus_installed) 3709 return -EINVAL; 3710 3711 flush_workqueue(hbus->wq); 3712 3713 ret = hv_pci_bus_exit(hdev, true); 3714 if (ret) 3715 return ret; 3716 3717 vmbus_close(hdev->channel); 3718 3719 return 0; 3720 } 3721 3722 static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg) 3723 { 3724 struct irq_data *irq_data; 3725 struct msi_desc *entry; 3726 int ret = 0; 3727 3728 msi_lock_descs(&pdev->dev); 3729 msi_for_each_desc(entry, &pdev->dev, MSI_DESC_ASSOCIATED) { 3730 irq_data = irq_get_irq_data(entry->irq); 3731 if (WARN_ON_ONCE(!irq_data)) { 3732 ret = -EINVAL; 3733 break; 3734 } 3735 3736 hv_compose_msi_msg(irq_data, &entry->msg); 3737 } 3738 msi_unlock_descs(&pdev->dev); 3739 3740 return ret; 3741 } 3742 3743 /* 3744 * Upon resume, pci_restore_msi_state() -> ... -> __pci_write_msi_msg() 3745 * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V 3746 * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg() 3747 * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping 3748 * Table entries. 3749 */ 3750 static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus) 3751 { 3752 pci_walk_bus(hbus->bridge->bus, hv_pci_restore_msi_msg, NULL); 3753 } 3754 3755 static int hv_pci_resume(struct hv_device *hdev) 3756 { 3757 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3758 enum pci_protocol_version_t version[1]; 3759 int ret; 3760 3761 hbus->state = hv_pcibus_init; 3762 3763 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, 3764 hv_pci_onchannelcallback, hbus); 3765 if (ret) 3766 return ret; 3767 3768 /* Only use the version that was in use before hibernation. */ 3769 version[0] = hbus->protocol_version; 3770 ret = hv_pci_protocol_negotiation(hdev, version, 1); 3771 if (ret) 3772 goto out; 3773 3774 ret = hv_pci_query_relations(hdev); 3775 if (ret) 3776 goto out; 3777 3778 ret = hv_pci_enter_d0(hdev); 3779 if (ret) 3780 goto out; 3781 3782 ret = hv_send_resources_allocated(hdev); 3783 if (ret) 3784 goto out; 3785 3786 prepopulate_bars(hbus); 3787 3788 hv_pci_restore_msi_state(hbus); 3789 3790 hbus->state = hv_pcibus_installed; 3791 return 0; 3792 out: 3793 vmbus_close(hdev->channel); 3794 return ret; 3795 } 3796 3797 static const struct hv_vmbus_device_id hv_pci_id_table[] = { 3798 /* PCI Pass-through Class ID */ 3799 /* 44C4F61D-4444-4400-9D52-802E27EDE19F */ 3800 { HV_PCIE_GUID, }, 3801 { }, 3802 }; 3803 3804 MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table); 3805 3806 static struct hv_driver hv_pci_drv = { 3807 .name = "hv_pci", 3808 .id_table = hv_pci_id_table, 3809 .probe = hv_pci_probe, 3810 .remove = hv_pci_remove, 3811 .suspend = hv_pci_suspend, 3812 .resume = hv_pci_resume, 3813 }; 3814 3815 static void __exit exit_hv_pci_drv(void) 3816 { 3817 vmbus_driver_unregister(&hv_pci_drv); 3818 3819 hvpci_block_ops.read_block = NULL; 3820 hvpci_block_ops.write_block = NULL; 3821 hvpci_block_ops.reg_blk_invalidate = NULL; 3822 } 3823 3824 static int __init init_hv_pci_drv(void) 3825 { 3826 int ret; 3827 3828 if (!hv_is_hyperv_initialized()) 3829 return -ENODEV; 3830 3831 ret = hv_pci_irqchip_init(); 3832 if (ret) 3833 return ret; 3834 3835 /* Set the invalid domain number's bit, so it will not be used */ 3836 set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); 3837 3838 /* Initialize PCI block r/w interface */ 3839 hvpci_block_ops.read_block = hv_read_config_block; 3840 hvpci_block_ops.write_block = hv_write_config_block; 3841 hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate; 3842 3843 return vmbus_driver_register(&hv_pci_drv); 3844 } 3845 3846 module_init(init_hv_pci_drv); 3847 module_exit(exit_hv_pci_drv); 3848 3849 MODULE_DESCRIPTION("Hyper-V PCI"); 3850 MODULE_LICENSE("GPL v2"); 3851