xref: /openbmc/linux/drivers/pci/controller/pci-hyperv.c (revision 22a41e9a5044bf3519f05b4a00e99af34bfeb40c)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) Microsoft Corporation.
4  *
5  * Author:
6  *   Jake Oshins <jakeo@microsoft.com>
7  *
8  * This driver acts as a paravirtual front-end for PCI Express root buses.
9  * When a PCI Express function (either an entire device or an SR-IOV
10  * Virtual Function) is being passed through to the VM, this driver exposes
11  * a new bus to the guest VM.  This is modeled as a root PCI bus because
12  * no bridges are being exposed to the VM.  In fact, with a "Generation 2"
13  * VM within Hyper-V, there may seem to be no PCI bus at all in the VM
14  * until a device as been exposed using this driver.
15  *
16  * Each root PCI bus has its own PCI domain, which is called "Segment" in
17  * the PCI Firmware Specifications.  Thus while each device passed through
18  * to the VM using this front-end will appear at "device 0", the domain will
19  * be unique.  Typically, each bus will have one PCI function on it, though
20  * this driver does support more than one.
21  *
22  * In order to map the interrupts from the device through to the guest VM,
23  * this driver also implements an IRQ Domain, which handles interrupts (either
24  * MSI or MSI-X) associated with the functions on the bus.  As interrupts are
25  * set up, torn down, or reaffined, this driver communicates with the
26  * underlying hypervisor to adjust the mappings in the I/O MMU so that each
27  * interrupt will be delivered to the correct virtual processor at the right
28  * vector.  This driver does not support level-triggered (line-based)
29  * interrupts, and will report that the Interrupt Line register in the
30  * function's configuration space is zero.
31  *
32  * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V
33  * facilities.  For instance, the configuration space of a function exposed
34  * by Hyper-V is mapped into a single page of memory space, and the
35  * read and write handlers for config space must be aware of this mechanism.
36  * Similarly, device setup and teardown involves messages sent to and from
37  * the PCI back-end driver in Hyper-V.
38  */
39 
40 #include <linux/kernel.h>
41 #include <linux/module.h>
42 #include <linux/pci.h>
43 #include <linux/pci-ecam.h>
44 #include <linux/delay.h>
45 #include <linux/semaphore.h>
46 #include <linux/irq.h>
47 #include <linux/msi.h>
48 #include <linux/hyperv.h>
49 #include <linux/refcount.h>
50 #include <linux/irqdomain.h>
51 #include <linux/acpi.h>
52 #include <asm/mshyperv.h>
53 
54 /*
55  * Protocol versions. The low word is the minor version, the high word the
56  * major version.
57  */
58 
59 #define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor)))
60 #define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16)
61 #define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff)
62 
63 enum pci_protocol_version_t {
64 	PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1),	/* Win10 */
65 	PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2),	/* RS1 */
66 	PCI_PROTOCOL_VERSION_1_3 = PCI_MAKE_VERSION(1, 3),	/* Vibranium */
67 	PCI_PROTOCOL_VERSION_1_4 = PCI_MAKE_VERSION(1, 4),	/* WS2022 */
68 };
69 
70 #define CPU_AFFINITY_ALL	-1ULL
71 
72 /*
73  * Supported protocol versions in the order of probing - highest go
74  * first.
75  */
76 static enum pci_protocol_version_t pci_protocol_versions[] = {
77 	PCI_PROTOCOL_VERSION_1_4,
78 	PCI_PROTOCOL_VERSION_1_3,
79 	PCI_PROTOCOL_VERSION_1_2,
80 	PCI_PROTOCOL_VERSION_1_1,
81 };
82 
83 #define PCI_CONFIG_MMIO_LENGTH	0x2000
84 #define CFG_PAGE_OFFSET 0x1000
85 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
86 
87 #define MAX_SUPPORTED_MSI_MESSAGES 0x400
88 
89 #define STATUS_REVISION_MISMATCH 0xC0000059
90 
91 /* space for 32bit serial number as string */
92 #define SLOT_NAME_SIZE 11
93 
94 /*
95  * Message Types
96  */
97 
98 enum pci_message_type {
99 	/*
100 	 * Version 1.1
101 	 */
102 	PCI_MESSAGE_BASE                = 0x42490000,
103 	PCI_BUS_RELATIONS               = PCI_MESSAGE_BASE + 0,
104 	PCI_QUERY_BUS_RELATIONS         = PCI_MESSAGE_BASE + 1,
105 	PCI_POWER_STATE_CHANGE          = PCI_MESSAGE_BASE + 4,
106 	PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
107 	PCI_QUERY_RESOURCE_RESOURCES    = PCI_MESSAGE_BASE + 6,
108 	PCI_BUS_D0ENTRY                 = PCI_MESSAGE_BASE + 7,
109 	PCI_BUS_D0EXIT                  = PCI_MESSAGE_BASE + 8,
110 	PCI_READ_BLOCK                  = PCI_MESSAGE_BASE + 9,
111 	PCI_WRITE_BLOCK                 = PCI_MESSAGE_BASE + 0xA,
112 	PCI_EJECT                       = PCI_MESSAGE_BASE + 0xB,
113 	PCI_QUERY_STOP                  = PCI_MESSAGE_BASE + 0xC,
114 	PCI_REENABLE                    = PCI_MESSAGE_BASE + 0xD,
115 	PCI_QUERY_STOP_FAILED           = PCI_MESSAGE_BASE + 0xE,
116 	PCI_EJECTION_COMPLETE           = PCI_MESSAGE_BASE + 0xF,
117 	PCI_RESOURCES_ASSIGNED          = PCI_MESSAGE_BASE + 0x10,
118 	PCI_RESOURCES_RELEASED          = PCI_MESSAGE_BASE + 0x11,
119 	PCI_INVALIDATE_BLOCK            = PCI_MESSAGE_BASE + 0x12,
120 	PCI_QUERY_PROTOCOL_VERSION      = PCI_MESSAGE_BASE + 0x13,
121 	PCI_CREATE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x14,
122 	PCI_DELETE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x15,
123 	PCI_RESOURCES_ASSIGNED2		= PCI_MESSAGE_BASE + 0x16,
124 	PCI_CREATE_INTERRUPT_MESSAGE2	= PCI_MESSAGE_BASE + 0x17,
125 	PCI_DELETE_INTERRUPT_MESSAGE2	= PCI_MESSAGE_BASE + 0x18, /* unused */
126 	PCI_BUS_RELATIONS2		= PCI_MESSAGE_BASE + 0x19,
127 	PCI_RESOURCES_ASSIGNED3         = PCI_MESSAGE_BASE + 0x1A,
128 	PCI_CREATE_INTERRUPT_MESSAGE3   = PCI_MESSAGE_BASE + 0x1B,
129 	PCI_MESSAGE_MAXIMUM
130 };
131 
132 /*
133  * Structures defining the virtual PCI Express protocol.
134  */
135 
136 union pci_version {
137 	struct {
138 		u16 minor_version;
139 		u16 major_version;
140 	} parts;
141 	u32 version;
142 } __packed;
143 
144 /*
145  * Function numbers are 8-bits wide on Express, as interpreted through ARI,
146  * which is all this driver does.  This representation is the one used in
147  * Windows, which is what is expected when sending this back and forth with
148  * the Hyper-V parent partition.
149  */
150 union win_slot_encoding {
151 	struct {
152 		u32	dev:5;
153 		u32	func:3;
154 		u32	reserved:24;
155 	} bits;
156 	u32 slot;
157 } __packed;
158 
159 /*
160  * Pretty much as defined in the PCI Specifications.
161  */
162 struct pci_function_description {
163 	u16	v_id;	/* vendor ID */
164 	u16	d_id;	/* device ID */
165 	u8	rev;
166 	u8	prog_intf;
167 	u8	subclass;
168 	u8	base_class;
169 	u32	subsystem_id;
170 	union win_slot_encoding win_slot;
171 	u32	ser;	/* serial number */
172 } __packed;
173 
174 enum pci_device_description_flags {
175 	HV_PCI_DEVICE_FLAG_NONE			= 0x0,
176 	HV_PCI_DEVICE_FLAG_NUMA_AFFINITY	= 0x1,
177 };
178 
179 struct pci_function_description2 {
180 	u16	v_id;	/* vendor ID */
181 	u16	d_id;	/* device ID */
182 	u8	rev;
183 	u8	prog_intf;
184 	u8	subclass;
185 	u8	base_class;
186 	u32	subsystem_id;
187 	union	win_slot_encoding win_slot;
188 	u32	ser;	/* serial number */
189 	u32	flags;
190 	u16	virtual_numa_node;
191 	u16	reserved;
192 } __packed;
193 
194 /**
195  * struct hv_msi_desc
196  * @vector:		IDT entry
197  * @delivery_mode:	As defined in Intel's Programmer's
198  *			Reference Manual, Volume 3, Chapter 8.
199  * @vector_count:	Number of contiguous entries in the
200  *			Interrupt Descriptor Table that are
201  *			occupied by this Message-Signaled
202  *			Interrupt. For "MSI", as first defined
203  *			in PCI 2.2, this can be between 1 and
204  *			32. For "MSI-X," as first defined in PCI
205  *			3.0, this must be 1, as each MSI-X table
206  *			entry would have its own descriptor.
207  * @reserved:		Empty space
208  * @cpu_mask:		All the target virtual processors.
209  */
210 struct hv_msi_desc {
211 	u8	vector;
212 	u8	delivery_mode;
213 	u16	vector_count;
214 	u32	reserved;
215 	u64	cpu_mask;
216 } __packed;
217 
218 /**
219  * struct hv_msi_desc2 - 1.2 version of hv_msi_desc
220  * @vector:		IDT entry
221  * @delivery_mode:	As defined in Intel's Programmer's
222  *			Reference Manual, Volume 3, Chapter 8.
223  * @vector_count:	Number of contiguous entries in the
224  *			Interrupt Descriptor Table that are
225  *			occupied by this Message-Signaled
226  *			Interrupt. For "MSI", as first defined
227  *			in PCI 2.2, this can be between 1 and
228  *			32. For "MSI-X," as first defined in PCI
229  *			3.0, this must be 1, as each MSI-X table
230  *			entry would have its own descriptor.
231  * @processor_count:	number of bits enabled in array.
232  * @processor_array:	All the target virtual processors.
233  */
234 struct hv_msi_desc2 {
235 	u8	vector;
236 	u8	delivery_mode;
237 	u16	vector_count;
238 	u16	processor_count;
239 	u16	processor_array[32];
240 } __packed;
241 
242 /*
243  * struct hv_msi_desc3 - 1.3 version of hv_msi_desc
244  *	Everything is the same as in 'hv_msi_desc2' except that the size of the
245  *	'vector' field is larger to support bigger vector values. For ex: LPI
246  *	vectors on ARM.
247  */
248 struct hv_msi_desc3 {
249 	u32	vector;
250 	u8	delivery_mode;
251 	u8	reserved;
252 	u16	vector_count;
253 	u16	processor_count;
254 	u16	processor_array[32];
255 } __packed;
256 
257 /**
258  * struct tran_int_desc
259  * @reserved:		unused, padding
260  * @vector_count:	same as in hv_msi_desc
261  * @data:		This is the "data payload" value that is
262  *			written by the device when it generates
263  *			a message-signaled interrupt, either MSI
264  *			or MSI-X.
265  * @address:		This is the address to which the data
266  *			payload is written on interrupt
267  *			generation.
268  */
269 struct tran_int_desc {
270 	u16	reserved;
271 	u16	vector_count;
272 	u32	data;
273 	u64	address;
274 } __packed;
275 
276 /*
277  * A generic message format for virtual PCI.
278  * Specific message formats are defined later in the file.
279  */
280 
281 struct pci_message {
282 	u32 type;
283 } __packed;
284 
285 struct pci_child_message {
286 	struct pci_message message_type;
287 	union win_slot_encoding wslot;
288 } __packed;
289 
290 struct pci_incoming_message {
291 	struct vmpacket_descriptor hdr;
292 	struct pci_message message_type;
293 } __packed;
294 
295 struct pci_response {
296 	struct vmpacket_descriptor hdr;
297 	s32 status;			/* negative values are failures */
298 } __packed;
299 
300 struct pci_packet {
301 	void (*completion_func)(void *context, struct pci_response *resp,
302 				int resp_packet_size);
303 	void *compl_ctxt;
304 
305 	struct pci_message message[];
306 };
307 
308 /*
309  * Specific message types supporting the PCI protocol.
310  */
311 
312 /*
313  * Version negotiation message. Sent from the guest to the host.
314  * The guest is free to try different versions until the host
315  * accepts the version.
316  *
317  * pci_version: The protocol version requested.
318  * is_last_attempt: If TRUE, this is the last version guest will request.
319  * reservedz: Reserved field, set to zero.
320  */
321 
322 struct pci_version_request {
323 	struct pci_message message_type;
324 	u32 protocol_version;
325 } __packed;
326 
327 /*
328  * Bus D0 Entry.  This is sent from the guest to the host when the virtual
329  * bus (PCI Express port) is ready for action.
330  */
331 
332 struct pci_bus_d0_entry {
333 	struct pci_message message_type;
334 	u32 reserved;
335 	u64 mmio_base;
336 } __packed;
337 
338 struct pci_bus_relations {
339 	struct pci_incoming_message incoming;
340 	u32 device_count;
341 	struct pci_function_description func[];
342 } __packed;
343 
344 struct pci_bus_relations2 {
345 	struct pci_incoming_message incoming;
346 	u32 device_count;
347 	struct pci_function_description2 func[];
348 } __packed;
349 
350 struct pci_q_res_req_response {
351 	struct vmpacket_descriptor hdr;
352 	s32 status;			/* negative values are failures */
353 	u32 probed_bar[PCI_STD_NUM_BARS];
354 } __packed;
355 
356 struct pci_set_power {
357 	struct pci_message message_type;
358 	union win_slot_encoding wslot;
359 	u32 power_state;		/* In Windows terms */
360 	u32 reserved;
361 } __packed;
362 
363 struct pci_set_power_response {
364 	struct vmpacket_descriptor hdr;
365 	s32 status;			/* negative values are failures */
366 	union win_slot_encoding wslot;
367 	u32 resultant_state;		/* In Windows terms */
368 	u32 reserved;
369 } __packed;
370 
371 struct pci_resources_assigned {
372 	struct pci_message message_type;
373 	union win_slot_encoding wslot;
374 	u8 memory_range[0x14][6];	/* not used here */
375 	u32 msi_descriptors;
376 	u32 reserved[4];
377 } __packed;
378 
379 struct pci_resources_assigned2 {
380 	struct pci_message message_type;
381 	union win_slot_encoding wslot;
382 	u8 memory_range[0x14][6];	/* not used here */
383 	u32 msi_descriptor_count;
384 	u8 reserved[70];
385 } __packed;
386 
387 struct pci_create_interrupt {
388 	struct pci_message message_type;
389 	union win_slot_encoding wslot;
390 	struct hv_msi_desc int_desc;
391 } __packed;
392 
393 struct pci_create_int_response {
394 	struct pci_response response;
395 	u32 reserved;
396 	struct tran_int_desc int_desc;
397 } __packed;
398 
399 struct pci_create_interrupt2 {
400 	struct pci_message message_type;
401 	union win_slot_encoding wslot;
402 	struct hv_msi_desc2 int_desc;
403 } __packed;
404 
405 struct pci_create_interrupt3 {
406 	struct pci_message message_type;
407 	union win_slot_encoding wslot;
408 	struct hv_msi_desc3 int_desc;
409 } __packed;
410 
411 struct pci_delete_interrupt {
412 	struct pci_message message_type;
413 	union win_slot_encoding wslot;
414 	struct tran_int_desc int_desc;
415 } __packed;
416 
417 /*
418  * Note: the VM must pass a valid block id, wslot and bytes_requested.
419  */
420 struct pci_read_block {
421 	struct pci_message message_type;
422 	u32 block_id;
423 	union win_slot_encoding wslot;
424 	u32 bytes_requested;
425 } __packed;
426 
427 struct pci_read_block_response {
428 	struct vmpacket_descriptor hdr;
429 	u32 status;
430 	u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
431 } __packed;
432 
433 /*
434  * Note: the VM must pass a valid block id, wslot and byte_count.
435  */
436 struct pci_write_block {
437 	struct pci_message message_type;
438 	u32 block_id;
439 	union win_slot_encoding wslot;
440 	u32 byte_count;
441 	u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
442 } __packed;
443 
444 struct pci_dev_inval_block {
445 	struct pci_incoming_message incoming;
446 	union win_slot_encoding wslot;
447 	u64 block_mask;
448 } __packed;
449 
450 struct pci_dev_incoming {
451 	struct pci_incoming_message incoming;
452 	union win_slot_encoding wslot;
453 } __packed;
454 
455 struct pci_eject_response {
456 	struct pci_message message_type;
457 	union win_slot_encoding wslot;
458 	u32 status;
459 } __packed;
460 
461 static int pci_ring_size = (4 * PAGE_SIZE);
462 
463 /*
464  * Driver specific state.
465  */
466 
467 enum hv_pcibus_state {
468 	hv_pcibus_init = 0,
469 	hv_pcibus_probed,
470 	hv_pcibus_installed,
471 	hv_pcibus_removing,
472 	hv_pcibus_maximum
473 };
474 
475 struct hv_pcibus_device {
476 #ifdef CONFIG_X86
477 	struct pci_sysdata sysdata;
478 #elif defined(CONFIG_ARM64)
479 	struct pci_config_window sysdata;
480 #endif
481 	struct pci_host_bridge *bridge;
482 	struct fwnode_handle *fwnode;
483 	/* Protocol version negotiated with the host */
484 	enum pci_protocol_version_t protocol_version;
485 	enum hv_pcibus_state state;
486 	struct hv_device *hdev;
487 	resource_size_t low_mmio_space;
488 	resource_size_t high_mmio_space;
489 	struct resource *mem_config;
490 	struct resource *low_mmio_res;
491 	struct resource *high_mmio_res;
492 	struct completion *survey_event;
493 	struct pci_bus *pci_bus;
494 	spinlock_t config_lock;	/* Avoid two threads writing index page */
495 	spinlock_t device_list_lock;	/* Protect lists below */
496 	void __iomem *cfg_addr;
497 
498 	struct list_head children;
499 	struct list_head dr_list;
500 
501 	struct msi_domain_info msi_info;
502 	struct irq_domain *irq_domain;
503 
504 	spinlock_t retarget_msi_interrupt_lock;
505 
506 	struct workqueue_struct *wq;
507 
508 	/* Highest slot of child device with resources allocated */
509 	int wslot_res_allocated;
510 
511 	/* hypercall arg, must not cross page boundary */
512 	struct hv_retarget_device_interrupt retarget_msi_interrupt_params;
513 
514 	/*
515 	 * Don't put anything here: retarget_msi_interrupt_params must be last
516 	 */
517 };
518 
519 /*
520  * Tracks "Device Relations" messages from the host, which must be both
521  * processed in order and deferred so that they don't run in the context
522  * of the incoming packet callback.
523  */
524 struct hv_dr_work {
525 	struct work_struct wrk;
526 	struct hv_pcibus_device *bus;
527 };
528 
529 struct hv_pcidev_description {
530 	u16	v_id;	/* vendor ID */
531 	u16	d_id;	/* device ID */
532 	u8	rev;
533 	u8	prog_intf;
534 	u8	subclass;
535 	u8	base_class;
536 	u32	subsystem_id;
537 	union	win_slot_encoding win_slot;
538 	u32	ser;	/* serial number */
539 	u32	flags;
540 	u16	virtual_numa_node;
541 };
542 
543 struct hv_dr_state {
544 	struct list_head list_entry;
545 	u32 device_count;
546 	struct hv_pcidev_description func[];
547 };
548 
549 enum hv_pcichild_state {
550 	hv_pcichild_init = 0,
551 	hv_pcichild_requirements,
552 	hv_pcichild_resourced,
553 	hv_pcichild_ejecting,
554 	hv_pcichild_maximum
555 };
556 
557 struct hv_pci_dev {
558 	/* List protected by pci_rescan_remove_lock */
559 	struct list_head list_entry;
560 	refcount_t refs;
561 	enum hv_pcichild_state state;
562 	struct pci_slot *pci_slot;
563 	struct hv_pcidev_description desc;
564 	bool reported_missing;
565 	struct hv_pcibus_device *hbus;
566 	struct work_struct wrk;
567 
568 	void (*block_invalidate)(void *context, u64 block_mask);
569 	void *invalidate_context;
570 
571 	/*
572 	 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
573 	 * read it back, for each of the BAR offsets within config space.
574 	 */
575 	u32 probed_bar[PCI_STD_NUM_BARS];
576 };
577 
578 struct hv_pci_compl {
579 	struct completion host_event;
580 	s32 completion_status;
581 };
582 
583 static void hv_pci_onchannelcallback(void *context);
584 
585 #ifdef CONFIG_X86
586 #define DELIVERY_MODE	APIC_DELIVERY_MODE_FIXED
587 #define FLOW_HANDLER	handle_edge_irq
588 #define FLOW_NAME	"edge"
589 
590 static int hv_pci_irqchip_init(void)
591 {
592 	return 0;
593 }
594 
595 static struct irq_domain *hv_pci_get_root_domain(void)
596 {
597 	return x86_vector_domain;
598 }
599 
600 static unsigned int hv_msi_get_int_vector(struct irq_data *data)
601 {
602 	struct irq_cfg *cfg = irqd_cfg(data);
603 
604 	return cfg->vector;
605 }
606 
607 static void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry,
608 				       struct msi_desc *msi_desc)
609 {
610 	msi_entry->address.as_uint32 = msi_desc->msg.address_lo;
611 	msi_entry->data.as_uint32 = msi_desc->msg.data;
612 }
613 
614 static int hv_msi_prepare(struct irq_domain *domain, struct device *dev,
615 			  int nvec, msi_alloc_info_t *info)
616 {
617 	return pci_msi_prepare(domain, dev, nvec, info);
618 }
619 
620 /**
621  * hv_arch_irq_unmask() - "Unmask" the IRQ by setting its current
622  * affinity.
623  * @data:	Describes the IRQ
624  *
625  * Build new a destination for the MSI and make a hypercall to
626  * update the Interrupt Redirection Table. "Device Logical ID"
627  * is built out of this PCI bus's instance GUID and the function
628  * number of the device.
629  */
630 static void hv_arch_irq_unmask(struct irq_data *data)
631 {
632 	struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
633 	struct hv_retarget_device_interrupt *params;
634 	struct hv_pcibus_device *hbus;
635 	struct cpumask *dest;
636 	cpumask_var_t tmp;
637 	struct pci_bus *pbus;
638 	struct pci_dev *pdev;
639 	unsigned long flags;
640 	u32 var_size = 0;
641 	int cpu, nr_bank;
642 	u64 res;
643 
644 	dest = irq_data_get_effective_affinity_mask(data);
645 	pdev = msi_desc_to_pci_dev(msi_desc);
646 	pbus = pdev->bus;
647 	hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
648 
649 	spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
650 
651 	params = &hbus->retarget_msi_interrupt_params;
652 	memset(params, 0, sizeof(*params));
653 	params->partition_id = HV_PARTITION_ID_SELF;
654 	params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
655 	hv_set_msi_entry_from_desc(&params->int_entry.msi_entry, msi_desc);
656 	params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
657 			   (hbus->hdev->dev_instance.b[4] << 16) |
658 			   (hbus->hdev->dev_instance.b[7] << 8) |
659 			   (hbus->hdev->dev_instance.b[6] & 0xf8) |
660 			   PCI_FUNC(pdev->devfn);
661 	params->int_target.vector = hv_msi_get_int_vector(data);
662 
663 	/*
664 	 * Honoring apic->delivery_mode set to APIC_DELIVERY_MODE_FIXED by
665 	 * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a
666 	 * spurious interrupt storm. Not doing so does not seem to have a
667 	 * negative effect (yet?).
668 	 */
669 
670 	if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
671 		/*
672 		 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the
673 		 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides
674 		 * with >64 VP support.
675 		 * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED
676 		 * is not sufficient for this hypercall.
677 		 */
678 		params->int_target.flags |=
679 			HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
680 
681 		if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) {
682 			res = 1;
683 			goto exit_unlock;
684 		}
685 
686 		cpumask_and(tmp, dest, cpu_online_mask);
687 		nr_bank = cpumask_to_vpset(&params->int_target.vp_set, tmp);
688 		free_cpumask_var(tmp);
689 
690 		if (nr_bank <= 0) {
691 			res = 1;
692 			goto exit_unlock;
693 		}
694 
695 		/*
696 		 * var-sized hypercall, var-size starts after vp_mask (thus
697 		 * vp_set.format does not count, but vp_set.valid_bank_mask
698 		 * does).
699 		 */
700 		var_size = 1 + nr_bank;
701 	} else {
702 		for_each_cpu_and(cpu, dest, cpu_online_mask) {
703 			params->int_target.vp_mask |=
704 				(1ULL << hv_cpu_number_to_vp_number(cpu));
705 		}
706 	}
707 
708 	res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
709 			      params, NULL);
710 
711 exit_unlock:
712 	spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
713 
714 	/*
715 	 * During hibernation, when a CPU is offlined, the kernel tries
716 	 * to move the interrupt to the remaining CPUs that haven't
717 	 * been offlined yet. In this case, the below hv_do_hypercall()
718 	 * always fails since the vmbus channel has been closed:
719 	 * refer to cpu_disable_common() -> fixup_irqs() ->
720 	 * irq_migrate_all_off_this_cpu() -> migrate_one_irq().
721 	 *
722 	 * Suppress the error message for hibernation because the failure
723 	 * during hibernation does not matter (at this time all the devices
724 	 * have been frozen). Note: the correct affinity info is still updated
725 	 * into the irqdata data structure in migrate_one_irq() ->
726 	 * irq_do_set_affinity() -> hv_set_affinity(), so later when the VM
727 	 * resumes, hv_pci_restore_msi_state() is able to correctly restore
728 	 * the interrupt with the correct affinity.
729 	 */
730 	if (!hv_result_success(res) && hbus->state != hv_pcibus_removing)
731 		dev_err(&hbus->hdev->device,
732 			"%s() failed: %#llx", __func__, res);
733 }
734 #elif defined(CONFIG_ARM64)
735 /*
736  * SPI vectors to use for vPCI; arch SPIs range is [32, 1019], but leaving a bit
737  * of room at the start to allow for SPIs to be specified through ACPI and
738  * starting with a power of two to satisfy power of 2 multi-MSI requirement.
739  */
740 #define HV_PCI_MSI_SPI_START	64
741 #define HV_PCI_MSI_SPI_NR	(1020 - HV_PCI_MSI_SPI_START)
742 #define DELIVERY_MODE		0
743 #define FLOW_HANDLER		NULL
744 #define FLOW_NAME		NULL
745 #define hv_msi_prepare		NULL
746 
747 struct hv_pci_chip_data {
748 	DECLARE_BITMAP(spi_map, HV_PCI_MSI_SPI_NR);
749 	struct mutex	map_lock;
750 };
751 
752 /* Hyper-V vPCI MSI GIC IRQ domain */
753 static struct irq_domain *hv_msi_gic_irq_domain;
754 
755 /* Hyper-V PCI MSI IRQ chip */
756 static struct irq_chip hv_arm64_msi_irq_chip = {
757 	.name = "MSI",
758 	.irq_set_affinity = irq_chip_set_affinity_parent,
759 	.irq_eoi = irq_chip_eoi_parent,
760 	.irq_mask = irq_chip_mask_parent,
761 	.irq_unmask = irq_chip_unmask_parent
762 };
763 
764 static unsigned int hv_msi_get_int_vector(struct irq_data *irqd)
765 {
766 	return irqd->parent_data->hwirq;
767 }
768 
769 static void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry,
770 				       struct msi_desc *msi_desc)
771 {
772 	msi_entry->address = ((u64)msi_desc->msg.address_hi << 32) |
773 			      msi_desc->msg.address_lo;
774 	msi_entry->data = msi_desc->msg.data;
775 }
776 
777 /*
778  * @nr_bm_irqs:		Indicates the number of IRQs that were allocated from
779  *			the bitmap.
780  * @nr_dom_irqs:	Indicates the number of IRQs that were allocated from
781  *			the parent domain.
782  */
783 static void hv_pci_vec_irq_free(struct irq_domain *domain,
784 				unsigned int virq,
785 				unsigned int nr_bm_irqs,
786 				unsigned int nr_dom_irqs)
787 {
788 	struct hv_pci_chip_data *chip_data = domain->host_data;
789 	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
790 	int first = d->hwirq - HV_PCI_MSI_SPI_START;
791 	int i;
792 
793 	mutex_lock(&chip_data->map_lock);
794 	bitmap_release_region(chip_data->spi_map,
795 			      first,
796 			      get_count_order(nr_bm_irqs));
797 	mutex_unlock(&chip_data->map_lock);
798 	for (i = 0; i < nr_dom_irqs; i++) {
799 		if (i)
800 			d = irq_domain_get_irq_data(domain, virq + i);
801 		irq_domain_reset_irq_data(d);
802 	}
803 
804 	irq_domain_free_irqs_parent(domain, virq, nr_dom_irqs);
805 }
806 
807 static void hv_pci_vec_irq_domain_free(struct irq_domain *domain,
808 				       unsigned int virq,
809 				       unsigned int nr_irqs)
810 {
811 	hv_pci_vec_irq_free(domain, virq, nr_irqs, nr_irqs);
812 }
813 
814 static int hv_pci_vec_alloc_device_irq(struct irq_domain *domain,
815 				       unsigned int nr_irqs,
816 				       irq_hw_number_t *hwirq)
817 {
818 	struct hv_pci_chip_data *chip_data = domain->host_data;
819 	int index;
820 
821 	/* Find and allocate region from the SPI bitmap */
822 	mutex_lock(&chip_data->map_lock);
823 	index = bitmap_find_free_region(chip_data->spi_map,
824 					HV_PCI_MSI_SPI_NR,
825 					get_count_order(nr_irqs));
826 	mutex_unlock(&chip_data->map_lock);
827 	if (index < 0)
828 		return -ENOSPC;
829 
830 	*hwirq = index + HV_PCI_MSI_SPI_START;
831 
832 	return 0;
833 }
834 
835 static int hv_pci_vec_irq_gic_domain_alloc(struct irq_domain *domain,
836 					   unsigned int virq,
837 					   irq_hw_number_t hwirq)
838 {
839 	struct irq_fwspec fwspec;
840 	struct irq_data *d;
841 	int ret;
842 
843 	fwspec.fwnode = domain->parent->fwnode;
844 	fwspec.param_count = 2;
845 	fwspec.param[0] = hwirq;
846 	fwspec.param[1] = IRQ_TYPE_EDGE_RISING;
847 
848 	ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec);
849 	if (ret)
850 		return ret;
851 
852 	/*
853 	 * Since the interrupt specifier is not coming from ACPI or DT, the
854 	 * trigger type will need to be set explicitly. Otherwise, it will be
855 	 * set to whatever is in the GIC configuration.
856 	 */
857 	d = irq_domain_get_irq_data(domain->parent, virq);
858 
859 	return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING);
860 }
861 
862 static int hv_pci_vec_irq_domain_alloc(struct irq_domain *domain,
863 				       unsigned int virq, unsigned int nr_irqs,
864 				       void *args)
865 {
866 	irq_hw_number_t hwirq;
867 	unsigned int i;
868 	int ret;
869 
870 	ret = hv_pci_vec_alloc_device_irq(domain, nr_irqs, &hwirq);
871 	if (ret)
872 		return ret;
873 
874 	for (i = 0; i < nr_irqs; i++) {
875 		ret = hv_pci_vec_irq_gic_domain_alloc(domain, virq + i,
876 						      hwirq + i);
877 		if (ret) {
878 			hv_pci_vec_irq_free(domain, virq, nr_irqs, i);
879 			return ret;
880 		}
881 
882 		irq_domain_set_hwirq_and_chip(domain, virq + i,
883 					      hwirq + i,
884 					      &hv_arm64_msi_irq_chip,
885 					      domain->host_data);
886 		pr_debug("pID:%d vID:%u\n", (int)(hwirq + i), virq + i);
887 	}
888 
889 	return 0;
890 }
891 
892 /*
893  * Pick the first cpu as the irq affinity that can be temporarily used for
894  * composing MSI from the hypervisor. GIC will eventually set the right
895  * affinity for the irq and the 'unmask' will retarget the interrupt to that
896  * cpu.
897  */
898 static int hv_pci_vec_irq_domain_activate(struct irq_domain *domain,
899 					  struct irq_data *irqd, bool reserve)
900 {
901 	int cpu = cpumask_first(cpu_present_mask);
902 
903 	irq_data_update_effective_affinity(irqd, cpumask_of(cpu));
904 
905 	return 0;
906 }
907 
908 static const struct irq_domain_ops hv_pci_domain_ops = {
909 	.alloc	= hv_pci_vec_irq_domain_alloc,
910 	.free	= hv_pci_vec_irq_domain_free,
911 	.activate = hv_pci_vec_irq_domain_activate,
912 };
913 
914 static int hv_pci_irqchip_init(void)
915 {
916 	static struct hv_pci_chip_data *chip_data;
917 	struct fwnode_handle *fn = NULL;
918 	int ret = -ENOMEM;
919 
920 	chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL);
921 	if (!chip_data)
922 		return ret;
923 
924 	mutex_init(&chip_data->map_lock);
925 	fn = irq_domain_alloc_named_fwnode("hv_vpci_arm64");
926 	if (!fn)
927 		goto free_chip;
928 
929 	/*
930 	 * IRQ domain once enabled, should not be removed since there is no
931 	 * way to ensure that all the corresponding devices are also gone and
932 	 * no interrupts will be generated.
933 	 */
934 	hv_msi_gic_irq_domain = acpi_irq_create_hierarchy(0, HV_PCI_MSI_SPI_NR,
935 							  fn, &hv_pci_domain_ops,
936 							  chip_data);
937 
938 	if (!hv_msi_gic_irq_domain) {
939 		pr_err("Failed to create Hyper-V arm64 vPCI MSI IRQ domain\n");
940 		goto free_chip;
941 	}
942 
943 	return 0;
944 
945 free_chip:
946 	kfree(chip_data);
947 	if (fn)
948 		irq_domain_free_fwnode(fn);
949 
950 	return ret;
951 }
952 
953 static struct irq_domain *hv_pci_get_root_domain(void)
954 {
955 	return hv_msi_gic_irq_domain;
956 }
957 
958 /*
959  * SPIs are used for interrupts of PCI devices and SPIs is managed via GICD
960  * registers which Hyper-V already supports, so no hypercall needed.
961  */
962 static void hv_arch_irq_unmask(struct irq_data *data) { }
963 #endif /* CONFIG_ARM64 */
964 
965 /**
966  * hv_pci_generic_compl() - Invoked for a completion packet
967  * @context:		Set up by the sender of the packet.
968  * @resp:		The response packet
969  * @resp_packet_size:	Size in bytes of the packet
970  *
971  * This function is used to trigger an event and report status
972  * for any message for which the completion packet contains a
973  * status and nothing else.
974  */
975 static void hv_pci_generic_compl(void *context, struct pci_response *resp,
976 				 int resp_packet_size)
977 {
978 	struct hv_pci_compl *comp_pkt = context;
979 
980 	if (resp_packet_size >= offsetofend(struct pci_response, status))
981 		comp_pkt->completion_status = resp->status;
982 	else
983 		comp_pkt->completion_status = -1;
984 
985 	complete(&comp_pkt->host_event);
986 }
987 
988 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
989 						u32 wslot);
990 
991 static void get_pcichild(struct hv_pci_dev *hpdev)
992 {
993 	refcount_inc(&hpdev->refs);
994 }
995 
996 static void put_pcichild(struct hv_pci_dev *hpdev)
997 {
998 	if (refcount_dec_and_test(&hpdev->refs))
999 		kfree(hpdev);
1000 }
1001 
1002 /*
1003  * There is no good way to get notified from vmbus_onoffer_rescind(),
1004  * so let's use polling here, since this is not a hot path.
1005  */
1006 static int wait_for_response(struct hv_device *hdev,
1007 			     struct completion *comp)
1008 {
1009 	while (true) {
1010 		if (hdev->channel->rescind) {
1011 			dev_warn_once(&hdev->device, "The device is gone.\n");
1012 			return -ENODEV;
1013 		}
1014 
1015 		if (wait_for_completion_timeout(comp, HZ / 10))
1016 			break;
1017 	}
1018 
1019 	return 0;
1020 }
1021 
1022 /**
1023  * devfn_to_wslot() - Convert from Linux PCI slot to Windows
1024  * @devfn:	The Linux representation of PCI slot
1025  *
1026  * Windows uses a slightly different representation of PCI slot.
1027  *
1028  * Return: The Windows representation
1029  */
1030 static u32 devfn_to_wslot(int devfn)
1031 {
1032 	union win_slot_encoding wslot;
1033 
1034 	wslot.slot = 0;
1035 	wslot.bits.dev = PCI_SLOT(devfn);
1036 	wslot.bits.func = PCI_FUNC(devfn);
1037 
1038 	return wslot.slot;
1039 }
1040 
1041 /**
1042  * wslot_to_devfn() - Convert from Windows PCI slot to Linux
1043  * @wslot:	The Windows representation of PCI slot
1044  *
1045  * Windows uses a slightly different representation of PCI slot.
1046  *
1047  * Return: The Linux representation
1048  */
1049 static int wslot_to_devfn(u32 wslot)
1050 {
1051 	union win_slot_encoding slot_no;
1052 
1053 	slot_no.slot = wslot;
1054 	return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func);
1055 }
1056 
1057 /*
1058  * PCI Configuration Space for these root PCI buses is implemented as a pair
1059  * of pages in memory-mapped I/O space.  Writing to the first page chooses
1060  * the PCI function being written or read.  Once the first page has been
1061  * written to, the following page maps in the entire configuration space of
1062  * the function.
1063  */
1064 
1065 /**
1066  * _hv_pcifront_read_config() - Internal PCI config read
1067  * @hpdev:	The PCI driver's representation of the device
1068  * @where:	Offset within config space
1069  * @size:	Size of the transfer
1070  * @val:	Pointer to the buffer receiving the data
1071  */
1072 static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
1073 				     int size, u32 *val)
1074 {
1075 	unsigned long flags;
1076 	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
1077 
1078 	/*
1079 	 * If the attempt is to read the IDs or the ROM BAR, simulate that.
1080 	 */
1081 	if (where + size <= PCI_COMMAND) {
1082 		memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size);
1083 	} else if (where >= PCI_CLASS_REVISION && where + size <=
1084 		   PCI_CACHE_LINE_SIZE) {
1085 		memcpy(val, ((u8 *)&hpdev->desc.rev) + where -
1086 		       PCI_CLASS_REVISION, size);
1087 	} else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <=
1088 		   PCI_ROM_ADDRESS) {
1089 		memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where -
1090 		       PCI_SUBSYSTEM_VENDOR_ID, size);
1091 	} else if (where >= PCI_ROM_ADDRESS && where + size <=
1092 		   PCI_CAPABILITY_LIST) {
1093 		/* ROM BARs are unimplemented */
1094 		*val = 0;
1095 	} else if (where >= PCI_INTERRUPT_LINE && where + size <=
1096 		   PCI_INTERRUPT_PIN) {
1097 		/*
1098 		 * Interrupt Line and Interrupt PIN are hard-wired to zero
1099 		 * because this front-end only supports message-signaled
1100 		 * interrupts.
1101 		 */
1102 		*val = 0;
1103 	} else if (where + size <= CFG_PAGE_SIZE) {
1104 		spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
1105 		/* Choose the function to be read. (See comment above) */
1106 		writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
1107 		/* Make sure the function was chosen before we start reading. */
1108 		mb();
1109 		/* Read from that function's config space. */
1110 		switch (size) {
1111 		case 1:
1112 			*val = readb(addr);
1113 			break;
1114 		case 2:
1115 			*val = readw(addr);
1116 			break;
1117 		default:
1118 			*val = readl(addr);
1119 			break;
1120 		}
1121 		/*
1122 		 * Make sure the read was done before we release the spinlock
1123 		 * allowing consecutive reads/writes.
1124 		 */
1125 		mb();
1126 		spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
1127 	} else {
1128 		dev_err(&hpdev->hbus->hdev->device,
1129 			"Attempt to read beyond a function's config space.\n");
1130 	}
1131 }
1132 
1133 static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
1134 {
1135 	u16 ret;
1136 	unsigned long flags;
1137 	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET +
1138 			     PCI_VENDOR_ID;
1139 
1140 	spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
1141 
1142 	/* Choose the function to be read. (See comment above) */
1143 	writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
1144 	/* Make sure the function was chosen before we start reading. */
1145 	mb();
1146 	/* Read from that function's config space. */
1147 	ret = readw(addr);
1148 	/*
1149 	 * mb() is not required here, because the spin_unlock_irqrestore()
1150 	 * is a barrier.
1151 	 */
1152 
1153 	spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
1154 
1155 	return ret;
1156 }
1157 
1158 /**
1159  * _hv_pcifront_write_config() - Internal PCI config write
1160  * @hpdev:	The PCI driver's representation of the device
1161  * @where:	Offset within config space
1162  * @size:	Size of the transfer
1163  * @val:	The data being transferred
1164  */
1165 static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
1166 				      int size, u32 val)
1167 {
1168 	unsigned long flags;
1169 	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
1170 
1171 	if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
1172 	    where + size <= PCI_CAPABILITY_LIST) {
1173 		/* SSIDs and ROM BARs are read-only */
1174 	} else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
1175 		spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
1176 		/* Choose the function to be written. (See comment above) */
1177 		writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
1178 		/* Make sure the function was chosen before we start writing. */
1179 		wmb();
1180 		/* Write to that function's config space. */
1181 		switch (size) {
1182 		case 1:
1183 			writeb(val, addr);
1184 			break;
1185 		case 2:
1186 			writew(val, addr);
1187 			break;
1188 		default:
1189 			writel(val, addr);
1190 			break;
1191 		}
1192 		/*
1193 		 * Make sure the write was done before we release the spinlock
1194 		 * allowing consecutive reads/writes.
1195 		 */
1196 		mb();
1197 		spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
1198 	} else {
1199 		dev_err(&hpdev->hbus->hdev->device,
1200 			"Attempt to write beyond a function's config space.\n");
1201 	}
1202 }
1203 
1204 /**
1205  * hv_pcifront_read_config() - Read configuration space
1206  * @bus: PCI Bus structure
1207  * @devfn: Device/function
1208  * @where: Offset from base
1209  * @size: Byte/word/dword
1210  * @val: Value to be read
1211  *
1212  * Return: PCIBIOS_SUCCESSFUL on success
1213  *	   PCIBIOS_DEVICE_NOT_FOUND on failure
1214  */
1215 static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn,
1216 				   int where, int size, u32 *val)
1217 {
1218 	struct hv_pcibus_device *hbus =
1219 		container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
1220 	struct hv_pci_dev *hpdev;
1221 
1222 	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
1223 	if (!hpdev)
1224 		return PCIBIOS_DEVICE_NOT_FOUND;
1225 
1226 	_hv_pcifront_read_config(hpdev, where, size, val);
1227 
1228 	put_pcichild(hpdev);
1229 	return PCIBIOS_SUCCESSFUL;
1230 }
1231 
1232 /**
1233  * hv_pcifront_write_config() - Write configuration space
1234  * @bus: PCI Bus structure
1235  * @devfn: Device/function
1236  * @where: Offset from base
1237  * @size: Byte/word/dword
1238  * @val: Value to be written to device
1239  *
1240  * Return: PCIBIOS_SUCCESSFUL on success
1241  *	   PCIBIOS_DEVICE_NOT_FOUND on failure
1242  */
1243 static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
1244 				    int where, int size, u32 val)
1245 {
1246 	struct hv_pcibus_device *hbus =
1247 	    container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
1248 	struct hv_pci_dev *hpdev;
1249 
1250 	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
1251 	if (!hpdev)
1252 		return PCIBIOS_DEVICE_NOT_FOUND;
1253 
1254 	_hv_pcifront_write_config(hpdev, where, size, val);
1255 
1256 	put_pcichild(hpdev);
1257 	return PCIBIOS_SUCCESSFUL;
1258 }
1259 
1260 /* PCIe operations */
1261 static struct pci_ops hv_pcifront_ops = {
1262 	.read  = hv_pcifront_read_config,
1263 	.write = hv_pcifront_write_config,
1264 };
1265 
1266 /*
1267  * Paravirtual backchannel
1268  *
1269  * Hyper-V SR-IOV provides a backchannel mechanism in software for
1270  * communication between a VF driver and a PF driver.  These
1271  * "configuration blocks" are similar in concept to PCI configuration space,
1272  * but instead of doing reads and writes in 32-bit chunks through a very slow
1273  * path, packets of up to 128 bytes can be sent or received asynchronously.
1274  *
1275  * Nearly every SR-IOV device contains just such a communications channel in
1276  * hardware, so using this one in software is usually optional.  Using the
1277  * software channel, however, allows driver implementers to leverage software
1278  * tools that fuzz the communications channel looking for vulnerabilities.
1279  *
1280  * The usage model for these packets puts the responsibility for reading or
1281  * writing on the VF driver.  The VF driver sends a read or a write packet,
1282  * indicating which "block" is being referred to by number.
1283  *
1284  * If the PF driver wishes to initiate communication, it can "invalidate" one or
1285  * more of the first 64 blocks.  This invalidation is delivered via a callback
1286  * supplied by the VF driver by this driver.
1287  *
1288  * No protocol is implied, except that supplied by the PF and VF drivers.
1289  */
1290 
1291 struct hv_read_config_compl {
1292 	struct hv_pci_compl comp_pkt;
1293 	void *buf;
1294 	unsigned int len;
1295 	unsigned int bytes_returned;
1296 };
1297 
1298 /**
1299  * hv_pci_read_config_compl() - Invoked when a response packet
1300  * for a read config block operation arrives.
1301  * @context:		Identifies the read config operation
1302  * @resp:		The response packet itself
1303  * @resp_packet_size:	Size in bytes of the response packet
1304  */
1305 static void hv_pci_read_config_compl(void *context, struct pci_response *resp,
1306 				     int resp_packet_size)
1307 {
1308 	struct hv_read_config_compl *comp = context;
1309 	struct pci_read_block_response *read_resp =
1310 		(struct pci_read_block_response *)resp;
1311 	unsigned int data_len, hdr_len;
1312 
1313 	hdr_len = offsetof(struct pci_read_block_response, bytes);
1314 	if (resp_packet_size < hdr_len) {
1315 		comp->comp_pkt.completion_status = -1;
1316 		goto out;
1317 	}
1318 
1319 	data_len = resp_packet_size - hdr_len;
1320 	if (data_len > 0 && read_resp->status == 0) {
1321 		comp->bytes_returned = min(comp->len, data_len);
1322 		memcpy(comp->buf, read_resp->bytes, comp->bytes_returned);
1323 	} else {
1324 		comp->bytes_returned = 0;
1325 	}
1326 
1327 	comp->comp_pkt.completion_status = read_resp->status;
1328 out:
1329 	complete(&comp->comp_pkt.host_event);
1330 }
1331 
1332 /**
1333  * hv_read_config_block() - Sends a read config block request to
1334  * the back-end driver running in the Hyper-V parent partition.
1335  * @pdev:		The PCI driver's representation for this device.
1336  * @buf:		Buffer into which the config block will be copied.
1337  * @len:		Size in bytes of buf.
1338  * @block_id:		Identifies the config block which has been requested.
1339  * @bytes_returned:	Size which came back from the back-end driver.
1340  *
1341  * Return: 0 on success, -errno on failure
1342  */
1343 static int hv_read_config_block(struct pci_dev *pdev, void *buf,
1344 				unsigned int len, unsigned int block_id,
1345 				unsigned int *bytes_returned)
1346 {
1347 	struct hv_pcibus_device *hbus =
1348 		container_of(pdev->bus->sysdata, struct hv_pcibus_device,
1349 			     sysdata);
1350 	struct {
1351 		struct pci_packet pkt;
1352 		char buf[sizeof(struct pci_read_block)];
1353 	} pkt;
1354 	struct hv_read_config_compl comp_pkt;
1355 	struct pci_read_block *read_blk;
1356 	int ret;
1357 
1358 	if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
1359 		return -EINVAL;
1360 
1361 	init_completion(&comp_pkt.comp_pkt.host_event);
1362 	comp_pkt.buf = buf;
1363 	comp_pkt.len = len;
1364 
1365 	memset(&pkt, 0, sizeof(pkt));
1366 	pkt.pkt.completion_func = hv_pci_read_config_compl;
1367 	pkt.pkt.compl_ctxt = &comp_pkt;
1368 	read_blk = (struct pci_read_block *)&pkt.pkt.message;
1369 	read_blk->message_type.type = PCI_READ_BLOCK;
1370 	read_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
1371 	read_blk->block_id = block_id;
1372 	read_blk->bytes_requested = len;
1373 
1374 	ret = vmbus_sendpacket(hbus->hdev->channel, read_blk,
1375 			       sizeof(*read_blk), (unsigned long)&pkt.pkt,
1376 			       VM_PKT_DATA_INBAND,
1377 			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1378 	if (ret)
1379 		return ret;
1380 
1381 	ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event);
1382 	if (ret)
1383 		return ret;
1384 
1385 	if (comp_pkt.comp_pkt.completion_status != 0 ||
1386 	    comp_pkt.bytes_returned == 0) {
1387 		dev_err(&hbus->hdev->device,
1388 			"Read Config Block failed: 0x%x, bytes_returned=%d\n",
1389 			comp_pkt.comp_pkt.completion_status,
1390 			comp_pkt.bytes_returned);
1391 		return -EIO;
1392 	}
1393 
1394 	*bytes_returned = comp_pkt.bytes_returned;
1395 	return 0;
1396 }
1397 
1398 /**
1399  * hv_pci_write_config_compl() - Invoked when a response packet for a write
1400  * config block operation arrives.
1401  * @context:		Identifies the write config operation
1402  * @resp:		The response packet itself
1403  * @resp_packet_size:	Size in bytes of the response packet
1404  */
1405 static void hv_pci_write_config_compl(void *context, struct pci_response *resp,
1406 				      int resp_packet_size)
1407 {
1408 	struct hv_pci_compl *comp_pkt = context;
1409 
1410 	comp_pkt->completion_status = resp->status;
1411 	complete(&comp_pkt->host_event);
1412 }
1413 
1414 /**
1415  * hv_write_config_block() - Sends a write config block request to the
1416  * back-end driver running in the Hyper-V parent partition.
1417  * @pdev:		The PCI driver's representation for this device.
1418  * @buf:		Buffer from which the config block will	be copied.
1419  * @len:		Size in bytes of buf.
1420  * @block_id:		Identifies the config block which is being written.
1421  *
1422  * Return: 0 on success, -errno on failure
1423  */
1424 static int hv_write_config_block(struct pci_dev *pdev, void *buf,
1425 				unsigned int len, unsigned int block_id)
1426 {
1427 	struct hv_pcibus_device *hbus =
1428 		container_of(pdev->bus->sysdata, struct hv_pcibus_device,
1429 			     sysdata);
1430 	struct {
1431 		struct pci_packet pkt;
1432 		char buf[sizeof(struct pci_write_block)];
1433 		u32 reserved;
1434 	} pkt;
1435 	struct hv_pci_compl comp_pkt;
1436 	struct pci_write_block *write_blk;
1437 	u32 pkt_size;
1438 	int ret;
1439 
1440 	if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
1441 		return -EINVAL;
1442 
1443 	init_completion(&comp_pkt.host_event);
1444 
1445 	memset(&pkt, 0, sizeof(pkt));
1446 	pkt.pkt.completion_func = hv_pci_write_config_compl;
1447 	pkt.pkt.compl_ctxt = &comp_pkt;
1448 	write_blk = (struct pci_write_block *)&pkt.pkt.message;
1449 	write_blk->message_type.type = PCI_WRITE_BLOCK;
1450 	write_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
1451 	write_blk->block_id = block_id;
1452 	write_blk->byte_count = len;
1453 	memcpy(write_blk->bytes, buf, len);
1454 	pkt_size = offsetof(struct pci_write_block, bytes) + len;
1455 	/*
1456 	 * This quirk is required on some hosts shipped around 2018, because
1457 	 * these hosts don't check the pkt_size correctly (new hosts have been
1458 	 * fixed since early 2019). The quirk is also safe on very old hosts
1459 	 * and new hosts, because, on them, what really matters is the length
1460 	 * specified in write_blk->byte_count.
1461 	 */
1462 	pkt_size += sizeof(pkt.reserved);
1463 
1464 	ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size,
1465 			       (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND,
1466 			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1467 	if (ret)
1468 		return ret;
1469 
1470 	ret = wait_for_response(hbus->hdev, &comp_pkt.host_event);
1471 	if (ret)
1472 		return ret;
1473 
1474 	if (comp_pkt.completion_status != 0) {
1475 		dev_err(&hbus->hdev->device,
1476 			"Write Config Block failed: 0x%x\n",
1477 			comp_pkt.completion_status);
1478 		return -EIO;
1479 	}
1480 
1481 	return 0;
1482 }
1483 
1484 /**
1485  * hv_register_block_invalidate() - Invoked when a config block invalidation
1486  * arrives from the back-end driver.
1487  * @pdev:		The PCI driver's representation for this device.
1488  * @context:		Identifies the device.
1489  * @block_invalidate:	Identifies all of the blocks being invalidated.
1490  *
1491  * Return: 0 on success, -errno on failure
1492  */
1493 static int hv_register_block_invalidate(struct pci_dev *pdev, void *context,
1494 					void (*block_invalidate)(void *context,
1495 								 u64 block_mask))
1496 {
1497 	struct hv_pcibus_device *hbus =
1498 		container_of(pdev->bus->sysdata, struct hv_pcibus_device,
1499 			     sysdata);
1500 	struct hv_pci_dev *hpdev;
1501 
1502 	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1503 	if (!hpdev)
1504 		return -ENODEV;
1505 
1506 	hpdev->block_invalidate = block_invalidate;
1507 	hpdev->invalidate_context = context;
1508 
1509 	put_pcichild(hpdev);
1510 	return 0;
1511 
1512 }
1513 
1514 /* Interrupt management hooks */
1515 static void hv_int_desc_free(struct hv_pci_dev *hpdev,
1516 			     struct tran_int_desc *int_desc)
1517 {
1518 	struct pci_delete_interrupt *int_pkt;
1519 	struct {
1520 		struct pci_packet pkt;
1521 		u8 buffer[sizeof(struct pci_delete_interrupt)];
1522 	} ctxt;
1523 
1524 	memset(&ctxt, 0, sizeof(ctxt));
1525 	int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
1526 	int_pkt->message_type.type =
1527 		PCI_DELETE_INTERRUPT_MESSAGE;
1528 	int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
1529 	int_pkt->int_desc = *int_desc;
1530 	vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
1531 			 (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0);
1532 	kfree(int_desc);
1533 }
1534 
1535 /**
1536  * hv_msi_free() - Free the MSI.
1537  * @domain:	The interrupt domain pointer
1538  * @info:	Extra MSI-related context
1539  * @irq:	Identifies the IRQ.
1540  *
1541  * The Hyper-V parent partition and hypervisor are tracking the
1542  * messages that are in use, keeping the interrupt redirection
1543  * table up to date.  This callback sends a message that frees
1544  * the IRT entry and related tracking nonsense.
1545  */
1546 static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
1547 			unsigned int irq)
1548 {
1549 	struct hv_pcibus_device *hbus;
1550 	struct hv_pci_dev *hpdev;
1551 	struct pci_dev *pdev;
1552 	struct tran_int_desc *int_desc;
1553 	struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq);
1554 	struct msi_desc *msi = irq_data_get_msi_desc(irq_data);
1555 
1556 	pdev = msi_desc_to_pci_dev(msi);
1557 	hbus = info->data;
1558 	int_desc = irq_data_get_irq_chip_data(irq_data);
1559 	if (!int_desc)
1560 		return;
1561 
1562 	irq_data->chip_data = NULL;
1563 	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1564 	if (!hpdev) {
1565 		kfree(int_desc);
1566 		return;
1567 	}
1568 
1569 	hv_int_desc_free(hpdev, int_desc);
1570 	put_pcichild(hpdev);
1571 }
1572 
1573 static void hv_irq_mask(struct irq_data *data)
1574 {
1575 	pci_msi_mask_irq(data);
1576 	if (data->parent_data->chip->irq_mask)
1577 		irq_chip_mask_parent(data);
1578 }
1579 
1580 static void hv_irq_unmask(struct irq_data *data)
1581 {
1582 	hv_arch_irq_unmask(data);
1583 
1584 	if (data->parent_data->chip->irq_unmask)
1585 		irq_chip_unmask_parent(data);
1586 	pci_msi_unmask_irq(data);
1587 }
1588 
1589 struct compose_comp_ctxt {
1590 	struct hv_pci_compl comp_pkt;
1591 	struct tran_int_desc int_desc;
1592 };
1593 
1594 static void hv_pci_compose_compl(void *context, struct pci_response *resp,
1595 				 int resp_packet_size)
1596 {
1597 	struct compose_comp_ctxt *comp_pkt = context;
1598 	struct pci_create_int_response *int_resp =
1599 		(struct pci_create_int_response *)resp;
1600 
1601 	comp_pkt->comp_pkt.completion_status = resp->status;
1602 	comp_pkt->int_desc = int_resp->int_desc;
1603 	complete(&comp_pkt->comp_pkt.host_event);
1604 }
1605 
1606 static u32 hv_compose_msi_req_v1(
1607 	struct pci_create_interrupt *int_pkt, struct cpumask *affinity,
1608 	u32 slot, u8 vector)
1609 {
1610 	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
1611 	int_pkt->wslot.slot = slot;
1612 	int_pkt->int_desc.vector = vector;
1613 	int_pkt->int_desc.vector_count = 1;
1614 	int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
1615 
1616 	/*
1617 	 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in
1618 	 * hv_irq_unmask().
1619 	 */
1620 	int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL;
1621 
1622 	return sizeof(*int_pkt);
1623 }
1624 
1625 /*
1626  * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
1627  * by subsequent retarget in hv_irq_unmask().
1628  */
1629 static int hv_compose_msi_req_get_cpu(struct cpumask *affinity)
1630 {
1631 	return cpumask_first_and(affinity, cpu_online_mask);
1632 }
1633 
1634 static u32 hv_compose_msi_req_v2(
1635 	struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity,
1636 	u32 slot, u8 vector)
1637 {
1638 	int cpu;
1639 
1640 	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
1641 	int_pkt->wslot.slot = slot;
1642 	int_pkt->int_desc.vector = vector;
1643 	int_pkt->int_desc.vector_count = 1;
1644 	int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
1645 	cpu = hv_compose_msi_req_get_cpu(affinity);
1646 	int_pkt->int_desc.processor_array[0] =
1647 		hv_cpu_number_to_vp_number(cpu);
1648 	int_pkt->int_desc.processor_count = 1;
1649 
1650 	return sizeof(*int_pkt);
1651 }
1652 
1653 static u32 hv_compose_msi_req_v3(
1654 	struct pci_create_interrupt3 *int_pkt, struct cpumask *affinity,
1655 	u32 slot, u32 vector)
1656 {
1657 	int cpu;
1658 
1659 	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3;
1660 	int_pkt->wslot.slot = slot;
1661 	int_pkt->int_desc.vector = vector;
1662 	int_pkt->int_desc.reserved = 0;
1663 	int_pkt->int_desc.vector_count = 1;
1664 	int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
1665 	cpu = hv_compose_msi_req_get_cpu(affinity);
1666 	int_pkt->int_desc.processor_array[0] =
1667 		hv_cpu_number_to_vp_number(cpu);
1668 	int_pkt->int_desc.processor_count = 1;
1669 
1670 	return sizeof(*int_pkt);
1671 }
1672 
1673 /**
1674  * hv_compose_msi_msg() - Supplies a valid MSI address/data
1675  * @data:	Everything about this MSI
1676  * @msg:	Buffer that is filled in by this function
1677  *
1678  * This function unpacks the IRQ looking for target CPU set, IDT
1679  * vector and mode and sends a message to the parent partition
1680  * asking for a mapping for that tuple in this partition.  The
1681  * response supplies a data value and address to which that data
1682  * should be written to trigger that interrupt.
1683  */
1684 static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
1685 {
1686 	struct hv_pcibus_device *hbus;
1687 	struct vmbus_channel *channel;
1688 	struct hv_pci_dev *hpdev;
1689 	struct pci_bus *pbus;
1690 	struct pci_dev *pdev;
1691 	struct cpumask *dest;
1692 	struct compose_comp_ctxt comp;
1693 	struct tran_int_desc *int_desc;
1694 	struct {
1695 		struct pci_packet pci_pkt;
1696 		union {
1697 			struct pci_create_interrupt v1;
1698 			struct pci_create_interrupt2 v2;
1699 			struct pci_create_interrupt3 v3;
1700 		} int_pkts;
1701 	} __packed ctxt;
1702 
1703 	u32 size;
1704 	int ret;
1705 
1706 	pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
1707 	dest = irq_data_get_effective_affinity_mask(data);
1708 	pbus = pdev->bus;
1709 	hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
1710 	channel = hbus->hdev->channel;
1711 	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1712 	if (!hpdev)
1713 		goto return_null_message;
1714 
1715 	/* Free any previous message that might have already been composed. */
1716 	if (data->chip_data) {
1717 		int_desc = data->chip_data;
1718 		data->chip_data = NULL;
1719 		hv_int_desc_free(hpdev, int_desc);
1720 	}
1721 
1722 	int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
1723 	if (!int_desc)
1724 		goto drop_reference;
1725 
1726 	memset(&ctxt, 0, sizeof(ctxt));
1727 	init_completion(&comp.comp_pkt.host_event);
1728 	ctxt.pci_pkt.completion_func = hv_pci_compose_compl;
1729 	ctxt.pci_pkt.compl_ctxt = &comp;
1730 
1731 	switch (hbus->protocol_version) {
1732 	case PCI_PROTOCOL_VERSION_1_1:
1733 		size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
1734 					dest,
1735 					hpdev->desc.win_slot.slot,
1736 					hv_msi_get_int_vector(data));
1737 		break;
1738 
1739 	case PCI_PROTOCOL_VERSION_1_2:
1740 	case PCI_PROTOCOL_VERSION_1_3:
1741 		size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
1742 					dest,
1743 					hpdev->desc.win_slot.slot,
1744 					hv_msi_get_int_vector(data));
1745 		break;
1746 
1747 	case PCI_PROTOCOL_VERSION_1_4:
1748 		size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3,
1749 					dest,
1750 					hpdev->desc.win_slot.slot,
1751 					hv_msi_get_int_vector(data));
1752 		break;
1753 
1754 	default:
1755 		/* As we only negotiate protocol versions known to this driver,
1756 		 * this path should never hit. However, this is it not a hot
1757 		 * path so we print a message to aid future updates.
1758 		 */
1759 		dev_err(&hbus->hdev->device,
1760 			"Unexpected vPCI protocol, update driver.");
1761 		goto free_int_desc;
1762 	}
1763 
1764 	ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, &ctxt.int_pkts,
1765 			       size, (unsigned long)&ctxt.pci_pkt,
1766 			       VM_PKT_DATA_INBAND,
1767 			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1768 	if (ret) {
1769 		dev_err(&hbus->hdev->device,
1770 			"Sending request for interrupt failed: 0x%x",
1771 			comp.comp_pkt.completion_status);
1772 		goto free_int_desc;
1773 	}
1774 
1775 	/*
1776 	 * Prevents hv_pci_onchannelcallback() from running concurrently
1777 	 * in the tasklet.
1778 	 */
1779 	tasklet_disable_in_atomic(&channel->callback_event);
1780 
1781 	/*
1782 	 * Since this function is called with IRQ locks held, can't
1783 	 * do normal wait for completion; instead poll.
1784 	 */
1785 	while (!try_wait_for_completion(&comp.comp_pkt.host_event)) {
1786 		unsigned long flags;
1787 
1788 		/* 0xFFFF means an invalid PCI VENDOR ID. */
1789 		if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) {
1790 			dev_err_once(&hbus->hdev->device,
1791 				     "the device has gone\n");
1792 			goto enable_tasklet;
1793 		}
1794 
1795 		/*
1796 		 * Make sure that the ring buffer data structure doesn't get
1797 		 * freed while we dereference the ring buffer pointer.  Test
1798 		 * for the channel's onchannel_callback being NULL within a
1799 		 * sched_lock critical section.  See also the inline comments
1800 		 * in vmbus_reset_channel_cb().
1801 		 */
1802 		spin_lock_irqsave(&channel->sched_lock, flags);
1803 		if (unlikely(channel->onchannel_callback == NULL)) {
1804 			spin_unlock_irqrestore(&channel->sched_lock, flags);
1805 			goto enable_tasklet;
1806 		}
1807 		hv_pci_onchannelcallback(hbus);
1808 		spin_unlock_irqrestore(&channel->sched_lock, flags);
1809 
1810 		if (hpdev->state == hv_pcichild_ejecting) {
1811 			dev_err_once(&hbus->hdev->device,
1812 				     "the device is being ejected\n");
1813 			goto enable_tasklet;
1814 		}
1815 
1816 		udelay(100);
1817 	}
1818 
1819 	tasklet_enable(&channel->callback_event);
1820 
1821 	if (comp.comp_pkt.completion_status < 0) {
1822 		dev_err(&hbus->hdev->device,
1823 			"Request for interrupt failed: 0x%x",
1824 			comp.comp_pkt.completion_status);
1825 		goto free_int_desc;
1826 	}
1827 
1828 	/*
1829 	 * Record the assignment so that this can be unwound later. Using
1830 	 * irq_set_chip_data() here would be appropriate, but the lock it takes
1831 	 * is already held.
1832 	 */
1833 	*int_desc = comp.int_desc;
1834 	data->chip_data = int_desc;
1835 
1836 	/* Pass up the result. */
1837 	msg->address_hi = comp.int_desc.address >> 32;
1838 	msg->address_lo = comp.int_desc.address & 0xffffffff;
1839 	msg->data = comp.int_desc.data;
1840 
1841 	put_pcichild(hpdev);
1842 	return;
1843 
1844 enable_tasklet:
1845 	tasklet_enable(&channel->callback_event);
1846 free_int_desc:
1847 	kfree(int_desc);
1848 drop_reference:
1849 	put_pcichild(hpdev);
1850 return_null_message:
1851 	msg->address_hi = 0;
1852 	msg->address_lo = 0;
1853 	msg->data = 0;
1854 }
1855 
1856 /* HW Interrupt Chip Descriptor */
1857 static struct irq_chip hv_msi_irq_chip = {
1858 	.name			= "Hyper-V PCIe MSI",
1859 	.irq_compose_msi_msg	= hv_compose_msi_msg,
1860 	.irq_set_affinity	= irq_chip_set_affinity_parent,
1861 #ifdef CONFIG_X86
1862 	.irq_ack		= irq_chip_ack_parent,
1863 #elif defined(CONFIG_ARM64)
1864 	.irq_eoi		= irq_chip_eoi_parent,
1865 #endif
1866 	.irq_mask		= hv_irq_mask,
1867 	.irq_unmask		= hv_irq_unmask,
1868 };
1869 
1870 static struct msi_domain_ops hv_msi_ops = {
1871 	.msi_prepare	= hv_msi_prepare,
1872 	.msi_free	= hv_msi_free,
1873 };
1874 
1875 /**
1876  * hv_pcie_init_irq_domain() - Initialize IRQ domain
1877  * @hbus:	The root PCI bus
1878  *
1879  * This function creates an IRQ domain which will be used for
1880  * interrupts from devices that have been passed through.  These
1881  * devices only support MSI and MSI-X, not line-based interrupts
1882  * or simulations of line-based interrupts through PCIe's
1883  * fabric-layer messages.  Because interrupts are remapped, we
1884  * can support multi-message MSI here.
1885  *
1886  * Return: '0' on success and error value on failure
1887  */
1888 static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
1889 {
1890 	hbus->msi_info.chip = &hv_msi_irq_chip;
1891 	hbus->msi_info.ops = &hv_msi_ops;
1892 	hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS |
1893 		MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI |
1894 		MSI_FLAG_PCI_MSIX);
1895 	hbus->msi_info.handler = FLOW_HANDLER;
1896 	hbus->msi_info.handler_name = FLOW_NAME;
1897 	hbus->msi_info.data = hbus;
1898 	hbus->irq_domain = pci_msi_create_irq_domain(hbus->fwnode,
1899 						     &hbus->msi_info,
1900 						     hv_pci_get_root_domain());
1901 	if (!hbus->irq_domain) {
1902 		dev_err(&hbus->hdev->device,
1903 			"Failed to build an MSI IRQ domain\n");
1904 		return -ENODEV;
1905 	}
1906 
1907 	dev_set_msi_domain(&hbus->bridge->dev, hbus->irq_domain);
1908 
1909 	return 0;
1910 }
1911 
1912 /**
1913  * get_bar_size() - Get the address space consumed by a BAR
1914  * @bar_val:	Value that a BAR returned after -1 was written
1915  *              to it.
1916  *
1917  * This function returns the size of the BAR, rounded up to 1
1918  * page.  It has to be rounded up because the hypervisor's page
1919  * table entry that maps the BAR into the VM can't specify an
1920  * offset within a page.  The invariant is that the hypervisor
1921  * must place any BARs of smaller than page length at the
1922  * beginning of a page.
1923  *
1924  * Return:	Size in bytes of the consumed MMIO space.
1925  */
1926 static u64 get_bar_size(u64 bar_val)
1927 {
1928 	return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)),
1929 			PAGE_SIZE);
1930 }
1931 
1932 /**
1933  * survey_child_resources() - Total all MMIO requirements
1934  * @hbus:	Root PCI bus, as understood by this driver
1935  */
1936 static void survey_child_resources(struct hv_pcibus_device *hbus)
1937 {
1938 	struct hv_pci_dev *hpdev;
1939 	resource_size_t bar_size = 0;
1940 	unsigned long flags;
1941 	struct completion *event;
1942 	u64 bar_val;
1943 	int i;
1944 
1945 	/* If nobody is waiting on the answer, don't compute it. */
1946 	event = xchg(&hbus->survey_event, NULL);
1947 	if (!event)
1948 		return;
1949 
1950 	/* If the answer has already been computed, go with it. */
1951 	if (hbus->low_mmio_space || hbus->high_mmio_space) {
1952 		complete(event);
1953 		return;
1954 	}
1955 
1956 	spin_lock_irqsave(&hbus->device_list_lock, flags);
1957 
1958 	/*
1959 	 * Due to an interesting quirk of the PCI spec, all memory regions
1960 	 * for a child device are a power of 2 in size and aligned in memory,
1961 	 * so it's sufficient to just add them up without tracking alignment.
1962 	 */
1963 	list_for_each_entry(hpdev, &hbus->children, list_entry) {
1964 		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
1965 			if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
1966 				dev_err(&hbus->hdev->device,
1967 					"There's an I/O BAR in this list!\n");
1968 
1969 			if (hpdev->probed_bar[i] != 0) {
1970 				/*
1971 				 * A probed BAR has all the upper bits set that
1972 				 * can be changed.
1973 				 */
1974 
1975 				bar_val = hpdev->probed_bar[i];
1976 				if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1977 					bar_val |=
1978 					((u64)hpdev->probed_bar[++i] << 32);
1979 				else
1980 					bar_val |= 0xffffffff00000000ULL;
1981 
1982 				bar_size = get_bar_size(bar_val);
1983 
1984 				if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1985 					hbus->high_mmio_space += bar_size;
1986 				else
1987 					hbus->low_mmio_space += bar_size;
1988 			}
1989 		}
1990 	}
1991 
1992 	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1993 	complete(event);
1994 }
1995 
1996 /**
1997  * prepopulate_bars() - Fill in BARs with defaults
1998  * @hbus:	Root PCI bus, as understood by this driver
1999  *
2000  * The core PCI driver code seems much, much happier if the BARs
2001  * for a device have values upon first scan. So fill them in.
2002  * The algorithm below works down from large sizes to small,
2003  * attempting to pack the assignments optimally. The assumption,
2004  * enforced in other parts of the code, is that the beginning of
2005  * the memory-mapped I/O space will be aligned on the largest
2006  * BAR size.
2007  */
2008 static void prepopulate_bars(struct hv_pcibus_device *hbus)
2009 {
2010 	resource_size_t high_size = 0;
2011 	resource_size_t low_size = 0;
2012 	resource_size_t high_base = 0;
2013 	resource_size_t low_base = 0;
2014 	resource_size_t bar_size;
2015 	struct hv_pci_dev *hpdev;
2016 	unsigned long flags;
2017 	u64 bar_val;
2018 	u32 command;
2019 	bool high;
2020 	int i;
2021 
2022 	if (hbus->low_mmio_space) {
2023 		low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
2024 		low_base = hbus->low_mmio_res->start;
2025 	}
2026 
2027 	if (hbus->high_mmio_space) {
2028 		high_size = 1ULL <<
2029 			(63 - __builtin_clzll(hbus->high_mmio_space));
2030 		high_base = hbus->high_mmio_res->start;
2031 	}
2032 
2033 	spin_lock_irqsave(&hbus->device_list_lock, flags);
2034 
2035 	/*
2036 	 * Clear the memory enable bit, in case it's already set. This occurs
2037 	 * in the suspend path of hibernation, where the device is suspended,
2038 	 * resumed and suspended again: see hibernation_snapshot() and
2039 	 * hibernation_platform_enter().
2040 	 *
2041 	 * If the memory enable bit is already set, Hyper-V silently ignores
2042 	 * the below BAR updates, and the related PCI device driver can not
2043 	 * work, because reading from the device register(s) always returns
2044 	 * 0xFFFFFFFF (PCI_ERROR_RESPONSE).
2045 	 */
2046 	list_for_each_entry(hpdev, &hbus->children, list_entry) {
2047 		_hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command);
2048 		command &= ~PCI_COMMAND_MEMORY;
2049 		_hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command);
2050 	}
2051 
2052 	/* Pick addresses for the BARs. */
2053 	do {
2054 		list_for_each_entry(hpdev, &hbus->children, list_entry) {
2055 			for (i = 0; i < PCI_STD_NUM_BARS; i++) {
2056 				bar_val = hpdev->probed_bar[i];
2057 				if (bar_val == 0)
2058 					continue;
2059 				high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64;
2060 				if (high) {
2061 					bar_val |=
2062 						((u64)hpdev->probed_bar[i + 1]
2063 						 << 32);
2064 				} else {
2065 					bar_val |= 0xffffffffULL << 32;
2066 				}
2067 				bar_size = get_bar_size(bar_val);
2068 				if (high) {
2069 					if (high_size != bar_size) {
2070 						i++;
2071 						continue;
2072 					}
2073 					_hv_pcifront_write_config(hpdev,
2074 						PCI_BASE_ADDRESS_0 + (4 * i),
2075 						4,
2076 						(u32)(high_base & 0xffffff00));
2077 					i++;
2078 					_hv_pcifront_write_config(hpdev,
2079 						PCI_BASE_ADDRESS_0 + (4 * i),
2080 						4, (u32)(high_base >> 32));
2081 					high_base += bar_size;
2082 				} else {
2083 					if (low_size != bar_size)
2084 						continue;
2085 					_hv_pcifront_write_config(hpdev,
2086 						PCI_BASE_ADDRESS_0 + (4 * i),
2087 						4,
2088 						(u32)(low_base & 0xffffff00));
2089 					low_base += bar_size;
2090 				}
2091 			}
2092 			if (high_size <= 1 && low_size <= 1) {
2093 				/* Set the memory enable bit. */
2094 				_hv_pcifront_read_config(hpdev, PCI_COMMAND, 2,
2095 							 &command);
2096 				command |= PCI_COMMAND_MEMORY;
2097 				_hv_pcifront_write_config(hpdev, PCI_COMMAND, 2,
2098 							  command);
2099 				break;
2100 			}
2101 		}
2102 
2103 		high_size >>= 1;
2104 		low_size >>= 1;
2105 	}  while (high_size || low_size);
2106 
2107 	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2108 }
2109 
2110 /*
2111  * Assign entries in sysfs pci slot directory.
2112  *
2113  * Note that this function does not need to lock the children list
2114  * because it is called from pci_devices_present_work which
2115  * is serialized with hv_eject_device_work because they are on the
2116  * same ordered workqueue. Therefore hbus->children list will not change
2117  * even when pci_create_slot sleeps.
2118  */
2119 static void hv_pci_assign_slots(struct hv_pcibus_device *hbus)
2120 {
2121 	struct hv_pci_dev *hpdev;
2122 	char name[SLOT_NAME_SIZE];
2123 	int slot_nr;
2124 
2125 	list_for_each_entry(hpdev, &hbus->children, list_entry) {
2126 		if (hpdev->pci_slot)
2127 			continue;
2128 
2129 		slot_nr = PCI_SLOT(wslot_to_devfn(hpdev->desc.win_slot.slot));
2130 		snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser);
2131 		hpdev->pci_slot = pci_create_slot(hbus->bridge->bus, slot_nr,
2132 					  name, NULL);
2133 		if (IS_ERR(hpdev->pci_slot)) {
2134 			pr_warn("pci_create slot %s failed\n", name);
2135 			hpdev->pci_slot = NULL;
2136 		}
2137 	}
2138 }
2139 
2140 /*
2141  * Remove entries in sysfs pci slot directory.
2142  */
2143 static void hv_pci_remove_slots(struct hv_pcibus_device *hbus)
2144 {
2145 	struct hv_pci_dev *hpdev;
2146 
2147 	list_for_each_entry(hpdev, &hbus->children, list_entry) {
2148 		if (!hpdev->pci_slot)
2149 			continue;
2150 		pci_destroy_slot(hpdev->pci_slot);
2151 		hpdev->pci_slot = NULL;
2152 	}
2153 }
2154 
2155 /*
2156  * Set NUMA node for the devices on the bus
2157  */
2158 static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus)
2159 {
2160 	struct pci_dev *dev;
2161 	struct pci_bus *bus = hbus->bridge->bus;
2162 	struct hv_pci_dev *hv_dev;
2163 
2164 	list_for_each_entry(dev, &bus->devices, bus_list) {
2165 		hv_dev = get_pcichild_wslot(hbus, devfn_to_wslot(dev->devfn));
2166 		if (!hv_dev)
2167 			continue;
2168 
2169 		if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY &&
2170 		    hv_dev->desc.virtual_numa_node < num_possible_nodes())
2171 			/*
2172 			 * The kernel may boot with some NUMA nodes offline
2173 			 * (e.g. in a KDUMP kernel) or with NUMA disabled via
2174 			 * "numa=off". In those cases, adjust the host provided
2175 			 * NUMA node to a valid NUMA node used by the kernel.
2176 			 */
2177 			set_dev_node(&dev->dev,
2178 				     numa_map_to_online_node(
2179 					     hv_dev->desc.virtual_numa_node));
2180 
2181 		put_pcichild(hv_dev);
2182 	}
2183 }
2184 
2185 /**
2186  * create_root_hv_pci_bus() - Expose a new root PCI bus
2187  * @hbus:	Root PCI bus, as understood by this driver
2188  *
2189  * Return: 0 on success, -errno on failure
2190  */
2191 static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
2192 {
2193 	int error;
2194 	struct pci_host_bridge *bridge = hbus->bridge;
2195 
2196 	bridge->dev.parent = &hbus->hdev->device;
2197 	bridge->sysdata = &hbus->sysdata;
2198 	bridge->ops = &hv_pcifront_ops;
2199 
2200 	error = pci_scan_root_bus_bridge(bridge);
2201 	if (error)
2202 		return error;
2203 
2204 	pci_lock_rescan_remove();
2205 	hv_pci_assign_numa_node(hbus);
2206 	pci_bus_assign_resources(bridge->bus);
2207 	hv_pci_assign_slots(hbus);
2208 	pci_bus_add_devices(bridge->bus);
2209 	pci_unlock_rescan_remove();
2210 	hbus->state = hv_pcibus_installed;
2211 	return 0;
2212 }
2213 
2214 struct q_res_req_compl {
2215 	struct completion host_event;
2216 	struct hv_pci_dev *hpdev;
2217 };
2218 
2219 /**
2220  * q_resource_requirements() - Query Resource Requirements
2221  * @context:		The completion context.
2222  * @resp:		The response that came from the host.
2223  * @resp_packet_size:	The size in bytes of resp.
2224  *
2225  * This function is invoked on completion of a Query Resource
2226  * Requirements packet.
2227  */
2228 static void q_resource_requirements(void *context, struct pci_response *resp,
2229 				    int resp_packet_size)
2230 {
2231 	struct q_res_req_compl *completion = context;
2232 	struct pci_q_res_req_response *q_res_req =
2233 		(struct pci_q_res_req_response *)resp;
2234 	int i;
2235 
2236 	if (resp->status < 0) {
2237 		dev_err(&completion->hpdev->hbus->hdev->device,
2238 			"query resource requirements failed: %x\n",
2239 			resp->status);
2240 	} else {
2241 		for (i = 0; i < PCI_STD_NUM_BARS; i++) {
2242 			completion->hpdev->probed_bar[i] =
2243 				q_res_req->probed_bar[i];
2244 		}
2245 	}
2246 
2247 	complete(&completion->host_event);
2248 }
2249 
2250 /**
2251  * new_pcichild_device() - Create a new child device
2252  * @hbus:	The internal struct tracking this root PCI bus.
2253  * @desc:	The information supplied so far from the host
2254  *              about the device.
2255  *
2256  * This function creates the tracking structure for a new child
2257  * device and kicks off the process of figuring out what it is.
2258  *
2259  * Return: Pointer to the new tracking struct
2260  */
2261 static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
2262 		struct hv_pcidev_description *desc)
2263 {
2264 	struct hv_pci_dev *hpdev;
2265 	struct pci_child_message *res_req;
2266 	struct q_res_req_compl comp_pkt;
2267 	struct {
2268 		struct pci_packet init_packet;
2269 		u8 buffer[sizeof(struct pci_child_message)];
2270 	} pkt;
2271 	unsigned long flags;
2272 	int ret;
2273 
2274 	hpdev = kzalloc(sizeof(*hpdev), GFP_KERNEL);
2275 	if (!hpdev)
2276 		return NULL;
2277 
2278 	hpdev->hbus = hbus;
2279 
2280 	memset(&pkt, 0, sizeof(pkt));
2281 	init_completion(&comp_pkt.host_event);
2282 	comp_pkt.hpdev = hpdev;
2283 	pkt.init_packet.compl_ctxt = &comp_pkt;
2284 	pkt.init_packet.completion_func = q_resource_requirements;
2285 	res_req = (struct pci_child_message *)&pkt.init_packet.message;
2286 	res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
2287 	res_req->wslot.slot = desc->win_slot.slot;
2288 
2289 	ret = vmbus_sendpacket(hbus->hdev->channel, res_req,
2290 			       sizeof(struct pci_child_message),
2291 			       (unsigned long)&pkt.init_packet,
2292 			       VM_PKT_DATA_INBAND,
2293 			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2294 	if (ret)
2295 		goto error;
2296 
2297 	if (wait_for_response(hbus->hdev, &comp_pkt.host_event))
2298 		goto error;
2299 
2300 	hpdev->desc = *desc;
2301 	refcount_set(&hpdev->refs, 1);
2302 	get_pcichild(hpdev);
2303 	spin_lock_irqsave(&hbus->device_list_lock, flags);
2304 
2305 	list_add_tail(&hpdev->list_entry, &hbus->children);
2306 	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2307 	return hpdev;
2308 
2309 error:
2310 	kfree(hpdev);
2311 	return NULL;
2312 }
2313 
2314 /**
2315  * get_pcichild_wslot() - Find device from slot
2316  * @hbus:	Root PCI bus, as understood by this driver
2317  * @wslot:	Location on the bus
2318  *
2319  * This function looks up a PCI device and returns the internal
2320  * representation of it.  It acquires a reference on it, so that
2321  * the device won't be deleted while somebody is using it.  The
2322  * caller is responsible for calling put_pcichild() to release
2323  * this reference.
2324  *
2325  * Return:	Internal representation of a PCI device
2326  */
2327 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
2328 					     u32 wslot)
2329 {
2330 	unsigned long flags;
2331 	struct hv_pci_dev *iter, *hpdev = NULL;
2332 
2333 	spin_lock_irqsave(&hbus->device_list_lock, flags);
2334 	list_for_each_entry(iter, &hbus->children, list_entry) {
2335 		if (iter->desc.win_slot.slot == wslot) {
2336 			hpdev = iter;
2337 			get_pcichild(hpdev);
2338 			break;
2339 		}
2340 	}
2341 	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2342 
2343 	return hpdev;
2344 }
2345 
2346 /**
2347  * pci_devices_present_work() - Handle new list of child devices
2348  * @work:	Work struct embedded in struct hv_dr_work
2349  *
2350  * "Bus Relations" is the Windows term for "children of this
2351  * bus."  The terminology is preserved here for people trying to
2352  * debug the interaction between Hyper-V and Linux.  This
2353  * function is called when the parent partition reports a list
2354  * of functions that should be observed under this PCI Express
2355  * port (bus).
2356  *
2357  * This function updates the list, and must tolerate being
2358  * called multiple times with the same information.  The typical
2359  * number of child devices is one, with very atypical cases
2360  * involving three or four, so the algorithms used here can be
2361  * simple and inefficient.
2362  *
2363  * It must also treat the omission of a previously observed device as
2364  * notification that the device no longer exists.
2365  *
2366  * Note that this function is serialized with hv_eject_device_work(),
2367  * because both are pushed to the ordered workqueue hbus->wq.
2368  */
2369 static void pci_devices_present_work(struct work_struct *work)
2370 {
2371 	u32 child_no;
2372 	bool found;
2373 	struct hv_pcidev_description *new_desc;
2374 	struct hv_pci_dev *hpdev;
2375 	struct hv_pcibus_device *hbus;
2376 	struct list_head removed;
2377 	struct hv_dr_work *dr_wrk;
2378 	struct hv_dr_state *dr = NULL;
2379 	unsigned long flags;
2380 
2381 	dr_wrk = container_of(work, struct hv_dr_work, wrk);
2382 	hbus = dr_wrk->bus;
2383 	kfree(dr_wrk);
2384 
2385 	INIT_LIST_HEAD(&removed);
2386 
2387 	/* Pull this off the queue and process it if it was the last one. */
2388 	spin_lock_irqsave(&hbus->device_list_lock, flags);
2389 	while (!list_empty(&hbus->dr_list)) {
2390 		dr = list_first_entry(&hbus->dr_list, struct hv_dr_state,
2391 				      list_entry);
2392 		list_del(&dr->list_entry);
2393 
2394 		/* Throw this away if the list still has stuff in it. */
2395 		if (!list_empty(&hbus->dr_list)) {
2396 			kfree(dr);
2397 			continue;
2398 		}
2399 	}
2400 	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2401 
2402 	if (!dr)
2403 		return;
2404 
2405 	/* First, mark all existing children as reported missing. */
2406 	spin_lock_irqsave(&hbus->device_list_lock, flags);
2407 	list_for_each_entry(hpdev, &hbus->children, list_entry) {
2408 		hpdev->reported_missing = true;
2409 	}
2410 	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2411 
2412 	/* Next, add back any reported devices. */
2413 	for (child_no = 0; child_no < dr->device_count; child_no++) {
2414 		found = false;
2415 		new_desc = &dr->func[child_no];
2416 
2417 		spin_lock_irqsave(&hbus->device_list_lock, flags);
2418 		list_for_each_entry(hpdev, &hbus->children, list_entry) {
2419 			if ((hpdev->desc.win_slot.slot == new_desc->win_slot.slot) &&
2420 			    (hpdev->desc.v_id == new_desc->v_id) &&
2421 			    (hpdev->desc.d_id == new_desc->d_id) &&
2422 			    (hpdev->desc.ser == new_desc->ser)) {
2423 				hpdev->reported_missing = false;
2424 				found = true;
2425 			}
2426 		}
2427 		spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2428 
2429 		if (!found) {
2430 			hpdev = new_pcichild_device(hbus, new_desc);
2431 			if (!hpdev)
2432 				dev_err(&hbus->hdev->device,
2433 					"couldn't record a child device.\n");
2434 		}
2435 	}
2436 
2437 	/* Move missing children to a list on the stack. */
2438 	spin_lock_irqsave(&hbus->device_list_lock, flags);
2439 	do {
2440 		found = false;
2441 		list_for_each_entry(hpdev, &hbus->children, list_entry) {
2442 			if (hpdev->reported_missing) {
2443 				found = true;
2444 				put_pcichild(hpdev);
2445 				list_move_tail(&hpdev->list_entry, &removed);
2446 				break;
2447 			}
2448 		}
2449 	} while (found);
2450 	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2451 
2452 	/* Delete everything that should no longer exist. */
2453 	while (!list_empty(&removed)) {
2454 		hpdev = list_first_entry(&removed, struct hv_pci_dev,
2455 					 list_entry);
2456 		list_del(&hpdev->list_entry);
2457 
2458 		if (hpdev->pci_slot)
2459 			pci_destroy_slot(hpdev->pci_slot);
2460 
2461 		put_pcichild(hpdev);
2462 	}
2463 
2464 	switch (hbus->state) {
2465 	case hv_pcibus_installed:
2466 		/*
2467 		 * Tell the core to rescan bus
2468 		 * because there may have been changes.
2469 		 */
2470 		pci_lock_rescan_remove();
2471 		pci_scan_child_bus(hbus->bridge->bus);
2472 		hv_pci_assign_numa_node(hbus);
2473 		hv_pci_assign_slots(hbus);
2474 		pci_unlock_rescan_remove();
2475 		break;
2476 
2477 	case hv_pcibus_init:
2478 	case hv_pcibus_probed:
2479 		survey_child_resources(hbus);
2480 		break;
2481 
2482 	default:
2483 		break;
2484 	}
2485 
2486 	kfree(dr);
2487 }
2488 
2489 /**
2490  * hv_pci_start_relations_work() - Queue work to start device discovery
2491  * @hbus:	Root PCI bus, as understood by this driver
2492  * @dr:		The list of children returned from host
2493  *
2494  * Return:  0 on success, -errno on failure
2495  */
2496 static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus,
2497 				       struct hv_dr_state *dr)
2498 {
2499 	struct hv_dr_work *dr_wrk;
2500 	unsigned long flags;
2501 	bool pending_dr;
2502 
2503 	if (hbus->state == hv_pcibus_removing) {
2504 		dev_info(&hbus->hdev->device,
2505 			 "PCI VMBus BUS_RELATIONS: ignored\n");
2506 		return -ENOENT;
2507 	}
2508 
2509 	dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT);
2510 	if (!dr_wrk)
2511 		return -ENOMEM;
2512 
2513 	INIT_WORK(&dr_wrk->wrk, pci_devices_present_work);
2514 	dr_wrk->bus = hbus;
2515 
2516 	spin_lock_irqsave(&hbus->device_list_lock, flags);
2517 	/*
2518 	 * If pending_dr is true, we have already queued a work,
2519 	 * which will see the new dr. Otherwise, we need to
2520 	 * queue a new work.
2521 	 */
2522 	pending_dr = !list_empty(&hbus->dr_list);
2523 	list_add_tail(&dr->list_entry, &hbus->dr_list);
2524 	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2525 
2526 	if (pending_dr)
2527 		kfree(dr_wrk);
2528 	else
2529 		queue_work(hbus->wq, &dr_wrk->wrk);
2530 
2531 	return 0;
2532 }
2533 
2534 /**
2535  * hv_pci_devices_present() - Handle list of new children
2536  * @hbus:      Root PCI bus, as understood by this driver
2537  * @relations: Packet from host listing children
2538  *
2539  * Process a new list of devices on the bus. The list of devices is
2540  * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS,
2541  * whenever a new list of devices for this bus appears.
2542  */
2543 static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
2544 				   struct pci_bus_relations *relations)
2545 {
2546 	struct hv_dr_state *dr;
2547 	int i;
2548 
2549 	dr = kzalloc(struct_size(dr, func, relations->device_count),
2550 		     GFP_NOWAIT);
2551 	if (!dr)
2552 		return;
2553 
2554 	dr->device_count = relations->device_count;
2555 	for (i = 0; i < dr->device_count; i++) {
2556 		dr->func[i].v_id = relations->func[i].v_id;
2557 		dr->func[i].d_id = relations->func[i].d_id;
2558 		dr->func[i].rev = relations->func[i].rev;
2559 		dr->func[i].prog_intf = relations->func[i].prog_intf;
2560 		dr->func[i].subclass = relations->func[i].subclass;
2561 		dr->func[i].base_class = relations->func[i].base_class;
2562 		dr->func[i].subsystem_id = relations->func[i].subsystem_id;
2563 		dr->func[i].win_slot = relations->func[i].win_slot;
2564 		dr->func[i].ser = relations->func[i].ser;
2565 	}
2566 
2567 	if (hv_pci_start_relations_work(hbus, dr))
2568 		kfree(dr);
2569 }
2570 
2571 /**
2572  * hv_pci_devices_present2() - Handle list of new children
2573  * @hbus:	Root PCI bus, as understood by this driver
2574  * @relations:	Packet from host listing children
2575  *
2576  * This function is the v2 version of hv_pci_devices_present()
2577  */
2578 static void hv_pci_devices_present2(struct hv_pcibus_device *hbus,
2579 				    struct pci_bus_relations2 *relations)
2580 {
2581 	struct hv_dr_state *dr;
2582 	int i;
2583 
2584 	dr = kzalloc(struct_size(dr, func, relations->device_count),
2585 		     GFP_NOWAIT);
2586 	if (!dr)
2587 		return;
2588 
2589 	dr->device_count = relations->device_count;
2590 	for (i = 0; i < dr->device_count; i++) {
2591 		dr->func[i].v_id = relations->func[i].v_id;
2592 		dr->func[i].d_id = relations->func[i].d_id;
2593 		dr->func[i].rev = relations->func[i].rev;
2594 		dr->func[i].prog_intf = relations->func[i].prog_intf;
2595 		dr->func[i].subclass = relations->func[i].subclass;
2596 		dr->func[i].base_class = relations->func[i].base_class;
2597 		dr->func[i].subsystem_id = relations->func[i].subsystem_id;
2598 		dr->func[i].win_slot = relations->func[i].win_slot;
2599 		dr->func[i].ser = relations->func[i].ser;
2600 		dr->func[i].flags = relations->func[i].flags;
2601 		dr->func[i].virtual_numa_node =
2602 			relations->func[i].virtual_numa_node;
2603 	}
2604 
2605 	if (hv_pci_start_relations_work(hbus, dr))
2606 		kfree(dr);
2607 }
2608 
2609 /**
2610  * hv_eject_device_work() - Asynchronously handles ejection
2611  * @work:	Work struct embedded in internal device struct
2612  *
2613  * This function handles ejecting a device.  Windows will
2614  * attempt to gracefully eject a device, waiting 60 seconds to
2615  * hear back from the guest OS that this completed successfully.
2616  * If this timer expires, the device will be forcibly removed.
2617  */
2618 static void hv_eject_device_work(struct work_struct *work)
2619 {
2620 	struct pci_eject_response *ejct_pkt;
2621 	struct hv_pcibus_device *hbus;
2622 	struct hv_pci_dev *hpdev;
2623 	struct pci_dev *pdev;
2624 	unsigned long flags;
2625 	int wslot;
2626 	struct {
2627 		struct pci_packet pkt;
2628 		u8 buffer[sizeof(struct pci_eject_response)];
2629 	} ctxt;
2630 
2631 	hpdev = container_of(work, struct hv_pci_dev, wrk);
2632 	hbus = hpdev->hbus;
2633 
2634 	WARN_ON(hpdev->state != hv_pcichild_ejecting);
2635 
2636 	/*
2637 	 * Ejection can come before or after the PCI bus has been set up, so
2638 	 * attempt to find it and tear down the bus state, if it exists.  This
2639 	 * must be done without constructs like pci_domain_nr(hbus->bridge->bus)
2640 	 * because hbus->bridge->bus may not exist yet.
2641 	 */
2642 	wslot = wslot_to_devfn(hpdev->desc.win_slot.slot);
2643 	pdev = pci_get_domain_bus_and_slot(hbus->bridge->domain_nr, 0, wslot);
2644 	if (pdev) {
2645 		pci_lock_rescan_remove();
2646 		pci_stop_and_remove_bus_device(pdev);
2647 		pci_dev_put(pdev);
2648 		pci_unlock_rescan_remove();
2649 	}
2650 
2651 	spin_lock_irqsave(&hbus->device_list_lock, flags);
2652 	list_del(&hpdev->list_entry);
2653 	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2654 
2655 	if (hpdev->pci_slot)
2656 		pci_destroy_slot(hpdev->pci_slot);
2657 
2658 	memset(&ctxt, 0, sizeof(ctxt));
2659 	ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
2660 	ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
2661 	ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
2662 	vmbus_sendpacket(hbus->hdev->channel, ejct_pkt,
2663 			 sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt,
2664 			 VM_PKT_DATA_INBAND, 0);
2665 
2666 	/* For the get_pcichild() in hv_pci_eject_device() */
2667 	put_pcichild(hpdev);
2668 	/* For the two refs got in new_pcichild_device() */
2669 	put_pcichild(hpdev);
2670 	put_pcichild(hpdev);
2671 	/* hpdev has been freed. Do not use it any more. */
2672 }
2673 
2674 /**
2675  * hv_pci_eject_device() - Handles device ejection
2676  * @hpdev:	Internal device tracking struct
2677  *
2678  * This function is invoked when an ejection packet arrives.  It
2679  * just schedules work so that we don't re-enter the packet
2680  * delivery code handling the ejection.
2681  */
2682 static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
2683 {
2684 	struct hv_pcibus_device *hbus = hpdev->hbus;
2685 	struct hv_device *hdev = hbus->hdev;
2686 
2687 	if (hbus->state == hv_pcibus_removing) {
2688 		dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n");
2689 		return;
2690 	}
2691 
2692 	hpdev->state = hv_pcichild_ejecting;
2693 	get_pcichild(hpdev);
2694 	INIT_WORK(&hpdev->wrk, hv_eject_device_work);
2695 	queue_work(hbus->wq, &hpdev->wrk);
2696 }
2697 
2698 /**
2699  * hv_pci_onchannelcallback() - Handles incoming packets
2700  * @context:	Internal bus tracking struct
2701  *
2702  * This function is invoked whenever the host sends a packet to
2703  * this channel (which is private to this root PCI bus).
2704  */
2705 static void hv_pci_onchannelcallback(void *context)
2706 {
2707 	const int packet_size = 0x100;
2708 	int ret;
2709 	struct hv_pcibus_device *hbus = context;
2710 	u32 bytes_recvd;
2711 	u64 req_id;
2712 	struct vmpacket_descriptor *desc;
2713 	unsigned char *buffer;
2714 	int bufferlen = packet_size;
2715 	struct pci_packet *comp_packet;
2716 	struct pci_response *response;
2717 	struct pci_incoming_message *new_message;
2718 	struct pci_bus_relations *bus_rel;
2719 	struct pci_bus_relations2 *bus_rel2;
2720 	struct pci_dev_inval_block *inval;
2721 	struct pci_dev_incoming *dev_message;
2722 	struct hv_pci_dev *hpdev;
2723 
2724 	buffer = kmalloc(bufferlen, GFP_ATOMIC);
2725 	if (!buffer)
2726 		return;
2727 
2728 	while (1) {
2729 		ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer,
2730 					   bufferlen, &bytes_recvd, &req_id);
2731 
2732 		if (ret == -ENOBUFS) {
2733 			kfree(buffer);
2734 			/* Handle large packet */
2735 			bufferlen = bytes_recvd;
2736 			buffer = kmalloc(bytes_recvd, GFP_ATOMIC);
2737 			if (!buffer)
2738 				return;
2739 			continue;
2740 		}
2741 
2742 		/* Zero length indicates there are no more packets. */
2743 		if (ret || !bytes_recvd)
2744 			break;
2745 
2746 		/*
2747 		 * All incoming packets must be at least as large as a
2748 		 * response.
2749 		 */
2750 		if (bytes_recvd <= sizeof(struct pci_response))
2751 			continue;
2752 		desc = (struct vmpacket_descriptor *)buffer;
2753 
2754 		switch (desc->type) {
2755 		case VM_PKT_COMP:
2756 
2757 			/*
2758 			 * The host is trusted, and thus it's safe to interpret
2759 			 * this transaction ID as a pointer.
2760 			 */
2761 			comp_packet = (struct pci_packet *)req_id;
2762 			response = (struct pci_response *)buffer;
2763 			comp_packet->completion_func(comp_packet->compl_ctxt,
2764 						     response,
2765 						     bytes_recvd);
2766 			break;
2767 
2768 		case VM_PKT_DATA_INBAND:
2769 
2770 			new_message = (struct pci_incoming_message *)buffer;
2771 			switch (new_message->message_type.type) {
2772 			case PCI_BUS_RELATIONS:
2773 
2774 				bus_rel = (struct pci_bus_relations *)buffer;
2775 				if (bytes_recvd <
2776 					struct_size(bus_rel, func,
2777 						    bus_rel->device_count)) {
2778 					dev_err(&hbus->hdev->device,
2779 						"bus relations too small\n");
2780 					break;
2781 				}
2782 
2783 				hv_pci_devices_present(hbus, bus_rel);
2784 				break;
2785 
2786 			case PCI_BUS_RELATIONS2:
2787 
2788 				bus_rel2 = (struct pci_bus_relations2 *)buffer;
2789 				if (bytes_recvd <
2790 					struct_size(bus_rel2, func,
2791 						    bus_rel2->device_count)) {
2792 					dev_err(&hbus->hdev->device,
2793 						"bus relations v2 too small\n");
2794 					break;
2795 				}
2796 
2797 				hv_pci_devices_present2(hbus, bus_rel2);
2798 				break;
2799 
2800 			case PCI_EJECT:
2801 
2802 				dev_message = (struct pci_dev_incoming *)buffer;
2803 				hpdev = get_pcichild_wslot(hbus,
2804 						      dev_message->wslot.slot);
2805 				if (hpdev) {
2806 					hv_pci_eject_device(hpdev);
2807 					put_pcichild(hpdev);
2808 				}
2809 				break;
2810 
2811 			case PCI_INVALIDATE_BLOCK:
2812 
2813 				inval = (struct pci_dev_inval_block *)buffer;
2814 				hpdev = get_pcichild_wslot(hbus,
2815 							   inval->wslot.slot);
2816 				if (hpdev) {
2817 					if (hpdev->block_invalidate) {
2818 						hpdev->block_invalidate(
2819 						    hpdev->invalidate_context,
2820 						    inval->block_mask);
2821 					}
2822 					put_pcichild(hpdev);
2823 				}
2824 				break;
2825 
2826 			default:
2827 				dev_warn(&hbus->hdev->device,
2828 					"Unimplemented protocol message %x\n",
2829 					new_message->message_type.type);
2830 				break;
2831 			}
2832 			break;
2833 
2834 		default:
2835 			dev_err(&hbus->hdev->device,
2836 				"unhandled packet type %d, tid %llx len %d\n",
2837 				desc->type, req_id, bytes_recvd);
2838 			break;
2839 		}
2840 	}
2841 
2842 	kfree(buffer);
2843 }
2844 
2845 /**
2846  * hv_pci_protocol_negotiation() - Set up protocol
2847  * @hdev:		VMBus's tracking struct for this root PCI bus.
2848  * @version:		Array of supported channel protocol versions in
2849  *			the order of probing - highest go first.
2850  * @num_version:	Number of elements in the version array.
2851  *
2852  * This driver is intended to support running on Windows 10
2853  * (server) and later versions. It will not run on earlier
2854  * versions, as they assume that many of the operations which
2855  * Linux needs accomplished with a spinlock held were done via
2856  * asynchronous messaging via VMBus.  Windows 10 increases the
2857  * surface area of PCI emulation so that these actions can take
2858  * place by suspending a virtual processor for their duration.
2859  *
2860  * This function negotiates the channel protocol version,
2861  * failing if the host doesn't support the necessary protocol
2862  * level.
2863  */
2864 static int hv_pci_protocol_negotiation(struct hv_device *hdev,
2865 				       enum pci_protocol_version_t version[],
2866 				       int num_version)
2867 {
2868 	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2869 	struct pci_version_request *version_req;
2870 	struct hv_pci_compl comp_pkt;
2871 	struct pci_packet *pkt;
2872 	int ret;
2873 	int i;
2874 
2875 	/*
2876 	 * Initiate the handshake with the host and negotiate
2877 	 * a version that the host can support. We start with the
2878 	 * highest version number and go down if the host cannot
2879 	 * support it.
2880 	 */
2881 	pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL);
2882 	if (!pkt)
2883 		return -ENOMEM;
2884 
2885 	init_completion(&comp_pkt.host_event);
2886 	pkt->completion_func = hv_pci_generic_compl;
2887 	pkt->compl_ctxt = &comp_pkt;
2888 	version_req = (struct pci_version_request *)&pkt->message;
2889 	version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
2890 
2891 	for (i = 0; i < num_version; i++) {
2892 		version_req->protocol_version = version[i];
2893 		ret = vmbus_sendpacket(hdev->channel, version_req,
2894 				sizeof(struct pci_version_request),
2895 				(unsigned long)pkt, VM_PKT_DATA_INBAND,
2896 				VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2897 		if (!ret)
2898 			ret = wait_for_response(hdev, &comp_pkt.host_event);
2899 
2900 		if (ret) {
2901 			dev_err(&hdev->device,
2902 				"PCI Pass-through VSP failed to request version: %d",
2903 				ret);
2904 			goto exit;
2905 		}
2906 
2907 		if (comp_pkt.completion_status >= 0) {
2908 			hbus->protocol_version = version[i];
2909 			dev_info(&hdev->device,
2910 				"PCI VMBus probing: Using version %#x\n",
2911 				hbus->protocol_version);
2912 			goto exit;
2913 		}
2914 
2915 		if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) {
2916 			dev_err(&hdev->device,
2917 				"PCI Pass-through VSP failed version request: %#x",
2918 				comp_pkt.completion_status);
2919 			ret = -EPROTO;
2920 			goto exit;
2921 		}
2922 
2923 		reinit_completion(&comp_pkt.host_event);
2924 	}
2925 
2926 	dev_err(&hdev->device,
2927 		"PCI pass-through VSP failed to find supported version");
2928 	ret = -EPROTO;
2929 
2930 exit:
2931 	kfree(pkt);
2932 	return ret;
2933 }
2934 
2935 /**
2936  * hv_pci_free_bridge_windows() - Release memory regions for the
2937  * bus
2938  * @hbus:	Root PCI bus, as understood by this driver
2939  */
2940 static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus)
2941 {
2942 	/*
2943 	 * Set the resources back to the way they looked when they
2944 	 * were allocated by setting IORESOURCE_BUSY again.
2945 	 */
2946 
2947 	if (hbus->low_mmio_space && hbus->low_mmio_res) {
2948 		hbus->low_mmio_res->flags |= IORESOURCE_BUSY;
2949 		vmbus_free_mmio(hbus->low_mmio_res->start,
2950 				resource_size(hbus->low_mmio_res));
2951 	}
2952 
2953 	if (hbus->high_mmio_space && hbus->high_mmio_res) {
2954 		hbus->high_mmio_res->flags |= IORESOURCE_BUSY;
2955 		vmbus_free_mmio(hbus->high_mmio_res->start,
2956 				resource_size(hbus->high_mmio_res));
2957 	}
2958 }
2959 
2960 /**
2961  * hv_pci_allocate_bridge_windows() - Allocate memory regions
2962  * for the bus
2963  * @hbus:	Root PCI bus, as understood by this driver
2964  *
2965  * This function calls vmbus_allocate_mmio(), which is itself a
2966  * bit of a compromise.  Ideally, we might change the pnp layer
2967  * in the kernel such that it comprehends either PCI devices
2968  * which are "grandchildren of ACPI," with some intermediate bus
2969  * node (in this case, VMBus) or change it such that it
2970  * understands VMBus.  The pnp layer, however, has been declared
2971  * deprecated, and not subject to change.
2972  *
2973  * The workaround, implemented here, is to ask VMBus to allocate
2974  * MMIO space for this bus.  VMBus itself knows which ranges are
2975  * appropriate by looking at its own ACPI objects.  Then, after
2976  * these ranges are claimed, they're modified to look like they
2977  * would have looked if the ACPI and pnp code had allocated
2978  * bridge windows.  These descriptors have to exist in this form
2979  * in order to satisfy the code which will get invoked when the
2980  * endpoint PCI function driver calls request_mem_region() or
2981  * request_mem_region_exclusive().
2982  *
2983  * Return: 0 on success, -errno on failure
2984  */
2985 static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus)
2986 {
2987 	resource_size_t align;
2988 	int ret;
2989 
2990 	if (hbus->low_mmio_space) {
2991 		align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
2992 		ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0,
2993 					  (u64)(u32)0xffffffff,
2994 					  hbus->low_mmio_space,
2995 					  align, false);
2996 		if (ret) {
2997 			dev_err(&hbus->hdev->device,
2998 				"Need %#llx of low MMIO space. Consider reconfiguring the VM.\n",
2999 				hbus->low_mmio_space);
3000 			return ret;
3001 		}
3002 
3003 		/* Modify this resource to become a bridge window. */
3004 		hbus->low_mmio_res->flags |= IORESOURCE_WINDOW;
3005 		hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY;
3006 		pci_add_resource(&hbus->bridge->windows, hbus->low_mmio_res);
3007 	}
3008 
3009 	if (hbus->high_mmio_space) {
3010 		align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space));
3011 		ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev,
3012 					  0x100000000, -1,
3013 					  hbus->high_mmio_space, align,
3014 					  false);
3015 		if (ret) {
3016 			dev_err(&hbus->hdev->device,
3017 				"Need %#llx of high MMIO space. Consider reconfiguring the VM.\n",
3018 				hbus->high_mmio_space);
3019 			goto release_low_mmio;
3020 		}
3021 
3022 		/* Modify this resource to become a bridge window. */
3023 		hbus->high_mmio_res->flags |= IORESOURCE_WINDOW;
3024 		hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY;
3025 		pci_add_resource(&hbus->bridge->windows, hbus->high_mmio_res);
3026 	}
3027 
3028 	return 0;
3029 
3030 release_low_mmio:
3031 	if (hbus->low_mmio_res) {
3032 		vmbus_free_mmio(hbus->low_mmio_res->start,
3033 				resource_size(hbus->low_mmio_res));
3034 	}
3035 
3036 	return ret;
3037 }
3038 
3039 /**
3040  * hv_allocate_config_window() - Find MMIO space for PCI Config
3041  * @hbus:	Root PCI bus, as understood by this driver
3042  *
3043  * This function claims memory-mapped I/O space for accessing
3044  * configuration space for the functions on this bus.
3045  *
3046  * Return: 0 on success, -errno on failure
3047  */
3048 static int hv_allocate_config_window(struct hv_pcibus_device *hbus)
3049 {
3050 	int ret;
3051 
3052 	/*
3053 	 * Set up a region of MMIO space to use for accessing configuration
3054 	 * space.
3055 	 */
3056 	ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1,
3057 				  PCI_CONFIG_MMIO_LENGTH, 0x1000, false);
3058 	if (ret)
3059 		return ret;
3060 
3061 	/*
3062 	 * vmbus_allocate_mmio() gets used for allocating both device endpoint
3063 	 * resource claims (those which cannot be overlapped) and the ranges
3064 	 * which are valid for the children of this bus, which are intended
3065 	 * to be overlapped by those children.  Set the flag on this claim
3066 	 * meaning that this region can't be overlapped.
3067 	 */
3068 
3069 	hbus->mem_config->flags |= IORESOURCE_BUSY;
3070 
3071 	return 0;
3072 }
3073 
3074 static void hv_free_config_window(struct hv_pcibus_device *hbus)
3075 {
3076 	vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH);
3077 }
3078 
3079 static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs);
3080 
3081 /**
3082  * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
3083  * @hdev:	VMBus's tracking struct for this root PCI bus
3084  *
3085  * Return: 0 on success, -errno on failure
3086  */
3087 static int hv_pci_enter_d0(struct hv_device *hdev)
3088 {
3089 	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3090 	struct pci_bus_d0_entry *d0_entry;
3091 	struct hv_pci_compl comp_pkt;
3092 	struct pci_packet *pkt;
3093 	int ret;
3094 
3095 	/*
3096 	 * Tell the host that the bus is ready to use, and moved into the
3097 	 * powered-on state.  This includes telling the host which region
3098 	 * of memory-mapped I/O space has been chosen for configuration space
3099 	 * access.
3100 	 */
3101 	pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL);
3102 	if (!pkt)
3103 		return -ENOMEM;
3104 
3105 	init_completion(&comp_pkt.host_event);
3106 	pkt->completion_func = hv_pci_generic_compl;
3107 	pkt->compl_ctxt = &comp_pkt;
3108 	d0_entry = (struct pci_bus_d0_entry *)&pkt->message;
3109 	d0_entry->message_type.type = PCI_BUS_D0ENTRY;
3110 	d0_entry->mmio_base = hbus->mem_config->start;
3111 
3112 	ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry),
3113 			       (unsigned long)pkt, VM_PKT_DATA_INBAND,
3114 			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
3115 	if (!ret)
3116 		ret = wait_for_response(hdev, &comp_pkt.host_event);
3117 
3118 	if (ret)
3119 		goto exit;
3120 
3121 	if (comp_pkt.completion_status < 0) {
3122 		dev_err(&hdev->device,
3123 			"PCI Pass-through VSP failed D0 Entry with status %x\n",
3124 			comp_pkt.completion_status);
3125 		ret = -EPROTO;
3126 		goto exit;
3127 	}
3128 
3129 	ret = 0;
3130 
3131 exit:
3132 	kfree(pkt);
3133 	return ret;
3134 }
3135 
3136 /**
3137  * hv_pci_query_relations() - Ask host to send list of child
3138  * devices
3139  * @hdev:	VMBus's tracking struct for this root PCI bus
3140  *
3141  * Return: 0 on success, -errno on failure
3142  */
3143 static int hv_pci_query_relations(struct hv_device *hdev)
3144 {
3145 	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3146 	struct pci_message message;
3147 	struct completion comp;
3148 	int ret;
3149 
3150 	/* Ask the host to send along the list of child devices */
3151 	init_completion(&comp);
3152 	if (cmpxchg(&hbus->survey_event, NULL, &comp))
3153 		return -ENOTEMPTY;
3154 
3155 	memset(&message, 0, sizeof(message));
3156 	message.type = PCI_QUERY_BUS_RELATIONS;
3157 
3158 	ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message),
3159 			       0, VM_PKT_DATA_INBAND, 0);
3160 	if (!ret)
3161 		ret = wait_for_response(hdev, &comp);
3162 
3163 	return ret;
3164 }
3165 
3166 /**
3167  * hv_send_resources_allocated() - Report local resource choices
3168  * @hdev:	VMBus's tracking struct for this root PCI bus
3169  *
3170  * The host OS is expecting to be sent a request as a message
3171  * which contains all the resources that the device will use.
3172  * The response contains those same resources, "translated"
3173  * which is to say, the values which should be used by the
3174  * hardware, when it delivers an interrupt.  (MMIO resources are
3175  * used in local terms.)  This is nice for Windows, and lines up
3176  * with the FDO/PDO split, which doesn't exist in Linux.  Linux
3177  * is deeply expecting to scan an emulated PCI configuration
3178  * space.  So this message is sent here only to drive the state
3179  * machine on the host forward.
3180  *
3181  * Return: 0 on success, -errno on failure
3182  */
3183 static int hv_send_resources_allocated(struct hv_device *hdev)
3184 {
3185 	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3186 	struct pci_resources_assigned *res_assigned;
3187 	struct pci_resources_assigned2 *res_assigned2;
3188 	struct hv_pci_compl comp_pkt;
3189 	struct hv_pci_dev *hpdev;
3190 	struct pci_packet *pkt;
3191 	size_t size_res;
3192 	int wslot;
3193 	int ret;
3194 
3195 	size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2)
3196 			? sizeof(*res_assigned) : sizeof(*res_assigned2);
3197 
3198 	pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL);
3199 	if (!pkt)
3200 		return -ENOMEM;
3201 
3202 	ret = 0;
3203 
3204 	for (wslot = 0; wslot < 256; wslot++) {
3205 		hpdev = get_pcichild_wslot(hbus, wslot);
3206 		if (!hpdev)
3207 			continue;
3208 
3209 		memset(pkt, 0, sizeof(*pkt) + size_res);
3210 		init_completion(&comp_pkt.host_event);
3211 		pkt->completion_func = hv_pci_generic_compl;
3212 		pkt->compl_ctxt = &comp_pkt;
3213 
3214 		if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) {
3215 			res_assigned =
3216 				(struct pci_resources_assigned *)&pkt->message;
3217 			res_assigned->message_type.type =
3218 				PCI_RESOURCES_ASSIGNED;
3219 			res_assigned->wslot.slot = hpdev->desc.win_slot.slot;
3220 		} else {
3221 			res_assigned2 =
3222 				(struct pci_resources_assigned2 *)&pkt->message;
3223 			res_assigned2->message_type.type =
3224 				PCI_RESOURCES_ASSIGNED2;
3225 			res_assigned2->wslot.slot = hpdev->desc.win_slot.slot;
3226 		}
3227 		put_pcichild(hpdev);
3228 
3229 		ret = vmbus_sendpacket(hdev->channel, &pkt->message,
3230 				size_res, (unsigned long)pkt,
3231 				VM_PKT_DATA_INBAND,
3232 				VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
3233 		if (!ret)
3234 			ret = wait_for_response(hdev, &comp_pkt.host_event);
3235 		if (ret)
3236 			break;
3237 
3238 		if (comp_pkt.completion_status < 0) {
3239 			ret = -EPROTO;
3240 			dev_err(&hdev->device,
3241 				"resource allocated returned 0x%x",
3242 				comp_pkt.completion_status);
3243 			break;
3244 		}
3245 
3246 		hbus->wslot_res_allocated = wslot;
3247 	}
3248 
3249 	kfree(pkt);
3250 	return ret;
3251 }
3252 
3253 /**
3254  * hv_send_resources_released() - Report local resources
3255  * released
3256  * @hdev:	VMBus's tracking struct for this root PCI bus
3257  *
3258  * Return: 0 on success, -errno on failure
3259  */
3260 static int hv_send_resources_released(struct hv_device *hdev)
3261 {
3262 	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3263 	struct pci_child_message pkt;
3264 	struct hv_pci_dev *hpdev;
3265 	int wslot;
3266 	int ret;
3267 
3268 	for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) {
3269 		hpdev = get_pcichild_wslot(hbus, wslot);
3270 		if (!hpdev)
3271 			continue;
3272 
3273 		memset(&pkt, 0, sizeof(pkt));
3274 		pkt.message_type.type = PCI_RESOURCES_RELEASED;
3275 		pkt.wslot.slot = hpdev->desc.win_slot.slot;
3276 
3277 		put_pcichild(hpdev);
3278 
3279 		ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0,
3280 				       VM_PKT_DATA_INBAND, 0);
3281 		if (ret)
3282 			return ret;
3283 
3284 		hbus->wslot_res_allocated = wslot - 1;
3285 	}
3286 
3287 	hbus->wslot_res_allocated = -1;
3288 
3289 	return 0;
3290 }
3291 
3292 #define HVPCI_DOM_MAP_SIZE (64 * 1024)
3293 static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
3294 
3295 /*
3296  * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
3297  * as invalid for passthrough PCI devices of this driver.
3298  */
3299 #define HVPCI_DOM_INVALID 0
3300 
3301 /**
3302  * hv_get_dom_num() - Get a valid PCI domain number
3303  * Check if the PCI domain number is in use, and return another number if
3304  * it is in use.
3305  *
3306  * @dom: Requested domain number
3307  *
3308  * return: domain number on success, HVPCI_DOM_INVALID on failure
3309  */
3310 static u16 hv_get_dom_num(u16 dom)
3311 {
3312 	unsigned int i;
3313 
3314 	if (test_and_set_bit(dom, hvpci_dom_map) == 0)
3315 		return dom;
3316 
3317 	for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
3318 		if (test_and_set_bit(i, hvpci_dom_map) == 0)
3319 			return i;
3320 	}
3321 
3322 	return HVPCI_DOM_INVALID;
3323 }
3324 
3325 /**
3326  * hv_put_dom_num() - Mark the PCI domain number as free
3327  * @dom: Domain number to be freed
3328  */
3329 static void hv_put_dom_num(u16 dom)
3330 {
3331 	clear_bit(dom, hvpci_dom_map);
3332 }
3333 
3334 /**
3335  * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
3336  * @hdev:	VMBus's tracking struct for this root PCI bus
3337  * @dev_id:	Identifies the device itself
3338  *
3339  * Return: 0 on success, -errno on failure
3340  */
3341 static int hv_pci_probe(struct hv_device *hdev,
3342 			const struct hv_vmbus_device_id *dev_id)
3343 {
3344 	struct pci_host_bridge *bridge;
3345 	struct hv_pcibus_device *hbus;
3346 	u16 dom_req, dom;
3347 	char *name;
3348 	bool enter_d0_retry = true;
3349 	int ret;
3350 
3351 	/*
3352 	 * hv_pcibus_device contains the hypercall arguments for retargeting in
3353 	 * hv_irq_unmask(). Those must not cross a page boundary.
3354 	 */
3355 	BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE);
3356 
3357 	bridge = devm_pci_alloc_host_bridge(&hdev->device, 0);
3358 	if (!bridge)
3359 		return -ENOMEM;
3360 
3361 	/*
3362 	 * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural
3363 	 * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate
3364 	 * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and
3365 	 * alignment of hbus is important because hbus's field
3366 	 * retarget_msi_interrupt_params must not cross a 4KB page boundary.
3367 	 *
3368 	 * Here we prefer kzalloc to get_zeroed_page(), because a buffer
3369 	 * allocated by the latter is not tracked and scanned by kmemleak, and
3370 	 * hence kmemleak reports the pointer contained in the hbus buffer
3371 	 * (i.e. the hpdev struct, which is created in new_pcichild_device() and
3372 	 * is tracked by hbus->children) as memory leak (false positive).
3373 	 *
3374 	 * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be
3375 	 * used to allocate the hbus buffer and we can avoid the kmemleak false
3376 	 * positive by using kmemleak_alloc() and kmemleak_free() to ask
3377 	 * kmemleak to track and scan the hbus buffer.
3378 	 */
3379 	hbus = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
3380 	if (!hbus)
3381 		return -ENOMEM;
3382 
3383 	hbus->bridge = bridge;
3384 	hbus->state = hv_pcibus_init;
3385 	hbus->wslot_res_allocated = -1;
3386 
3387 	/*
3388 	 * The PCI bus "domain" is what is called "segment" in ACPI and other
3389 	 * specs. Pull it from the instance ID, to get something usually
3390 	 * unique. In rare cases of collision, we will find out another number
3391 	 * not in use.
3392 	 *
3393 	 * Note that, since this code only runs in a Hyper-V VM, Hyper-V
3394 	 * together with this guest driver can guarantee that (1) The only
3395 	 * domain used by Gen1 VMs for something that looks like a physical
3396 	 * PCI bus (which is actually emulated by the hypervisor) is domain 0.
3397 	 * (2) There will be no overlap between domains (after fixing possible
3398 	 * collisions) in the same VM.
3399 	 */
3400 	dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
3401 	dom = hv_get_dom_num(dom_req);
3402 
3403 	if (dom == HVPCI_DOM_INVALID) {
3404 		dev_err(&hdev->device,
3405 			"Unable to use dom# 0x%x or other numbers", dom_req);
3406 		ret = -EINVAL;
3407 		goto free_bus;
3408 	}
3409 
3410 	if (dom != dom_req)
3411 		dev_info(&hdev->device,
3412 			 "PCI dom# 0x%x has collision, using 0x%x",
3413 			 dom_req, dom);
3414 
3415 	hbus->bridge->domain_nr = dom;
3416 #ifdef CONFIG_X86
3417 	hbus->sysdata.domain = dom;
3418 #endif
3419 
3420 	hbus->hdev = hdev;
3421 	INIT_LIST_HEAD(&hbus->children);
3422 	INIT_LIST_HEAD(&hbus->dr_list);
3423 	spin_lock_init(&hbus->config_lock);
3424 	spin_lock_init(&hbus->device_list_lock);
3425 	spin_lock_init(&hbus->retarget_msi_interrupt_lock);
3426 	hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0,
3427 					   hbus->bridge->domain_nr);
3428 	if (!hbus->wq) {
3429 		ret = -ENOMEM;
3430 		goto free_dom;
3431 	}
3432 
3433 	ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
3434 			 hv_pci_onchannelcallback, hbus);
3435 	if (ret)
3436 		goto destroy_wq;
3437 
3438 	hv_set_drvdata(hdev, hbus);
3439 
3440 	ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions,
3441 					  ARRAY_SIZE(pci_protocol_versions));
3442 	if (ret)
3443 		goto close;
3444 
3445 	ret = hv_allocate_config_window(hbus);
3446 	if (ret)
3447 		goto close;
3448 
3449 	hbus->cfg_addr = ioremap(hbus->mem_config->start,
3450 				 PCI_CONFIG_MMIO_LENGTH);
3451 	if (!hbus->cfg_addr) {
3452 		dev_err(&hdev->device,
3453 			"Unable to map a virtual address for config space\n");
3454 		ret = -ENOMEM;
3455 		goto free_config;
3456 	}
3457 
3458 	name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance);
3459 	if (!name) {
3460 		ret = -ENOMEM;
3461 		goto unmap;
3462 	}
3463 
3464 	hbus->fwnode = irq_domain_alloc_named_fwnode(name);
3465 	kfree(name);
3466 	if (!hbus->fwnode) {
3467 		ret = -ENOMEM;
3468 		goto unmap;
3469 	}
3470 
3471 	ret = hv_pcie_init_irq_domain(hbus);
3472 	if (ret)
3473 		goto free_fwnode;
3474 
3475 retry:
3476 	ret = hv_pci_query_relations(hdev);
3477 	if (ret)
3478 		goto free_irq_domain;
3479 
3480 	ret = hv_pci_enter_d0(hdev);
3481 	/*
3482 	 * In certain case (Kdump) the pci device of interest was
3483 	 * not cleanly shut down and resource is still held on host
3484 	 * side, the host could return invalid device status.
3485 	 * We need to explicitly request host to release the resource
3486 	 * and try to enter D0 again.
3487 	 * Since the hv_pci_bus_exit() call releases structures
3488 	 * of all its child devices, we need to start the retry from
3489 	 * hv_pci_query_relations() call, requesting host to send
3490 	 * the synchronous child device relations message before this
3491 	 * information is needed in hv_send_resources_allocated()
3492 	 * call later.
3493 	 */
3494 	if (ret == -EPROTO && enter_d0_retry) {
3495 		enter_d0_retry = false;
3496 
3497 		dev_err(&hdev->device, "Retrying D0 Entry\n");
3498 
3499 		/*
3500 		 * Hv_pci_bus_exit() calls hv_send_resources_released()
3501 		 * to free up resources of its child devices.
3502 		 * In the kdump kernel we need to set the
3503 		 * wslot_res_allocated to 255 so it scans all child
3504 		 * devices to release resources allocated in the
3505 		 * normal kernel before panic happened.
3506 		 */
3507 		hbus->wslot_res_allocated = 255;
3508 		ret = hv_pci_bus_exit(hdev, true);
3509 
3510 		if (ret == 0)
3511 			goto retry;
3512 
3513 		dev_err(&hdev->device,
3514 			"Retrying D0 failed with ret %d\n", ret);
3515 	}
3516 	if (ret)
3517 		goto free_irq_domain;
3518 
3519 	ret = hv_pci_allocate_bridge_windows(hbus);
3520 	if (ret)
3521 		goto exit_d0;
3522 
3523 	ret = hv_send_resources_allocated(hdev);
3524 	if (ret)
3525 		goto free_windows;
3526 
3527 	prepopulate_bars(hbus);
3528 
3529 	hbus->state = hv_pcibus_probed;
3530 
3531 	ret = create_root_hv_pci_bus(hbus);
3532 	if (ret)
3533 		goto free_windows;
3534 
3535 	return 0;
3536 
3537 free_windows:
3538 	hv_pci_free_bridge_windows(hbus);
3539 exit_d0:
3540 	(void) hv_pci_bus_exit(hdev, true);
3541 free_irq_domain:
3542 	irq_domain_remove(hbus->irq_domain);
3543 free_fwnode:
3544 	irq_domain_free_fwnode(hbus->fwnode);
3545 unmap:
3546 	iounmap(hbus->cfg_addr);
3547 free_config:
3548 	hv_free_config_window(hbus);
3549 close:
3550 	vmbus_close(hdev->channel);
3551 destroy_wq:
3552 	destroy_workqueue(hbus->wq);
3553 free_dom:
3554 	hv_put_dom_num(hbus->bridge->domain_nr);
3555 free_bus:
3556 	kfree(hbus);
3557 	return ret;
3558 }
3559 
3560 static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs)
3561 {
3562 	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3563 	struct {
3564 		struct pci_packet teardown_packet;
3565 		u8 buffer[sizeof(struct pci_message)];
3566 	} pkt;
3567 	struct hv_pci_compl comp_pkt;
3568 	struct hv_pci_dev *hpdev, *tmp;
3569 	unsigned long flags;
3570 	int ret;
3571 
3572 	/*
3573 	 * After the host sends the RESCIND_CHANNEL message, it doesn't
3574 	 * access the per-channel ringbuffer any longer.
3575 	 */
3576 	if (hdev->channel->rescind)
3577 		return 0;
3578 
3579 	if (!keep_devs) {
3580 		struct list_head removed;
3581 
3582 		/* Move all present children to the list on stack */
3583 		INIT_LIST_HEAD(&removed);
3584 		spin_lock_irqsave(&hbus->device_list_lock, flags);
3585 		list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry)
3586 			list_move_tail(&hpdev->list_entry, &removed);
3587 		spin_unlock_irqrestore(&hbus->device_list_lock, flags);
3588 
3589 		/* Remove all children in the list */
3590 		list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) {
3591 			list_del(&hpdev->list_entry);
3592 			if (hpdev->pci_slot)
3593 				pci_destroy_slot(hpdev->pci_slot);
3594 			/* For the two refs got in new_pcichild_device() */
3595 			put_pcichild(hpdev);
3596 			put_pcichild(hpdev);
3597 		}
3598 	}
3599 
3600 	ret = hv_send_resources_released(hdev);
3601 	if (ret) {
3602 		dev_err(&hdev->device,
3603 			"Couldn't send resources released packet(s)\n");
3604 		return ret;
3605 	}
3606 
3607 	memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet));
3608 	init_completion(&comp_pkt.host_event);
3609 	pkt.teardown_packet.completion_func = hv_pci_generic_compl;
3610 	pkt.teardown_packet.compl_ctxt = &comp_pkt;
3611 	pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT;
3612 
3613 	ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message,
3614 			       sizeof(struct pci_message),
3615 			       (unsigned long)&pkt.teardown_packet,
3616 			       VM_PKT_DATA_INBAND,
3617 			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
3618 	if (ret)
3619 		return ret;
3620 
3621 	if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0)
3622 		return -ETIMEDOUT;
3623 
3624 	return 0;
3625 }
3626 
3627 /**
3628  * hv_pci_remove() - Remove routine for this VMBus channel
3629  * @hdev:	VMBus's tracking struct for this root PCI bus
3630  *
3631  * Return: 0 on success, -errno on failure
3632  */
3633 static int hv_pci_remove(struct hv_device *hdev)
3634 {
3635 	struct hv_pcibus_device *hbus;
3636 	int ret;
3637 
3638 	hbus = hv_get_drvdata(hdev);
3639 	if (hbus->state == hv_pcibus_installed) {
3640 		tasklet_disable(&hdev->channel->callback_event);
3641 		hbus->state = hv_pcibus_removing;
3642 		tasklet_enable(&hdev->channel->callback_event);
3643 		destroy_workqueue(hbus->wq);
3644 		hbus->wq = NULL;
3645 		/*
3646 		 * At this point, no work is running or can be scheduled
3647 		 * on hbus-wq. We can't race with hv_pci_devices_present()
3648 		 * or hv_pci_eject_device(), it's safe to proceed.
3649 		 */
3650 
3651 		/* Remove the bus from PCI's point of view. */
3652 		pci_lock_rescan_remove();
3653 		pci_stop_root_bus(hbus->bridge->bus);
3654 		hv_pci_remove_slots(hbus);
3655 		pci_remove_root_bus(hbus->bridge->bus);
3656 		pci_unlock_rescan_remove();
3657 	}
3658 
3659 	ret = hv_pci_bus_exit(hdev, false);
3660 
3661 	vmbus_close(hdev->channel);
3662 
3663 	iounmap(hbus->cfg_addr);
3664 	hv_free_config_window(hbus);
3665 	hv_pci_free_bridge_windows(hbus);
3666 	irq_domain_remove(hbus->irq_domain);
3667 	irq_domain_free_fwnode(hbus->fwnode);
3668 
3669 	hv_put_dom_num(hbus->bridge->domain_nr);
3670 
3671 	kfree(hbus);
3672 	return ret;
3673 }
3674 
3675 static int hv_pci_suspend(struct hv_device *hdev)
3676 {
3677 	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3678 	enum hv_pcibus_state old_state;
3679 	int ret;
3680 
3681 	/*
3682 	 * hv_pci_suspend() must make sure there are no pending work items
3683 	 * before calling vmbus_close(), since it runs in a process context
3684 	 * as a callback in dpm_suspend().  When it starts to run, the channel
3685 	 * callback hv_pci_onchannelcallback(), which runs in a tasklet
3686 	 * context, can be still running concurrently and scheduling new work
3687 	 * items onto hbus->wq in hv_pci_devices_present() and
3688 	 * hv_pci_eject_device(), and the work item handlers can access the
3689 	 * vmbus channel, which can be being closed by hv_pci_suspend(), e.g.
3690 	 * the work item handler pci_devices_present_work() ->
3691 	 * new_pcichild_device() writes to the vmbus channel.
3692 	 *
3693 	 * To eliminate the race, hv_pci_suspend() disables the channel
3694 	 * callback tasklet, sets hbus->state to hv_pcibus_removing, and
3695 	 * re-enables the tasklet. This way, when hv_pci_suspend() proceeds,
3696 	 * it knows that no new work item can be scheduled, and then it flushes
3697 	 * hbus->wq and safely closes the vmbus channel.
3698 	 */
3699 	tasklet_disable(&hdev->channel->callback_event);
3700 
3701 	/* Change the hbus state to prevent new work items. */
3702 	old_state = hbus->state;
3703 	if (hbus->state == hv_pcibus_installed)
3704 		hbus->state = hv_pcibus_removing;
3705 
3706 	tasklet_enable(&hdev->channel->callback_event);
3707 
3708 	if (old_state != hv_pcibus_installed)
3709 		return -EINVAL;
3710 
3711 	flush_workqueue(hbus->wq);
3712 
3713 	ret = hv_pci_bus_exit(hdev, true);
3714 	if (ret)
3715 		return ret;
3716 
3717 	vmbus_close(hdev->channel);
3718 
3719 	return 0;
3720 }
3721 
3722 static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg)
3723 {
3724 	struct irq_data *irq_data;
3725 	struct msi_desc *entry;
3726 	int ret = 0;
3727 
3728 	msi_lock_descs(&pdev->dev);
3729 	msi_for_each_desc(entry, &pdev->dev, MSI_DESC_ASSOCIATED) {
3730 		irq_data = irq_get_irq_data(entry->irq);
3731 		if (WARN_ON_ONCE(!irq_data)) {
3732 			ret = -EINVAL;
3733 			break;
3734 		}
3735 
3736 		hv_compose_msi_msg(irq_data, &entry->msg);
3737 	}
3738 	msi_unlock_descs(&pdev->dev);
3739 
3740 	return ret;
3741 }
3742 
3743 /*
3744  * Upon resume, pci_restore_msi_state() -> ... ->  __pci_write_msi_msg()
3745  * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V
3746  * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg()
3747  * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping
3748  * Table entries.
3749  */
3750 static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus)
3751 {
3752 	pci_walk_bus(hbus->bridge->bus, hv_pci_restore_msi_msg, NULL);
3753 }
3754 
3755 static int hv_pci_resume(struct hv_device *hdev)
3756 {
3757 	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3758 	enum pci_protocol_version_t version[1];
3759 	int ret;
3760 
3761 	hbus->state = hv_pcibus_init;
3762 
3763 	ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
3764 			 hv_pci_onchannelcallback, hbus);
3765 	if (ret)
3766 		return ret;
3767 
3768 	/* Only use the version that was in use before hibernation. */
3769 	version[0] = hbus->protocol_version;
3770 	ret = hv_pci_protocol_negotiation(hdev, version, 1);
3771 	if (ret)
3772 		goto out;
3773 
3774 	ret = hv_pci_query_relations(hdev);
3775 	if (ret)
3776 		goto out;
3777 
3778 	ret = hv_pci_enter_d0(hdev);
3779 	if (ret)
3780 		goto out;
3781 
3782 	ret = hv_send_resources_allocated(hdev);
3783 	if (ret)
3784 		goto out;
3785 
3786 	prepopulate_bars(hbus);
3787 
3788 	hv_pci_restore_msi_state(hbus);
3789 
3790 	hbus->state = hv_pcibus_installed;
3791 	return 0;
3792 out:
3793 	vmbus_close(hdev->channel);
3794 	return ret;
3795 }
3796 
3797 static const struct hv_vmbus_device_id hv_pci_id_table[] = {
3798 	/* PCI Pass-through Class ID */
3799 	/* 44C4F61D-4444-4400-9D52-802E27EDE19F */
3800 	{ HV_PCIE_GUID, },
3801 	{ },
3802 };
3803 
3804 MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table);
3805 
3806 static struct hv_driver hv_pci_drv = {
3807 	.name		= "hv_pci",
3808 	.id_table	= hv_pci_id_table,
3809 	.probe		= hv_pci_probe,
3810 	.remove		= hv_pci_remove,
3811 	.suspend	= hv_pci_suspend,
3812 	.resume		= hv_pci_resume,
3813 };
3814 
3815 static void __exit exit_hv_pci_drv(void)
3816 {
3817 	vmbus_driver_unregister(&hv_pci_drv);
3818 
3819 	hvpci_block_ops.read_block = NULL;
3820 	hvpci_block_ops.write_block = NULL;
3821 	hvpci_block_ops.reg_blk_invalidate = NULL;
3822 }
3823 
3824 static int __init init_hv_pci_drv(void)
3825 {
3826 	int ret;
3827 
3828 	if (!hv_is_hyperv_initialized())
3829 		return -ENODEV;
3830 
3831 	ret = hv_pci_irqchip_init();
3832 	if (ret)
3833 		return ret;
3834 
3835 	/* Set the invalid domain number's bit, so it will not be used */
3836 	set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
3837 
3838 	/* Initialize PCI block r/w interface */
3839 	hvpci_block_ops.read_block = hv_read_config_block;
3840 	hvpci_block_ops.write_block = hv_write_config_block;
3841 	hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate;
3842 
3843 	return vmbus_driver_register(&hv_pci_drv);
3844 }
3845 
3846 module_init(init_hv_pci_drv);
3847 module_exit(exit_hv_pci_drv);
3848 
3849 MODULE_DESCRIPTION("Hyper-V PCI");
3850 MODULE_LICENSE("GPL v2");
3851