1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Volume Management Device driver
4 * Copyright (c) 2015, Intel Corporation.
5 */
6
7 #include <linux/device.h>
8 #include <linux/interrupt.h>
9 #include <linux/irq.h>
10 #include <linux/kernel.h>
11 #include <linux/module.h>
12 #include <linux/msi.h>
13 #include <linux/pci.h>
14 #include <linux/pci-acpi.h>
15 #include <linux/pci-ecam.h>
16 #include <linux/srcu.h>
17 #include <linux/rculist.h>
18 #include <linux/rcupdate.h>
19
20 #include <xen/xen.h>
21
22 #include <asm/irqdomain.h>
23
24 #define VMD_CFGBAR 0
25 #define VMD_MEMBAR1 2
26 #define VMD_MEMBAR2 4
27
28 #define PCI_REG_VMCAP 0x40
29 #define BUS_RESTRICT_CAP(vmcap) (vmcap & 0x1)
30 #define PCI_REG_VMCONFIG 0x44
31 #define BUS_RESTRICT_CFG(vmcfg) ((vmcfg >> 8) & 0x3)
32 #define VMCONFIG_MSI_REMAP 0x2
33 #define PCI_REG_VMLOCK 0x70
34 #define MB2_SHADOW_EN(vmlock) (vmlock & 0x2)
35
36 #define MB2_SHADOW_OFFSET 0x2000
37 #define MB2_SHADOW_SIZE 16
38
39 enum vmd_features {
40 /*
41 * Device may contain registers which hint the physical location of the
42 * membars, in order to allow proper address translation during
43 * resource assignment to enable guest virtualization
44 */
45 VMD_FEAT_HAS_MEMBAR_SHADOW = (1 << 0),
46
47 /*
48 * Device may provide root port configuration information which limits
49 * bus numbering
50 */
51 VMD_FEAT_HAS_BUS_RESTRICTIONS = (1 << 1),
52
53 /*
54 * Device contains physical location shadow registers in
55 * vendor-specific capability space
56 */
57 VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP = (1 << 2),
58
59 /*
60 * Device may use MSI-X vector 0 for software triggering and will not
61 * be used for MSI remapping
62 */
63 VMD_FEAT_OFFSET_FIRST_VECTOR = (1 << 3),
64
65 /*
66 * Device can bypass remapping MSI-X transactions into its MSI-X table,
67 * avoiding the requirement of a VMD MSI domain for child device
68 * interrupt handling.
69 */
70 VMD_FEAT_CAN_BYPASS_MSI_REMAP = (1 << 4),
71
72 /*
73 * Enable ASPM on the PCIE root ports and set the default LTR of the
74 * storage devices on platforms where these values are not configured by
75 * BIOS. This is needed for laptops, which require these settings for
76 * proper power management of the SoC.
77 */
78 VMD_FEAT_BIOS_PM_QUIRK = (1 << 5),
79 };
80
81 #define VMD_BIOS_PM_QUIRK_LTR 0x1003 /* 3145728 ns */
82
83 #define VMD_FEATS_CLIENT (VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP | \
84 VMD_FEAT_HAS_BUS_RESTRICTIONS | \
85 VMD_FEAT_OFFSET_FIRST_VECTOR | \
86 VMD_FEAT_BIOS_PM_QUIRK)
87
88 static DEFINE_IDA(vmd_instance_ida);
89
90 /*
91 * Lock for manipulating VMD IRQ lists.
92 */
93 static DEFINE_RAW_SPINLOCK(list_lock);
94
95 /**
96 * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
97 * @node: list item for parent traversal.
98 * @irq: back pointer to parent.
99 * @enabled: true if driver enabled IRQ
100 * @virq: the virtual IRQ value provided to the requesting driver.
101 *
102 * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to
103 * a VMD IRQ using this structure.
104 */
105 struct vmd_irq {
106 struct list_head node;
107 struct vmd_irq_list *irq;
108 bool enabled;
109 unsigned int virq;
110 };
111
112 /**
113 * struct vmd_irq_list - list of driver requested IRQs mapping to a VMD vector
114 * @irq_list: the list of irq's the VMD one demuxes to.
115 * @srcu: SRCU struct for local synchronization.
116 * @count: number of child IRQs assigned to this vector; used to track
117 * sharing.
118 * @virq: The underlying VMD Linux interrupt number
119 */
120 struct vmd_irq_list {
121 struct list_head irq_list;
122 struct srcu_struct srcu;
123 unsigned int count;
124 unsigned int virq;
125 };
126
127 struct vmd_dev {
128 struct pci_dev *dev;
129
130 raw_spinlock_t cfg_lock;
131 void __iomem *cfgbar;
132
133 int msix_count;
134 struct vmd_irq_list *irqs;
135
136 struct pci_sysdata sysdata;
137 struct resource resources[3];
138 struct irq_domain *irq_domain;
139 struct pci_bus *bus;
140 u8 busn_start;
141 u8 first_vec;
142 char *name;
143 int instance;
144 };
145
vmd_from_bus(struct pci_bus * bus)146 static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus)
147 {
148 return container_of(bus->sysdata, struct vmd_dev, sysdata);
149 }
150
index_from_irqs(struct vmd_dev * vmd,struct vmd_irq_list * irqs)151 static inline unsigned int index_from_irqs(struct vmd_dev *vmd,
152 struct vmd_irq_list *irqs)
153 {
154 return irqs - vmd->irqs;
155 }
156
157 /*
158 * Drivers managing a device in a VMD domain allocate their own IRQs as before,
159 * but the MSI entry for the hardware it's driving will be programmed with a
160 * destination ID for the VMD MSI-X table. The VMD muxes interrupts in its
161 * domain into one of its own, and the VMD driver de-muxes these for the
162 * handlers sharing that VMD IRQ. The vmd irq_domain provides the operations
163 * and irq_chip to set this up.
164 */
vmd_compose_msi_msg(struct irq_data * data,struct msi_msg * msg)165 static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
166 {
167 struct vmd_irq *vmdirq = data->chip_data;
168 struct vmd_irq_list *irq = vmdirq->irq;
169 struct vmd_dev *vmd = irq_data_get_irq_handler_data(data);
170
171 memset(msg, 0, sizeof(*msg));
172 msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
173 msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
174 msg->arch_addr_lo.destid_0_7 = index_from_irqs(vmd, irq);
175 }
176
177 /*
178 * We rely on MSI_FLAG_USE_DEF_CHIP_OPS to set the IRQ mask/unmask ops.
179 */
vmd_irq_enable(struct irq_data * data)180 static void vmd_irq_enable(struct irq_data *data)
181 {
182 struct vmd_irq *vmdirq = data->chip_data;
183 unsigned long flags;
184
185 raw_spin_lock_irqsave(&list_lock, flags);
186 WARN_ON(vmdirq->enabled);
187 list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list);
188 vmdirq->enabled = true;
189 raw_spin_unlock_irqrestore(&list_lock, flags);
190
191 data->chip->irq_unmask(data);
192 }
193
vmd_irq_disable(struct irq_data * data)194 static void vmd_irq_disable(struct irq_data *data)
195 {
196 struct vmd_irq *vmdirq = data->chip_data;
197 unsigned long flags;
198
199 data->chip->irq_mask(data);
200
201 raw_spin_lock_irqsave(&list_lock, flags);
202 if (vmdirq->enabled) {
203 list_del_rcu(&vmdirq->node);
204 vmdirq->enabled = false;
205 }
206 raw_spin_unlock_irqrestore(&list_lock, flags);
207 }
208
209 /*
210 * XXX: Stubbed until we develop acceptable way to not create conflicts with
211 * other devices sharing the same vector.
212 */
vmd_irq_set_affinity(struct irq_data * data,const struct cpumask * dest,bool force)213 static int vmd_irq_set_affinity(struct irq_data *data,
214 const struct cpumask *dest, bool force)
215 {
216 return -EINVAL;
217 }
218
219 static struct irq_chip vmd_msi_controller = {
220 .name = "VMD-MSI",
221 .irq_enable = vmd_irq_enable,
222 .irq_disable = vmd_irq_disable,
223 .irq_compose_msi_msg = vmd_compose_msi_msg,
224 .irq_set_affinity = vmd_irq_set_affinity,
225 };
226
vmd_get_hwirq(struct msi_domain_info * info,msi_alloc_info_t * arg)227 static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info,
228 msi_alloc_info_t *arg)
229 {
230 return 0;
231 }
232
233 /*
234 * XXX: We can be even smarter selecting the best IRQ once we solve the
235 * affinity problem.
236 */
vmd_next_irq(struct vmd_dev * vmd,struct msi_desc * desc)237 static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *desc)
238 {
239 unsigned long flags;
240 int i, best;
241
242 if (vmd->msix_count == 1 + vmd->first_vec)
243 return &vmd->irqs[vmd->first_vec];
244
245 /*
246 * White list for fast-interrupt handlers. All others will share the
247 * "slow" interrupt vector.
248 */
249 switch (msi_desc_to_pci_dev(desc)->class) {
250 case PCI_CLASS_STORAGE_EXPRESS:
251 break;
252 default:
253 return &vmd->irqs[vmd->first_vec];
254 }
255
256 raw_spin_lock_irqsave(&list_lock, flags);
257 best = vmd->first_vec + 1;
258 for (i = best; i < vmd->msix_count; i++)
259 if (vmd->irqs[i].count < vmd->irqs[best].count)
260 best = i;
261 vmd->irqs[best].count++;
262 raw_spin_unlock_irqrestore(&list_lock, flags);
263
264 return &vmd->irqs[best];
265 }
266
vmd_msi_init(struct irq_domain * domain,struct msi_domain_info * info,unsigned int virq,irq_hw_number_t hwirq,msi_alloc_info_t * arg)267 static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
268 unsigned int virq, irq_hw_number_t hwirq,
269 msi_alloc_info_t *arg)
270 {
271 struct msi_desc *desc = arg->desc;
272 struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(desc)->bus);
273 struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL);
274
275 if (!vmdirq)
276 return -ENOMEM;
277
278 INIT_LIST_HEAD(&vmdirq->node);
279 vmdirq->irq = vmd_next_irq(vmd, desc);
280 vmdirq->virq = virq;
281
282 irq_domain_set_info(domain, virq, vmdirq->irq->virq, info->chip, vmdirq,
283 handle_untracked_irq, vmd, NULL);
284 return 0;
285 }
286
vmd_msi_free(struct irq_domain * domain,struct msi_domain_info * info,unsigned int virq)287 static void vmd_msi_free(struct irq_domain *domain,
288 struct msi_domain_info *info, unsigned int virq)
289 {
290 struct vmd_irq *vmdirq = irq_get_chip_data(virq);
291 unsigned long flags;
292
293 synchronize_srcu(&vmdirq->irq->srcu);
294
295 /* XXX: Potential optimization to rebalance */
296 raw_spin_lock_irqsave(&list_lock, flags);
297 vmdirq->irq->count--;
298 raw_spin_unlock_irqrestore(&list_lock, flags);
299
300 kfree(vmdirq);
301 }
302
vmd_msi_prepare(struct irq_domain * domain,struct device * dev,int nvec,msi_alloc_info_t * arg)303 static int vmd_msi_prepare(struct irq_domain *domain, struct device *dev,
304 int nvec, msi_alloc_info_t *arg)
305 {
306 struct pci_dev *pdev = to_pci_dev(dev);
307 struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
308
309 if (nvec > vmd->msix_count)
310 return vmd->msix_count;
311
312 memset(arg, 0, sizeof(*arg));
313 return 0;
314 }
315
vmd_set_desc(msi_alloc_info_t * arg,struct msi_desc * desc)316 static void vmd_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
317 {
318 arg->desc = desc;
319 }
320
321 static struct msi_domain_ops vmd_msi_domain_ops = {
322 .get_hwirq = vmd_get_hwirq,
323 .msi_init = vmd_msi_init,
324 .msi_free = vmd_msi_free,
325 .msi_prepare = vmd_msi_prepare,
326 .set_desc = vmd_set_desc,
327 };
328
329 static struct msi_domain_info vmd_msi_domain_info = {
330 .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
331 MSI_FLAG_PCI_MSIX,
332 .ops = &vmd_msi_domain_ops,
333 .chip = &vmd_msi_controller,
334 };
335
vmd_set_msi_remapping(struct vmd_dev * vmd,bool enable)336 static void vmd_set_msi_remapping(struct vmd_dev *vmd, bool enable)
337 {
338 u16 reg;
339
340 pci_read_config_word(vmd->dev, PCI_REG_VMCONFIG, ®);
341 reg = enable ? (reg & ~VMCONFIG_MSI_REMAP) :
342 (reg | VMCONFIG_MSI_REMAP);
343 pci_write_config_word(vmd->dev, PCI_REG_VMCONFIG, reg);
344 }
345
vmd_create_irq_domain(struct vmd_dev * vmd)346 static int vmd_create_irq_domain(struct vmd_dev *vmd)
347 {
348 struct fwnode_handle *fn;
349
350 fn = irq_domain_alloc_named_id_fwnode("VMD-MSI", vmd->sysdata.domain);
351 if (!fn)
352 return -ENODEV;
353
354 vmd->irq_domain = pci_msi_create_irq_domain(fn, &vmd_msi_domain_info, NULL);
355 if (!vmd->irq_domain) {
356 irq_domain_free_fwnode(fn);
357 return -ENODEV;
358 }
359
360 return 0;
361 }
362
vmd_remove_irq_domain(struct vmd_dev * vmd)363 static void vmd_remove_irq_domain(struct vmd_dev *vmd)
364 {
365 /*
366 * Some production BIOS won't enable remapping between soft reboots.
367 * Ensure remapping is restored before unloading the driver.
368 */
369 if (!vmd->msix_count)
370 vmd_set_msi_remapping(vmd, true);
371
372 if (vmd->irq_domain) {
373 struct fwnode_handle *fn = vmd->irq_domain->fwnode;
374
375 irq_domain_remove(vmd->irq_domain);
376 irq_domain_free_fwnode(fn);
377 }
378 }
379
vmd_cfg_addr(struct vmd_dev * vmd,struct pci_bus * bus,unsigned int devfn,int reg,int len)380 static void __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus,
381 unsigned int devfn, int reg, int len)
382 {
383 unsigned int busnr_ecam = bus->number - vmd->busn_start;
384 u32 offset = PCIE_ECAM_OFFSET(busnr_ecam, devfn, reg);
385
386 if (offset + len >= resource_size(&vmd->dev->resource[VMD_CFGBAR]))
387 return NULL;
388
389 return vmd->cfgbar + offset;
390 }
391
392 /*
393 * CPU may deadlock if config space is not serialized on some versions of this
394 * hardware, so all config space access is done under a spinlock.
395 */
vmd_pci_read(struct pci_bus * bus,unsigned int devfn,int reg,int len,u32 * value)396 static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
397 int len, u32 *value)
398 {
399 struct vmd_dev *vmd = vmd_from_bus(bus);
400 void __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len);
401 unsigned long flags;
402 int ret = 0;
403
404 if (!addr)
405 return -EFAULT;
406
407 raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
408 switch (len) {
409 case 1:
410 *value = readb(addr);
411 break;
412 case 2:
413 *value = readw(addr);
414 break;
415 case 4:
416 *value = readl(addr);
417 break;
418 default:
419 ret = -EINVAL;
420 break;
421 }
422 raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
423 return ret;
424 }
425
426 /*
427 * VMD h/w converts non-posted config writes to posted memory writes. The
428 * read-back in this function forces the completion so it returns only after
429 * the config space was written, as expected.
430 */
vmd_pci_write(struct pci_bus * bus,unsigned int devfn,int reg,int len,u32 value)431 static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
432 int len, u32 value)
433 {
434 struct vmd_dev *vmd = vmd_from_bus(bus);
435 void __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len);
436 unsigned long flags;
437 int ret = 0;
438
439 if (!addr)
440 return -EFAULT;
441
442 raw_spin_lock_irqsave(&vmd->cfg_lock, flags);
443 switch (len) {
444 case 1:
445 writeb(value, addr);
446 readb(addr);
447 break;
448 case 2:
449 writew(value, addr);
450 readw(addr);
451 break;
452 case 4:
453 writel(value, addr);
454 readl(addr);
455 break;
456 default:
457 ret = -EINVAL;
458 break;
459 }
460 raw_spin_unlock_irqrestore(&vmd->cfg_lock, flags);
461 return ret;
462 }
463
464 static struct pci_ops vmd_ops = {
465 .read = vmd_pci_read,
466 .write = vmd_pci_write,
467 };
468
469 #ifdef CONFIG_ACPI
vmd_acpi_find_companion(struct pci_dev * pci_dev)470 static struct acpi_device *vmd_acpi_find_companion(struct pci_dev *pci_dev)
471 {
472 struct pci_host_bridge *bridge;
473 u32 busnr, addr;
474
475 if (pci_dev->bus->ops != &vmd_ops)
476 return NULL;
477
478 bridge = pci_find_host_bridge(pci_dev->bus);
479 busnr = pci_dev->bus->number - bridge->bus->number;
480 /*
481 * The address computation below is only applicable to relative bus
482 * numbers below 32.
483 */
484 if (busnr > 31)
485 return NULL;
486
487 addr = (busnr << 24) | ((u32)pci_dev->devfn << 16) | 0x8000FFFFU;
488
489 dev_dbg(&pci_dev->dev, "Looking for ACPI companion (address 0x%x)\n",
490 addr);
491
492 return acpi_find_child_device(ACPI_COMPANION(bridge->dev.parent), addr,
493 false);
494 }
495
496 static bool hook_installed;
497
vmd_acpi_begin(void)498 static void vmd_acpi_begin(void)
499 {
500 if (pci_acpi_set_companion_lookup_hook(vmd_acpi_find_companion))
501 return;
502
503 hook_installed = true;
504 }
505
vmd_acpi_end(void)506 static void vmd_acpi_end(void)
507 {
508 if (!hook_installed)
509 return;
510
511 pci_acpi_clear_companion_lookup_hook();
512 hook_installed = false;
513 }
514 #else
vmd_acpi_begin(void)515 static inline void vmd_acpi_begin(void) { }
vmd_acpi_end(void)516 static inline void vmd_acpi_end(void) { }
517 #endif /* CONFIG_ACPI */
518
vmd_domain_reset(struct vmd_dev * vmd)519 static void vmd_domain_reset(struct vmd_dev *vmd)
520 {
521 u16 bus, max_buses = resource_size(&vmd->resources[0]);
522 u8 dev, functions, fn, hdr_type;
523 char __iomem *base;
524
525 for (bus = 0; bus < max_buses; bus++) {
526 for (dev = 0; dev < 32; dev++) {
527 base = vmd->cfgbar + PCIE_ECAM_OFFSET(bus,
528 PCI_DEVFN(dev, 0), 0);
529
530 hdr_type = readb(base + PCI_HEADER_TYPE);
531
532 functions = (hdr_type & 0x80) ? 8 : 1;
533 for (fn = 0; fn < functions; fn++) {
534 base = vmd->cfgbar + PCIE_ECAM_OFFSET(bus,
535 PCI_DEVFN(dev, fn), 0);
536
537 hdr_type = readb(base + PCI_HEADER_TYPE) &
538 PCI_HEADER_TYPE_MASK;
539
540 if (hdr_type != PCI_HEADER_TYPE_BRIDGE ||
541 (readw(base + PCI_CLASS_DEVICE) !=
542 PCI_CLASS_BRIDGE_PCI))
543 continue;
544
545 /*
546 * Temporarily disable the I/O range before updating
547 * PCI_IO_BASE.
548 */
549 writel(0x0000ffff, base + PCI_IO_BASE_UPPER16);
550 /* Update lower 16 bits of I/O base/limit */
551 writew(0x00f0, base + PCI_IO_BASE);
552 /* Update upper 16 bits of I/O base/limit */
553 writel(0, base + PCI_IO_BASE_UPPER16);
554
555 /* MMIO Base/Limit */
556 writel(0x0000fff0, base + PCI_MEMORY_BASE);
557
558 /* Prefetchable MMIO Base/Limit */
559 writel(0, base + PCI_PREF_LIMIT_UPPER32);
560 writel(0x0000fff0, base + PCI_PREF_MEMORY_BASE);
561 writel(0xffffffff, base + PCI_PREF_BASE_UPPER32);
562 }
563 }
564 }
565 }
566
vmd_attach_resources(struct vmd_dev * vmd)567 static void vmd_attach_resources(struct vmd_dev *vmd)
568 {
569 vmd->dev->resource[VMD_MEMBAR1].child = &vmd->resources[1];
570 vmd->dev->resource[VMD_MEMBAR2].child = &vmd->resources[2];
571 }
572
vmd_detach_resources(struct vmd_dev * vmd)573 static void vmd_detach_resources(struct vmd_dev *vmd)
574 {
575 vmd->dev->resource[VMD_MEMBAR1].child = NULL;
576 vmd->dev->resource[VMD_MEMBAR2].child = NULL;
577 }
578
579 /*
580 * VMD domains start at 0x10000 to not clash with ACPI _SEG domains.
581 * Per ACPI r6.0, sec 6.5.6, _SEG returns an integer, of which the lower
582 * 16 bits are the PCI Segment Group (domain) number. Other bits are
583 * currently reserved.
584 */
vmd_find_free_domain(void)585 static int vmd_find_free_domain(void)
586 {
587 int domain = 0xffff;
588 struct pci_bus *bus = NULL;
589
590 while ((bus = pci_find_next_bus(bus)) != NULL)
591 domain = max_t(int, domain, pci_domain_nr(bus));
592 return domain + 1;
593 }
594
vmd_get_phys_offsets(struct vmd_dev * vmd,bool native_hint,resource_size_t * offset1,resource_size_t * offset2)595 static int vmd_get_phys_offsets(struct vmd_dev *vmd, bool native_hint,
596 resource_size_t *offset1,
597 resource_size_t *offset2)
598 {
599 struct pci_dev *dev = vmd->dev;
600 u64 phys1, phys2;
601
602 if (native_hint) {
603 u32 vmlock;
604 int ret;
605
606 ret = pci_read_config_dword(dev, PCI_REG_VMLOCK, &vmlock);
607 if (ret || PCI_POSSIBLE_ERROR(vmlock))
608 return -ENODEV;
609
610 if (MB2_SHADOW_EN(vmlock)) {
611 void __iomem *membar2;
612
613 membar2 = pci_iomap(dev, VMD_MEMBAR2, 0);
614 if (!membar2)
615 return -ENOMEM;
616 phys1 = readq(membar2 + MB2_SHADOW_OFFSET);
617 phys2 = readq(membar2 + MB2_SHADOW_OFFSET + 8);
618 pci_iounmap(dev, membar2);
619 } else
620 return 0;
621 } else {
622 /* Hypervisor-Emulated Vendor-Specific Capability */
623 int pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
624 u32 reg, regu;
625
626 pci_read_config_dword(dev, pos + 4, ®);
627
628 /* "SHDW" */
629 if (pos && reg == 0x53484457) {
630 pci_read_config_dword(dev, pos + 8, ®);
631 pci_read_config_dword(dev, pos + 12, ®u);
632 phys1 = (u64) regu << 32 | reg;
633
634 pci_read_config_dword(dev, pos + 16, ®);
635 pci_read_config_dword(dev, pos + 20, ®u);
636 phys2 = (u64) regu << 32 | reg;
637 } else
638 return 0;
639 }
640
641 *offset1 = dev->resource[VMD_MEMBAR1].start -
642 (phys1 & PCI_BASE_ADDRESS_MEM_MASK);
643 *offset2 = dev->resource[VMD_MEMBAR2].start -
644 (phys2 & PCI_BASE_ADDRESS_MEM_MASK);
645
646 return 0;
647 }
648
vmd_get_bus_number_start(struct vmd_dev * vmd)649 static int vmd_get_bus_number_start(struct vmd_dev *vmd)
650 {
651 struct pci_dev *dev = vmd->dev;
652 u16 reg;
653
654 pci_read_config_word(dev, PCI_REG_VMCAP, ®);
655 if (BUS_RESTRICT_CAP(reg)) {
656 pci_read_config_word(dev, PCI_REG_VMCONFIG, ®);
657
658 switch (BUS_RESTRICT_CFG(reg)) {
659 case 0:
660 vmd->busn_start = 0;
661 break;
662 case 1:
663 vmd->busn_start = 128;
664 break;
665 case 2:
666 vmd->busn_start = 224;
667 break;
668 default:
669 pci_err(dev, "Unknown Bus Offset Setting (%d)\n",
670 BUS_RESTRICT_CFG(reg));
671 return -ENODEV;
672 }
673 }
674
675 return 0;
676 }
677
vmd_irq(int irq,void * data)678 static irqreturn_t vmd_irq(int irq, void *data)
679 {
680 struct vmd_irq_list *irqs = data;
681 struct vmd_irq *vmdirq;
682 int idx;
683
684 idx = srcu_read_lock(&irqs->srcu);
685 list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
686 generic_handle_irq(vmdirq->virq);
687 srcu_read_unlock(&irqs->srcu, idx);
688
689 return IRQ_HANDLED;
690 }
691
vmd_alloc_irqs(struct vmd_dev * vmd)692 static int vmd_alloc_irqs(struct vmd_dev *vmd)
693 {
694 struct pci_dev *dev = vmd->dev;
695 int i, err;
696
697 vmd->msix_count = pci_msix_vec_count(dev);
698 if (vmd->msix_count < 0)
699 return -ENODEV;
700
701 vmd->msix_count = pci_alloc_irq_vectors(dev, vmd->first_vec + 1,
702 vmd->msix_count, PCI_IRQ_MSIX);
703 if (vmd->msix_count < 0)
704 return vmd->msix_count;
705
706 vmd->irqs = devm_kcalloc(&dev->dev, vmd->msix_count, sizeof(*vmd->irqs),
707 GFP_KERNEL);
708 if (!vmd->irqs)
709 return -ENOMEM;
710
711 for (i = 0; i < vmd->msix_count; i++) {
712 err = init_srcu_struct(&vmd->irqs[i].srcu);
713 if (err)
714 return err;
715
716 INIT_LIST_HEAD(&vmd->irqs[i].irq_list);
717 vmd->irqs[i].virq = pci_irq_vector(dev, i);
718 err = devm_request_irq(&dev->dev, vmd->irqs[i].virq,
719 vmd_irq, IRQF_NO_THREAD,
720 vmd->name, &vmd->irqs[i]);
721 if (err)
722 return err;
723 }
724
725 return 0;
726 }
727
728 /*
729 * Since VMD is an aperture to regular PCIe root ports, only allow it to
730 * control features that the OS is allowed to control on the physical PCI bus.
731 */
vmd_copy_host_bridge_flags(struct pci_host_bridge * root_bridge,struct pci_host_bridge * vmd_bridge)732 static void vmd_copy_host_bridge_flags(struct pci_host_bridge *root_bridge,
733 struct pci_host_bridge *vmd_bridge)
734 {
735 vmd_bridge->native_pcie_hotplug = root_bridge->native_pcie_hotplug;
736 vmd_bridge->native_shpc_hotplug = root_bridge->native_shpc_hotplug;
737 vmd_bridge->native_aer = root_bridge->native_aer;
738 vmd_bridge->native_pme = root_bridge->native_pme;
739 vmd_bridge->native_ltr = root_bridge->native_ltr;
740 vmd_bridge->native_dpc = root_bridge->native_dpc;
741 }
742
743 /*
744 * Enable ASPM and LTR settings on devices that aren't configured by BIOS.
745 */
vmd_pm_enable_quirk(struct pci_dev * pdev,void * userdata)746 static int vmd_pm_enable_quirk(struct pci_dev *pdev, void *userdata)
747 {
748 unsigned long features = *(unsigned long *)userdata;
749 u16 ltr = VMD_BIOS_PM_QUIRK_LTR;
750 u32 ltr_reg;
751 int pos;
752
753 if (!(features & VMD_FEAT_BIOS_PM_QUIRK))
754 return 0;
755
756 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_LTR);
757 if (!pos)
758 goto out_state_change;
759
760 /*
761 * Skip if the max snoop LTR is non-zero, indicating BIOS has set it
762 * so the LTR quirk is not needed.
763 */
764 pci_read_config_dword(pdev, pos + PCI_LTR_MAX_SNOOP_LAT, <r_reg);
765 if (!!(ltr_reg & (PCI_LTR_VALUE_MASK | PCI_LTR_SCALE_MASK)))
766 goto out_state_change;
767
768 /*
769 * Set the default values to the maximum required by the platform to
770 * allow the deepest power management savings. Write as a DWORD where
771 * the lower word is the max snoop latency and the upper word is the
772 * max non-snoop latency.
773 */
774 ltr_reg = (ltr << 16) | ltr;
775 pci_write_config_dword(pdev, pos + PCI_LTR_MAX_SNOOP_LAT, ltr_reg);
776 pci_info(pdev, "VMD: Default LTR value set by driver\n");
777
778 out_state_change:
779 /*
780 * Ensure devices are in D0 before enabling PCI-PM L1 PM Substates, per
781 * PCIe r6.0, sec 5.5.4.
782 */
783 pci_set_power_state_locked(pdev, PCI_D0);
784 pci_enable_link_state_locked(pdev, PCIE_LINK_STATE_ALL);
785 return 0;
786 }
787
vmd_enable_domain(struct vmd_dev * vmd,unsigned long features)788 static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
789 {
790 struct pci_sysdata *sd = &vmd->sysdata;
791 struct resource *res;
792 u32 upper_bits;
793 unsigned long flags;
794 LIST_HEAD(resources);
795 resource_size_t offset[2] = {0};
796 resource_size_t membar2_offset = 0x2000;
797 struct pci_bus *child;
798 struct pci_dev *dev;
799 int ret;
800
801 /*
802 * Shadow registers may exist in certain VMD device ids which allow
803 * guests to correctly assign host physical addresses to the root ports
804 * and child devices. These registers will either return the host value
805 * or 0, depending on an enable bit in the VMD device.
806 */
807 if (features & VMD_FEAT_HAS_MEMBAR_SHADOW) {
808 membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE;
809 ret = vmd_get_phys_offsets(vmd, true, &offset[0], &offset[1]);
810 if (ret)
811 return ret;
812 } else if (features & VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP) {
813 ret = vmd_get_phys_offsets(vmd, false, &offset[0], &offset[1]);
814 if (ret)
815 return ret;
816 }
817
818 /*
819 * Certain VMD devices may have a root port configuration option which
820 * limits the bus range to between 0-127, 128-255, or 224-255
821 */
822 if (features & VMD_FEAT_HAS_BUS_RESTRICTIONS) {
823 ret = vmd_get_bus_number_start(vmd);
824 if (ret)
825 return ret;
826 }
827
828 res = &vmd->dev->resource[VMD_CFGBAR];
829 vmd->resources[0] = (struct resource) {
830 .name = "VMD CFGBAR",
831 .start = vmd->busn_start,
832 .end = vmd->busn_start + (resource_size(res) >> 20) - 1,
833 .flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED,
834 };
835
836 /*
837 * If the window is below 4GB, clear IORESOURCE_MEM_64 so we can
838 * put 32-bit resources in the window.
839 *
840 * There's no hardware reason why a 64-bit window *couldn't*
841 * contain a 32-bit resource, but pbus_size_mem() computes the
842 * bridge window size assuming a 64-bit window will contain no
843 * 32-bit resources. __pci_assign_resource() enforces that
844 * artificial restriction to make sure everything will fit.
845 *
846 * The only way we could use a 64-bit non-prefetchable MEMBAR is
847 * if its address is <4GB so that we can convert it to a 32-bit
848 * resource. To be visible to the host OS, all VMD endpoints must
849 * be initially configured by platform BIOS, which includes setting
850 * up these resources. We can assume the device is configured
851 * according to the platform needs.
852 */
853 res = &vmd->dev->resource[VMD_MEMBAR1];
854 upper_bits = upper_32_bits(res->end);
855 flags = res->flags & ~IORESOURCE_SIZEALIGN;
856 if (!upper_bits)
857 flags &= ~IORESOURCE_MEM_64;
858 vmd->resources[1] = (struct resource) {
859 .name = "VMD MEMBAR1",
860 .start = res->start,
861 .end = res->end,
862 .flags = flags,
863 .parent = res,
864 };
865
866 res = &vmd->dev->resource[VMD_MEMBAR2];
867 upper_bits = upper_32_bits(res->end);
868 flags = res->flags & ~IORESOURCE_SIZEALIGN;
869 if (!upper_bits)
870 flags &= ~IORESOURCE_MEM_64;
871 vmd->resources[2] = (struct resource) {
872 .name = "VMD MEMBAR2",
873 .start = res->start + membar2_offset,
874 .end = res->end,
875 .flags = flags,
876 .parent = res,
877 };
878
879 sd->vmd_dev = vmd->dev;
880 sd->domain = vmd_find_free_domain();
881 if (sd->domain < 0)
882 return sd->domain;
883
884 sd->node = pcibus_to_node(vmd->dev->bus);
885
886 /*
887 * Currently MSI remapping must be enabled in guest passthrough mode
888 * due to some missing interrupt remapping plumbing. This is probably
889 * acceptable because the guest is usually CPU-limited and MSI
890 * remapping doesn't become a performance bottleneck.
891 */
892 if (!(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) ||
893 offset[0] || offset[1]) {
894 ret = vmd_alloc_irqs(vmd);
895 if (ret)
896 return ret;
897
898 vmd_set_msi_remapping(vmd, true);
899
900 ret = vmd_create_irq_domain(vmd);
901 if (ret)
902 return ret;
903
904 /*
905 * Override the IRQ domain bus token so the domain can be
906 * distinguished from a regular PCI/MSI domain.
907 */
908 irq_domain_update_bus_token(vmd->irq_domain, DOMAIN_BUS_VMD_MSI);
909 } else {
910 vmd_set_msi_remapping(vmd, false);
911 }
912
913 pci_add_resource(&resources, &vmd->resources[0]);
914 pci_add_resource_offset(&resources, &vmd->resources[1], offset[0]);
915 pci_add_resource_offset(&resources, &vmd->resources[2], offset[1]);
916
917 vmd->bus = pci_create_root_bus(&vmd->dev->dev, vmd->busn_start,
918 &vmd_ops, sd, &resources);
919 if (!vmd->bus) {
920 pci_free_resource_list(&resources);
921 vmd_remove_irq_domain(vmd);
922 return -ENODEV;
923 }
924
925 vmd_copy_host_bridge_flags(pci_find_host_bridge(vmd->dev->bus),
926 to_pci_host_bridge(vmd->bus->bridge));
927
928 vmd_attach_resources(vmd);
929 if (vmd->irq_domain)
930 dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
931 else
932 dev_set_msi_domain(&vmd->bus->dev,
933 dev_get_msi_domain(&vmd->dev->dev));
934
935 WARN(sysfs_create_link(&vmd->dev->dev.kobj, &vmd->bus->dev.kobj,
936 "domain"), "Can't create symlink to domain\n");
937
938 vmd_acpi_begin();
939
940 pci_scan_child_bus(vmd->bus);
941 vmd_domain_reset(vmd);
942
943 /* When Intel VMD is enabled, the OS does not discover the Root Ports
944 * owned by Intel VMD within the MMCFG space. pci_reset_bus() applies
945 * a reset to the parent of the PCI device supplied as argument. This
946 * is why we pass a child device, so the reset can be triggered at
947 * the Intel bridge level and propagated to all the children in the
948 * hierarchy.
949 */
950 list_for_each_entry(child, &vmd->bus->children, node) {
951 if (!list_empty(&child->devices)) {
952 dev = list_first_entry(&child->devices,
953 struct pci_dev, bus_list);
954 ret = pci_reset_bus(dev);
955 if (ret)
956 pci_warn(dev, "can't reset device: %d\n", ret);
957
958 break;
959 }
960 }
961
962 pci_assign_unassigned_bus_resources(vmd->bus);
963
964 pci_walk_bus(vmd->bus, vmd_pm_enable_quirk, &features);
965
966 /*
967 * VMD root buses are virtual and don't return true on pci_is_pcie()
968 * and will fail pcie_bus_configure_settings() early. It can instead be
969 * run on each of the real root ports.
970 */
971 list_for_each_entry(child, &vmd->bus->children, node)
972 pcie_bus_configure_settings(child);
973
974 pci_bus_add_devices(vmd->bus);
975
976 vmd_acpi_end();
977 return 0;
978 }
979
vmd_probe(struct pci_dev * dev,const struct pci_device_id * id)980 static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
981 {
982 unsigned long features = (unsigned long) id->driver_data;
983 struct vmd_dev *vmd;
984 int err;
985
986 if (xen_domain()) {
987 /*
988 * Xen doesn't have knowledge about devices in the VMD bus
989 * because the config space of devices behind the VMD bridge is
990 * not known to Xen, and hence Xen cannot discover or configure
991 * them in any way.
992 *
993 * Bypass of MSI remapping won't work in that case as direct
994 * write by Linux to the MSI entries won't result in functional
995 * interrupts, as Xen is the entity that manages the host
996 * interrupt controller and must configure interrupts. However
997 * multiplexing of interrupts by the VMD bridge will work under
998 * Xen, so force the usage of that mode which must always be
999 * supported by VMD bridges.
1000 */
1001 features &= ~VMD_FEAT_CAN_BYPASS_MSI_REMAP;
1002 }
1003
1004 if (resource_size(&dev->resource[VMD_CFGBAR]) < (1 << 20))
1005 return -ENOMEM;
1006
1007 vmd = devm_kzalloc(&dev->dev, sizeof(*vmd), GFP_KERNEL);
1008 if (!vmd)
1009 return -ENOMEM;
1010
1011 vmd->dev = dev;
1012 vmd->instance = ida_simple_get(&vmd_instance_ida, 0, 0, GFP_KERNEL);
1013 if (vmd->instance < 0)
1014 return vmd->instance;
1015
1016 vmd->name = devm_kasprintf(&dev->dev, GFP_KERNEL, "vmd%d",
1017 vmd->instance);
1018 if (!vmd->name) {
1019 err = -ENOMEM;
1020 goto out_release_instance;
1021 }
1022
1023 err = pcim_enable_device(dev);
1024 if (err < 0)
1025 goto out_release_instance;
1026
1027 vmd->cfgbar = pcim_iomap(dev, VMD_CFGBAR, 0);
1028 if (!vmd->cfgbar) {
1029 err = -ENOMEM;
1030 goto out_release_instance;
1031 }
1032
1033 pci_set_master(dev);
1034 if (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(64)) &&
1035 dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32))) {
1036 err = -ENODEV;
1037 goto out_release_instance;
1038 }
1039
1040 if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
1041 vmd->first_vec = 1;
1042
1043 raw_spin_lock_init(&vmd->cfg_lock);
1044 pci_set_drvdata(dev, vmd);
1045 err = vmd_enable_domain(vmd, features);
1046 if (err)
1047 goto out_release_instance;
1048
1049 dev_info(&vmd->dev->dev, "Bound to PCI domain %04x\n",
1050 vmd->sysdata.domain);
1051 return 0;
1052
1053 out_release_instance:
1054 ida_simple_remove(&vmd_instance_ida, vmd->instance);
1055 return err;
1056 }
1057
vmd_cleanup_srcu(struct vmd_dev * vmd)1058 static void vmd_cleanup_srcu(struct vmd_dev *vmd)
1059 {
1060 int i;
1061
1062 for (i = 0; i < vmd->msix_count; i++)
1063 cleanup_srcu_struct(&vmd->irqs[i].srcu);
1064 }
1065
vmd_remove(struct pci_dev * dev)1066 static void vmd_remove(struct pci_dev *dev)
1067 {
1068 struct vmd_dev *vmd = pci_get_drvdata(dev);
1069
1070 pci_stop_root_bus(vmd->bus);
1071 sysfs_remove_link(&vmd->dev->dev.kobj, "domain");
1072 pci_remove_root_bus(vmd->bus);
1073 vmd_cleanup_srcu(vmd);
1074 vmd_detach_resources(vmd);
1075 vmd_remove_irq_domain(vmd);
1076 ida_simple_remove(&vmd_instance_ida, vmd->instance);
1077 }
1078
vmd_shutdown(struct pci_dev * dev)1079 static void vmd_shutdown(struct pci_dev *dev)
1080 {
1081 struct vmd_dev *vmd = pci_get_drvdata(dev);
1082
1083 vmd_remove_irq_domain(vmd);
1084 }
1085
1086 #ifdef CONFIG_PM_SLEEP
vmd_suspend(struct device * dev)1087 static int vmd_suspend(struct device *dev)
1088 {
1089 struct pci_dev *pdev = to_pci_dev(dev);
1090 struct vmd_dev *vmd = pci_get_drvdata(pdev);
1091 int i;
1092
1093 for (i = 0; i < vmd->msix_count; i++)
1094 devm_free_irq(dev, vmd->irqs[i].virq, &vmd->irqs[i]);
1095
1096 return 0;
1097 }
1098
vmd_resume(struct device * dev)1099 static int vmd_resume(struct device *dev)
1100 {
1101 struct pci_dev *pdev = to_pci_dev(dev);
1102 struct vmd_dev *vmd = pci_get_drvdata(pdev);
1103 int err, i;
1104
1105 if (vmd->irq_domain)
1106 vmd_set_msi_remapping(vmd, true);
1107 else
1108 vmd_set_msi_remapping(vmd, false);
1109
1110 for (i = 0; i < vmd->msix_count; i++) {
1111 err = devm_request_irq(dev, vmd->irqs[i].virq,
1112 vmd_irq, IRQF_NO_THREAD,
1113 vmd->name, &vmd->irqs[i]);
1114 if (err)
1115 return err;
1116 }
1117
1118 return 0;
1119 }
1120 #endif
1121 static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, vmd_resume);
1122
1123 static const struct pci_device_id vmd_ids[] = {
1124 {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_VMD_201D),
1125 .driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP,},
1126 {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_VMD_28C0),
1127 .driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW |
1128 VMD_FEAT_HAS_BUS_RESTRICTIONS |
1129 VMD_FEAT_CAN_BYPASS_MSI_REMAP,},
1130 {PCI_VDEVICE(INTEL, 0x467f),
1131 .driver_data = VMD_FEATS_CLIENT,},
1132 {PCI_VDEVICE(INTEL, 0x4c3d),
1133 .driver_data = VMD_FEATS_CLIENT,},
1134 {PCI_VDEVICE(INTEL, 0xa77f),
1135 .driver_data = VMD_FEATS_CLIENT,},
1136 {PCI_VDEVICE(INTEL, 0x7d0b),
1137 .driver_data = VMD_FEATS_CLIENT,},
1138 {PCI_VDEVICE(INTEL, 0xad0b),
1139 .driver_data = VMD_FEATS_CLIENT,},
1140 {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_VMD_9A0B),
1141 .driver_data = VMD_FEATS_CLIENT,},
1142 {PCI_VDEVICE(INTEL, 0xb60b),
1143 .driver_data = VMD_FEATS_CLIENT,},
1144 {PCI_VDEVICE(INTEL, 0xb06f),
1145 .driver_data = VMD_FEATS_CLIENT,},
1146 {0,}
1147 };
1148 MODULE_DEVICE_TABLE(pci, vmd_ids);
1149
1150 static struct pci_driver vmd_drv = {
1151 .name = "vmd",
1152 .id_table = vmd_ids,
1153 .probe = vmd_probe,
1154 .remove = vmd_remove,
1155 .shutdown = vmd_shutdown,
1156 .driver = {
1157 .pm = &vmd_dev_pm_ops,
1158 },
1159 };
1160 module_pci_driver(vmd_drv);
1161
1162 MODULE_AUTHOR("Intel Corporation");
1163 MODULE_LICENSE("GPL v2");
1164 MODULE_VERSION("0.6");
1165