xref: /openbmc/linux/drivers/vfio/pci/vfio_pci.c (revision afc98d90)
1 /*
2  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
3  *     Author: Alex Williamson <alex.williamson@redhat.com>
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License version 2 as
7  * published by the Free Software Foundation.
8  *
9  * Derived from original vfio:
10  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
11  * Author: Tom Lyon, pugs@cisco.com
12  */
13 
14 #include <linux/device.h>
15 #include <linux/eventfd.h>
16 #include <linux/file.h>
17 #include <linux/interrupt.h>
18 #include <linux/iommu.h>
19 #include <linux/module.h>
20 #include <linux/mutex.h>
21 #include <linux/notifier.h>
22 #include <linux/pci.h>
23 #include <linux/pm_runtime.h>
24 #include <linux/slab.h>
25 #include <linux/types.h>
26 #include <linux/uaccess.h>
27 #include <linux/vfio.h>
28 
29 #include "vfio_pci_private.h"
30 
31 #define DRIVER_VERSION  "0.2"
32 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
33 #define DRIVER_DESC     "VFIO PCI - User Level meta-driver"
34 
35 static bool nointxmask;
36 module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
37 MODULE_PARM_DESC(nointxmask,
38 		  "Disable support for PCI 2.3 style INTx masking.  If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
39 
40 static int vfio_pci_enable(struct vfio_pci_device *vdev)
41 {
42 	struct pci_dev *pdev = vdev->pdev;
43 	int ret;
44 	u16 cmd;
45 	u8 msix_pos;
46 
47 	ret = pci_enable_device(pdev);
48 	if (ret)
49 		return ret;
50 
51 	vdev->reset_works = (pci_reset_function(pdev) == 0);
52 	pci_save_state(pdev);
53 	vdev->pci_saved_state = pci_store_saved_state(pdev);
54 	if (!vdev->pci_saved_state)
55 		pr_debug("%s: Couldn't store %s saved state\n",
56 			 __func__, dev_name(&pdev->dev));
57 
58 	ret = vfio_config_init(vdev);
59 	if (ret) {
60 		pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state);
61 		pci_disable_device(pdev);
62 		return ret;
63 	}
64 
65 	if (likely(!nointxmask))
66 		vdev->pci_2_3 = pci_intx_mask_supported(pdev);
67 
68 	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
69 	if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
70 		cmd &= ~PCI_COMMAND_INTX_DISABLE;
71 		pci_write_config_word(pdev, PCI_COMMAND, cmd);
72 	}
73 
74 	msix_pos = pdev->msix_cap;
75 	if (msix_pos) {
76 		u16 flags;
77 		u32 table;
78 
79 		pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
80 		pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
81 
82 		vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
83 		vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
84 		vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
85 	} else
86 		vdev->msix_bar = 0xFF;
87 
88 #ifdef CONFIG_VFIO_PCI_VGA
89 	if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
90 		vdev->has_vga = true;
91 #endif
92 
93 	return 0;
94 }
95 
96 static void vfio_pci_disable(struct vfio_pci_device *vdev)
97 {
98 	struct pci_dev *pdev = vdev->pdev;
99 	int bar;
100 
101 	pci_disable_device(pdev);
102 
103 	vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
104 				VFIO_IRQ_SET_ACTION_TRIGGER,
105 				vdev->irq_type, 0, 0, NULL);
106 
107 	vdev->virq_disabled = false;
108 
109 	vfio_config_free(vdev);
110 
111 	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
112 		if (!vdev->barmap[bar])
113 			continue;
114 		pci_iounmap(pdev, vdev->barmap[bar]);
115 		pci_release_selected_regions(pdev, 1 << bar);
116 		vdev->barmap[bar] = NULL;
117 	}
118 
119 	/*
120 	 * If we have saved state, restore it.  If we can reset the device,
121 	 * even better.  Resetting with current state seems better than
122 	 * nothing, but saving and restoring current state without reset
123 	 * is just busy work.
124 	 */
125 	if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
126 		pr_info("%s: Couldn't reload %s saved state\n",
127 			__func__, dev_name(&pdev->dev));
128 
129 		if (!vdev->reset_works)
130 			return;
131 
132 		pci_save_state(pdev);
133 	}
134 
135 	/*
136 	 * Disable INTx and MSI, presumably to avoid spurious interrupts
137 	 * during reset.  Stolen from pci_reset_function()
138 	 */
139 	pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
140 
141 	/*
142 	 * Try to reset the device.  The success of this is dependent on
143 	 * being able to lock the device, which is not always possible.
144 	 */
145 	if (vdev->reset_works) {
146 		int ret = pci_try_reset_function(pdev);
147 		if (ret)
148 			pr_warn("%s: Failed to reset device %s (%d)\n",
149 				__func__, dev_name(&pdev->dev), ret);
150 	}
151 
152 	pci_restore_state(pdev);
153 }
154 
155 static void vfio_pci_release(void *device_data)
156 {
157 	struct vfio_pci_device *vdev = device_data;
158 
159 	if (atomic_dec_and_test(&vdev->refcnt))
160 		vfio_pci_disable(vdev);
161 
162 	module_put(THIS_MODULE);
163 }
164 
165 static int vfio_pci_open(void *device_data)
166 {
167 	struct vfio_pci_device *vdev = device_data;
168 
169 	if (!try_module_get(THIS_MODULE))
170 		return -ENODEV;
171 
172 	if (atomic_inc_return(&vdev->refcnt) == 1) {
173 		int ret = vfio_pci_enable(vdev);
174 		if (ret) {
175 			module_put(THIS_MODULE);
176 			return ret;
177 		}
178 	}
179 
180 	return 0;
181 }
182 
183 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
184 {
185 	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
186 		u8 pin;
187 		pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
188 		if (pin)
189 			return 1;
190 
191 	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
192 		u8 pos;
193 		u16 flags;
194 
195 		pos = vdev->pdev->msi_cap;
196 		if (pos) {
197 			pci_read_config_word(vdev->pdev,
198 					     pos + PCI_MSI_FLAGS, &flags);
199 
200 			return 1 << (flags & PCI_MSI_FLAGS_QMASK);
201 		}
202 	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
203 		u8 pos;
204 		u16 flags;
205 
206 		pos = vdev->pdev->msix_cap;
207 		if (pos) {
208 			pci_read_config_word(vdev->pdev,
209 					     pos + PCI_MSIX_FLAGS, &flags);
210 
211 			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
212 		}
213 	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX)
214 		if (pci_is_pcie(vdev->pdev))
215 			return 1;
216 
217 	return 0;
218 }
219 
220 static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
221 {
222 	(*(int *)data)++;
223 	return 0;
224 }
225 
226 struct vfio_pci_fill_info {
227 	int max;
228 	int cur;
229 	struct vfio_pci_dependent_device *devices;
230 };
231 
232 static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
233 {
234 	struct vfio_pci_fill_info *fill = data;
235 	struct iommu_group *iommu_group;
236 
237 	if (fill->cur == fill->max)
238 		return -EAGAIN; /* Something changed, try again */
239 
240 	iommu_group = iommu_group_get(&pdev->dev);
241 	if (!iommu_group)
242 		return -EPERM; /* Cannot reset non-isolated devices */
243 
244 	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
245 	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
246 	fill->devices[fill->cur].bus = pdev->bus->number;
247 	fill->devices[fill->cur].devfn = pdev->devfn;
248 	fill->cur++;
249 	iommu_group_put(iommu_group);
250 	return 0;
251 }
252 
253 struct vfio_pci_group_entry {
254 	struct vfio_group *group;
255 	int id;
256 };
257 
258 struct vfio_pci_group_info {
259 	int count;
260 	struct vfio_pci_group_entry *groups;
261 };
262 
263 static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
264 {
265 	struct vfio_pci_group_info *info = data;
266 	struct iommu_group *group;
267 	int id, i;
268 
269 	group = iommu_group_get(&pdev->dev);
270 	if (!group)
271 		return -EPERM;
272 
273 	id = iommu_group_id(group);
274 
275 	for (i = 0; i < info->count; i++)
276 		if (info->groups[i].id == id)
277 			break;
278 
279 	iommu_group_put(group);
280 
281 	return (i == info->count) ? -EINVAL : 0;
282 }
283 
284 static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
285 {
286 	for (; pdev; pdev = pdev->bus->self)
287 		if (pdev->bus == slot->bus)
288 			return (pdev->slot == slot);
289 	return false;
290 }
291 
292 struct vfio_pci_walk_info {
293 	int (*fn)(struct pci_dev *, void *data);
294 	void *data;
295 	struct pci_dev *pdev;
296 	bool slot;
297 	int ret;
298 };
299 
300 static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
301 {
302 	struct vfio_pci_walk_info *walk = data;
303 
304 	if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
305 		walk->ret = walk->fn(pdev, walk->data);
306 
307 	return walk->ret;
308 }
309 
310 static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
311 					 int (*fn)(struct pci_dev *,
312 						   void *data), void *data,
313 					 bool slot)
314 {
315 	struct vfio_pci_walk_info walk = {
316 		.fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
317 	};
318 
319 	pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
320 
321 	return walk.ret;
322 }
323 
324 static long vfio_pci_ioctl(void *device_data,
325 			   unsigned int cmd, unsigned long arg)
326 {
327 	struct vfio_pci_device *vdev = device_data;
328 	unsigned long minsz;
329 
330 	if (cmd == VFIO_DEVICE_GET_INFO) {
331 		struct vfio_device_info info;
332 
333 		minsz = offsetofend(struct vfio_device_info, num_irqs);
334 
335 		if (copy_from_user(&info, (void __user *)arg, minsz))
336 			return -EFAULT;
337 
338 		if (info.argsz < minsz)
339 			return -EINVAL;
340 
341 		info.flags = VFIO_DEVICE_FLAGS_PCI;
342 
343 		if (vdev->reset_works)
344 			info.flags |= VFIO_DEVICE_FLAGS_RESET;
345 
346 		info.num_regions = VFIO_PCI_NUM_REGIONS;
347 		info.num_irqs = VFIO_PCI_NUM_IRQS;
348 
349 		return copy_to_user((void __user *)arg, &info, minsz);
350 
351 	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
352 		struct pci_dev *pdev = vdev->pdev;
353 		struct vfio_region_info info;
354 
355 		minsz = offsetofend(struct vfio_region_info, offset);
356 
357 		if (copy_from_user(&info, (void __user *)arg, minsz))
358 			return -EFAULT;
359 
360 		if (info.argsz < minsz)
361 			return -EINVAL;
362 
363 		switch (info.index) {
364 		case VFIO_PCI_CONFIG_REGION_INDEX:
365 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
366 			info.size = pdev->cfg_size;
367 			info.flags = VFIO_REGION_INFO_FLAG_READ |
368 				     VFIO_REGION_INFO_FLAG_WRITE;
369 			break;
370 		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
371 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
372 			info.size = pci_resource_len(pdev, info.index);
373 			if (!info.size) {
374 				info.flags = 0;
375 				break;
376 			}
377 
378 			info.flags = VFIO_REGION_INFO_FLAG_READ |
379 				     VFIO_REGION_INFO_FLAG_WRITE;
380 			if (pci_resource_flags(pdev, info.index) &
381 			    IORESOURCE_MEM && info.size >= PAGE_SIZE)
382 				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
383 			break;
384 		case VFIO_PCI_ROM_REGION_INDEX:
385 		{
386 			void __iomem *io;
387 			size_t size;
388 
389 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
390 			info.flags = 0;
391 
392 			/* Report the BAR size, not the ROM size */
393 			info.size = pci_resource_len(pdev, info.index);
394 			if (!info.size)
395 				break;
396 
397 			/* Is it really there? */
398 			io = pci_map_rom(pdev, &size);
399 			if (!io || !size) {
400 				info.size = 0;
401 				break;
402 			}
403 			pci_unmap_rom(pdev, io);
404 
405 			info.flags = VFIO_REGION_INFO_FLAG_READ;
406 			break;
407 		}
408 		case VFIO_PCI_VGA_REGION_INDEX:
409 			if (!vdev->has_vga)
410 				return -EINVAL;
411 
412 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
413 			info.size = 0xc0000;
414 			info.flags = VFIO_REGION_INFO_FLAG_READ |
415 				     VFIO_REGION_INFO_FLAG_WRITE;
416 
417 			break;
418 		default:
419 			return -EINVAL;
420 		}
421 
422 		return copy_to_user((void __user *)arg, &info, minsz);
423 
424 	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
425 		struct vfio_irq_info info;
426 
427 		minsz = offsetofend(struct vfio_irq_info, count);
428 
429 		if (copy_from_user(&info, (void __user *)arg, minsz))
430 			return -EFAULT;
431 
432 		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
433 			return -EINVAL;
434 
435 		switch (info.index) {
436 		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
437 			break;
438 		case VFIO_PCI_ERR_IRQ_INDEX:
439 			if (pci_is_pcie(vdev->pdev))
440 				break;
441 		/* pass thru to return error */
442 		default:
443 			return -EINVAL;
444 		}
445 
446 		info.flags = VFIO_IRQ_INFO_EVENTFD;
447 
448 		info.count = vfio_pci_get_irq_count(vdev, info.index);
449 
450 		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
451 			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
452 				       VFIO_IRQ_INFO_AUTOMASKED);
453 		else
454 			info.flags |= VFIO_IRQ_INFO_NORESIZE;
455 
456 		return copy_to_user((void __user *)arg, &info, minsz);
457 
458 	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
459 		struct vfio_irq_set hdr;
460 		u8 *data = NULL;
461 		int ret = 0;
462 
463 		minsz = offsetofend(struct vfio_irq_set, count);
464 
465 		if (copy_from_user(&hdr, (void __user *)arg, minsz))
466 			return -EFAULT;
467 
468 		if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
469 		    hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
470 				  VFIO_IRQ_SET_ACTION_TYPE_MASK))
471 			return -EINVAL;
472 
473 		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
474 			size_t size;
475 			int max = vfio_pci_get_irq_count(vdev, hdr.index);
476 
477 			if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
478 				size = sizeof(uint8_t);
479 			else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
480 				size = sizeof(int32_t);
481 			else
482 				return -EINVAL;
483 
484 			if (hdr.argsz - minsz < hdr.count * size ||
485 			    hdr.start >= max || hdr.start + hdr.count > max)
486 				return -EINVAL;
487 
488 			data = memdup_user((void __user *)(arg + minsz),
489 					   hdr.count * size);
490 			if (IS_ERR(data))
491 				return PTR_ERR(data);
492 		}
493 
494 		mutex_lock(&vdev->igate);
495 
496 		ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
497 					      hdr.start, hdr.count, data);
498 
499 		mutex_unlock(&vdev->igate);
500 		kfree(data);
501 
502 		return ret;
503 
504 	} else if (cmd == VFIO_DEVICE_RESET) {
505 		return vdev->reset_works ?
506 			pci_try_reset_function(vdev->pdev) : -EINVAL;
507 
508 	} else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
509 		struct vfio_pci_hot_reset_info hdr;
510 		struct vfio_pci_fill_info fill = { 0 };
511 		struct vfio_pci_dependent_device *devices = NULL;
512 		bool slot = false;
513 		int ret = 0;
514 
515 		minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
516 
517 		if (copy_from_user(&hdr, (void __user *)arg, minsz))
518 			return -EFAULT;
519 
520 		if (hdr.argsz < minsz)
521 			return -EINVAL;
522 
523 		hdr.flags = 0;
524 
525 		/* Can we do a slot or bus reset or neither? */
526 		if (!pci_probe_reset_slot(vdev->pdev->slot))
527 			slot = true;
528 		else if (pci_probe_reset_bus(vdev->pdev->bus))
529 			return -ENODEV;
530 
531 		/* How many devices are affected? */
532 		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
533 						    vfio_pci_count_devs,
534 						    &fill.max, slot);
535 		if (ret)
536 			return ret;
537 
538 		WARN_ON(!fill.max); /* Should always be at least one */
539 
540 		/*
541 		 * If there's enough space, fill it now, otherwise return
542 		 * -ENOSPC and the number of devices affected.
543 		 */
544 		if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
545 			ret = -ENOSPC;
546 			hdr.count = fill.max;
547 			goto reset_info_exit;
548 		}
549 
550 		devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
551 		if (!devices)
552 			return -ENOMEM;
553 
554 		fill.devices = devices;
555 
556 		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
557 						    vfio_pci_fill_devs,
558 						    &fill, slot);
559 
560 		/*
561 		 * If a device was removed between counting and filling,
562 		 * we may come up short of fill.max.  If a device was
563 		 * added, we'll have a return of -EAGAIN above.
564 		 */
565 		if (!ret)
566 			hdr.count = fill.cur;
567 
568 reset_info_exit:
569 		if (copy_to_user((void __user *)arg, &hdr, minsz))
570 			ret = -EFAULT;
571 
572 		if (!ret) {
573 			if (copy_to_user((void __user *)(arg + minsz), devices,
574 					 hdr.count * sizeof(*devices)))
575 				ret = -EFAULT;
576 		}
577 
578 		kfree(devices);
579 		return ret;
580 
581 	} else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
582 		struct vfio_pci_hot_reset hdr;
583 		int32_t *group_fds;
584 		struct vfio_pci_group_entry *groups;
585 		struct vfio_pci_group_info info;
586 		bool slot = false;
587 		int i, count = 0, ret = 0;
588 
589 		minsz = offsetofend(struct vfio_pci_hot_reset, count);
590 
591 		if (copy_from_user(&hdr, (void __user *)arg, minsz))
592 			return -EFAULT;
593 
594 		if (hdr.argsz < minsz || hdr.flags)
595 			return -EINVAL;
596 
597 		/* Can we do a slot or bus reset or neither? */
598 		if (!pci_probe_reset_slot(vdev->pdev->slot))
599 			slot = true;
600 		else if (pci_probe_reset_bus(vdev->pdev->bus))
601 			return -ENODEV;
602 
603 		/*
604 		 * We can't let userspace give us an arbitrarily large
605 		 * buffer to copy, so verify how many we think there
606 		 * could be.  Note groups can have multiple devices so
607 		 * one group per device is the max.
608 		 */
609 		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
610 						    vfio_pci_count_devs,
611 						    &count, slot);
612 		if (ret)
613 			return ret;
614 
615 		/* Somewhere between 1 and count is OK */
616 		if (!hdr.count || hdr.count > count)
617 			return -EINVAL;
618 
619 		group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
620 		groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
621 		if (!group_fds || !groups) {
622 			kfree(group_fds);
623 			kfree(groups);
624 			return -ENOMEM;
625 		}
626 
627 		if (copy_from_user(group_fds, (void __user *)(arg + minsz),
628 				   hdr.count * sizeof(*group_fds))) {
629 			kfree(group_fds);
630 			kfree(groups);
631 			return -EFAULT;
632 		}
633 
634 		/*
635 		 * For each group_fd, get the group through the vfio external
636 		 * user interface and store the group and iommu ID.  This
637 		 * ensures the group is held across the reset.
638 		 */
639 		for (i = 0; i < hdr.count; i++) {
640 			struct vfio_group *group;
641 			struct fd f = fdget(group_fds[i]);
642 			if (!f.file) {
643 				ret = -EBADF;
644 				break;
645 			}
646 
647 			group = vfio_group_get_external_user(f.file);
648 			fdput(f);
649 			if (IS_ERR(group)) {
650 				ret = PTR_ERR(group);
651 				break;
652 			}
653 
654 			groups[i].group = group;
655 			groups[i].id = vfio_external_user_iommu_id(group);
656 		}
657 
658 		kfree(group_fds);
659 
660 		/* release reference to groups on error */
661 		if (ret)
662 			goto hot_reset_release;
663 
664 		info.count = hdr.count;
665 		info.groups = groups;
666 
667 		/*
668 		 * Test whether all the affected devices are contained
669 		 * by the set of groups provided by the user.
670 		 */
671 		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
672 						    vfio_pci_validate_devs,
673 						    &info, slot);
674 		if (!ret)
675 			/* User has access, do the reset */
676 			ret = slot ? pci_try_reset_slot(vdev->pdev->slot) :
677 				     pci_try_reset_bus(vdev->pdev->bus);
678 
679 hot_reset_release:
680 		for (i--; i >= 0; i--)
681 			vfio_group_put_external_user(groups[i].group);
682 
683 		kfree(groups);
684 		return ret;
685 	}
686 
687 	return -ENOTTY;
688 }
689 
690 static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
691 			   size_t count, loff_t *ppos, bool iswrite)
692 {
693 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
694 	struct vfio_pci_device *vdev = device_data;
695 
696 	if (index >= VFIO_PCI_NUM_REGIONS)
697 		return -EINVAL;
698 
699 	switch (index) {
700 	case VFIO_PCI_CONFIG_REGION_INDEX:
701 		return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
702 
703 	case VFIO_PCI_ROM_REGION_INDEX:
704 		if (iswrite)
705 			return -EINVAL;
706 		return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
707 
708 	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
709 		return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
710 
711 	case VFIO_PCI_VGA_REGION_INDEX:
712 		return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
713 	}
714 
715 	return -EINVAL;
716 }
717 
718 static ssize_t vfio_pci_read(void *device_data, char __user *buf,
719 			     size_t count, loff_t *ppos)
720 {
721 	if (!count)
722 		return 0;
723 
724 	return vfio_pci_rw(device_data, buf, count, ppos, false);
725 }
726 
727 static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
728 			      size_t count, loff_t *ppos)
729 {
730 	if (!count)
731 		return 0;
732 
733 	return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true);
734 }
735 
736 static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
737 {
738 	struct vfio_pci_device *vdev = device_data;
739 	struct pci_dev *pdev = vdev->pdev;
740 	unsigned int index;
741 	u64 phys_len, req_len, pgoff, req_start;
742 	int ret;
743 
744 	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
745 
746 	if (vma->vm_end < vma->vm_start)
747 		return -EINVAL;
748 	if ((vma->vm_flags & VM_SHARED) == 0)
749 		return -EINVAL;
750 	if (index >= VFIO_PCI_ROM_REGION_INDEX)
751 		return -EINVAL;
752 	if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
753 		return -EINVAL;
754 
755 	phys_len = pci_resource_len(pdev, index);
756 	req_len = vma->vm_end - vma->vm_start;
757 	pgoff = vma->vm_pgoff &
758 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
759 	req_start = pgoff << PAGE_SHIFT;
760 
761 	if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
762 		return -EINVAL;
763 
764 	if (index == vdev->msix_bar) {
765 		/*
766 		 * Disallow mmaps overlapping the MSI-X table; users don't
767 		 * get to touch this directly.  We could find somewhere
768 		 * else to map the overlap, but page granularity is only
769 		 * a recommendation, not a requirement, so the user needs
770 		 * to know which bits are real.  Requiring them to mmap
771 		 * around the table makes that clear.
772 		 */
773 
774 		/* If neither entirely above nor below, then it overlaps */
775 		if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
776 		      req_start + req_len <= vdev->msix_offset))
777 			return -EINVAL;
778 	}
779 
780 	/*
781 	 * Even though we don't make use of the barmap for the mmap,
782 	 * we need to request the region and the barmap tracks that.
783 	 */
784 	if (!vdev->barmap[index]) {
785 		ret = pci_request_selected_regions(pdev,
786 						   1 << index, "vfio-pci");
787 		if (ret)
788 			return ret;
789 
790 		vdev->barmap[index] = pci_iomap(pdev, index, 0);
791 	}
792 
793 	vma->vm_private_data = vdev;
794 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
795 	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
796 
797 	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
798 			       req_len, vma->vm_page_prot);
799 }
800 
801 static const struct vfio_device_ops vfio_pci_ops = {
802 	.name		= "vfio-pci",
803 	.open		= vfio_pci_open,
804 	.release	= vfio_pci_release,
805 	.ioctl		= vfio_pci_ioctl,
806 	.read		= vfio_pci_read,
807 	.write		= vfio_pci_write,
808 	.mmap		= vfio_pci_mmap,
809 };
810 
811 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
812 {
813 	u8 type;
814 	struct vfio_pci_device *vdev;
815 	struct iommu_group *group;
816 	int ret;
817 
818 	pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
819 	if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL)
820 		return -EINVAL;
821 
822 	group = iommu_group_get(&pdev->dev);
823 	if (!group)
824 		return -EINVAL;
825 
826 	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
827 	if (!vdev) {
828 		iommu_group_put(group);
829 		return -ENOMEM;
830 	}
831 
832 	vdev->pdev = pdev;
833 	vdev->irq_type = VFIO_PCI_NUM_IRQS;
834 	mutex_init(&vdev->igate);
835 	spin_lock_init(&vdev->irqlock);
836 	atomic_set(&vdev->refcnt, 0);
837 
838 	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
839 	if (ret) {
840 		iommu_group_put(group);
841 		kfree(vdev);
842 	}
843 
844 	return ret;
845 }
846 
847 static void vfio_pci_remove(struct pci_dev *pdev)
848 {
849 	struct vfio_pci_device *vdev;
850 
851 	vdev = vfio_del_group_dev(&pdev->dev);
852 	if (!vdev)
853 		return;
854 
855 	iommu_group_put(pdev->dev.iommu_group);
856 	kfree(vdev);
857 }
858 
859 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
860 						  pci_channel_state_t state)
861 {
862 	struct vfio_pci_device *vdev;
863 	struct vfio_device *device;
864 
865 	device = vfio_device_get_from_dev(&pdev->dev);
866 	if (device == NULL)
867 		return PCI_ERS_RESULT_DISCONNECT;
868 
869 	vdev = vfio_device_data(device);
870 	if (vdev == NULL) {
871 		vfio_device_put(device);
872 		return PCI_ERS_RESULT_DISCONNECT;
873 	}
874 
875 	mutex_lock(&vdev->igate);
876 
877 	if (vdev->err_trigger)
878 		eventfd_signal(vdev->err_trigger, 1);
879 
880 	mutex_unlock(&vdev->igate);
881 
882 	vfio_device_put(device);
883 
884 	return PCI_ERS_RESULT_CAN_RECOVER;
885 }
886 
887 static struct pci_error_handlers vfio_err_handlers = {
888 	.error_detected = vfio_pci_aer_err_detected,
889 };
890 
891 static struct pci_driver vfio_pci_driver = {
892 	.name		= "vfio-pci",
893 	.id_table	= NULL, /* only dynamic ids */
894 	.probe		= vfio_pci_probe,
895 	.remove		= vfio_pci_remove,
896 	.err_handler	= &vfio_err_handlers,
897 };
898 
899 static void __exit vfio_pci_cleanup(void)
900 {
901 	pci_unregister_driver(&vfio_pci_driver);
902 	vfio_pci_virqfd_exit();
903 	vfio_pci_uninit_perm_bits();
904 }
905 
906 static int __init vfio_pci_init(void)
907 {
908 	int ret;
909 
910 	/* Allocate shared config space permision data used by all devices */
911 	ret = vfio_pci_init_perm_bits();
912 	if (ret)
913 		return ret;
914 
915 	/* Start the virqfd cleanup handler */
916 	ret = vfio_pci_virqfd_init();
917 	if (ret)
918 		goto out_virqfd;
919 
920 	/* Register and scan for devices */
921 	ret = pci_register_driver(&vfio_pci_driver);
922 	if (ret)
923 		goto out_driver;
924 
925 	return 0;
926 
927 out_driver:
928 	vfio_pci_virqfd_exit();
929 out_virqfd:
930 	vfio_pci_uninit_perm_bits();
931 	return ret;
932 }
933 
934 module_init(vfio_pci_init);
935 module_exit(vfio_pci_cleanup);
936 
937 MODULE_VERSION(DRIVER_VERSION);
938 MODULE_LICENSE("GPL v2");
939 MODULE_AUTHOR(DRIVER_AUTHOR);
940 MODULE_DESCRIPTION(DRIVER_DESC);
941