xref: /openbmc/linux/drivers/vfio/pci/vfio_pci_core.c (revision ecfb9f40)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
4  *     Author: Alex Williamson <alex.williamson@redhat.com>
5  *
6  * Derived from original vfio:
7  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
8  * Author: Tom Lyon, pugs@cisco.com
9  */
10 
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12 
13 #include <linux/aperture.h>
14 #include <linux/device.h>
15 #include <linux/eventfd.h>
16 #include <linux/file.h>
17 #include <linux/interrupt.h>
18 #include <linux/iommu.h>
19 #include <linux/module.h>
20 #include <linux/mutex.h>
21 #include <linux/notifier.h>
22 #include <linux/pci.h>
23 #include <linux/pm_runtime.h>
24 #include <linux/slab.h>
25 #include <linux/types.h>
26 #include <linux/uaccess.h>
27 #include <linux/vgaarb.h>
28 #include <linux/nospec.h>
29 #include <linux/sched/mm.h>
30 #if IS_ENABLED(CONFIG_EEH)
31 #include <asm/eeh.h>
32 #endif
33 
34 #include "vfio_pci_priv.h"
35 
36 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
37 #define DRIVER_DESC "core driver for VFIO based PCI devices"
38 
39 static bool nointxmask;
40 static bool disable_vga;
41 static bool disable_idle_d3;
42 
43 /* List of PF's that vfio_pci_core_sriov_configure() has been called on */
44 static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex);
45 static LIST_HEAD(vfio_pci_sriov_pfs);
46 
47 struct vfio_pci_dummy_resource {
48 	struct resource		resource;
49 	int			index;
50 	struct list_head	res_next;
51 };
52 
53 struct vfio_pci_vf_token {
54 	struct mutex		lock;
55 	uuid_t			uuid;
56 	int			users;
57 };
58 
59 struct vfio_pci_mmap_vma {
60 	struct vm_area_struct	*vma;
61 	struct list_head	vma_next;
62 };
63 
64 static inline bool vfio_vga_disabled(void)
65 {
66 #ifdef CONFIG_VFIO_PCI_VGA
67 	return disable_vga;
68 #else
69 	return true;
70 #endif
71 }
72 
73 /*
74  * Our VGA arbiter participation is limited since we don't know anything
75  * about the device itself.  However, if the device is the only VGA device
76  * downstream of a bridge and VFIO VGA support is disabled, then we can
77  * safely return legacy VGA IO and memory as not decoded since the user
78  * has no way to get to it and routing can be disabled externally at the
79  * bridge.
80  */
81 static unsigned int vfio_pci_set_decode(struct pci_dev *pdev, bool single_vga)
82 {
83 	struct pci_dev *tmp = NULL;
84 	unsigned char max_busnr;
85 	unsigned int decodes;
86 
87 	if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus))
88 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
89 		       VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
90 
91 	max_busnr = pci_bus_max_busnr(pdev->bus);
92 	decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
93 
94 	while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) {
95 		if (tmp == pdev ||
96 		    pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) ||
97 		    pci_is_root_bus(tmp->bus))
98 			continue;
99 
100 		if (tmp->bus->number >= pdev->bus->number &&
101 		    tmp->bus->number <= max_busnr) {
102 			pci_dev_put(tmp);
103 			decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
104 			break;
105 		}
106 	}
107 
108 	return decodes;
109 }
110 
111 static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
112 {
113 	struct resource *res;
114 	int i;
115 	struct vfio_pci_dummy_resource *dummy_res;
116 
117 	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
118 		int bar = i + PCI_STD_RESOURCES;
119 
120 		res = &vdev->pdev->resource[bar];
121 
122 		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
123 			goto no_mmap;
124 
125 		if (!(res->flags & IORESOURCE_MEM))
126 			goto no_mmap;
127 
128 		/*
129 		 * The PCI core shouldn't set up a resource with a
130 		 * type but zero size. But there may be bugs that
131 		 * cause us to do that.
132 		 */
133 		if (!resource_size(res))
134 			goto no_mmap;
135 
136 		if (resource_size(res) >= PAGE_SIZE) {
137 			vdev->bar_mmap_supported[bar] = true;
138 			continue;
139 		}
140 
141 		if (!(res->start & ~PAGE_MASK)) {
142 			/*
143 			 * Add a dummy resource to reserve the remainder
144 			 * of the exclusive page in case that hot-add
145 			 * device's bar is assigned into it.
146 			 */
147 			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
148 			if (dummy_res == NULL)
149 				goto no_mmap;
150 
151 			dummy_res->resource.name = "vfio sub-page reserved";
152 			dummy_res->resource.start = res->end + 1;
153 			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
154 			dummy_res->resource.flags = res->flags;
155 			if (request_resource(res->parent,
156 						&dummy_res->resource)) {
157 				kfree(dummy_res);
158 				goto no_mmap;
159 			}
160 			dummy_res->index = bar;
161 			list_add(&dummy_res->res_next,
162 					&vdev->dummy_resources_list);
163 			vdev->bar_mmap_supported[bar] = true;
164 			continue;
165 		}
166 		/*
167 		 * Here we don't handle the case when the BAR is not page
168 		 * aligned because we can't expect the BAR will be
169 		 * assigned into the same location in a page in guest
170 		 * when we passthrough the BAR. And it's hard to access
171 		 * this BAR in userspace because we have no way to get
172 		 * the BAR's location in a page.
173 		 */
174 no_mmap:
175 		vdev->bar_mmap_supported[bar] = false;
176 	}
177 }
178 
179 struct vfio_pci_group_info;
180 static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
181 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
182 				      struct vfio_pci_group_info *groups);
183 
184 /*
185  * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
186  * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
187  * If a device implements the former but not the latter we would typically
188  * expect broken_intx_masking be set and require an exclusive interrupt.
189  * However since we do have control of the device's ability to assert INTx,
190  * we can instead pretend that the device does not implement INTx, virtualizing
191  * the pin register to report zero and maintaining DisINTx set on the host.
192  */
193 static bool vfio_pci_nointx(struct pci_dev *pdev)
194 {
195 	switch (pdev->vendor) {
196 	case PCI_VENDOR_ID_INTEL:
197 		switch (pdev->device) {
198 		/* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
199 		case 0x1572:
200 		case 0x1574:
201 		case 0x1580 ... 0x1581:
202 		case 0x1583 ... 0x158b:
203 		case 0x37d0 ... 0x37d2:
204 		/* X550 */
205 		case 0x1563:
206 			return true;
207 		default:
208 			return false;
209 		}
210 	}
211 
212 	return false;
213 }
214 
215 static void vfio_pci_probe_power_state(struct vfio_pci_core_device *vdev)
216 {
217 	struct pci_dev *pdev = vdev->pdev;
218 	u16 pmcsr;
219 
220 	if (!pdev->pm_cap)
221 		return;
222 
223 	pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
224 
225 	vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
226 }
227 
228 /*
229  * pci_set_power_state() wrapper handling devices which perform a soft reset on
230  * D3->D0 transition.  Save state prior to D0/1/2->D3, stash it on the vdev,
231  * restore when returned to D0.  Saved separately from pci_saved_state for use
232  * by PM capability emulation and separately from pci_dev internal saved state
233  * to avoid it being overwritten and consumed around other resets.
234  */
235 int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state)
236 {
237 	struct pci_dev *pdev = vdev->pdev;
238 	bool needs_restore = false, needs_save = false;
239 	int ret;
240 
241 	/* Prevent changing power state for PFs with VFs enabled */
242 	if (pci_num_vf(pdev) && state > PCI_D0)
243 		return -EBUSY;
244 
245 	if (vdev->needs_pm_restore) {
246 		if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
247 			pci_save_state(pdev);
248 			needs_save = true;
249 		}
250 
251 		if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
252 			needs_restore = true;
253 	}
254 
255 	ret = pci_set_power_state(pdev, state);
256 
257 	if (!ret) {
258 		/* D3 might be unsupported via quirk, skip unless in D3 */
259 		if (needs_save && pdev->current_state >= PCI_D3hot) {
260 			/*
261 			 * The current PCI state will be saved locally in
262 			 * 'pm_save' during the D3hot transition. When the
263 			 * device state is changed to D0 again with the current
264 			 * function, then pci_store_saved_state() will restore
265 			 * the state and will free the memory pointed by
266 			 * 'pm_save'. There are few cases where the PCI power
267 			 * state can be changed to D0 without the involvement
268 			 * of the driver. For these cases, free the earlier
269 			 * allocated memory first before overwriting 'pm_save'
270 			 * to prevent the memory leak.
271 			 */
272 			kfree(vdev->pm_save);
273 			vdev->pm_save = pci_store_saved_state(pdev);
274 		} else if (needs_restore) {
275 			pci_load_and_free_saved_state(pdev, &vdev->pm_save);
276 			pci_restore_state(pdev);
277 		}
278 	}
279 
280 	return ret;
281 }
282 
283 static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev,
284 				     struct eventfd_ctx *efdctx)
285 {
286 	/*
287 	 * The vdev power related flags are protected with 'memory_lock'
288 	 * semaphore.
289 	 */
290 	vfio_pci_zap_and_down_write_memory_lock(vdev);
291 	if (vdev->pm_runtime_engaged) {
292 		up_write(&vdev->memory_lock);
293 		return -EINVAL;
294 	}
295 
296 	vdev->pm_runtime_engaged = true;
297 	vdev->pm_wake_eventfd_ctx = efdctx;
298 	pm_runtime_put_noidle(&vdev->pdev->dev);
299 	up_write(&vdev->memory_lock);
300 
301 	return 0;
302 }
303 
304 static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags,
305 				  void __user *arg, size_t argsz)
306 {
307 	struct vfio_pci_core_device *vdev =
308 		container_of(device, struct vfio_pci_core_device, vdev);
309 	int ret;
310 
311 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
312 	if (ret != 1)
313 		return ret;
314 
315 	/*
316 	 * Inside vfio_pci_runtime_pm_entry(), only the runtime PM usage count
317 	 * will be decremented. The pm_runtime_put() will be invoked again
318 	 * while returning from the ioctl and then the device can go into
319 	 * runtime suspended state.
320 	 */
321 	return vfio_pci_runtime_pm_entry(vdev, NULL);
322 }
323 
324 static int vfio_pci_core_pm_entry_with_wakeup(
325 	struct vfio_device *device, u32 flags,
326 	struct vfio_device_low_power_entry_with_wakeup __user *arg,
327 	size_t argsz)
328 {
329 	struct vfio_pci_core_device *vdev =
330 		container_of(device, struct vfio_pci_core_device, vdev);
331 	struct vfio_device_low_power_entry_with_wakeup entry;
332 	struct eventfd_ctx *efdctx;
333 	int ret;
334 
335 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
336 				 sizeof(entry));
337 	if (ret != 1)
338 		return ret;
339 
340 	if (copy_from_user(&entry, arg, sizeof(entry)))
341 		return -EFAULT;
342 
343 	if (entry.wakeup_eventfd < 0)
344 		return -EINVAL;
345 
346 	efdctx = eventfd_ctx_fdget(entry.wakeup_eventfd);
347 	if (IS_ERR(efdctx))
348 		return PTR_ERR(efdctx);
349 
350 	ret = vfio_pci_runtime_pm_entry(vdev, efdctx);
351 	if (ret)
352 		eventfd_ctx_put(efdctx);
353 
354 	return ret;
355 }
356 
357 static void __vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
358 {
359 	if (vdev->pm_runtime_engaged) {
360 		vdev->pm_runtime_engaged = false;
361 		pm_runtime_get_noresume(&vdev->pdev->dev);
362 
363 		if (vdev->pm_wake_eventfd_ctx) {
364 			eventfd_ctx_put(vdev->pm_wake_eventfd_ctx);
365 			vdev->pm_wake_eventfd_ctx = NULL;
366 		}
367 	}
368 }
369 
370 static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
371 {
372 	/*
373 	 * The vdev power related flags are protected with 'memory_lock'
374 	 * semaphore.
375 	 */
376 	down_write(&vdev->memory_lock);
377 	__vfio_pci_runtime_pm_exit(vdev);
378 	up_write(&vdev->memory_lock);
379 }
380 
381 static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags,
382 				 void __user *arg, size_t argsz)
383 {
384 	struct vfio_pci_core_device *vdev =
385 		container_of(device, struct vfio_pci_core_device, vdev);
386 	int ret;
387 
388 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
389 	if (ret != 1)
390 		return ret;
391 
392 	/*
393 	 * The device is always in the active state here due to pm wrappers
394 	 * around ioctls. If the device had entered a low power state and
395 	 * pm_wake_eventfd_ctx is valid, vfio_pci_core_runtime_resume() has
396 	 * already signaled the eventfd and exited low power mode itself.
397 	 * pm_runtime_engaged protects the redundant call here.
398 	 */
399 	vfio_pci_runtime_pm_exit(vdev);
400 	return 0;
401 }
402 
403 #ifdef CONFIG_PM
404 static int vfio_pci_core_runtime_suspend(struct device *dev)
405 {
406 	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
407 
408 	down_write(&vdev->memory_lock);
409 	/*
410 	 * The user can move the device into D3hot state before invoking
411 	 * power management IOCTL. Move the device into D0 state here and then
412 	 * the pci-driver core runtime PM suspend function will move the device
413 	 * into the low power state. Also, for the devices which have
414 	 * NoSoftRst-, it will help in restoring the original state
415 	 * (saved locally in 'vdev->pm_save').
416 	 */
417 	vfio_pci_set_power_state(vdev, PCI_D0);
418 	up_write(&vdev->memory_lock);
419 
420 	/*
421 	 * If INTx is enabled, then mask INTx before going into the runtime
422 	 * suspended state and unmask the same in the runtime resume.
423 	 * If INTx has already been masked by the user, then
424 	 * vfio_pci_intx_mask() will return false and in that case, INTx
425 	 * should not be unmasked in the runtime resume.
426 	 */
427 	vdev->pm_intx_masked = ((vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) &&
428 				vfio_pci_intx_mask(vdev));
429 
430 	return 0;
431 }
432 
433 static int vfio_pci_core_runtime_resume(struct device *dev)
434 {
435 	struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
436 
437 	/*
438 	 * Resume with a pm_wake_eventfd_ctx signals the eventfd and exit
439 	 * low power mode.
440 	 */
441 	down_write(&vdev->memory_lock);
442 	if (vdev->pm_wake_eventfd_ctx) {
443 		eventfd_signal(vdev->pm_wake_eventfd_ctx, 1);
444 		__vfio_pci_runtime_pm_exit(vdev);
445 	}
446 	up_write(&vdev->memory_lock);
447 
448 	if (vdev->pm_intx_masked)
449 		vfio_pci_intx_unmask(vdev);
450 
451 	return 0;
452 }
453 #endif /* CONFIG_PM */
454 
455 /*
456  * The pci-driver core runtime PM routines always save the device state
457  * before going into suspended state. If the device is going into low power
458  * state with only with runtime PM ops, then no explicit handling is needed
459  * for the devices which have NoSoftRst-.
460  */
461 static const struct dev_pm_ops vfio_pci_core_pm_ops = {
462 	SET_RUNTIME_PM_OPS(vfio_pci_core_runtime_suspend,
463 			   vfio_pci_core_runtime_resume,
464 			   NULL)
465 };
466 
467 int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
468 {
469 	struct pci_dev *pdev = vdev->pdev;
470 	int ret;
471 	u16 cmd;
472 	u8 msix_pos;
473 
474 	if (!disable_idle_d3) {
475 		ret = pm_runtime_resume_and_get(&pdev->dev);
476 		if (ret < 0)
477 			return ret;
478 	}
479 
480 	/* Don't allow our initial saved state to include busmaster */
481 	pci_clear_master(pdev);
482 
483 	ret = pci_enable_device(pdev);
484 	if (ret)
485 		goto out_power;
486 
487 	/* If reset fails because of the device lock, fail this path entirely */
488 	ret = pci_try_reset_function(pdev);
489 	if (ret == -EAGAIN)
490 		goto out_disable_device;
491 
492 	vdev->reset_works = !ret;
493 	pci_save_state(pdev);
494 	vdev->pci_saved_state = pci_store_saved_state(pdev);
495 	if (!vdev->pci_saved_state)
496 		pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__);
497 
498 	if (likely(!nointxmask)) {
499 		if (vfio_pci_nointx(pdev)) {
500 			pci_info(pdev, "Masking broken INTx support\n");
501 			vdev->nointx = true;
502 			pci_intx(pdev, 0);
503 		} else
504 			vdev->pci_2_3 = pci_intx_mask_supported(pdev);
505 	}
506 
507 	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
508 	if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
509 		cmd &= ~PCI_COMMAND_INTX_DISABLE;
510 		pci_write_config_word(pdev, PCI_COMMAND, cmd);
511 	}
512 
513 	ret = vfio_pci_zdev_open_device(vdev);
514 	if (ret)
515 		goto out_free_state;
516 
517 	ret = vfio_config_init(vdev);
518 	if (ret)
519 		goto out_free_zdev;
520 
521 	msix_pos = pdev->msix_cap;
522 	if (msix_pos) {
523 		u16 flags;
524 		u32 table;
525 
526 		pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
527 		pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
528 
529 		vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
530 		vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
531 		vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
532 	} else
533 		vdev->msix_bar = 0xFF;
534 
535 	if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
536 		vdev->has_vga = true;
537 
538 
539 	return 0;
540 
541 out_free_zdev:
542 	vfio_pci_zdev_close_device(vdev);
543 out_free_state:
544 	kfree(vdev->pci_saved_state);
545 	vdev->pci_saved_state = NULL;
546 out_disable_device:
547 	pci_disable_device(pdev);
548 out_power:
549 	if (!disable_idle_d3)
550 		pm_runtime_put(&pdev->dev);
551 	return ret;
552 }
553 EXPORT_SYMBOL_GPL(vfio_pci_core_enable);
554 
555 void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
556 {
557 	struct pci_dev *pdev = vdev->pdev;
558 	struct vfio_pci_dummy_resource *dummy_res, *tmp;
559 	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
560 	int i, bar;
561 
562 	/* For needs_reset */
563 	lockdep_assert_held(&vdev->vdev.dev_set->lock);
564 
565 	/*
566 	 * This function can be invoked while the power state is non-D0.
567 	 * This non-D0 power state can be with or without runtime PM.
568 	 * vfio_pci_runtime_pm_exit() will internally increment the usage
569 	 * count corresponding to pm_runtime_put() called during low power
570 	 * feature entry and then pm_runtime_resume() will wake up the device,
571 	 * if the device has already gone into the suspended state. Otherwise,
572 	 * the vfio_pci_set_power_state() will change the device power state
573 	 * to D0.
574 	 */
575 	vfio_pci_runtime_pm_exit(vdev);
576 	pm_runtime_resume(&pdev->dev);
577 
578 	/*
579 	 * This function calls __pci_reset_function_locked() which internally
580 	 * can use pci_pm_reset() for the function reset. pci_pm_reset() will
581 	 * fail if the power state is non-D0. Also, for the devices which
582 	 * have NoSoftRst-, the reset function can cause the PCI config space
583 	 * reset without restoring the original state (saved locally in
584 	 * 'vdev->pm_save').
585 	 */
586 	vfio_pci_set_power_state(vdev, PCI_D0);
587 
588 	/* Stop the device from further DMA */
589 	pci_clear_master(pdev);
590 
591 	vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
592 				VFIO_IRQ_SET_ACTION_TRIGGER,
593 				vdev->irq_type, 0, 0, NULL);
594 
595 	/* Device closed, don't need mutex here */
596 	list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
597 				 &vdev->ioeventfds_list, next) {
598 		vfio_virqfd_disable(&ioeventfd->virqfd);
599 		list_del(&ioeventfd->next);
600 		kfree(ioeventfd);
601 	}
602 	vdev->ioeventfds_nr = 0;
603 
604 	vdev->virq_disabled = false;
605 
606 	for (i = 0; i < vdev->num_regions; i++)
607 		vdev->region[i].ops->release(vdev, &vdev->region[i]);
608 
609 	vdev->num_regions = 0;
610 	kfree(vdev->region);
611 	vdev->region = NULL; /* don't krealloc a freed pointer */
612 
613 	vfio_config_free(vdev);
614 
615 	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
616 		bar = i + PCI_STD_RESOURCES;
617 		if (!vdev->barmap[bar])
618 			continue;
619 		pci_iounmap(pdev, vdev->barmap[bar]);
620 		pci_release_selected_regions(pdev, 1 << bar);
621 		vdev->barmap[bar] = NULL;
622 	}
623 
624 	list_for_each_entry_safe(dummy_res, tmp,
625 				 &vdev->dummy_resources_list, res_next) {
626 		list_del(&dummy_res->res_next);
627 		release_resource(&dummy_res->resource);
628 		kfree(dummy_res);
629 	}
630 
631 	vdev->needs_reset = true;
632 
633 	vfio_pci_zdev_close_device(vdev);
634 
635 	/*
636 	 * If we have saved state, restore it.  If we can reset the device,
637 	 * even better.  Resetting with current state seems better than
638 	 * nothing, but saving and restoring current state without reset
639 	 * is just busy work.
640 	 */
641 	if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
642 		pci_info(pdev, "%s: Couldn't reload saved state\n", __func__);
643 
644 		if (!vdev->reset_works)
645 			goto out;
646 
647 		pci_save_state(pdev);
648 	}
649 
650 	/*
651 	 * Disable INTx and MSI, presumably to avoid spurious interrupts
652 	 * during reset.  Stolen from pci_reset_function()
653 	 */
654 	pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
655 
656 	/*
657 	 * Try to get the locks ourselves to prevent a deadlock. The
658 	 * success of this is dependent on being able to lock the device,
659 	 * which is not always possible.
660 	 * We can not use the "try" reset interface here, which will
661 	 * overwrite the previously restored configuration information.
662 	 */
663 	if (vdev->reset_works && pci_dev_trylock(pdev)) {
664 		if (!__pci_reset_function_locked(pdev))
665 			vdev->needs_reset = false;
666 		pci_dev_unlock(pdev);
667 	}
668 
669 	pci_restore_state(pdev);
670 out:
671 	pci_disable_device(pdev);
672 
673 	vfio_pci_dev_set_try_reset(vdev->vdev.dev_set);
674 
675 	/* Put the pm-runtime usage counter acquired during enable */
676 	if (!disable_idle_d3)
677 		pm_runtime_put(&pdev->dev);
678 }
679 EXPORT_SYMBOL_GPL(vfio_pci_core_disable);
680 
681 void vfio_pci_core_close_device(struct vfio_device *core_vdev)
682 {
683 	struct vfio_pci_core_device *vdev =
684 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
685 
686 	if (vdev->sriov_pf_core_dev) {
687 		mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock);
688 		WARN_ON(!vdev->sriov_pf_core_dev->vf_token->users);
689 		vdev->sriov_pf_core_dev->vf_token->users--;
690 		mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock);
691 	}
692 #if IS_ENABLED(CONFIG_EEH)
693 	eeh_dev_release(vdev->pdev);
694 #endif
695 	vfio_pci_core_disable(vdev);
696 
697 	mutex_lock(&vdev->igate);
698 	if (vdev->err_trigger) {
699 		eventfd_ctx_put(vdev->err_trigger);
700 		vdev->err_trigger = NULL;
701 	}
702 	if (vdev->req_trigger) {
703 		eventfd_ctx_put(vdev->req_trigger);
704 		vdev->req_trigger = NULL;
705 	}
706 	mutex_unlock(&vdev->igate);
707 }
708 EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
709 
710 void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev)
711 {
712 	vfio_pci_probe_mmaps(vdev);
713 #if IS_ENABLED(CONFIG_EEH)
714 	eeh_dev_open(vdev->pdev);
715 #endif
716 
717 	if (vdev->sriov_pf_core_dev) {
718 		mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock);
719 		vdev->sriov_pf_core_dev->vf_token->users++;
720 		mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock);
721 	}
722 }
723 EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable);
724 
725 static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type)
726 {
727 	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
728 		u8 pin;
729 
730 		if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) ||
731 		    vdev->nointx || vdev->pdev->is_virtfn)
732 			return 0;
733 
734 		pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
735 
736 		return pin ? 1 : 0;
737 	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
738 		u8 pos;
739 		u16 flags;
740 
741 		pos = vdev->pdev->msi_cap;
742 		if (pos) {
743 			pci_read_config_word(vdev->pdev,
744 					     pos + PCI_MSI_FLAGS, &flags);
745 			return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
746 		}
747 	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
748 		u8 pos;
749 		u16 flags;
750 
751 		pos = vdev->pdev->msix_cap;
752 		if (pos) {
753 			pci_read_config_word(vdev->pdev,
754 					     pos + PCI_MSIX_FLAGS, &flags);
755 
756 			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
757 		}
758 	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
759 		if (pci_is_pcie(vdev->pdev))
760 			return 1;
761 	} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
762 		return 1;
763 	}
764 
765 	return 0;
766 }
767 
768 static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
769 {
770 	(*(int *)data)++;
771 	return 0;
772 }
773 
774 struct vfio_pci_fill_info {
775 	int max;
776 	int cur;
777 	struct vfio_pci_dependent_device *devices;
778 };
779 
780 static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
781 {
782 	struct vfio_pci_fill_info *fill = data;
783 	struct iommu_group *iommu_group;
784 
785 	if (fill->cur == fill->max)
786 		return -EAGAIN; /* Something changed, try again */
787 
788 	iommu_group = iommu_group_get(&pdev->dev);
789 	if (!iommu_group)
790 		return -EPERM; /* Cannot reset non-isolated devices */
791 
792 	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
793 	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
794 	fill->devices[fill->cur].bus = pdev->bus->number;
795 	fill->devices[fill->cur].devfn = pdev->devfn;
796 	fill->cur++;
797 	iommu_group_put(iommu_group);
798 	return 0;
799 }
800 
801 struct vfio_pci_group_info {
802 	int count;
803 	struct file **files;
804 };
805 
806 static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
807 {
808 	for (; pdev; pdev = pdev->bus->self)
809 		if (pdev->bus == slot->bus)
810 			return (pdev->slot == slot);
811 	return false;
812 }
813 
814 struct vfio_pci_walk_info {
815 	int (*fn)(struct pci_dev *pdev, void *data);
816 	void *data;
817 	struct pci_dev *pdev;
818 	bool slot;
819 	int ret;
820 };
821 
822 static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
823 {
824 	struct vfio_pci_walk_info *walk = data;
825 
826 	if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
827 		walk->ret = walk->fn(pdev, walk->data);
828 
829 	return walk->ret;
830 }
831 
832 static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
833 					 int (*fn)(struct pci_dev *,
834 						   void *data), void *data,
835 					 bool slot)
836 {
837 	struct vfio_pci_walk_info walk = {
838 		.fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
839 	};
840 
841 	pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
842 
843 	return walk.ret;
844 }
845 
846 static int msix_mmappable_cap(struct vfio_pci_core_device *vdev,
847 			      struct vfio_info_cap *caps)
848 {
849 	struct vfio_info_cap_header header = {
850 		.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
851 		.version = 1
852 	};
853 
854 	return vfio_info_add_capability(caps, &header, sizeof(header));
855 }
856 
857 int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
858 				      unsigned int type, unsigned int subtype,
859 				      const struct vfio_pci_regops *ops,
860 				      size_t size, u32 flags, void *data)
861 {
862 	struct vfio_pci_region *region;
863 
864 	region = krealloc(vdev->region,
865 			  (vdev->num_regions + 1) * sizeof(*region),
866 			  GFP_KERNEL);
867 	if (!region)
868 		return -ENOMEM;
869 
870 	vdev->region = region;
871 	vdev->region[vdev->num_regions].type = type;
872 	vdev->region[vdev->num_regions].subtype = subtype;
873 	vdev->region[vdev->num_regions].ops = ops;
874 	vdev->region[vdev->num_regions].size = size;
875 	vdev->region[vdev->num_regions].flags = flags;
876 	vdev->region[vdev->num_regions].data = data;
877 
878 	vdev->num_regions++;
879 
880 	return 0;
881 }
882 EXPORT_SYMBOL_GPL(vfio_pci_core_register_dev_region);
883 
884 static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
885 				   struct vfio_device_info __user *arg)
886 {
887 	unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs);
888 	struct vfio_device_info info;
889 	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
890 	unsigned long capsz;
891 	int ret;
892 
893 	/* For backward compatibility, cannot require this */
894 	capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
895 
896 	if (copy_from_user(&info, arg, minsz))
897 		return -EFAULT;
898 
899 	if (info.argsz < minsz)
900 		return -EINVAL;
901 
902 	if (info.argsz >= capsz) {
903 		minsz = capsz;
904 		info.cap_offset = 0;
905 	}
906 
907 	info.flags = VFIO_DEVICE_FLAGS_PCI;
908 
909 	if (vdev->reset_works)
910 		info.flags |= VFIO_DEVICE_FLAGS_RESET;
911 
912 	info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
913 	info.num_irqs = VFIO_PCI_NUM_IRQS;
914 
915 	ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
916 	if (ret && ret != -ENODEV) {
917 		pci_warn(vdev->pdev,
918 			 "Failed to setup zPCI info capabilities\n");
919 		return ret;
920 	}
921 
922 	if (caps.size) {
923 		info.flags |= VFIO_DEVICE_FLAGS_CAPS;
924 		if (info.argsz < sizeof(info) + caps.size) {
925 			info.argsz = sizeof(info) + caps.size;
926 		} else {
927 			vfio_info_cap_shift(&caps, sizeof(info));
928 			if (copy_to_user(arg + 1, caps.buf, caps.size)) {
929 				kfree(caps.buf);
930 				return -EFAULT;
931 			}
932 			info.cap_offset = sizeof(*arg);
933 		}
934 
935 		kfree(caps.buf);
936 	}
937 
938 	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
939 }
940 
941 static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
942 					  struct vfio_region_info __user *arg)
943 {
944 	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
945 	struct pci_dev *pdev = vdev->pdev;
946 	struct vfio_region_info info;
947 	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
948 	int i, ret;
949 
950 	if (copy_from_user(&info, arg, minsz))
951 		return -EFAULT;
952 
953 	if (info.argsz < minsz)
954 		return -EINVAL;
955 
956 	switch (info.index) {
957 	case VFIO_PCI_CONFIG_REGION_INDEX:
958 		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
959 		info.size = pdev->cfg_size;
960 		info.flags = VFIO_REGION_INFO_FLAG_READ |
961 			     VFIO_REGION_INFO_FLAG_WRITE;
962 		break;
963 	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
964 		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
965 		info.size = pci_resource_len(pdev, info.index);
966 		if (!info.size) {
967 			info.flags = 0;
968 			break;
969 		}
970 
971 		info.flags = VFIO_REGION_INFO_FLAG_READ |
972 			     VFIO_REGION_INFO_FLAG_WRITE;
973 		if (vdev->bar_mmap_supported[info.index]) {
974 			info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
975 			if (info.index == vdev->msix_bar) {
976 				ret = msix_mmappable_cap(vdev, &caps);
977 				if (ret)
978 					return ret;
979 			}
980 		}
981 
982 		break;
983 	case VFIO_PCI_ROM_REGION_INDEX: {
984 		void __iomem *io;
985 		size_t size;
986 		u16 cmd;
987 
988 		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
989 		info.flags = 0;
990 
991 		/* Report the BAR size, not the ROM size */
992 		info.size = pci_resource_len(pdev, info.index);
993 		if (!info.size) {
994 			/* Shadow ROMs appear as PCI option ROMs */
995 			if (pdev->resource[PCI_ROM_RESOURCE].flags &
996 			    IORESOURCE_ROM_SHADOW)
997 				info.size = 0x20000;
998 			else
999 				break;
1000 		}
1001 
1002 		/*
1003 		 * Is it really there?  Enable memory decode for implicit access
1004 		 * in pci_map_rom().
1005 		 */
1006 		cmd = vfio_pci_memory_lock_and_enable(vdev);
1007 		io = pci_map_rom(pdev, &size);
1008 		if (io) {
1009 			info.flags = VFIO_REGION_INFO_FLAG_READ;
1010 			pci_unmap_rom(pdev, io);
1011 		} else {
1012 			info.size = 0;
1013 		}
1014 		vfio_pci_memory_unlock_and_restore(vdev, cmd);
1015 
1016 		break;
1017 	}
1018 	case VFIO_PCI_VGA_REGION_INDEX:
1019 		if (!vdev->has_vga)
1020 			return -EINVAL;
1021 
1022 		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1023 		info.size = 0xc0000;
1024 		info.flags = VFIO_REGION_INFO_FLAG_READ |
1025 			     VFIO_REGION_INFO_FLAG_WRITE;
1026 
1027 		break;
1028 	default: {
1029 		struct vfio_region_info_cap_type cap_type = {
1030 			.header.id = VFIO_REGION_INFO_CAP_TYPE,
1031 			.header.version = 1
1032 		};
1033 
1034 		if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1035 			return -EINVAL;
1036 		info.index = array_index_nospec(
1037 			info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions);
1038 
1039 		i = info.index - VFIO_PCI_NUM_REGIONS;
1040 
1041 		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1042 		info.size = vdev->region[i].size;
1043 		info.flags = vdev->region[i].flags;
1044 
1045 		cap_type.type = vdev->region[i].type;
1046 		cap_type.subtype = vdev->region[i].subtype;
1047 
1048 		ret = vfio_info_add_capability(&caps, &cap_type.header,
1049 					       sizeof(cap_type));
1050 		if (ret)
1051 			return ret;
1052 
1053 		if (vdev->region[i].ops->add_capability) {
1054 			ret = vdev->region[i].ops->add_capability(
1055 				vdev, &vdev->region[i], &caps);
1056 			if (ret)
1057 				return ret;
1058 		}
1059 	}
1060 	}
1061 
1062 	if (caps.size) {
1063 		info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1064 		if (info.argsz < sizeof(info) + caps.size) {
1065 			info.argsz = sizeof(info) + caps.size;
1066 			info.cap_offset = 0;
1067 		} else {
1068 			vfio_info_cap_shift(&caps, sizeof(info));
1069 			if (copy_to_user(arg + 1, caps.buf, caps.size)) {
1070 				kfree(caps.buf);
1071 				return -EFAULT;
1072 			}
1073 			info.cap_offset = sizeof(*arg);
1074 		}
1075 
1076 		kfree(caps.buf);
1077 	}
1078 
1079 	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
1080 }
1081 
1082 static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev,
1083 				       struct vfio_irq_info __user *arg)
1084 {
1085 	unsigned long minsz = offsetofend(struct vfio_irq_info, count);
1086 	struct vfio_irq_info info;
1087 
1088 	if (copy_from_user(&info, arg, minsz))
1089 		return -EFAULT;
1090 
1091 	if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1092 		return -EINVAL;
1093 
1094 	switch (info.index) {
1095 	case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
1096 	case VFIO_PCI_REQ_IRQ_INDEX:
1097 		break;
1098 	case VFIO_PCI_ERR_IRQ_INDEX:
1099 		if (pci_is_pcie(vdev->pdev))
1100 			break;
1101 		fallthrough;
1102 	default:
1103 		return -EINVAL;
1104 	}
1105 
1106 	info.flags = VFIO_IRQ_INFO_EVENTFD;
1107 
1108 	info.count = vfio_pci_get_irq_count(vdev, info.index);
1109 
1110 	if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1111 		info.flags |=
1112 			(VFIO_IRQ_INFO_MASKABLE | VFIO_IRQ_INFO_AUTOMASKED);
1113 	else
1114 		info.flags |= VFIO_IRQ_INFO_NORESIZE;
1115 
1116 	return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
1117 }
1118 
1119 static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
1120 				   struct vfio_irq_set __user *arg)
1121 {
1122 	unsigned long minsz = offsetofend(struct vfio_irq_set, count);
1123 	struct vfio_irq_set hdr;
1124 	u8 *data = NULL;
1125 	int max, ret = 0;
1126 	size_t data_size = 0;
1127 
1128 	if (copy_from_user(&hdr, arg, minsz))
1129 		return -EFAULT;
1130 
1131 	max = vfio_pci_get_irq_count(vdev, hdr.index);
1132 
1133 	ret = vfio_set_irqs_validate_and_prepare(&hdr, max, VFIO_PCI_NUM_IRQS,
1134 						 &data_size);
1135 	if (ret)
1136 		return ret;
1137 
1138 	if (data_size) {
1139 		data = memdup_user(&arg->data, data_size);
1140 		if (IS_ERR(data))
1141 			return PTR_ERR(data);
1142 	}
1143 
1144 	mutex_lock(&vdev->igate);
1145 
1146 	ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, hdr.start,
1147 				      hdr.count, data);
1148 
1149 	mutex_unlock(&vdev->igate);
1150 	kfree(data);
1151 
1152 	return ret;
1153 }
1154 
1155 static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
1156 				void __user *arg)
1157 {
1158 	int ret;
1159 
1160 	if (!vdev->reset_works)
1161 		return -EINVAL;
1162 
1163 	vfio_pci_zap_and_down_write_memory_lock(vdev);
1164 
1165 	/*
1166 	 * This function can be invoked while the power state is non-D0. If
1167 	 * pci_try_reset_function() has been called while the power state is
1168 	 * non-D0, then pci_try_reset_function() will internally set the power
1169 	 * state to D0 without vfio driver involvement. For the devices which
1170 	 * have NoSoftRst-, the reset function can cause the PCI config space
1171 	 * reset without restoring the original state (saved locally in
1172 	 * 'vdev->pm_save').
1173 	 */
1174 	vfio_pci_set_power_state(vdev, PCI_D0);
1175 
1176 	ret = pci_try_reset_function(vdev->pdev);
1177 	up_write(&vdev->memory_lock);
1178 
1179 	return ret;
1180 }
1181 
1182 static int vfio_pci_ioctl_get_pci_hot_reset_info(
1183 	struct vfio_pci_core_device *vdev,
1184 	struct vfio_pci_hot_reset_info __user *arg)
1185 {
1186 	unsigned long minsz =
1187 		offsetofend(struct vfio_pci_hot_reset_info, count);
1188 	struct vfio_pci_hot_reset_info hdr;
1189 	struct vfio_pci_fill_info fill = { 0 };
1190 	struct vfio_pci_dependent_device *devices = NULL;
1191 	bool slot = false;
1192 	int ret = 0;
1193 
1194 	if (copy_from_user(&hdr, arg, minsz))
1195 		return -EFAULT;
1196 
1197 	if (hdr.argsz < minsz)
1198 		return -EINVAL;
1199 
1200 	hdr.flags = 0;
1201 
1202 	/* Can we do a slot or bus reset or neither? */
1203 	if (!pci_probe_reset_slot(vdev->pdev->slot))
1204 		slot = true;
1205 	else if (pci_probe_reset_bus(vdev->pdev->bus))
1206 		return -ENODEV;
1207 
1208 	/* How many devices are affected? */
1209 	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
1210 					    &fill.max, slot);
1211 	if (ret)
1212 		return ret;
1213 
1214 	WARN_ON(!fill.max); /* Should always be at least one */
1215 
1216 	/*
1217 	 * If there's enough space, fill it now, otherwise return -ENOSPC and
1218 	 * the number of devices affected.
1219 	 */
1220 	if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
1221 		ret = -ENOSPC;
1222 		hdr.count = fill.max;
1223 		goto reset_info_exit;
1224 	}
1225 
1226 	devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
1227 	if (!devices)
1228 		return -ENOMEM;
1229 
1230 	fill.devices = devices;
1231 
1232 	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
1233 					    &fill, slot);
1234 
1235 	/*
1236 	 * If a device was removed between counting and filling, we may come up
1237 	 * short of fill.max.  If a device was added, we'll have a return of
1238 	 * -EAGAIN above.
1239 	 */
1240 	if (!ret)
1241 		hdr.count = fill.cur;
1242 
1243 reset_info_exit:
1244 	if (copy_to_user(arg, &hdr, minsz))
1245 		ret = -EFAULT;
1246 
1247 	if (!ret) {
1248 		if (copy_to_user(&arg->devices, devices,
1249 				 hdr.count * sizeof(*devices)))
1250 			ret = -EFAULT;
1251 	}
1252 
1253 	kfree(devices);
1254 	return ret;
1255 }
1256 
1257 static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
1258 					struct vfio_pci_hot_reset __user *arg)
1259 {
1260 	unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
1261 	struct vfio_pci_hot_reset hdr;
1262 	int32_t *group_fds;
1263 	struct file **files;
1264 	struct vfio_pci_group_info info;
1265 	bool slot = false;
1266 	int file_idx, count = 0, ret = 0;
1267 
1268 	if (copy_from_user(&hdr, arg, minsz))
1269 		return -EFAULT;
1270 
1271 	if (hdr.argsz < minsz || hdr.flags)
1272 		return -EINVAL;
1273 
1274 	/* Can we do a slot or bus reset or neither? */
1275 	if (!pci_probe_reset_slot(vdev->pdev->slot))
1276 		slot = true;
1277 	else if (pci_probe_reset_bus(vdev->pdev->bus))
1278 		return -ENODEV;
1279 
1280 	/*
1281 	 * We can't let userspace give us an arbitrarily large buffer to copy,
1282 	 * so verify how many we think there could be.  Note groups can have
1283 	 * multiple devices so one group per device is the max.
1284 	 */
1285 	ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
1286 					    &count, slot);
1287 	if (ret)
1288 		return ret;
1289 
1290 	/* Somewhere between 1 and count is OK */
1291 	if (!hdr.count || hdr.count > count)
1292 		return -EINVAL;
1293 
1294 	group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
1295 	files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL);
1296 	if (!group_fds || !files) {
1297 		kfree(group_fds);
1298 		kfree(files);
1299 		return -ENOMEM;
1300 	}
1301 
1302 	if (copy_from_user(group_fds, arg->group_fds,
1303 			   hdr.count * sizeof(*group_fds))) {
1304 		kfree(group_fds);
1305 		kfree(files);
1306 		return -EFAULT;
1307 	}
1308 
1309 	/*
1310 	 * For each group_fd, get the group through the vfio external user
1311 	 * interface and store the group and iommu ID.  This ensures the group
1312 	 * is held across the reset.
1313 	 */
1314 	for (file_idx = 0; file_idx < hdr.count; file_idx++) {
1315 		struct file *file = fget(group_fds[file_idx]);
1316 
1317 		if (!file) {
1318 			ret = -EBADF;
1319 			break;
1320 		}
1321 
1322 		/* Ensure the FD is a vfio group FD.*/
1323 		if (!vfio_file_is_group(file)) {
1324 			fput(file);
1325 			ret = -EINVAL;
1326 			break;
1327 		}
1328 
1329 		files[file_idx] = file;
1330 	}
1331 
1332 	kfree(group_fds);
1333 
1334 	/* release reference to groups on error */
1335 	if (ret)
1336 		goto hot_reset_release;
1337 
1338 	info.count = hdr.count;
1339 	info.files = files;
1340 
1341 	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
1342 
1343 hot_reset_release:
1344 	for (file_idx--; file_idx >= 0; file_idx--)
1345 		fput(files[file_idx]);
1346 
1347 	kfree(files);
1348 	return ret;
1349 }
1350 
1351 static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
1352 				    struct vfio_device_ioeventfd __user *arg)
1353 {
1354 	unsigned long minsz = offsetofend(struct vfio_device_ioeventfd, fd);
1355 	struct vfio_device_ioeventfd ioeventfd;
1356 	int count;
1357 
1358 	if (copy_from_user(&ioeventfd, arg, minsz))
1359 		return -EFAULT;
1360 
1361 	if (ioeventfd.argsz < minsz)
1362 		return -EINVAL;
1363 
1364 	if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
1365 		return -EINVAL;
1366 
1367 	count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
1368 
1369 	if (hweight8(count) != 1 || ioeventfd.fd < -1)
1370 		return -EINVAL;
1371 
1372 	return vfio_pci_ioeventfd(vdev, ioeventfd.offset, ioeventfd.data, count,
1373 				  ioeventfd.fd);
1374 }
1375 
1376 long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
1377 			 unsigned long arg)
1378 {
1379 	struct vfio_pci_core_device *vdev =
1380 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1381 	void __user *uarg = (void __user *)arg;
1382 
1383 	switch (cmd) {
1384 	case VFIO_DEVICE_GET_INFO:
1385 		return vfio_pci_ioctl_get_info(vdev, uarg);
1386 	case VFIO_DEVICE_GET_IRQ_INFO:
1387 		return vfio_pci_ioctl_get_irq_info(vdev, uarg);
1388 	case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO:
1389 		return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg);
1390 	case VFIO_DEVICE_GET_REGION_INFO:
1391 		return vfio_pci_ioctl_get_region_info(vdev, uarg);
1392 	case VFIO_DEVICE_IOEVENTFD:
1393 		return vfio_pci_ioctl_ioeventfd(vdev, uarg);
1394 	case VFIO_DEVICE_PCI_HOT_RESET:
1395 		return vfio_pci_ioctl_pci_hot_reset(vdev, uarg);
1396 	case VFIO_DEVICE_RESET:
1397 		return vfio_pci_ioctl_reset(vdev, uarg);
1398 	case VFIO_DEVICE_SET_IRQS:
1399 		return vfio_pci_ioctl_set_irqs(vdev, uarg);
1400 	default:
1401 		return -ENOTTY;
1402 	}
1403 }
1404 EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl);
1405 
1406 static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
1407 				       uuid_t __user *arg, size_t argsz)
1408 {
1409 	struct vfio_pci_core_device *vdev =
1410 		container_of(device, struct vfio_pci_core_device, vdev);
1411 	uuid_t uuid;
1412 	int ret;
1413 
1414 	if (!vdev->vf_token)
1415 		return -ENOTTY;
1416 	/*
1417 	 * We do not support GET of the VF Token UUID as this could
1418 	 * expose the token of the previous device user.
1419 	 */
1420 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
1421 				 sizeof(uuid));
1422 	if (ret != 1)
1423 		return ret;
1424 
1425 	if (copy_from_user(&uuid, arg, sizeof(uuid)))
1426 		return -EFAULT;
1427 
1428 	mutex_lock(&vdev->vf_token->lock);
1429 	uuid_copy(&vdev->vf_token->uuid, &uuid);
1430 	mutex_unlock(&vdev->vf_token->lock);
1431 	return 0;
1432 }
1433 
1434 int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
1435 				void __user *arg, size_t argsz)
1436 {
1437 	switch (flags & VFIO_DEVICE_FEATURE_MASK) {
1438 	case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY:
1439 		return vfio_pci_core_pm_entry(device, flags, arg, argsz);
1440 	case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP:
1441 		return vfio_pci_core_pm_entry_with_wakeup(device, flags,
1442 							  arg, argsz);
1443 	case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT:
1444 		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
1445 	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
1446 		return vfio_pci_core_feature_token(device, flags, arg, argsz);
1447 	default:
1448 		return -ENOTTY;
1449 	}
1450 }
1451 EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl_feature);
1452 
1453 static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
1454 			   size_t count, loff_t *ppos, bool iswrite)
1455 {
1456 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
1457 	int ret;
1458 
1459 	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1460 		return -EINVAL;
1461 
1462 	ret = pm_runtime_resume_and_get(&vdev->pdev->dev);
1463 	if (ret) {
1464 		pci_info_ratelimited(vdev->pdev, "runtime resume failed %d\n",
1465 				     ret);
1466 		return -EIO;
1467 	}
1468 
1469 	switch (index) {
1470 	case VFIO_PCI_CONFIG_REGION_INDEX:
1471 		ret = vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
1472 		break;
1473 
1474 	case VFIO_PCI_ROM_REGION_INDEX:
1475 		if (iswrite)
1476 			ret = -EINVAL;
1477 		else
1478 			ret = vfio_pci_bar_rw(vdev, buf, count, ppos, false);
1479 		break;
1480 
1481 	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1482 		ret = vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
1483 		break;
1484 
1485 	case VFIO_PCI_VGA_REGION_INDEX:
1486 		ret = vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
1487 		break;
1488 
1489 	default:
1490 		index -= VFIO_PCI_NUM_REGIONS;
1491 		ret = vdev->region[index].ops->rw(vdev, buf,
1492 						   count, ppos, iswrite);
1493 		break;
1494 	}
1495 
1496 	pm_runtime_put(&vdev->pdev->dev);
1497 	return ret;
1498 }
1499 
1500 ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
1501 		size_t count, loff_t *ppos)
1502 {
1503 	struct vfio_pci_core_device *vdev =
1504 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1505 
1506 	if (!count)
1507 		return 0;
1508 
1509 	return vfio_pci_rw(vdev, buf, count, ppos, false);
1510 }
1511 EXPORT_SYMBOL_GPL(vfio_pci_core_read);
1512 
1513 ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
1514 		size_t count, loff_t *ppos)
1515 {
1516 	struct vfio_pci_core_device *vdev =
1517 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1518 
1519 	if (!count)
1520 		return 0;
1521 
1522 	return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true);
1523 }
1524 EXPORT_SYMBOL_GPL(vfio_pci_core_write);
1525 
1526 /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
1527 static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try)
1528 {
1529 	struct vfio_pci_mmap_vma *mmap_vma, *tmp;
1530 
1531 	/*
1532 	 * Lock ordering:
1533 	 * vma_lock is nested under mmap_lock for vm_ops callback paths.
1534 	 * The memory_lock semaphore is used by both code paths calling
1535 	 * into this function to zap vmas and the vm_ops.fault callback
1536 	 * to protect the memory enable state of the device.
1537 	 *
1538 	 * When zapping vmas we need to maintain the mmap_lock => vma_lock
1539 	 * ordering, which requires using vma_lock to walk vma_list to
1540 	 * acquire an mm, then dropping vma_lock to get the mmap_lock and
1541 	 * reacquiring vma_lock.  This logic is derived from similar
1542 	 * requirements in uverbs_user_mmap_disassociate().
1543 	 *
1544 	 * mmap_lock must always be the top-level lock when it is taken.
1545 	 * Therefore we can only hold the memory_lock write lock when
1546 	 * vma_list is empty, as we'd need to take mmap_lock to clear
1547 	 * entries.  vma_list can only be guaranteed empty when holding
1548 	 * vma_lock, thus memory_lock is nested under vma_lock.
1549 	 *
1550 	 * This enables the vm_ops.fault callback to acquire vma_lock,
1551 	 * followed by memory_lock read lock, while already holding
1552 	 * mmap_lock without risk of deadlock.
1553 	 */
1554 	while (1) {
1555 		struct mm_struct *mm = NULL;
1556 
1557 		if (try) {
1558 			if (!mutex_trylock(&vdev->vma_lock))
1559 				return 0;
1560 		} else {
1561 			mutex_lock(&vdev->vma_lock);
1562 		}
1563 		while (!list_empty(&vdev->vma_list)) {
1564 			mmap_vma = list_first_entry(&vdev->vma_list,
1565 						    struct vfio_pci_mmap_vma,
1566 						    vma_next);
1567 			mm = mmap_vma->vma->vm_mm;
1568 			if (mmget_not_zero(mm))
1569 				break;
1570 
1571 			list_del(&mmap_vma->vma_next);
1572 			kfree(mmap_vma);
1573 			mm = NULL;
1574 		}
1575 		if (!mm)
1576 			return 1;
1577 		mutex_unlock(&vdev->vma_lock);
1578 
1579 		if (try) {
1580 			if (!mmap_read_trylock(mm)) {
1581 				mmput(mm);
1582 				return 0;
1583 			}
1584 		} else {
1585 			mmap_read_lock(mm);
1586 		}
1587 		if (try) {
1588 			if (!mutex_trylock(&vdev->vma_lock)) {
1589 				mmap_read_unlock(mm);
1590 				mmput(mm);
1591 				return 0;
1592 			}
1593 		} else {
1594 			mutex_lock(&vdev->vma_lock);
1595 		}
1596 		list_for_each_entry_safe(mmap_vma, tmp,
1597 					 &vdev->vma_list, vma_next) {
1598 			struct vm_area_struct *vma = mmap_vma->vma;
1599 
1600 			if (vma->vm_mm != mm)
1601 				continue;
1602 
1603 			list_del(&mmap_vma->vma_next);
1604 			kfree(mmap_vma);
1605 
1606 			zap_vma_ptes(vma, vma->vm_start,
1607 				     vma->vm_end - vma->vm_start);
1608 		}
1609 		mutex_unlock(&vdev->vma_lock);
1610 		mmap_read_unlock(mm);
1611 		mmput(mm);
1612 	}
1613 }
1614 
1615 void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev)
1616 {
1617 	vfio_pci_zap_and_vma_lock(vdev, false);
1618 	down_write(&vdev->memory_lock);
1619 	mutex_unlock(&vdev->vma_lock);
1620 }
1621 
1622 u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
1623 {
1624 	u16 cmd;
1625 
1626 	down_write(&vdev->memory_lock);
1627 	pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd);
1628 	if (!(cmd & PCI_COMMAND_MEMORY))
1629 		pci_write_config_word(vdev->pdev, PCI_COMMAND,
1630 				      cmd | PCI_COMMAND_MEMORY);
1631 
1632 	return cmd;
1633 }
1634 
1635 void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd)
1636 {
1637 	pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
1638 	up_write(&vdev->memory_lock);
1639 }
1640 
1641 /* Caller holds vma_lock */
1642 static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev,
1643 			      struct vm_area_struct *vma)
1644 {
1645 	struct vfio_pci_mmap_vma *mmap_vma;
1646 
1647 	mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL);
1648 	if (!mmap_vma)
1649 		return -ENOMEM;
1650 
1651 	mmap_vma->vma = vma;
1652 	list_add(&mmap_vma->vma_next, &vdev->vma_list);
1653 
1654 	return 0;
1655 }
1656 
1657 /*
1658  * Zap mmaps on open so that we can fault them in on access and therefore
1659  * our vma_list only tracks mappings accessed since last zap.
1660  */
1661 static void vfio_pci_mmap_open(struct vm_area_struct *vma)
1662 {
1663 	zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1664 }
1665 
1666 static void vfio_pci_mmap_close(struct vm_area_struct *vma)
1667 {
1668 	struct vfio_pci_core_device *vdev = vma->vm_private_data;
1669 	struct vfio_pci_mmap_vma *mmap_vma;
1670 
1671 	mutex_lock(&vdev->vma_lock);
1672 	list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
1673 		if (mmap_vma->vma == vma) {
1674 			list_del(&mmap_vma->vma_next);
1675 			kfree(mmap_vma);
1676 			break;
1677 		}
1678 	}
1679 	mutex_unlock(&vdev->vma_lock);
1680 }
1681 
1682 static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
1683 {
1684 	struct vm_area_struct *vma = vmf->vma;
1685 	struct vfio_pci_core_device *vdev = vma->vm_private_data;
1686 	struct vfio_pci_mmap_vma *mmap_vma;
1687 	vm_fault_t ret = VM_FAULT_NOPAGE;
1688 
1689 	mutex_lock(&vdev->vma_lock);
1690 	down_read(&vdev->memory_lock);
1691 
1692 	/*
1693 	 * Memory region cannot be accessed if the low power feature is engaged
1694 	 * or memory access is disabled.
1695 	 */
1696 	if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) {
1697 		ret = VM_FAULT_SIGBUS;
1698 		goto up_out;
1699 	}
1700 
1701 	/*
1702 	 * We populate the whole vma on fault, so we need to test whether
1703 	 * the vma has already been mapped, such as for concurrent faults
1704 	 * to the same vma.  io_remap_pfn_range() will trigger a BUG_ON if
1705 	 * we ask it to fill the same range again.
1706 	 */
1707 	list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
1708 		if (mmap_vma->vma == vma)
1709 			goto up_out;
1710 	}
1711 
1712 	if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
1713 			       vma->vm_end - vma->vm_start,
1714 			       vma->vm_page_prot)) {
1715 		ret = VM_FAULT_SIGBUS;
1716 		zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1717 		goto up_out;
1718 	}
1719 
1720 	if (__vfio_pci_add_vma(vdev, vma)) {
1721 		ret = VM_FAULT_OOM;
1722 		zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1723 	}
1724 
1725 up_out:
1726 	up_read(&vdev->memory_lock);
1727 	mutex_unlock(&vdev->vma_lock);
1728 	return ret;
1729 }
1730 
1731 static const struct vm_operations_struct vfio_pci_mmap_ops = {
1732 	.open = vfio_pci_mmap_open,
1733 	.close = vfio_pci_mmap_close,
1734 	.fault = vfio_pci_mmap_fault,
1735 };
1736 
1737 int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
1738 {
1739 	struct vfio_pci_core_device *vdev =
1740 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1741 	struct pci_dev *pdev = vdev->pdev;
1742 	unsigned int index;
1743 	u64 phys_len, req_len, pgoff, req_start;
1744 	int ret;
1745 
1746 	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1747 
1748 	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1749 		return -EINVAL;
1750 	if (vma->vm_end < vma->vm_start)
1751 		return -EINVAL;
1752 	if ((vma->vm_flags & VM_SHARED) == 0)
1753 		return -EINVAL;
1754 	if (index >= VFIO_PCI_NUM_REGIONS) {
1755 		int regnum = index - VFIO_PCI_NUM_REGIONS;
1756 		struct vfio_pci_region *region = vdev->region + regnum;
1757 
1758 		if (region->ops && region->ops->mmap &&
1759 		    (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
1760 			return region->ops->mmap(vdev, region, vma);
1761 		return -EINVAL;
1762 	}
1763 	if (index >= VFIO_PCI_ROM_REGION_INDEX)
1764 		return -EINVAL;
1765 	if (!vdev->bar_mmap_supported[index])
1766 		return -EINVAL;
1767 
1768 	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
1769 	req_len = vma->vm_end - vma->vm_start;
1770 	pgoff = vma->vm_pgoff &
1771 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1772 	req_start = pgoff << PAGE_SHIFT;
1773 
1774 	if (req_start + req_len > phys_len)
1775 		return -EINVAL;
1776 
1777 	/*
1778 	 * Even though we don't make use of the barmap for the mmap,
1779 	 * we need to request the region and the barmap tracks that.
1780 	 */
1781 	if (!vdev->barmap[index]) {
1782 		ret = pci_request_selected_regions(pdev,
1783 						   1 << index, "vfio-pci");
1784 		if (ret)
1785 			return ret;
1786 
1787 		vdev->barmap[index] = pci_iomap(pdev, index, 0);
1788 		if (!vdev->barmap[index]) {
1789 			pci_release_selected_regions(pdev, 1 << index);
1790 			return -ENOMEM;
1791 		}
1792 	}
1793 
1794 	vma->vm_private_data = vdev;
1795 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1796 	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
1797 
1798 	/*
1799 	 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
1800 	 * change vm_flags within the fault handler.  Set them now.
1801 	 */
1802 	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1803 	vma->vm_ops = &vfio_pci_mmap_ops;
1804 
1805 	return 0;
1806 }
1807 EXPORT_SYMBOL_GPL(vfio_pci_core_mmap);
1808 
1809 void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
1810 {
1811 	struct vfio_pci_core_device *vdev =
1812 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1813 	struct pci_dev *pdev = vdev->pdev;
1814 
1815 	mutex_lock(&vdev->igate);
1816 
1817 	if (vdev->req_trigger) {
1818 		if (!(count % 10))
1819 			pci_notice_ratelimited(pdev,
1820 				"Relaying device request to user (#%u)\n",
1821 				count);
1822 		eventfd_signal(vdev->req_trigger, 1);
1823 	} else if (count == 0) {
1824 		pci_warn(pdev,
1825 			"No device request channel registered, blocked until released by user\n");
1826 	}
1827 
1828 	mutex_unlock(&vdev->igate);
1829 }
1830 EXPORT_SYMBOL_GPL(vfio_pci_core_request);
1831 
1832 static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
1833 				      bool vf_token, uuid_t *uuid)
1834 {
1835 	/*
1836 	 * There's always some degree of trust or collaboration between SR-IOV
1837 	 * PF and VFs, even if just that the PF hosts the SR-IOV capability and
1838 	 * can disrupt VFs with a reset, but often the PF has more explicit
1839 	 * access to deny service to the VF or access data passed through the
1840 	 * VF.  We therefore require an opt-in via a shared VF token (UUID) to
1841 	 * represent this trust.  This both prevents that a VF driver might
1842 	 * assume the PF driver is a trusted, in-kernel driver, and also that
1843 	 * a PF driver might be replaced with a rogue driver, unknown to in-use
1844 	 * VF drivers.
1845 	 *
1846 	 * Therefore when presented with a VF, if the PF is a vfio device and
1847 	 * it is bound to the vfio-pci driver, the user needs to provide a VF
1848 	 * token to access the device, in the form of appending a vf_token to
1849 	 * the device name, for example:
1850 	 *
1851 	 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
1852 	 *
1853 	 * When presented with a PF which has VFs in use, the user must also
1854 	 * provide the current VF token to prove collaboration with existing
1855 	 * VF users.  If VFs are not in use, the VF token provided for the PF
1856 	 * device will act to set the VF token.
1857 	 *
1858 	 * If the VF token is provided but unused, an error is generated.
1859 	 */
1860 	if (vdev->pdev->is_virtfn) {
1861 		struct vfio_pci_core_device *pf_vdev = vdev->sriov_pf_core_dev;
1862 		bool match;
1863 
1864 		if (!pf_vdev) {
1865 			if (!vf_token)
1866 				return 0; /* PF is not vfio-pci, no VF token */
1867 
1868 			pci_info_ratelimited(vdev->pdev,
1869 				"VF token incorrectly provided, PF not bound to vfio-pci\n");
1870 			return -EINVAL;
1871 		}
1872 
1873 		if (!vf_token) {
1874 			pci_info_ratelimited(vdev->pdev,
1875 				"VF token required to access device\n");
1876 			return -EACCES;
1877 		}
1878 
1879 		mutex_lock(&pf_vdev->vf_token->lock);
1880 		match = uuid_equal(uuid, &pf_vdev->vf_token->uuid);
1881 		mutex_unlock(&pf_vdev->vf_token->lock);
1882 
1883 		if (!match) {
1884 			pci_info_ratelimited(vdev->pdev,
1885 				"Incorrect VF token provided for device\n");
1886 			return -EACCES;
1887 		}
1888 	} else if (vdev->vf_token) {
1889 		mutex_lock(&vdev->vf_token->lock);
1890 		if (vdev->vf_token->users) {
1891 			if (!vf_token) {
1892 				mutex_unlock(&vdev->vf_token->lock);
1893 				pci_info_ratelimited(vdev->pdev,
1894 					"VF token required to access device\n");
1895 				return -EACCES;
1896 			}
1897 
1898 			if (!uuid_equal(uuid, &vdev->vf_token->uuid)) {
1899 				mutex_unlock(&vdev->vf_token->lock);
1900 				pci_info_ratelimited(vdev->pdev,
1901 					"Incorrect VF token provided for device\n");
1902 				return -EACCES;
1903 			}
1904 		} else if (vf_token) {
1905 			uuid_copy(&vdev->vf_token->uuid, uuid);
1906 		}
1907 
1908 		mutex_unlock(&vdev->vf_token->lock);
1909 	} else if (vf_token) {
1910 		pci_info_ratelimited(vdev->pdev,
1911 			"VF token incorrectly provided, not a PF or VF\n");
1912 		return -EINVAL;
1913 	}
1914 
1915 	return 0;
1916 }
1917 
1918 #define VF_TOKEN_ARG "vf_token="
1919 
1920 int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
1921 {
1922 	struct vfio_pci_core_device *vdev =
1923 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1924 	bool vf_token = false;
1925 	uuid_t uuid;
1926 	int ret;
1927 
1928 	if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev))))
1929 		return 0; /* No match */
1930 
1931 	if (strlen(buf) > strlen(pci_name(vdev->pdev))) {
1932 		buf += strlen(pci_name(vdev->pdev));
1933 
1934 		if (*buf != ' ')
1935 			return 0; /* No match: non-whitespace after name */
1936 
1937 		while (*buf) {
1938 			if (*buf == ' ') {
1939 				buf++;
1940 				continue;
1941 			}
1942 
1943 			if (!vf_token && !strncmp(buf, VF_TOKEN_ARG,
1944 						  strlen(VF_TOKEN_ARG))) {
1945 				buf += strlen(VF_TOKEN_ARG);
1946 
1947 				if (strlen(buf) < UUID_STRING_LEN)
1948 					return -EINVAL;
1949 
1950 				ret = uuid_parse(buf, &uuid);
1951 				if (ret)
1952 					return ret;
1953 
1954 				vf_token = true;
1955 				buf += UUID_STRING_LEN;
1956 			} else {
1957 				/* Unknown/duplicate option */
1958 				return -EINVAL;
1959 			}
1960 		}
1961 	}
1962 
1963 	ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
1964 	if (ret)
1965 		return ret;
1966 
1967 	return 1; /* Match */
1968 }
1969 EXPORT_SYMBOL_GPL(vfio_pci_core_match);
1970 
1971 static int vfio_pci_bus_notifier(struct notifier_block *nb,
1972 				 unsigned long action, void *data)
1973 {
1974 	struct vfio_pci_core_device *vdev = container_of(nb,
1975 						    struct vfio_pci_core_device, nb);
1976 	struct device *dev = data;
1977 	struct pci_dev *pdev = to_pci_dev(dev);
1978 	struct pci_dev *physfn = pci_physfn(pdev);
1979 
1980 	if (action == BUS_NOTIFY_ADD_DEVICE &&
1981 	    pdev->is_virtfn && physfn == vdev->pdev) {
1982 		pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
1983 			 pci_name(pdev));
1984 		pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
1985 						  vdev->vdev.ops->name);
1986 	} else if (action == BUS_NOTIFY_BOUND_DRIVER &&
1987 		   pdev->is_virtfn && physfn == vdev->pdev) {
1988 		struct pci_driver *drv = pci_dev_driver(pdev);
1989 
1990 		if (drv && drv != pci_dev_driver(vdev->pdev))
1991 			pci_warn(vdev->pdev,
1992 				 "VF %s bound to driver %s while PF bound to driver %s\n",
1993 				 pci_name(pdev), drv->name,
1994 				 pci_dev_driver(vdev->pdev)->name);
1995 	}
1996 
1997 	return 0;
1998 }
1999 
2000 static int vfio_pci_vf_init(struct vfio_pci_core_device *vdev)
2001 {
2002 	struct pci_dev *pdev = vdev->pdev;
2003 	struct vfio_pci_core_device *cur;
2004 	struct pci_dev *physfn;
2005 	int ret;
2006 
2007 	if (pdev->is_virtfn) {
2008 		/*
2009 		 * If this VF was created by our vfio_pci_core_sriov_configure()
2010 		 * then we can find the PF vfio_pci_core_device now, and due to
2011 		 * the locking in pci_disable_sriov() it cannot change until
2012 		 * this VF device driver is removed.
2013 		 */
2014 		physfn = pci_physfn(vdev->pdev);
2015 		mutex_lock(&vfio_pci_sriov_pfs_mutex);
2016 		list_for_each_entry(cur, &vfio_pci_sriov_pfs, sriov_pfs_item) {
2017 			if (cur->pdev == physfn) {
2018 				vdev->sriov_pf_core_dev = cur;
2019 				break;
2020 			}
2021 		}
2022 		mutex_unlock(&vfio_pci_sriov_pfs_mutex);
2023 		return 0;
2024 	}
2025 
2026 	/* Not a SRIOV PF */
2027 	if (!pdev->is_physfn)
2028 		return 0;
2029 
2030 	vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
2031 	if (!vdev->vf_token)
2032 		return -ENOMEM;
2033 
2034 	mutex_init(&vdev->vf_token->lock);
2035 	uuid_gen(&vdev->vf_token->uuid);
2036 
2037 	vdev->nb.notifier_call = vfio_pci_bus_notifier;
2038 	ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
2039 	if (ret) {
2040 		kfree(vdev->vf_token);
2041 		return ret;
2042 	}
2043 	return 0;
2044 }
2045 
2046 static void vfio_pci_vf_uninit(struct vfio_pci_core_device *vdev)
2047 {
2048 	if (!vdev->vf_token)
2049 		return;
2050 
2051 	bus_unregister_notifier(&pci_bus_type, &vdev->nb);
2052 	WARN_ON(vdev->vf_token->users);
2053 	mutex_destroy(&vdev->vf_token->lock);
2054 	kfree(vdev->vf_token);
2055 }
2056 
2057 static int vfio_pci_vga_init(struct vfio_pci_core_device *vdev)
2058 {
2059 	struct pci_dev *pdev = vdev->pdev;
2060 	int ret;
2061 
2062 	if (!vfio_pci_is_vga(pdev))
2063 		return 0;
2064 
2065 	ret = aperture_remove_conflicting_pci_devices(pdev, vdev->vdev.ops->name);
2066 	if (ret)
2067 		return ret;
2068 
2069 	ret = vga_client_register(pdev, vfio_pci_set_decode);
2070 	if (ret)
2071 		return ret;
2072 	vga_set_legacy_decoding(pdev, vfio_pci_set_decode(pdev, false));
2073 	return 0;
2074 }
2075 
2076 static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev)
2077 {
2078 	struct pci_dev *pdev = vdev->pdev;
2079 
2080 	if (!vfio_pci_is_vga(pdev))
2081 		return;
2082 	vga_client_unregister(pdev);
2083 	vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
2084 					      VGA_RSRC_LEGACY_IO |
2085 					      VGA_RSRC_LEGACY_MEM);
2086 }
2087 
2088 int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
2089 {
2090 	struct vfio_pci_core_device *vdev =
2091 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
2092 
2093 	vdev->pdev = to_pci_dev(core_vdev->dev);
2094 	vdev->irq_type = VFIO_PCI_NUM_IRQS;
2095 	mutex_init(&vdev->igate);
2096 	spin_lock_init(&vdev->irqlock);
2097 	mutex_init(&vdev->ioeventfds_lock);
2098 	INIT_LIST_HEAD(&vdev->dummy_resources_list);
2099 	INIT_LIST_HEAD(&vdev->ioeventfds_list);
2100 	mutex_init(&vdev->vma_lock);
2101 	INIT_LIST_HEAD(&vdev->vma_list);
2102 	INIT_LIST_HEAD(&vdev->sriov_pfs_item);
2103 	init_rwsem(&vdev->memory_lock);
2104 
2105 	return 0;
2106 }
2107 EXPORT_SYMBOL_GPL(vfio_pci_core_init_dev);
2108 
2109 void vfio_pci_core_release_dev(struct vfio_device *core_vdev)
2110 {
2111 	struct vfio_pci_core_device *vdev =
2112 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
2113 
2114 	mutex_destroy(&vdev->igate);
2115 	mutex_destroy(&vdev->ioeventfds_lock);
2116 	mutex_destroy(&vdev->vma_lock);
2117 	kfree(vdev->region);
2118 	kfree(vdev->pm_save);
2119 }
2120 EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev);
2121 
2122 int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
2123 {
2124 	struct pci_dev *pdev = vdev->pdev;
2125 	struct device *dev = &pdev->dev;
2126 	int ret;
2127 
2128 	/* Drivers must set the vfio_pci_core_device to their drvdata */
2129 	if (WARN_ON(vdev != dev_get_drvdata(dev)))
2130 		return -EINVAL;
2131 
2132 	if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
2133 		return -EINVAL;
2134 
2135 	if (vdev->vdev.mig_ops) {
2136 		if (!(vdev->vdev.mig_ops->migration_get_state &&
2137 		      vdev->vdev.mig_ops->migration_set_state &&
2138 		      vdev->vdev.mig_ops->migration_get_data_size) ||
2139 		    !(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY))
2140 			return -EINVAL;
2141 	}
2142 
2143 	if (vdev->vdev.log_ops && !(vdev->vdev.log_ops->log_start &&
2144 	    vdev->vdev.log_ops->log_stop &&
2145 	    vdev->vdev.log_ops->log_read_and_clear))
2146 		return -EINVAL;
2147 
2148 	/*
2149 	 * Prevent binding to PFs with VFs enabled, the VFs might be in use
2150 	 * by the host or other users.  We cannot capture the VFs if they
2151 	 * already exist, nor can we track VF users.  Disabling SR-IOV here
2152 	 * would initiate removing the VFs, which would unbind the driver,
2153 	 * which is prone to blocking if that VF is also in use by vfio-pci.
2154 	 * Just reject these PFs and let the user sort it out.
2155 	 */
2156 	if (pci_num_vf(pdev)) {
2157 		pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
2158 		return -EBUSY;
2159 	}
2160 
2161 	if (pci_is_root_bus(pdev->bus)) {
2162 		ret = vfio_assign_device_set(&vdev->vdev, vdev);
2163 	} else if (!pci_probe_reset_slot(pdev->slot)) {
2164 		ret = vfio_assign_device_set(&vdev->vdev, pdev->slot);
2165 	} else {
2166 		/*
2167 		 * If there is no slot reset support for this device, the whole
2168 		 * bus needs to be grouped together to support bus-wide resets.
2169 		 */
2170 		ret = vfio_assign_device_set(&vdev->vdev, pdev->bus);
2171 	}
2172 
2173 	if (ret)
2174 		return ret;
2175 	ret = vfio_pci_vf_init(vdev);
2176 	if (ret)
2177 		return ret;
2178 	ret = vfio_pci_vga_init(vdev);
2179 	if (ret)
2180 		goto out_vf;
2181 
2182 	vfio_pci_probe_power_state(vdev);
2183 
2184 	/*
2185 	 * pci-core sets the device power state to an unknown value at
2186 	 * bootup and after being removed from a driver.  The only
2187 	 * transition it allows from this unknown state is to D0, which
2188 	 * typically happens when a driver calls pci_enable_device().
2189 	 * We're not ready to enable the device yet, but we do want to
2190 	 * be able to get to D3.  Therefore first do a D0 transition
2191 	 * before enabling runtime PM.
2192 	 */
2193 	vfio_pci_set_power_state(vdev, PCI_D0);
2194 
2195 	dev->driver->pm = &vfio_pci_core_pm_ops;
2196 	pm_runtime_allow(dev);
2197 	if (!disable_idle_d3)
2198 		pm_runtime_put(dev);
2199 
2200 	ret = vfio_register_group_dev(&vdev->vdev);
2201 	if (ret)
2202 		goto out_power;
2203 	return 0;
2204 
2205 out_power:
2206 	if (!disable_idle_d3)
2207 		pm_runtime_get_noresume(dev);
2208 
2209 	pm_runtime_forbid(dev);
2210 out_vf:
2211 	vfio_pci_vf_uninit(vdev);
2212 	return ret;
2213 }
2214 EXPORT_SYMBOL_GPL(vfio_pci_core_register_device);
2215 
2216 void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev)
2217 {
2218 	vfio_pci_core_sriov_configure(vdev, 0);
2219 
2220 	vfio_unregister_group_dev(&vdev->vdev);
2221 
2222 	vfio_pci_vf_uninit(vdev);
2223 	vfio_pci_vga_uninit(vdev);
2224 
2225 	if (!disable_idle_d3)
2226 		pm_runtime_get_noresume(&vdev->pdev->dev);
2227 
2228 	pm_runtime_forbid(&vdev->pdev->dev);
2229 }
2230 EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device);
2231 
2232 pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
2233 						pci_channel_state_t state)
2234 {
2235 	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
2236 
2237 	mutex_lock(&vdev->igate);
2238 
2239 	if (vdev->err_trigger)
2240 		eventfd_signal(vdev->err_trigger, 1);
2241 
2242 	mutex_unlock(&vdev->igate);
2243 
2244 	return PCI_ERS_RESULT_CAN_RECOVER;
2245 }
2246 EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected);
2247 
2248 int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev,
2249 				  int nr_virtfn)
2250 {
2251 	struct pci_dev *pdev = vdev->pdev;
2252 	int ret = 0;
2253 
2254 	device_lock_assert(&pdev->dev);
2255 
2256 	if (nr_virtfn) {
2257 		mutex_lock(&vfio_pci_sriov_pfs_mutex);
2258 		/*
2259 		 * The thread that adds the vdev to the list is the only thread
2260 		 * that gets to call pci_enable_sriov() and we will only allow
2261 		 * it to be called once without going through
2262 		 * pci_disable_sriov()
2263 		 */
2264 		if (!list_empty(&vdev->sriov_pfs_item)) {
2265 			ret = -EINVAL;
2266 			goto out_unlock;
2267 		}
2268 		list_add_tail(&vdev->sriov_pfs_item, &vfio_pci_sriov_pfs);
2269 		mutex_unlock(&vfio_pci_sriov_pfs_mutex);
2270 
2271 		/*
2272 		 * The PF power state should always be higher than the VF power
2273 		 * state. The PF can be in low power state either with runtime
2274 		 * power management (when there is no user) or PCI_PM_CTRL
2275 		 * register write by the user. If PF is in the low power state,
2276 		 * then change the power state to D0 first before enabling
2277 		 * SR-IOV. Also, this function can be called at any time, and
2278 		 * userspace PCI_PM_CTRL write can race against this code path,
2279 		 * so protect the same with 'memory_lock'.
2280 		 */
2281 		ret = pm_runtime_resume_and_get(&pdev->dev);
2282 		if (ret)
2283 			goto out_del;
2284 
2285 		down_write(&vdev->memory_lock);
2286 		vfio_pci_set_power_state(vdev, PCI_D0);
2287 		ret = pci_enable_sriov(pdev, nr_virtfn);
2288 		up_write(&vdev->memory_lock);
2289 		if (ret) {
2290 			pm_runtime_put(&pdev->dev);
2291 			goto out_del;
2292 		}
2293 		return nr_virtfn;
2294 	}
2295 
2296 	if (pci_num_vf(pdev)) {
2297 		pci_disable_sriov(pdev);
2298 		pm_runtime_put(&pdev->dev);
2299 	}
2300 
2301 out_del:
2302 	mutex_lock(&vfio_pci_sriov_pfs_mutex);
2303 	list_del_init(&vdev->sriov_pfs_item);
2304 out_unlock:
2305 	mutex_unlock(&vfio_pci_sriov_pfs_mutex);
2306 	return ret;
2307 }
2308 EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure);
2309 
2310 const struct pci_error_handlers vfio_pci_core_err_handlers = {
2311 	.error_detected = vfio_pci_core_aer_err_detected,
2312 };
2313 EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
2314 
2315 static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
2316 			       struct vfio_pci_group_info *groups)
2317 {
2318 	unsigned int i;
2319 
2320 	for (i = 0; i < groups->count; i++)
2321 		if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
2322 			return true;
2323 	return false;
2324 }
2325 
2326 static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
2327 {
2328 	struct vfio_device_set *dev_set = data;
2329 	struct vfio_device *cur;
2330 
2331 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
2332 		if (cur->dev == &pdev->dev)
2333 			return 0;
2334 	return -EBUSY;
2335 }
2336 
2337 /*
2338  * vfio-core considers a group to be viable and will create a vfio_device even
2339  * if some devices are bound to drivers like pci-stub or pcieport. Here we
2340  * require all PCI devices to be inside our dev_set since that ensures they stay
2341  * put and that every driver controlling the device can co-ordinate with the
2342  * device reset.
2343  *
2344  * Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be
2345  * reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise.
2346  */
2347 static struct pci_dev *
2348 vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set)
2349 {
2350 	struct pci_dev *pdev;
2351 
2352 	lockdep_assert_held(&dev_set->lock);
2353 
2354 	/*
2355 	 * By definition all PCI devices in the dev_set share the same PCI
2356 	 * reset, so any pci_dev will have the same outcomes for
2357 	 * pci_probe_reset_*() and pci_reset_bus().
2358 	 */
2359 	pdev = list_first_entry(&dev_set->device_list,
2360 				struct vfio_pci_core_device,
2361 				vdev.dev_set_list)->pdev;
2362 
2363 	/* pci_reset_bus() is supported */
2364 	if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus))
2365 		return NULL;
2366 
2367 	if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set,
2368 					  dev_set,
2369 					  !pci_probe_reset_slot(pdev->slot)))
2370 		return NULL;
2371 	return pdev;
2372 }
2373 
2374 static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
2375 {
2376 	struct vfio_pci_core_device *cur;
2377 	int ret;
2378 
2379 	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2380 		ret = pm_runtime_resume_and_get(&cur->pdev->dev);
2381 		if (ret)
2382 			goto unwind;
2383 	}
2384 
2385 	return 0;
2386 
2387 unwind:
2388 	list_for_each_entry_continue_reverse(cur, &dev_set->device_list,
2389 					     vdev.dev_set_list)
2390 		pm_runtime_put(&cur->pdev->dev);
2391 
2392 	return ret;
2393 }
2394 
2395 /*
2396  * We need to get memory_lock for each device, but devices can share mmap_lock,
2397  * therefore we need to zap and hold the vma_lock for each device, and only then
2398  * get each memory_lock.
2399  */
2400 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
2401 				      struct vfio_pci_group_info *groups)
2402 {
2403 	struct vfio_pci_core_device *cur_mem;
2404 	struct vfio_pci_core_device *cur_vma;
2405 	struct vfio_pci_core_device *cur;
2406 	struct pci_dev *pdev;
2407 	bool is_mem = true;
2408 	int ret;
2409 
2410 	mutex_lock(&dev_set->lock);
2411 	cur_mem = list_first_entry(&dev_set->device_list,
2412 				   struct vfio_pci_core_device,
2413 				   vdev.dev_set_list);
2414 
2415 	pdev = vfio_pci_dev_set_resettable(dev_set);
2416 	if (!pdev) {
2417 		ret = -EINVAL;
2418 		goto err_unlock;
2419 	}
2420 
2421 	/*
2422 	 * Some of the devices in the dev_set can be in the runtime suspended
2423 	 * state. Increment the usage count for all the devices in the dev_set
2424 	 * before reset and decrement the same after reset.
2425 	 */
2426 	ret = vfio_pci_dev_set_pm_runtime_get(dev_set);
2427 	if (ret)
2428 		goto err_unlock;
2429 
2430 	list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
2431 		/*
2432 		 * Test whether all the affected devices are contained by the
2433 		 * set of groups provided by the user.
2434 		 */
2435 		if (!vfio_dev_in_groups(cur_vma, groups)) {
2436 			ret = -EINVAL;
2437 			goto err_undo;
2438 		}
2439 
2440 		/*
2441 		 * Locking multiple devices is prone to deadlock, runaway and
2442 		 * unwind if we hit contention.
2443 		 */
2444 		if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) {
2445 			ret = -EBUSY;
2446 			goto err_undo;
2447 		}
2448 	}
2449 	cur_vma = NULL;
2450 
2451 	list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) {
2452 		if (!down_write_trylock(&cur_mem->memory_lock)) {
2453 			ret = -EBUSY;
2454 			goto err_undo;
2455 		}
2456 		mutex_unlock(&cur_mem->vma_lock);
2457 	}
2458 	cur_mem = NULL;
2459 
2460 	/*
2461 	 * The pci_reset_bus() will reset all the devices in the bus.
2462 	 * The power state can be non-D0 for some of the devices in the bus.
2463 	 * For these devices, the pci_reset_bus() will internally set
2464 	 * the power state to D0 without vfio driver involvement.
2465 	 * For the devices which have NoSoftRst-, the reset function can
2466 	 * cause the PCI config space reset without restoring the original
2467 	 * state (saved locally in 'vdev->pm_save').
2468 	 */
2469 	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
2470 		vfio_pci_set_power_state(cur, PCI_D0);
2471 
2472 	ret = pci_reset_bus(pdev);
2473 
2474 err_undo:
2475 	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2476 		if (cur == cur_mem)
2477 			is_mem = false;
2478 		if (cur == cur_vma)
2479 			break;
2480 		if (is_mem)
2481 			up_write(&cur->memory_lock);
2482 		else
2483 			mutex_unlock(&cur->vma_lock);
2484 	}
2485 
2486 	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
2487 		pm_runtime_put(&cur->pdev->dev);
2488 err_unlock:
2489 	mutex_unlock(&dev_set->lock);
2490 	return ret;
2491 }
2492 
2493 static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
2494 {
2495 	struct vfio_pci_core_device *cur;
2496 	bool needs_reset = false;
2497 
2498 	/* No other VFIO device in the set can be open. */
2499 	if (vfio_device_set_open_count(dev_set) > 1)
2500 		return false;
2501 
2502 	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
2503 		needs_reset |= cur->needs_reset;
2504 	return needs_reset;
2505 }
2506 
2507 /*
2508  * If a bus or slot reset is available for the provided dev_set and:
2509  *  - All of the devices affected by that bus or slot reset are unused
2510  *  - At least one of the affected devices is marked dirty via
2511  *    needs_reset (such as by lack of FLR support)
2512  * Then attempt to perform that bus or slot reset.
2513  */
2514 static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
2515 {
2516 	struct vfio_pci_core_device *cur;
2517 	struct pci_dev *pdev;
2518 	bool reset_done = false;
2519 
2520 	if (!vfio_pci_dev_set_needs_reset(dev_set))
2521 		return;
2522 
2523 	pdev = vfio_pci_dev_set_resettable(dev_set);
2524 	if (!pdev)
2525 		return;
2526 
2527 	/*
2528 	 * Some of the devices in the bus can be in the runtime suspended
2529 	 * state. Increment the usage count for all the devices in the dev_set
2530 	 * before reset and decrement the same after reset.
2531 	 */
2532 	if (!disable_idle_d3 && vfio_pci_dev_set_pm_runtime_get(dev_set))
2533 		return;
2534 
2535 	if (!pci_reset_bus(pdev))
2536 		reset_done = true;
2537 
2538 	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2539 		if (reset_done)
2540 			cur->needs_reset = false;
2541 
2542 		if (!disable_idle_d3)
2543 			pm_runtime_put(&cur->pdev->dev);
2544 	}
2545 }
2546 
2547 void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga,
2548 			      bool is_disable_idle_d3)
2549 {
2550 	nointxmask = is_nointxmask;
2551 	disable_vga = is_disable_vga;
2552 	disable_idle_d3 = is_disable_idle_d3;
2553 }
2554 EXPORT_SYMBOL_GPL(vfio_pci_core_set_params);
2555 
2556 static void vfio_pci_core_cleanup(void)
2557 {
2558 	vfio_pci_uninit_perm_bits();
2559 }
2560 
2561 static int __init vfio_pci_core_init(void)
2562 {
2563 	/* Allocate shared config space permission data used by all devices */
2564 	return vfio_pci_init_perm_bits();
2565 }
2566 
2567 module_init(vfio_pci_core_init);
2568 module_exit(vfio_pci_core_cleanup);
2569 
2570 MODULE_LICENSE("GPL v2");
2571 MODULE_AUTHOR(DRIVER_AUTHOR);
2572 MODULE_DESCRIPTION(DRIVER_DESC);
2573