xref: /openbmc/linux/drivers/vfio/vfio_main.c (revision ca9e45b4)
10f3e72b5SJason Gunthorpe // SPDX-License-Identifier: GPL-2.0-only
20f3e72b5SJason Gunthorpe /*
30f3e72b5SJason Gunthorpe  * VFIO core
40f3e72b5SJason Gunthorpe  *
50f3e72b5SJason Gunthorpe  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
60f3e72b5SJason Gunthorpe  *     Author: Alex Williamson <alex.williamson@redhat.com>
70f3e72b5SJason Gunthorpe  *
80f3e72b5SJason Gunthorpe  * Derived from original vfio:
90f3e72b5SJason Gunthorpe  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
100f3e72b5SJason Gunthorpe  * Author: Tom Lyon, pugs@cisco.com
110f3e72b5SJason Gunthorpe  */
120f3e72b5SJason Gunthorpe 
130f3e72b5SJason Gunthorpe #include <linux/cdev.h>
140f3e72b5SJason Gunthorpe #include <linux/compat.h>
150f3e72b5SJason Gunthorpe #include <linux/device.h>
160f3e72b5SJason Gunthorpe #include <linux/fs.h>
170f3e72b5SJason Gunthorpe #include <linux/idr.h>
180f3e72b5SJason Gunthorpe #include <linux/iommu.h>
192b48f52fSMatthew Rosato #ifdef CONFIG_HAVE_KVM
202b48f52fSMatthew Rosato #include <linux/kvm_host.h>
212b48f52fSMatthew Rosato #endif
220f3e72b5SJason Gunthorpe #include <linux/list.h>
230f3e72b5SJason Gunthorpe #include <linux/miscdevice.h>
240f3e72b5SJason Gunthorpe #include <linux/module.h>
250f3e72b5SJason Gunthorpe #include <linux/mutex.h>
260f3e72b5SJason Gunthorpe #include <linux/pci.h>
270f3e72b5SJason Gunthorpe #include <linux/rwsem.h>
280f3e72b5SJason Gunthorpe #include <linux/sched.h>
290f3e72b5SJason Gunthorpe #include <linux/slab.h>
300f3e72b5SJason Gunthorpe #include <linux/stat.h>
310f3e72b5SJason Gunthorpe #include <linux/string.h>
320f3e72b5SJason Gunthorpe #include <linux/uaccess.h>
330f3e72b5SJason Gunthorpe #include <linux/vfio.h>
340f3e72b5SJason Gunthorpe #include <linux/wait.h>
350f3e72b5SJason Gunthorpe #include <linux/sched/signal.h>
368e5c6995SAbhishek Sahu #include <linux/pm_runtime.h>
3780c4b92aSYishai Hadas #include <linux/interval_tree.h>
3880c4b92aSYishai Hadas #include <linux/iova_bitmap.h>
392a3dab19SJason Gunthorpe #include <linux/iommufd.h>
400f3e72b5SJason Gunthorpe #include "vfio.h"
410f3e72b5SJason Gunthorpe 
420f3e72b5SJason Gunthorpe #define DRIVER_VERSION	"0.3"
430f3e72b5SJason Gunthorpe #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
440f3e72b5SJason Gunthorpe #define DRIVER_DESC	"VFIO - User Level meta-driver"
450f3e72b5SJason Gunthorpe 
460f3e72b5SJason Gunthorpe static struct vfio {
473c28a761SYi Liu 	struct class			*device_class;
483c28a761SYi Liu 	struct ida			device_ida;
490f3e72b5SJason Gunthorpe } vfio;
500f3e72b5SJason Gunthorpe 
51c9a397ceSJason Gunthorpe #ifdef CONFIG_VFIO_NOIOMMU
52c9a397ceSJason Gunthorpe bool vfio_noiommu __read_mostly;
53c9a397ceSJason Gunthorpe module_param_named(enable_unsafe_noiommu_mode,
54c9a397ceSJason Gunthorpe 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55c9a397ceSJason Gunthorpe MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
56c9a397ceSJason Gunthorpe #endif
57c9a397ceSJason Gunthorpe 
580f3e72b5SJason Gunthorpe static DEFINE_XARRAY(vfio_device_set_xa);
590f3e72b5SJason Gunthorpe 
600f3e72b5SJason Gunthorpe int vfio_assign_device_set(struct vfio_device *device, void *set_id)
610f3e72b5SJason Gunthorpe {
620f3e72b5SJason Gunthorpe 	unsigned long idx = (unsigned long)set_id;
630f3e72b5SJason Gunthorpe 	struct vfio_device_set *new_dev_set;
640f3e72b5SJason Gunthorpe 	struct vfio_device_set *dev_set;
650f3e72b5SJason Gunthorpe 
660f3e72b5SJason Gunthorpe 	if (WARN_ON(!set_id))
670f3e72b5SJason Gunthorpe 		return -EINVAL;
680f3e72b5SJason Gunthorpe 
690f3e72b5SJason Gunthorpe 	/*
700f3e72b5SJason Gunthorpe 	 * Atomically acquire a singleton object in the xarray for this set_id
710f3e72b5SJason Gunthorpe 	 */
720f3e72b5SJason Gunthorpe 	xa_lock(&vfio_device_set_xa);
730f3e72b5SJason Gunthorpe 	dev_set = xa_load(&vfio_device_set_xa, idx);
740f3e72b5SJason Gunthorpe 	if (dev_set)
750f3e72b5SJason Gunthorpe 		goto found_get_ref;
760f3e72b5SJason Gunthorpe 	xa_unlock(&vfio_device_set_xa);
770f3e72b5SJason Gunthorpe 
780f3e72b5SJason Gunthorpe 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
790f3e72b5SJason Gunthorpe 	if (!new_dev_set)
800f3e72b5SJason Gunthorpe 		return -ENOMEM;
810f3e72b5SJason Gunthorpe 	mutex_init(&new_dev_set->lock);
820f3e72b5SJason Gunthorpe 	INIT_LIST_HEAD(&new_dev_set->device_list);
830f3e72b5SJason Gunthorpe 	new_dev_set->set_id = set_id;
840f3e72b5SJason Gunthorpe 
850f3e72b5SJason Gunthorpe 	xa_lock(&vfio_device_set_xa);
860f3e72b5SJason Gunthorpe 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
870f3e72b5SJason Gunthorpe 			       GFP_KERNEL);
880f3e72b5SJason Gunthorpe 	if (!dev_set) {
890f3e72b5SJason Gunthorpe 		dev_set = new_dev_set;
900f3e72b5SJason Gunthorpe 		goto found_get_ref;
910f3e72b5SJason Gunthorpe 	}
920f3e72b5SJason Gunthorpe 
930f3e72b5SJason Gunthorpe 	kfree(new_dev_set);
940f3e72b5SJason Gunthorpe 	if (xa_is_err(dev_set)) {
950f3e72b5SJason Gunthorpe 		xa_unlock(&vfio_device_set_xa);
960f3e72b5SJason Gunthorpe 		return xa_err(dev_set);
970f3e72b5SJason Gunthorpe 	}
980f3e72b5SJason Gunthorpe 
990f3e72b5SJason Gunthorpe found_get_ref:
1000f3e72b5SJason Gunthorpe 	dev_set->device_count++;
1010f3e72b5SJason Gunthorpe 	xa_unlock(&vfio_device_set_xa);
1020f3e72b5SJason Gunthorpe 	mutex_lock(&dev_set->lock);
1030f3e72b5SJason Gunthorpe 	device->dev_set = dev_set;
1040f3e72b5SJason Gunthorpe 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
1050f3e72b5SJason Gunthorpe 	mutex_unlock(&dev_set->lock);
1060f3e72b5SJason Gunthorpe 	return 0;
1070f3e72b5SJason Gunthorpe }
1080f3e72b5SJason Gunthorpe EXPORT_SYMBOL_GPL(vfio_assign_device_set);
1090f3e72b5SJason Gunthorpe 
1100f3e72b5SJason Gunthorpe static void vfio_release_device_set(struct vfio_device *device)
1110f3e72b5SJason Gunthorpe {
1120f3e72b5SJason Gunthorpe 	struct vfio_device_set *dev_set = device->dev_set;
1130f3e72b5SJason Gunthorpe 
1140f3e72b5SJason Gunthorpe 	if (!dev_set)
1150f3e72b5SJason Gunthorpe 		return;
1160f3e72b5SJason Gunthorpe 
1170f3e72b5SJason Gunthorpe 	mutex_lock(&dev_set->lock);
1180f3e72b5SJason Gunthorpe 	list_del(&device->dev_set_list);
1190f3e72b5SJason Gunthorpe 	mutex_unlock(&dev_set->lock);
1200f3e72b5SJason Gunthorpe 
1210f3e72b5SJason Gunthorpe 	xa_lock(&vfio_device_set_xa);
1220f3e72b5SJason Gunthorpe 	if (!--dev_set->device_count) {
1230f3e72b5SJason Gunthorpe 		__xa_erase(&vfio_device_set_xa,
1240f3e72b5SJason Gunthorpe 			   (unsigned long)dev_set->set_id);
1250f3e72b5SJason Gunthorpe 		mutex_destroy(&dev_set->lock);
1260f3e72b5SJason Gunthorpe 		kfree(dev_set);
1270f3e72b5SJason Gunthorpe 	}
1280f3e72b5SJason Gunthorpe 	xa_unlock(&vfio_device_set_xa);
1290f3e72b5SJason Gunthorpe }
1300f3e72b5SJason Gunthorpe 
1315cd189e4SAnthony DeRossi unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
1325cd189e4SAnthony DeRossi {
1335cd189e4SAnthony DeRossi 	struct vfio_device *cur;
1345cd189e4SAnthony DeRossi 	unsigned int open_count = 0;
1355cd189e4SAnthony DeRossi 
1365cd189e4SAnthony DeRossi 	lockdep_assert_held(&dev_set->lock);
1375cd189e4SAnthony DeRossi 
1385cd189e4SAnthony DeRossi 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
1395cd189e4SAnthony DeRossi 		open_count += cur->open_count;
1405cd189e4SAnthony DeRossi 	return open_count;
1415cd189e4SAnthony DeRossi }
1425cd189e4SAnthony DeRossi EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
1435cd189e4SAnthony DeRossi 
144a80e1de9SYi Liu struct vfio_device *
145a80e1de9SYi Liu vfio_find_device_in_devset(struct vfio_device_set *dev_set,
146a80e1de9SYi Liu 			   struct device *dev)
147a80e1de9SYi Liu {
148a80e1de9SYi Liu 	struct vfio_device *cur;
149a80e1de9SYi Liu 
150a80e1de9SYi Liu 	lockdep_assert_held(&dev_set->lock);
151a80e1de9SYi Liu 
152a80e1de9SYi Liu 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
153a80e1de9SYi Liu 		if (cur->dev == dev)
154a80e1de9SYi Liu 			return cur;
155a80e1de9SYi Liu 	return NULL;
156a80e1de9SYi Liu }
157a80e1de9SYi Liu EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
158a80e1de9SYi Liu 
1590f3e72b5SJason Gunthorpe /*
1600f3e72b5SJason Gunthorpe  * Device objects - create, release, get, put, search
1610f3e72b5SJason Gunthorpe  */
1620f3e72b5SJason Gunthorpe /* Device reference always implies a group reference */
1639eefba80SYi Liu void vfio_device_put_registration(struct vfio_device *device)
1640f3e72b5SJason Gunthorpe {
1650f3e72b5SJason Gunthorpe 	if (refcount_dec_and_test(&device->refcount))
1660f3e72b5SJason Gunthorpe 		complete(&device->comp);
1670f3e72b5SJason Gunthorpe }
1680f3e72b5SJason Gunthorpe 
1699eefba80SYi Liu bool vfio_device_try_get_registration(struct vfio_device *device)
1700f3e72b5SJason Gunthorpe {
1710f3e72b5SJason Gunthorpe 	return refcount_inc_not_zero(&device->refcount);
1720f3e72b5SJason Gunthorpe }
1730f3e72b5SJason Gunthorpe 
1740f3e72b5SJason Gunthorpe /*
1750f3e72b5SJason Gunthorpe  * VFIO driver API
1760f3e72b5SJason Gunthorpe  */
177cb9ff3f3SKevin Tian /* Release helper called by vfio_put_device() */
1783c28a761SYi Liu static void vfio_device_release(struct device *dev)
179cb9ff3f3SKevin Tian {
180cb9ff3f3SKevin Tian 	struct vfio_device *device =
1813c28a761SYi Liu 			container_of(dev, struct vfio_device, device);
182cb9ff3f3SKevin Tian 
183ebb72b76SKevin Tian 	vfio_release_device_set(device);
1843c28a761SYi Liu 	ida_free(&vfio.device_ida, device->index);
185cb9ff3f3SKevin Tian 
186913447d0SEric Farman 	if (device->ops->release)
187cb9ff3f3SKevin Tian 		device->ops->release(device);
188913447d0SEric Farman 
189913447d0SEric Farman 	kvfree(device);
190cb9ff3f3SKevin Tian }
191cb9ff3f3SKevin Tian 
192d1104f93SEric Farman static int vfio_init_device(struct vfio_device *device, struct device *dev,
193d1104f93SEric Farman 			    const struct vfio_device_ops *ops);
194d1104f93SEric Farman 
195cb9ff3f3SKevin Tian /*
196cb9ff3f3SKevin Tian  * Allocate and initialize vfio_device so it can be registered to vfio
197cb9ff3f3SKevin Tian  * core.
198cb9ff3f3SKevin Tian  *
199cb9ff3f3SKevin Tian  * Drivers should use the wrapper vfio_alloc_device() for allocation.
200cb9ff3f3SKevin Tian  * @size is the size of the structure to be allocated, including any
201cb9ff3f3SKevin Tian  * private data used by the driver.
202cb9ff3f3SKevin Tian  *
203cb9ff3f3SKevin Tian  * Driver may provide an @init callback to cover device private data.
204cb9ff3f3SKevin Tian  *
205cb9ff3f3SKevin Tian  * Use vfio_put_device() to release the structure after success return.
206cb9ff3f3SKevin Tian  */
207cb9ff3f3SKevin Tian struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
208cb9ff3f3SKevin Tian 				       const struct vfio_device_ops *ops)
209cb9ff3f3SKevin Tian {
210cb9ff3f3SKevin Tian 	struct vfio_device *device;
211cb9ff3f3SKevin Tian 	int ret;
212cb9ff3f3SKevin Tian 
213cb9ff3f3SKevin Tian 	if (WARN_ON(size < sizeof(struct vfio_device)))
214cb9ff3f3SKevin Tian 		return ERR_PTR(-EINVAL);
215cb9ff3f3SKevin Tian 
216cb9ff3f3SKevin Tian 	device = kvzalloc(size, GFP_KERNEL);
217cb9ff3f3SKevin Tian 	if (!device)
218cb9ff3f3SKevin Tian 		return ERR_PTR(-ENOMEM);
219cb9ff3f3SKevin Tian 
220cb9ff3f3SKevin Tian 	ret = vfio_init_device(device, dev, ops);
221cb9ff3f3SKevin Tian 	if (ret)
222cb9ff3f3SKevin Tian 		goto out_free;
223cb9ff3f3SKevin Tian 	return device;
224cb9ff3f3SKevin Tian 
225cb9ff3f3SKevin Tian out_free:
226cb9ff3f3SKevin Tian 	kvfree(device);
227cb9ff3f3SKevin Tian 	return ERR_PTR(ret);
228cb9ff3f3SKevin Tian }
229cb9ff3f3SKevin Tian EXPORT_SYMBOL_GPL(_vfio_alloc_device);
230cb9ff3f3SKevin Tian 
231cb9ff3f3SKevin Tian /*
232cb9ff3f3SKevin Tian  * Initialize a vfio_device so it can be registered to vfio core.
233cb9ff3f3SKevin Tian  */
234d1104f93SEric Farman static int vfio_init_device(struct vfio_device *device, struct device *dev,
235cb9ff3f3SKevin Tian 			    const struct vfio_device_ops *ops)
236cb9ff3f3SKevin Tian {
237cb9ff3f3SKevin Tian 	int ret;
238cb9ff3f3SKevin Tian 
2393c28a761SYi Liu 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
2403c28a761SYi Liu 	if (ret < 0) {
2413c28a761SYi Liu 		dev_dbg(dev, "Error to alloc index\n");
2423c28a761SYi Liu 		return ret;
2433c28a761SYi Liu 	}
2443c28a761SYi Liu 
2453c28a761SYi Liu 	device->index = ret;
246ebb72b76SKevin Tian 	init_completion(&device->comp);
247ebb72b76SKevin Tian 	device->dev = dev;
248ebb72b76SKevin Tian 	device->ops = ops;
249cb9ff3f3SKevin Tian 
250cb9ff3f3SKevin Tian 	if (ops->init) {
251cb9ff3f3SKevin Tian 		ret = ops->init(device);
252cb9ff3f3SKevin Tian 		if (ret)
253cb9ff3f3SKevin Tian 			goto out_uninit;
254cb9ff3f3SKevin Tian 	}
255cb9ff3f3SKevin Tian 
2563c28a761SYi Liu 	device_initialize(&device->device);
2573c28a761SYi Liu 	device->device.release = vfio_device_release;
2583c28a761SYi Liu 	device->device.class = vfio.device_class;
2593c28a761SYi Liu 	device->device.parent = device->dev;
260cb9ff3f3SKevin Tian 	return 0;
261cb9ff3f3SKevin Tian 
262cb9ff3f3SKevin Tian out_uninit:
263ebb72b76SKevin Tian 	vfio_release_device_set(device);
2643c28a761SYi Liu 	ida_free(&vfio.device_ida, device->index);
265cb9ff3f3SKevin Tian 	return ret;
266cb9ff3f3SKevin Tian }
267cb9ff3f3SKevin Tian 
26849ea02d3SYi Liu static int __vfio_register_dev(struct vfio_device *device,
26949ea02d3SYi Liu 			       enum vfio_group_type type)
27049ea02d3SYi Liu {
27149ea02d3SYi Liu 	int ret;
27249ea02d3SYi Liu 
2737d12578cSYi Liu 	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
2747d12578cSYi Liu 		    (!device->ops->bind_iommufd ||
2757d12578cSYi Liu 		     !device->ops->unbind_iommufd ||
2769048c734SYi Liu 		     !device->ops->attach_ioas ||
2779048c734SYi Liu 		     !device->ops->detach_ioas)))
278a4d1f91dSJason Gunthorpe 		return -EINVAL;
279a4d1f91dSJason Gunthorpe 
2800f3e72b5SJason Gunthorpe 	/*
2810f3e72b5SJason Gunthorpe 	 * If the driver doesn't specify a set then the device is added to a
2820f3e72b5SJason Gunthorpe 	 * singleton set just for itself.
2830f3e72b5SJason Gunthorpe 	 */
2840f3e72b5SJason Gunthorpe 	if (!device->dev_set)
2850f3e72b5SJason Gunthorpe 		vfio_assign_device_set(device, device);
2860f3e72b5SJason Gunthorpe 
2873c28a761SYi Liu 	ret = dev_set_name(&device->device, "vfio%d", device->index);
2883c28a761SYi Liu 	if (ret)
28949ea02d3SYi Liu 		return ret;
29049ea02d3SYi Liu 
29149ea02d3SYi Liu 	ret = vfio_device_set_group(device, type);
29249ea02d3SYi Liu 	if (ret)
29349ea02d3SYi Liu 		return ret;
2943c28a761SYi Liu 
2958b6f173aSYi Liu 	ret = vfio_device_add(device);
2963c28a761SYi Liu 	if (ret)
2973c28a761SYi Liu 		goto err_out;
2983c28a761SYi Liu 
2990f3e72b5SJason Gunthorpe 	/* Refcounting can't start until the driver calls register */
3000f3e72b5SJason Gunthorpe 	refcount_set(&device->refcount, 1);
3010f3e72b5SJason Gunthorpe 
30232e09228SYi Liu 	vfio_device_group_register(device);
3030f3e72b5SJason Gunthorpe 
3040f3e72b5SJason Gunthorpe 	return 0;
3053c28a761SYi Liu err_out:
306ca5f21b2SJason Gunthorpe 	vfio_device_remove_group(device);
3073c28a761SYi Liu 	return ret;
3080f3e72b5SJason Gunthorpe }
3090f3e72b5SJason Gunthorpe 
3100f3e72b5SJason Gunthorpe int vfio_register_group_dev(struct vfio_device *device)
3110f3e72b5SJason Gunthorpe {
31249ea02d3SYi Liu 	return __vfio_register_dev(device, VFIO_IOMMU);
3130f3e72b5SJason Gunthorpe }
3140f3e72b5SJason Gunthorpe EXPORT_SYMBOL_GPL(vfio_register_group_dev);
3150f3e72b5SJason Gunthorpe 
3160f3e72b5SJason Gunthorpe /*
3170f3e72b5SJason Gunthorpe  * Register a virtual device without IOMMU backing.  The user of this
3180f3e72b5SJason Gunthorpe  * device must not be able to directly trigger unmediated DMA.
3190f3e72b5SJason Gunthorpe  */
3200f3e72b5SJason Gunthorpe int vfio_register_emulated_iommu_dev(struct vfio_device *device)
3210f3e72b5SJason Gunthorpe {
32249ea02d3SYi Liu 	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
3230f3e72b5SJason Gunthorpe }
3240f3e72b5SJason Gunthorpe EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
3250f3e72b5SJason Gunthorpe 
3260f3e72b5SJason Gunthorpe /*
3270f3e72b5SJason Gunthorpe  * Decrement the device reference count and wait for the device to be
3280f3e72b5SJason Gunthorpe  * removed.  Open file descriptors for the device... */
3290f3e72b5SJason Gunthorpe void vfio_unregister_group_dev(struct vfio_device *device)
3300f3e72b5SJason Gunthorpe {
3310f3e72b5SJason Gunthorpe 	unsigned int i = 0;
3320f3e72b5SJason Gunthorpe 	bool interrupted = false;
3330f3e72b5SJason Gunthorpe 	long rc;
3340f3e72b5SJason Gunthorpe 
335291872a5SYi Liu 	/*
336291872a5SYi Liu 	 * Prevent new device opened by userspace via the
337291872a5SYi Liu 	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
338291872a5SYi Liu 	 */
339291872a5SYi Liu 	vfio_device_group_unregister(device);
340291872a5SYi Liu 
3418b6f173aSYi Liu 	/*
3428b6f173aSYi Liu 	 * Balances vfio_device_add() in register path, also prevents
3438b6f173aSYi Liu 	 * new device opened by userspace in the cdev path.
3448b6f173aSYi Liu 	 */
3458b6f173aSYi Liu 	vfio_device_del(device);
34638c24544SYi Liu 
3474a725b8dSKevin Tian 	vfio_device_put_registration(device);
3480f3e72b5SJason Gunthorpe 	rc = try_wait_for_completion(&device->comp);
3490f3e72b5SJason Gunthorpe 	while (rc <= 0) {
3500f3e72b5SJason Gunthorpe 		if (device->ops->request)
3510f3e72b5SJason Gunthorpe 			device->ops->request(device, i++);
3520f3e72b5SJason Gunthorpe 
3530f3e72b5SJason Gunthorpe 		if (interrupted) {
3540f3e72b5SJason Gunthorpe 			rc = wait_for_completion_timeout(&device->comp,
3550f3e72b5SJason Gunthorpe 							 HZ * 10);
3560f3e72b5SJason Gunthorpe 		} else {
3570f3e72b5SJason Gunthorpe 			rc = wait_for_completion_interruptible_timeout(
3580f3e72b5SJason Gunthorpe 				&device->comp, HZ * 10);
3590f3e72b5SJason Gunthorpe 			if (rc < 0) {
3600f3e72b5SJason Gunthorpe 				interrupted = true;
3610f3e72b5SJason Gunthorpe 				dev_warn(device->dev,
3620f3e72b5SJason Gunthorpe 					 "Device is currently in use, task"
3630f3e72b5SJason Gunthorpe 					 " \"%s\" (%d) "
3640f3e72b5SJason Gunthorpe 					 "blocked until device is released",
3650f3e72b5SJason Gunthorpe 					 current->comm, task_pid_nr(current));
3660f3e72b5SJason Gunthorpe 			}
3670f3e72b5SJason Gunthorpe 		}
3680f3e72b5SJason Gunthorpe 	}
3690f3e72b5SJason Gunthorpe 
37049ea02d3SYi Liu 	/* Balances vfio_device_set_group in register path */
371ca5f21b2SJason Gunthorpe 	vfio_device_remove_group(device);
3720f3e72b5SJason Gunthorpe }
3730f3e72b5SJason Gunthorpe EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
3740f3e72b5SJason Gunthorpe 
3752b48f52fSMatthew Rosato #ifdef CONFIG_HAVE_KVM
3765c6de3eaSYi Liu void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
3772b48f52fSMatthew Rosato {
3782b48f52fSMatthew Rosato 	void (*pfn)(struct kvm *kvm);
3792b48f52fSMatthew Rosato 	bool (*fn)(struct kvm *kvm);
3802b48f52fSMatthew Rosato 	bool ret;
3812b48f52fSMatthew Rosato 
3822b48f52fSMatthew Rosato 	lockdep_assert_held(&device->dev_set->lock);
3832b48f52fSMatthew Rosato 
3845c6de3eaSYi Liu 	if (!kvm)
3855c6de3eaSYi Liu 		return;
3865c6de3eaSYi Liu 
3872b48f52fSMatthew Rosato 	pfn = symbol_get(kvm_put_kvm);
3882b48f52fSMatthew Rosato 	if (WARN_ON(!pfn))
3892b48f52fSMatthew Rosato 		return;
3902b48f52fSMatthew Rosato 
3912b48f52fSMatthew Rosato 	fn = symbol_get(kvm_get_kvm_safe);
3922b48f52fSMatthew Rosato 	if (WARN_ON(!fn)) {
3932b48f52fSMatthew Rosato 		symbol_put(kvm_put_kvm);
3942b48f52fSMatthew Rosato 		return;
3952b48f52fSMatthew Rosato 	}
3962b48f52fSMatthew Rosato 
3972b48f52fSMatthew Rosato 	ret = fn(kvm);
3982b48f52fSMatthew Rosato 	symbol_put(kvm_get_kvm_safe);
3992b48f52fSMatthew Rosato 	if (!ret) {
4002b48f52fSMatthew Rosato 		symbol_put(kvm_put_kvm);
4012b48f52fSMatthew Rosato 		return;
4022b48f52fSMatthew Rosato 	}
4032b48f52fSMatthew Rosato 
4042b48f52fSMatthew Rosato 	device->put_kvm = pfn;
4052b48f52fSMatthew Rosato 	device->kvm = kvm;
4062b48f52fSMatthew Rosato }
4072b48f52fSMatthew Rosato 
4082b48f52fSMatthew Rosato void vfio_device_put_kvm(struct vfio_device *device)
4092b48f52fSMatthew Rosato {
4102b48f52fSMatthew Rosato 	lockdep_assert_held(&device->dev_set->lock);
4112b48f52fSMatthew Rosato 
4122b48f52fSMatthew Rosato 	if (!device->kvm)
4132b48f52fSMatthew Rosato 		return;
4142b48f52fSMatthew Rosato 
4152b48f52fSMatthew Rosato 	if (WARN_ON(!device->put_kvm))
4162b48f52fSMatthew Rosato 		goto clear;
4172b48f52fSMatthew Rosato 
4182b48f52fSMatthew Rosato 	device->put_kvm(device->kvm);
4192b48f52fSMatthew Rosato 	device->put_kvm = NULL;
4202b48f52fSMatthew Rosato 	symbol_put(kvm_put_kvm);
4212b48f52fSMatthew Rosato 
4222b48f52fSMatthew Rosato clear:
4232b48f52fSMatthew Rosato 	device->kvm = NULL;
4242b48f52fSMatthew Rosato }
4252b48f52fSMatthew Rosato #endif
4262b48f52fSMatthew Rosato 
4270f3e72b5SJason Gunthorpe /* true if the vfio_device has open_device() called but not close_device() */
4284741f2e9SJason Gunthorpe static bool vfio_assert_device_open(struct vfio_device *device)
4290f3e72b5SJason Gunthorpe {
4300f3e72b5SJason Gunthorpe 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
4310f3e72b5SJason Gunthorpe }
4320f3e72b5SJason Gunthorpe 
433b1a3b5c6SYi Liu struct vfio_device_file *
434b1a3b5c6SYi Liu vfio_allocate_device_file(struct vfio_device *device)
435b1a3b5c6SYi Liu {
436b1a3b5c6SYi Liu 	struct vfio_device_file *df;
437b1a3b5c6SYi Liu 
438b1a3b5c6SYi Liu 	df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
439b1a3b5c6SYi Liu 	if (!df)
440b1a3b5c6SYi Liu 		return ERR_PTR(-ENOMEM);
441b1a3b5c6SYi Liu 
442b1a3b5c6SYi Liu 	df->device = device;
44334aeeecdSYi Liu 	spin_lock_init(&df->kvm_ref_lock);
444b1a3b5c6SYi Liu 
445b1a3b5c6SYi Liu 	return df;
446b1a3b5c6SYi Liu }
447b1a3b5c6SYi Liu 
44805f37e1cSYi Liu static int vfio_df_device_first_open(struct vfio_device_file *df)
449294aaccbSJason Gunthorpe {
45005f37e1cSYi Liu 	struct vfio_device *device = df->device;
45105f37e1cSYi Liu 	struct iommufd_ctx *iommufd = df->iommufd;
452294aaccbSJason Gunthorpe 	int ret;
453294aaccbSJason Gunthorpe 
454294aaccbSJason Gunthorpe 	lockdep_assert_held(&device->dev_set->lock);
455294aaccbSJason Gunthorpe 
456294aaccbSJason Gunthorpe 	if (!try_module_get(device->dev->driver->owner))
457294aaccbSJason Gunthorpe 		return -ENODEV;
458294aaccbSJason Gunthorpe 
4595c8d3d93SYi Liu 	if (iommufd)
46031014aefSYi Liu 		ret = vfio_df_iommufd_bind(df);
4615c8d3d93SYi Liu 	else
4625c8d3d93SYi Liu 		ret = vfio_device_group_use_iommu(device);
463bab6fabcSJason Gunthorpe 	if (ret)
464bab6fabcSJason Gunthorpe 		goto err_module_put;
465bab6fabcSJason Gunthorpe 
466294aaccbSJason Gunthorpe 	if (device->ops->open_device) {
467294aaccbSJason Gunthorpe 		ret = device->ops->open_device(device);
468294aaccbSJason Gunthorpe 		if (ret)
4695c8d3d93SYi Liu 			goto err_unuse_iommu;
470294aaccbSJason Gunthorpe 	}
471294aaccbSJason Gunthorpe 	return 0;
472294aaccbSJason Gunthorpe 
4735c8d3d93SYi Liu err_unuse_iommu:
4745c8d3d93SYi Liu 	if (iommufd)
47531014aefSYi Liu 		vfio_df_iommufd_unbind(df);
4765c8d3d93SYi Liu 	else
4775c8d3d93SYi Liu 		vfio_device_group_unuse_iommu(device);
478bab6fabcSJason Gunthorpe err_module_put:
479294aaccbSJason Gunthorpe 	module_put(device->dev->driver->owner);
480294aaccbSJason Gunthorpe 	return ret;
481294aaccbSJason Gunthorpe }
482294aaccbSJason Gunthorpe 
48305f37e1cSYi Liu static void vfio_df_device_last_close(struct vfio_device_file *df)
484294aaccbSJason Gunthorpe {
48505f37e1cSYi Liu 	struct vfio_device *device = df->device;
48605f37e1cSYi Liu 	struct iommufd_ctx *iommufd = df->iommufd;
48705f37e1cSYi Liu 
488294aaccbSJason Gunthorpe 	lockdep_assert_held(&device->dev_set->lock);
489294aaccbSJason Gunthorpe 
490294aaccbSJason Gunthorpe 	if (device->ops->close_device)
491294aaccbSJason Gunthorpe 		device->ops->close_device(device);
4925c8d3d93SYi Liu 	if (iommufd)
49331014aefSYi Liu 		vfio_df_iommufd_unbind(df);
4945c8d3d93SYi Liu 	else
4955c8d3d93SYi Liu 		vfio_device_group_unuse_iommu(device);
496294aaccbSJason Gunthorpe 	module_put(device->dev->driver->owner);
497294aaccbSJason Gunthorpe }
498294aaccbSJason Gunthorpe 
49905f37e1cSYi Liu int vfio_df_open(struct vfio_device_file *df)
5000f3e72b5SJason Gunthorpe {
50105f37e1cSYi Liu 	struct vfio_device *device = df->device;
5025cfff077SYi Liu 	int ret = 0;
5030f3e72b5SJason Gunthorpe 
5042b48f52fSMatthew Rosato 	lockdep_assert_held(&device->dev_set->lock);
5052b48f52fSMatthew Rosato 
506839e692fSYi Liu 	/*
507839e692fSYi Liu 	 * Only the group path allows the device to be opened multiple
508839e692fSYi Liu 	 * times.  The device cdev path doesn't have a secure way for it.
509839e692fSYi Liu 	 */
510839e692fSYi Liu 	if (device->open_count != 0 && !df->group)
511839e692fSYi Liu 		return -EINVAL;
512839e692fSYi Liu 
5130f3e72b5SJason Gunthorpe 	device->open_count++;
5140f3e72b5SJason Gunthorpe 	if (device->open_count == 1) {
51505f37e1cSYi Liu 		ret = vfio_df_device_first_open(df);
5160f3e72b5SJason Gunthorpe 		if (ret)
5175cfff077SYi Liu 			device->open_count--;
5180f3e72b5SJason Gunthorpe 	}
5190f3e72b5SJason Gunthorpe 
5205cfff077SYi Liu 	return ret;
5215cfff077SYi Liu }
5225cfff077SYi Liu 
52305f37e1cSYi Liu void vfio_df_close(struct vfio_device_file *df)
5245cfff077SYi Liu {
52505f37e1cSYi Liu 	struct vfio_device *device = df->device;
52605f37e1cSYi Liu 
5272b48f52fSMatthew Rosato 	lockdep_assert_held(&device->dev_set->lock);
5282b48f52fSMatthew Rosato 
5295cfff077SYi Liu 	vfio_assert_device_open(device);
5305cfff077SYi Liu 	if (device->open_count == 1)
53105f37e1cSYi Liu 		vfio_df_device_last_close(df);
5325cfff077SYi Liu 	device->open_count--;
5335cfff077SYi Liu }
5345cfff077SYi Liu 
5350f3e72b5SJason Gunthorpe /*
5368e5c6995SAbhishek Sahu  * Wrapper around pm_runtime_resume_and_get().
5378e5c6995SAbhishek Sahu  * Return error code on failure or 0 on success.
5388e5c6995SAbhishek Sahu  */
5398e5c6995SAbhishek Sahu static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
5408e5c6995SAbhishek Sahu {
5418e5c6995SAbhishek Sahu 	struct device *dev = device->dev;
5428e5c6995SAbhishek Sahu 
5438e5c6995SAbhishek Sahu 	if (dev->driver && dev->driver->pm) {
5448e5c6995SAbhishek Sahu 		int ret;
5458e5c6995SAbhishek Sahu 
5468e5c6995SAbhishek Sahu 		ret = pm_runtime_resume_and_get(dev);
5478e5c6995SAbhishek Sahu 		if (ret) {
5488e5c6995SAbhishek Sahu 			dev_info_ratelimited(dev,
5498e5c6995SAbhishek Sahu 				"vfio: runtime resume failed %d\n", ret);
5508e5c6995SAbhishek Sahu 			return -EIO;
5518e5c6995SAbhishek Sahu 		}
5528e5c6995SAbhishek Sahu 	}
5538e5c6995SAbhishek Sahu 
5548e5c6995SAbhishek Sahu 	return 0;
5558e5c6995SAbhishek Sahu }
5568e5c6995SAbhishek Sahu 
5578e5c6995SAbhishek Sahu /*
5588e5c6995SAbhishek Sahu  * Wrapper around pm_runtime_put().
5598e5c6995SAbhishek Sahu  */
5608e5c6995SAbhishek Sahu static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
5618e5c6995SAbhishek Sahu {
5628e5c6995SAbhishek Sahu 	struct device *dev = device->dev;
5638e5c6995SAbhishek Sahu 
5648e5c6995SAbhishek Sahu 	if (dev->driver && dev->driver->pm)
5658e5c6995SAbhishek Sahu 		pm_runtime_put(dev);
5668e5c6995SAbhishek Sahu }
5678e5c6995SAbhishek Sahu 
5688e5c6995SAbhishek Sahu /*
5690f3e72b5SJason Gunthorpe  * VFIO Device fd
5700f3e72b5SJason Gunthorpe  */
5710f3e72b5SJason Gunthorpe static int vfio_device_fops_release(struct inode *inode, struct file *filep)
5720f3e72b5SJason Gunthorpe {
573b1a3b5c6SYi Liu 	struct vfio_device_file *df = filep->private_data;
574b1a3b5c6SYi Liu 	struct vfio_device *device = df->device;
5750f3e72b5SJason Gunthorpe 
5768b6f173aSYi Liu 	if (df->group)
57705f37e1cSYi Liu 		vfio_df_group_close(df);
5780f3e72b5SJason Gunthorpe 
5794a725b8dSKevin Tian 	vfio_device_put_registration(device);
5800f3e72b5SJason Gunthorpe 
581b1a3b5c6SYi Liu 	kfree(df);
582b1a3b5c6SYi Liu 
5830f3e72b5SJason Gunthorpe 	return 0;
5840f3e72b5SJason Gunthorpe }
5850f3e72b5SJason Gunthorpe 
5860f3e72b5SJason Gunthorpe /*
5870f3e72b5SJason Gunthorpe  * vfio_mig_get_next_state - Compute the next step in the FSM
5880f3e72b5SJason Gunthorpe  * @cur_fsm - The current state the device is in
5890f3e72b5SJason Gunthorpe  * @new_fsm - The target state to reach
5900f3e72b5SJason Gunthorpe  * @next_fsm - Pointer to the next step to get to new_fsm
5910f3e72b5SJason Gunthorpe  *
5920f3e72b5SJason Gunthorpe  * Return 0 upon success, otherwise -errno
5930f3e72b5SJason Gunthorpe  * Upon success the next step in the state progression between cur_fsm and
5940f3e72b5SJason Gunthorpe  * new_fsm will be set in next_fsm.
5950f3e72b5SJason Gunthorpe  *
5960f3e72b5SJason Gunthorpe  * This breaks down requests for combination transitions into smaller steps and
5970f3e72b5SJason Gunthorpe  * returns the next step to get to new_fsm. The function may need to be called
5980f3e72b5SJason Gunthorpe  * multiple times before reaching new_fsm.
5990f3e72b5SJason Gunthorpe  *
6000f3e72b5SJason Gunthorpe  */
6010f3e72b5SJason Gunthorpe int vfio_mig_get_next_state(struct vfio_device *device,
6020f3e72b5SJason Gunthorpe 			    enum vfio_device_mig_state cur_fsm,
6030f3e72b5SJason Gunthorpe 			    enum vfio_device_mig_state new_fsm,
6040f3e72b5SJason Gunthorpe 			    enum vfio_device_mig_state *next_fsm)
6050f3e72b5SJason Gunthorpe {
6064db52602SJason Gunthorpe 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
6070f3e72b5SJason Gunthorpe 	/*
6080f3e72b5SJason Gunthorpe 	 * The coding in this table requires the driver to implement the
6090f3e72b5SJason Gunthorpe 	 * following FSM arcs:
6100f3e72b5SJason Gunthorpe 	 *         RESUMING -> STOP
6110f3e72b5SJason Gunthorpe 	 *         STOP -> RESUMING
6120f3e72b5SJason Gunthorpe 	 *         STOP -> STOP_COPY
6130f3e72b5SJason Gunthorpe 	 *         STOP_COPY -> STOP
6140f3e72b5SJason Gunthorpe 	 *
6150f3e72b5SJason Gunthorpe 	 * If P2P is supported then the driver must also implement these FSM
6160f3e72b5SJason Gunthorpe 	 * arcs:
6170f3e72b5SJason Gunthorpe 	 *         RUNNING -> RUNNING_P2P
6180f3e72b5SJason Gunthorpe 	 *         RUNNING_P2P -> RUNNING
6190f3e72b5SJason Gunthorpe 	 *         RUNNING_P2P -> STOP
6200f3e72b5SJason Gunthorpe 	 *         STOP -> RUNNING_P2P
6214db52602SJason Gunthorpe 	 *
6224db52602SJason Gunthorpe 	 * If precopy is supported then the driver must support these additional
6234db52602SJason Gunthorpe 	 * FSM arcs:
6244db52602SJason Gunthorpe 	 *         RUNNING -> PRE_COPY
6254db52602SJason Gunthorpe 	 *         PRE_COPY -> RUNNING
6264db52602SJason Gunthorpe 	 *         PRE_COPY -> STOP_COPY
6274db52602SJason Gunthorpe 	 * However, if precopy and P2P are supported together then the driver
6284db52602SJason Gunthorpe 	 * must support these additional arcs beyond the P2P arcs above:
6294db52602SJason Gunthorpe 	 *         PRE_COPY -> RUNNING
6304db52602SJason Gunthorpe 	 *         PRE_COPY -> PRE_COPY_P2P
6314db52602SJason Gunthorpe 	 *         PRE_COPY_P2P -> PRE_COPY
6324db52602SJason Gunthorpe 	 *         PRE_COPY_P2P -> RUNNING_P2P
6334db52602SJason Gunthorpe 	 *         PRE_COPY_P2P -> STOP_COPY
6344db52602SJason Gunthorpe 	 *         RUNNING -> PRE_COPY
6354db52602SJason Gunthorpe 	 *         RUNNING_P2P -> PRE_COPY_P2P
6364db52602SJason Gunthorpe 	 *
6374db52602SJason Gunthorpe 	 * Without P2P and precopy the driver must implement:
6380f3e72b5SJason Gunthorpe 	 *         RUNNING -> STOP
6390f3e72b5SJason Gunthorpe 	 *         STOP -> RUNNING
6400f3e72b5SJason Gunthorpe 	 *
6410f3e72b5SJason Gunthorpe 	 * The coding will step through multiple states for some combination
6420f3e72b5SJason Gunthorpe 	 * transitions; if all optional features are supported, this means the
6430f3e72b5SJason Gunthorpe 	 * following ones:
6444db52602SJason Gunthorpe 	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
6454db52602SJason Gunthorpe 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
6464db52602SJason Gunthorpe 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
6474db52602SJason Gunthorpe 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
6484db52602SJason Gunthorpe 	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
6494db52602SJason Gunthorpe 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
6504db52602SJason Gunthorpe 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
6510f3e72b5SJason Gunthorpe 	 *         RESUMING -> STOP -> RUNNING_P2P
6524db52602SJason Gunthorpe 	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
6530f3e72b5SJason Gunthorpe 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
6544db52602SJason Gunthorpe 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
6550f3e72b5SJason Gunthorpe 	 *         RESUMING -> STOP -> STOP_COPY
6564db52602SJason Gunthorpe 	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
6570f3e72b5SJason Gunthorpe 	 *         RUNNING -> RUNNING_P2P -> STOP
6580f3e72b5SJason Gunthorpe 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
6590f3e72b5SJason Gunthorpe 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
6604db52602SJason Gunthorpe 	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
6610f3e72b5SJason Gunthorpe 	 *         RUNNING_P2P -> STOP -> RESUMING
6620f3e72b5SJason Gunthorpe 	 *         RUNNING_P2P -> STOP -> STOP_COPY
6634db52602SJason Gunthorpe 	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
6640f3e72b5SJason Gunthorpe 	 *         STOP -> RUNNING_P2P -> RUNNING
6654db52602SJason Gunthorpe 	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
6660f3e72b5SJason Gunthorpe 	 *         STOP_COPY -> STOP -> RESUMING
6670f3e72b5SJason Gunthorpe 	 *         STOP_COPY -> STOP -> RUNNING_P2P
6680f3e72b5SJason Gunthorpe 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
6694db52602SJason Gunthorpe 	 *
6704db52602SJason Gunthorpe 	 *  The following transitions are blocked:
6714db52602SJason Gunthorpe 	 *         STOP_COPY -> PRE_COPY
6724db52602SJason Gunthorpe 	 *         STOP_COPY -> PRE_COPY_P2P
6730f3e72b5SJason Gunthorpe 	 */
6740f3e72b5SJason Gunthorpe 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
6750f3e72b5SJason Gunthorpe 		[VFIO_DEVICE_STATE_STOP] = {
6760f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
6770f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
6784db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
6794db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
6800f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
6810f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
6820f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
6830f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
6840f3e72b5SJason Gunthorpe 		},
6850f3e72b5SJason Gunthorpe 		[VFIO_DEVICE_STATE_RUNNING] = {
6860f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
6870f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
6884db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
6894db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
6900f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
6910f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
6920f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
6930f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
6940f3e72b5SJason Gunthorpe 		},
6954db52602SJason Gunthorpe 		[VFIO_DEVICE_STATE_PRE_COPY] = {
6964db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
6974db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
6984db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
6994db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
7004db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
7014db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
7024db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
7034db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
7044db52602SJason Gunthorpe 		},
7054db52602SJason Gunthorpe 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
7064db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
7074db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
7084db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
7094db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
7104db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
7114db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
7124db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
7134db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
7144db52602SJason Gunthorpe 		},
7150f3e72b5SJason Gunthorpe 		[VFIO_DEVICE_STATE_STOP_COPY] = {
7160f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
7170f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
7184db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
7194db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
7200f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
7210f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
7220f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
7230f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
7240f3e72b5SJason Gunthorpe 		},
7250f3e72b5SJason Gunthorpe 		[VFIO_DEVICE_STATE_RESUMING] = {
7260f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
7270f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
7284db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
7294db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
7300f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
7310f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
7320f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
7330f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
7340f3e72b5SJason Gunthorpe 		},
7350f3e72b5SJason Gunthorpe 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
7360f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
7370f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
7384db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
7394db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
7400f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
7410f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
7420f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
7430f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
7440f3e72b5SJason Gunthorpe 		},
7450f3e72b5SJason Gunthorpe 		[VFIO_DEVICE_STATE_ERROR] = {
7460f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
7470f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
7484db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
7494db52602SJason Gunthorpe 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
7500f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
7510f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
7520f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
7530f3e72b5SJason Gunthorpe 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
7540f3e72b5SJason Gunthorpe 		},
7550f3e72b5SJason Gunthorpe 	};
7560f3e72b5SJason Gunthorpe 
7570f3e72b5SJason Gunthorpe 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
7580f3e72b5SJason Gunthorpe 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
7590f3e72b5SJason Gunthorpe 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
7604db52602SJason Gunthorpe 		[VFIO_DEVICE_STATE_PRE_COPY] =
7614db52602SJason Gunthorpe 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
7624db52602SJason Gunthorpe 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
7634db52602SJason Gunthorpe 						   VFIO_MIGRATION_P2P |
7644db52602SJason Gunthorpe 						   VFIO_MIGRATION_PRE_COPY,
7650f3e72b5SJason Gunthorpe 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
7660f3e72b5SJason Gunthorpe 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
7670f3e72b5SJason Gunthorpe 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
7680f3e72b5SJason Gunthorpe 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
7690f3e72b5SJason Gunthorpe 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
7700f3e72b5SJason Gunthorpe 	};
7710f3e72b5SJason Gunthorpe 
7720f3e72b5SJason Gunthorpe 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
7730f3e72b5SJason Gunthorpe 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
7740f3e72b5SJason Gunthorpe 			state_flags_table[cur_fsm]))
7750f3e72b5SJason Gunthorpe 		return -EINVAL;
7760f3e72b5SJason Gunthorpe 
7770f3e72b5SJason Gunthorpe 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
7780f3e72b5SJason Gunthorpe 	   (state_flags_table[new_fsm] & device->migration_flags) !=
7790f3e72b5SJason Gunthorpe 			state_flags_table[new_fsm])
7800f3e72b5SJason Gunthorpe 		return -EINVAL;
7810f3e72b5SJason Gunthorpe 
7820f3e72b5SJason Gunthorpe 	/*
7830f3e72b5SJason Gunthorpe 	 * Arcs touching optional and unsupported states are skipped over. The
7840f3e72b5SJason Gunthorpe 	 * driver will instead see an arc from the original state to the next
7850f3e72b5SJason Gunthorpe 	 * logical state, as per the above comment.
7860f3e72b5SJason Gunthorpe 	 */
7870f3e72b5SJason Gunthorpe 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
7880f3e72b5SJason Gunthorpe 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
7890f3e72b5SJason Gunthorpe 			state_flags_table[*next_fsm])
7900f3e72b5SJason Gunthorpe 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
7910f3e72b5SJason Gunthorpe 
7920f3e72b5SJason Gunthorpe 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
7930f3e72b5SJason Gunthorpe }
7940f3e72b5SJason Gunthorpe EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
7950f3e72b5SJason Gunthorpe 
7960f3e72b5SJason Gunthorpe /*
7970f3e72b5SJason Gunthorpe  * Convert the drivers's struct file into a FD number and return it to userspace
7980f3e72b5SJason Gunthorpe  */
7990f3e72b5SJason Gunthorpe static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
8000f3e72b5SJason Gunthorpe 				   struct vfio_device_feature_mig_state *mig)
8010f3e72b5SJason Gunthorpe {
8020f3e72b5SJason Gunthorpe 	int ret;
8030f3e72b5SJason Gunthorpe 	int fd;
8040f3e72b5SJason Gunthorpe 
8050f3e72b5SJason Gunthorpe 	fd = get_unused_fd_flags(O_CLOEXEC);
8060f3e72b5SJason Gunthorpe 	if (fd < 0) {
8070f3e72b5SJason Gunthorpe 		ret = fd;
8080f3e72b5SJason Gunthorpe 		goto out_fput;
8090f3e72b5SJason Gunthorpe 	}
8100f3e72b5SJason Gunthorpe 
8110f3e72b5SJason Gunthorpe 	mig->data_fd = fd;
8120f3e72b5SJason Gunthorpe 	if (copy_to_user(arg, mig, sizeof(*mig))) {
8130f3e72b5SJason Gunthorpe 		ret = -EFAULT;
8140f3e72b5SJason Gunthorpe 		goto out_put_unused;
8150f3e72b5SJason Gunthorpe 	}
8160f3e72b5SJason Gunthorpe 	fd_install(fd, filp);
8170f3e72b5SJason Gunthorpe 	return 0;
8180f3e72b5SJason Gunthorpe 
8190f3e72b5SJason Gunthorpe out_put_unused:
8200f3e72b5SJason Gunthorpe 	put_unused_fd(fd);
8210f3e72b5SJason Gunthorpe out_fput:
8220f3e72b5SJason Gunthorpe 	fput(filp);
8230f3e72b5SJason Gunthorpe 	return ret;
8240f3e72b5SJason Gunthorpe }
8250f3e72b5SJason Gunthorpe 
8260f3e72b5SJason Gunthorpe static int
8270f3e72b5SJason Gunthorpe vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
8280f3e72b5SJason Gunthorpe 					   u32 flags, void __user *arg,
8290f3e72b5SJason Gunthorpe 					   size_t argsz)
8300f3e72b5SJason Gunthorpe {
8310f3e72b5SJason Gunthorpe 	size_t minsz =
8320f3e72b5SJason Gunthorpe 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
8330f3e72b5SJason Gunthorpe 	struct vfio_device_feature_mig_state mig;
8340f3e72b5SJason Gunthorpe 	struct file *filp = NULL;
8350f3e72b5SJason Gunthorpe 	int ret;
8360f3e72b5SJason Gunthorpe 
8370f3e72b5SJason Gunthorpe 	if (!device->mig_ops)
8380f3e72b5SJason Gunthorpe 		return -ENOTTY;
8390f3e72b5SJason Gunthorpe 
8400f3e72b5SJason Gunthorpe 	ret = vfio_check_feature(flags, argsz,
8410f3e72b5SJason Gunthorpe 				 VFIO_DEVICE_FEATURE_SET |
8420f3e72b5SJason Gunthorpe 				 VFIO_DEVICE_FEATURE_GET,
8430f3e72b5SJason Gunthorpe 				 sizeof(mig));
8440f3e72b5SJason Gunthorpe 	if (ret != 1)
8450f3e72b5SJason Gunthorpe 		return ret;
8460f3e72b5SJason Gunthorpe 
8470f3e72b5SJason Gunthorpe 	if (copy_from_user(&mig, arg, minsz))
8480f3e72b5SJason Gunthorpe 		return -EFAULT;
8490f3e72b5SJason Gunthorpe 
8500f3e72b5SJason Gunthorpe 	if (flags & VFIO_DEVICE_FEATURE_GET) {
8510f3e72b5SJason Gunthorpe 		enum vfio_device_mig_state curr_state;
8520f3e72b5SJason Gunthorpe 
8530f3e72b5SJason Gunthorpe 		ret = device->mig_ops->migration_get_state(device,
8540f3e72b5SJason Gunthorpe 							   &curr_state);
8550f3e72b5SJason Gunthorpe 		if (ret)
8560f3e72b5SJason Gunthorpe 			return ret;
8570f3e72b5SJason Gunthorpe 		mig.device_state = curr_state;
8580f3e72b5SJason Gunthorpe 		goto out_copy;
8590f3e72b5SJason Gunthorpe 	}
8600f3e72b5SJason Gunthorpe 
8610f3e72b5SJason Gunthorpe 	/* Handle the VFIO_DEVICE_FEATURE_SET */
8620f3e72b5SJason Gunthorpe 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
8630f3e72b5SJason Gunthorpe 	if (IS_ERR(filp) || !filp)
8640f3e72b5SJason Gunthorpe 		goto out_copy;
8650f3e72b5SJason Gunthorpe 
8660f3e72b5SJason Gunthorpe 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
8670f3e72b5SJason Gunthorpe out_copy:
8680f3e72b5SJason Gunthorpe 	mig.data_fd = -1;
8690f3e72b5SJason Gunthorpe 	if (copy_to_user(arg, &mig, sizeof(mig)))
8700f3e72b5SJason Gunthorpe 		return -EFAULT;
8710f3e72b5SJason Gunthorpe 	if (IS_ERR(filp))
8720f3e72b5SJason Gunthorpe 		return PTR_ERR(filp);
8730f3e72b5SJason Gunthorpe 	return 0;
8740f3e72b5SJason Gunthorpe }
8750f3e72b5SJason Gunthorpe 
8764e016f96SYishai Hadas static int
8774e016f96SYishai Hadas vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
8784e016f96SYishai Hadas 					      u32 flags, void __user *arg,
8794e016f96SYishai Hadas 					      size_t argsz)
8804e016f96SYishai Hadas {
8814e016f96SYishai Hadas 	struct vfio_device_feature_mig_data_size data_size = {};
8824e016f96SYishai Hadas 	unsigned long stop_copy_length;
8834e016f96SYishai Hadas 	int ret;
8844e016f96SYishai Hadas 
8854e016f96SYishai Hadas 	if (!device->mig_ops)
8864e016f96SYishai Hadas 		return -ENOTTY;
8874e016f96SYishai Hadas 
8884e016f96SYishai Hadas 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
8894e016f96SYishai Hadas 				 sizeof(data_size));
8904e016f96SYishai Hadas 	if (ret != 1)
8914e016f96SYishai Hadas 		return ret;
8924e016f96SYishai Hadas 
8934e016f96SYishai Hadas 	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
8944e016f96SYishai Hadas 	if (ret)
8954e016f96SYishai Hadas 		return ret;
8964e016f96SYishai Hadas 
8974e016f96SYishai Hadas 	data_size.stop_copy_length = stop_copy_length;
8984e016f96SYishai Hadas 	if (copy_to_user(arg, &data_size, sizeof(data_size)))
8994e016f96SYishai Hadas 		return -EFAULT;
9004e016f96SYishai Hadas 
9014e016f96SYishai Hadas 	return 0;
9024e016f96SYishai Hadas }
9034e016f96SYishai Hadas 
9040f3e72b5SJason Gunthorpe static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
9050f3e72b5SJason Gunthorpe 					       u32 flags, void __user *arg,
9060f3e72b5SJason Gunthorpe 					       size_t argsz)
9070f3e72b5SJason Gunthorpe {
9080f3e72b5SJason Gunthorpe 	struct vfio_device_feature_migration mig = {
9090f3e72b5SJason Gunthorpe 		.flags = device->migration_flags,
9100f3e72b5SJason Gunthorpe 	};
9110f3e72b5SJason Gunthorpe 	int ret;
9120f3e72b5SJason Gunthorpe 
9130f3e72b5SJason Gunthorpe 	if (!device->mig_ops)
9140f3e72b5SJason Gunthorpe 		return -ENOTTY;
9150f3e72b5SJason Gunthorpe 
9160f3e72b5SJason Gunthorpe 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
9170f3e72b5SJason Gunthorpe 				 sizeof(mig));
9180f3e72b5SJason Gunthorpe 	if (ret != 1)
9190f3e72b5SJason Gunthorpe 		return ret;
9200f3e72b5SJason Gunthorpe 	if (copy_to_user(arg, &mig, sizeof(mig)))
9210f3e72b5SJason Gunthorpe 		return -EFAULT;
9220f3e72b5SJason Gunthorpe 	return 0;
9230f3e72b5SJason Gunthorpe }
9240f3e72b5SJason Gunthorpe 
92580c4b92aSYishai Hadas /* Ranges should fit into a single kernel page */
92680c4b92aSYishai Hadas #define LOG_MAX_RANGES \
92780c4b92aSYishai Hadas 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
92880c4b92aSYishai Hadas 
92980c4b92aSYishai Hadas static int
93080c4b92aSYishai Hadas vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
93180c4b92aSYishai Hadas 					u32 flags, void __user *arg,
93280c4b92aSYishai Hadas 					size_t argsz)
93380c4b92aSYishai Hadas {
93480c4b92aSYishai Hadas 	size_t minsz =
93580c4b92aSYishai Hadas 		offsetofend(struct vfio_device_feature_dma_logging_control,
93680c4b92aSYishai Hadas 			    ranges);
93780c4b92aSYishai Hadas 	struct vfio_device_feature_dma_logging_range __user *ranges;
93880c4b92aSYishai Hadas 	struct vfio_device_feature_dma_logging_control control;
93980c4b92aSYishai Hadas 	struct vfio_device_feature_dma_logging_range range;
94080c4b92aSYishai Hadas 	struct rb_root_cached root = RB_ROOT_CACHED;
94180c4b92aSYishai Hadas 	struct interval_tree_node *nodes;
94280c4b92aSYishai Hadas 	u64 iova_end;
94380c4b92aSYishai Hadas 	u32 nnodes;
94480c4b92aSYishai Hadas 	int i, ret;
94580c4b92aSYishai Hadas 
94680c4b92aSYishai Hadas 	if (!device->log_ops)
94780c4b92aSYishai Hadas 		return -ENOTTY;
94880c4b92aSYishai Hadas 
94980c4b92aSYishai Hadas 	ret = vfio_check_feature(flags, argsz,
95080c4b92aSYishai Hadas 				 VFIO_DEVICE_FEATURE_SET,
95180c4b92aSYishai Hadas 				 sizeof(control));
95280c4b92aSYishai Hadas 	if (ret != 1)
95380c4b92aSYishai Hadas 		return ret;
95480c4b92aSYishai Hadas 
95580c4b92aSYishai Hadas 	if (copy_from_user(&control, arg, minsz))
95680c4b92aSYishai Hadas 		return -EFAULT;
95780c4b92aSYishai Hadas 
95880c4b92aSYishai Hadas 	nnodes = control.num_ranges;
95980c4b92aSYishai Hadas 	if (!nnodes)
96080c4b92aSYishai Hadas 		return -EINVAL;
96180c4b92aSYishai Hadas 
96280c4b92aSYishai Hadas 	if (nnodes > LOG_MAX_RANGES)
96380c4b92aSYishai Hadas 		return -E2BIG;
96480c4b92aSYishai Hadas 
96580c4b92aSYishai Hadas 	ranges = u64_to_user_ptr(control.ranges);
96680c4b92aSYishai Hadas 	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
96780c4b92aSYishai Hadas 			      GFP_KERNEL);
96880c4b92aSYishai Hadas 	if (!nodes)
96980c4b92aSYishai Hadas 		return -ENOMEM;
97080c4b92aSYishai Hadas 
97180c4b92aSYishai Hadas 	for (i = 0; i < nnodes; i++) {
97280c4b92aSYishai Hadas 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
97380c4b92aSYishai Hadas 			ret = -EFAULT;
97480c4b92aSYishai Hadas 			goto end;
97580c4b92aSYishai Hadas 		}
97680c4b92aSYishai Hadas 		if (!IS_ALIGNED(range.iova, control.page_size) ||
97780c4b92aSYishai Hadas 		    !IS_ALIGNED(range.length, control.page_size)) {
97880c4b92aSYishai Hadas 			ret = -EINVAL;
97980c4b92aSYishai Hadas 			goto end;
98080c4b92aSYishai Hadas 		}
98180c4b92aSYishai Hadas 
98280c4b92aSYishai Hadas 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
98380c4b92aSYishai Hadas 		    iova_end > ULONG_MAX) {
98480c4b92aSYishai Hadas 			ret = -EOVERFLOW;
98580c4b92aSYishai Hadas 			goto end;
98680c4b92aSYishai Hadas 		}
98780c4b92aSYishai Hadas 
98880c4b92aSYishai Hadas 		nodes[i].start = range.iova;
98980c4b92aSYishai Hadas 		nodes[i].last = range.iova + range.length - 1;
99080c4b92aSYishai Hadas 		if (interval_tree_iter_first(&root, nodes[i].start,
99180c4b92aSYishai Hadas 					     nodes[i].last)) {
99280c4b92aSYishai Hadas 			/* Range overlapping */
99380c4b92aSYishai Hadas 			ret = -EINVAL;
99480c4b92aSYishai Hadas 			goto end;
99580c4b92aSYishai Hadas 		}
99680c4b92aSYishai Hadas 		interval_tree_insert(nodes + i, &root);
99780c4b92aSYishai Hadas 	}
99880c4b92aSYishai Hadas 
99980c4b92aSYishai Hadas 	ret = device->log_ops->log_start(device, &root, nnodes,
100080c4b92aSYishai Hadas 					 &control.page_size);
100180c4b92aSYishai Hadas 	if (ret)
100280c4b92aSYishai Hadas 		goto end;
100380c4b92aSYishai Hadas 
100480c4b92aSYishai Hadas 	if (copy_to_user(arg, &control, sizeof(control))) {
100580c4b92aSYishai Hadas 		ret = -EFAULT;
100680c4b92aSYishai Hadas 		device->log_ops->log_stop(device);
100780c4b92aSYishai Hadas 	}
100880c4b92aSYishai Hadas 
100980c4b92aSYishai Hadas end:
101080c4b92aSYishai Hadas 	kfree(nodes);
101180c4b92aSYishai Hadas 	return ret;
101280c4b92aSYishai Hadas }
101380c4b92aSYishai Hadas 
101480c4b92aSYishai Hadas static int
101580c4b92aSYishai Hadas vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
101680c4b92aSYishai Hadas 				       u32 flags, void __user *arg,
101780c4b92aSYishai Hadas 				       size_t argsz)
101880c4b92aSYishai Hadas {
101980c4b92aSYishai Hadas 	int ret;
102080c4b92aSYishai Hadas 
102180c4b92aSYishai Hadas 	if (!device->log_ops)
102280c4b92aSYishai Hadas 		return -ENOTTY;
102380c4b92aSYishai Hadas 
102480c4b92aSYishai Hadas 	ret = vfio_check_feature(flags, argsz,
102580c4b92aSYishai Hadas 				 VFIO_DEVICE_FEATURE_SET, 0);
102680c4b92aSYishai Hadas 	if (ret != 1)
102780c4b92aSYishai Hadas 		return ret;
102880c4b92aSYishai Hadas 
102980c4b92aSYishai Hadas 	return device->log_ops->log_stop(device);
103080c4b92aSYishai Hadas }
103180c4b92aSYishai Hadas 
103280c4b92aSYishai Hadas static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
103380c4b92aSYishai Hadas 					  unsigned long iova, size_t length,
103480c4b92aSYishai Hadas 					  void *opaque)
103580c4b92aSYishai Hadas {
103680c4b92aSYishai Hadas 	struct vfio_device *device = opaque;
103780c4b92aSYishai Hadas 
103880c4b92aSYishai Hadas 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
103980c4b92aSYishai Hadas }
104080c4b92aSYishai Hadas 
104180c4b92aSYishai Hadas static int
104280c4b92aSYishai Hadas vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
104380c4b92aSYishai Hadas 					 u32 flags, void __user *arg,
104480c4b92aSYishai Hadas 					 size_t argsz)
104580c4b92aSYishai Hadas {
104680c4b92aSYishai Hadas 	size_t minsz =
104780c4b92aSYishai Hadas 		offsetofend(struct vfio_device_feature_dma_logging_report,
104880c4b92aSYishai Hadas 			    bitmap);
104980c4b92aSYishai Hadas 	struct vfio_device_feature_dma_logging_report report;
105080c4b92aSYishai Hadas 	struct iova_bitmap *iter;
105180c4b92aSYishai Hadas 	u64 iova_end;
105280c4b92aSYishai Hadas 	int ret;
105380c4b92aSYishai Hadas 
105480c4b92aSYishai Hadas 	if (!device->log_ops)
105580c4b92aSYishai Hadas 		return -ENOTTY;
105680c4b92aSYishai Hadas 
105780c4b92aSYishai Hadas 	ret = vfio_check_feature(flags, argsz,
105880c4b92aSYishai Hadas 				 VFIO_DEVICE_FEATURE_GET,
105980c4b92aSYishai Hadas 				 sizeof(report));
106080c4b92aSYishai Hadas 	if (ret != 1)
106180c4b92aSYishai Hadas 		return ret;
106280c4b92aSYishai Hadas 
106380c4b92aSYishai Hadas 	if (copy_from_user(&report, arg, minsz))
106480c4b92aSYishai Hadas 		return -EFAULT;
106580c4b92aSYishai Hadas 
106680c4b92aSYishai Hadas 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
106780c4b92aSYishai Hadas 		return -EINVAL;
106880c4b92aSYishai Hadas 
106980c4b92aSYishai Hadas 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
107080c4b92aSYishai Hadas 	    iova_end > ULONG_MAX)
107180c4b92aSYishai Hadas 		return -EOVERFLOW;
107280c4b92aSYishai Hadas 
107380c4b92aSYishai Hadas 	iter = iova_bitmap_alloc(report.iova, report.length,
107480c4b92aSYishai Hadas 				 report.page_size,
107580c4b92aSYishai Hadas 				 u64_to_user_ptr(report.bitmap));
107680c4b92aSYishai Hadas 	if (IS_ERR(iter))
107780c4b92aSYishai Hadas 		return PTR_ERR(iter);
107880c4b92aSYishai Hadas 
107980c4b92aSYishai Hadas 	ret = iova_bitmap_for_each(iter, device,
108080c4b92aSYishai Hadas 				   vfio_device_log_read_and_clear);
108180c4b92aSYishai Hadas 
108280c4b92aSYishai Hadas 	iova_bitmap_free(iter);
108380c4b92aSYishai Hadas 	return ret;
108480c4b92aSYishai Hadas }
108580c4b92aSYishai Hadas 
10860f3e72b5SJason Gunthorpe static int vfio_ioctl_device_feature(struct vfio_device *device,
10870f3e72b5SJason Gunthorpe 				     struct vfio_device_feature __user *arg)
10880f3e72b5SJason Gunthorpe {
10890f3e72b5SJason Gunthorpe 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
10900f3e72b5SJason Gunthorpe 	struct vfio_device_feature feature;
10910f3e72b5SJason Gunthorpe 
10920f3e72b5SJason Gunthorpe 	if (copy_from_user(&feature, arg, minsz))
10930f3e72b5SJason Gunthorpe 		return -EFAULT;
10940f3e72b5SJason Gunthorpe 
10950f3e72b5SJason Gunthorpe 	if (feature.argsz < minsz)
10960f3e72b5SJason Gunthorpe 		return -EINVAL;
10970f3e72b5SJason Gunthorpe 
10980f3e72b5SJason Gunthorpe 	/* Check unknown flags */
10990f3e72b5SJason Gunthorpe 	if (feature.flags &
11000f3e72b5SJason Gunthorpe 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
11010f3e72b5SJason Gunthorpe 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
11020f3e72b5SJason Gunthorpe 		return -EINVAL;
11030f3e72b5SJason Gunthorpe 
11040f3e72b5SJason Gunthorpe 	/* GET & SET are mutually exclusive except with PROBE */
11050f3e72b5SJason Gunthorpe 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
11060f3e72b5SJason Gunthorpe 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
11070f3e72b5SJason Gunthorpe 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
11080f3e72b5SJason Gunthorpe 		return -EINVAL;
11090f3e72b5SJason Gunthorpe 
11100f3e72b5SJason Gunthorpe 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
11110f3e72b5SJason Gunthorpe 	case VFIO_DEVICE_FEATURE_MIGRATION:
11120f3e72b5SJason Gunthorpe 		return vfio_ioctl_device_feature_migration(
11130f3e72b5SJason Gunthorpe 			device, feature.flags, arg->data,
11140f3e72b5SJason Gunthorpe 			feature.argsz - minsz);
11150f3e72b5SJason Gunthorpe 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
11160f3e72b5SJason Gunthorpe 		return vfio_ioctl_device_feature_mig_device_state(
11170f3e72b5SJason Gunthorpe 			device, feature.flags, arg->data,
11180f3e72b5SJason Gunthorpe 			feature.argsz - minsz);
111980c4b92aSYishai Hadas 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
112080c4b92aSYishai Hadas 		return vfio_ioctl_device_feature_logging_start(
112180c4b92aSYishai Hadas 			device, feature.flags, arg->data,
112280c4b92aSYishai Hadas 			feature.argsz - minsz);
112380c4b92aSYishai Hadas 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
112480c4b92aSYishai Hadas 		return vfio_ioctl_device_feature_logging_stop(
112580c4b92aSYishai Hadas 			device, feature.flags, arg->data,
112680c4b92aSYishai Hadas 			feature.argsz - minsz);
112780c4b92aSYishai Hadas 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
112880c4b92aSYishai Hadas 		return vfio_ioctl_device_feature_logging_report(
112980c4b92aSYishai Hadas 			device, feature.flags, arg->data,
113080c4b92aSYishai Hadas 			feature.argsz - minsz);
11314e016f96SYishai Hadas 	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
11324e016f96SYishai Hadas 		return vfio_ioctl_device_feature_migration_data_size(
11334e016f96SYishai Hadas 			device, feature.flags, arg->data,
11344e016f96SYishai Hadas 			feature.argsz - minsz);
11350f3e72b5SJason Gunthorpe 	default:
11360f3e72b5SJason Gunthorpe 		if (unlikely(!device->ops->device_feature))
11370f3e72b5SJason Gunthorpe 			return -EINVAL;
11380f3e72b5SJason Gunthorpe 		return device->ops->device_feature(device, feature.flags,
11390f3e72b5SJason Gunthorpe 						   arg->data,
11400f3e72b5SJason Gunthorpe 						   feature.argsz - minsz);
11410f3e72b5SJason Gunthorpe 	}
11420f3e72b5SJason Gunthorpe }
11430f3e72b5SJason Gunthorpe 
11440f3e72b5SJason Gunthorpe static long vfio_device_fops_unl_ioctl(struct file *filep,
11450f3e72b5SJason Gunthorpe 				       unsigned int cmd, unsigned long arg)
11460f3e72b5SJason Gunthorpe {
1147b1a3b5c6SYi Liu 	struct vfio_device_file *df = filep->private_data;
1148b1a3b5c6SYi Liu 	struct vfio_device *device = df->device;
1149*ca9e45b4SYi Liu 	void __user *uptr = (void __user *)arg;
11508e5c6995SAbhishek Sahu 	int ret;
11518e5c6995SAbhishek Sahu 
115282d93f58SYi Liu 	/* Paired with smp_store_release() following vfio_df_open() */
115382d93f58SYi Liu 	if (!smp_load_acquire(&df->access_granted))
115482d93f58SYi Liu 		return -EINVAL;
115582d93f58SYi Liu 
11568e5c6995SAbhishek Sahu 	ret = vfio_device_pm_runtime_get(device);
11578e5c6995SAbhishek Sahu 	if (ret)
11588e5c6995SAbhishek Sahu 		return ret;
11590f3e72b5SJason Gunthorpe 
11600f3e72b5SJason Gunthorpe 	switch (cmd) {
11610f3e72b5SJason Gunthorpe 	case VFIO_DEVICE_FEATURE:
1162*ca9e45b4SYi Liu 		ret = vfio_ioctl_device_feature(device, uptr);
11638e5c6995SAbhishek Sahu 		break;
11648e5c6995SAbhishek Sahu 
11650f3e72b5SJason Gunthorpe 	default:
11660f3e72b5SJason Gunthorpe 		if (unlikely(!device->ops->ioctl))
11678e5c6995SAbhishek Sahu 			ret = -EINVAL;
11688e5c6995SAbhishek Sahu 		else
11698e5c6995SAbhishek Sahu 			ret = device->ops->ioctl(device, cmd, arg);
11708e5c6995SAbhishek Sahu 		break;
11710f3e72b5SJason Gunthorpe 	}
11728e5c6995SAbhishek Sahu 
11738e5c6995SAbhishek Sahu 	vfio_device_pm_runtime_put(device);
11748e5c6995SAbhishek Sahu 	return ret;
11750f3e72b5SJason Gunthorpe }
11760f3e72b5SJason Gunthorpe 
11770f3e72b5SJason Gunthorpe static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
11780f3e72b5SJason Gunthorpe 				     size_t count, loff_t *ppos)
11790f3e72b5SJason Gunthorpe {
1180b1a3b5c6SYi Liu 	struct vfio_device_file *df = filep->private_data;
1181b1a3b5c6SYi Liu 	struct vfio_device *device = df->device;
11820f3e72b5SJason Gunthorpe 
118382d93f58SYi Liu 	/* Paired with smp_store_release() following vfio_df_open() */
118482d93f58SYi Liu 	if (!smp_load_acquire(&df->access_granted))
118582d93f58SYi Liu 		return -EINVAL;
118682d93f58SYi Liu 
11870f3e72b5SJason Gunthorpe 	if (unlikely(!device->ops->read))
11880f3e72b5SJason Gunthorpe 		return -EINVAL;
11890f3e72b5SJason Gunthorpe 
11900f3e72b5SJason Gunthorpe 	return device->ops->read(device, buf, count, ppos);
11910f3e72b5SJason Gunthorpe }
11920f3e72b5SJason Gunthorpe 
11930f3e72b5SJason Gunthorpe static ssize_t vfio_device_fops_write(struct file *filep,
11940f3e72b5SJason Gunthorpe 				      const char __user *buf,
11950f3e72b5SJason Gunthorpe 				      size_t count, loff_t *ppos)
11960f3e72b5SJason Gunthorpe {
1197b1a3b5c6SYi Liu 	struct vfio_device_file *df = filep->private_data;
1198b1a3b5c6SYi Liu 	struct vfio_device *device = df->device;
11990f3e72b5SJason Gunthorpe 
120082d93f58SYi Liu 	/* Paired with smp_store_release() following vfio_df_open() */
120182d93f58SYi Liu 	if (!smp_load_acquire(&df->access_granted))
120282d93f58SYi Liu 		return -EINVAL;
120382d93f58SYi Liu 
12040f3e72b5SJason Gunthorpe 	if (unlikely(!device->ops->write))
12050f3e72b5SJason Gunthorpe 		return -EINVAL;
12060f3e72b5SJason Gunthorpe 
12070f3e72b5SJason Gunthorpe 	return device->ops->write(device, buf, count, ppos);
12080f3e72b5SJason Gunthorpe }
12090f3e72b5SJason Gunthorpe 
12100f3e72b5SJason Gunthorpe static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
12110f3e72b5SJason Gunthorpe {
1212b1a3b5c6SYi Liu 	struct vfio_device_file *df = filep->private_data;
1213b1a3b5c6SYi Liu 	struct vfio_device *device = df->device;
12140f3e72b5SJason Gunthorpe 
121582d93f58SYi Liu 	/* Paired with smp_store_release() following vfio_df_open() */
121682d93f58SYi Liu 	if (!smp_load_acquire(&df->access_granted))
121782d93f58SYi Liu 		return -EINVAL;
121882d93f58SYi Liu 
12190f3e72b5SJason Gunthorpe 	if (unlikely(!device->ops->mmap))
12200f3e72b5SJason Gunthorpe 		return -EINVAL;
12210f3e72b5SJason Gunthorpe 
12220f3e72b5SJason Gunthorpe 	return device->ops->mmap(device, vma);
12230f3e72b5SJason Gunthorpe }
12240f3e72b5SJason Gunthorpe 
12259eefba80SYi Liu const struct file_operations vfio_device_fops = {
12260f3e72b5SJason Gunthorpe 	.owner		= THIS_MODULE,
12278b6f173aSYi Liu 	.open		= vfio_device_fops_cdev_open,
12280f3e72b5SJason Gunthorpe 	.release	= vfio_device_fops_release,
12290f3e72b5SJason Gunthorpe 	.read		= vfio_device_fops_read,
12300f3e72b5SJason Gunthorpe 	.write		= vfio_device_fops_write,
12310f3e72b5SJason Gunthorpe 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
12320f3e72b5SJason Gunthorpe 	.compat_ioctl	= compat_ptr_ioctl,
12330f3e72b5SJason Gunthorpe 	.mmap		= vfio_device_fops_mmap,
12340f3e72b5SJason Gunthorpe };
12350f3e72b5SJason Gunthorpe 
123634aeeecdSYi Liu static struct vfio_device *vfio_device_from_file(struct file *file)
123734aeeecdSYi Liu {
123834aeeecdSYi Liu 	struct vfio_device_file *df = file->private_data;
123934aeeecdSYi Liu 
124034aeeecdSYi Liu 	if (file->f_op != &vfio_device_fops)
124134aeeecdSYi Liu 		return NULL;
124234aeeecdSYi Liu 	return df->device;
124334aeeecdSYi Liu }
124434aeeecdSYi Liu 
1245b1a59be8SYi Liu /**
1246b1a59be8SYi Liu  * vfio_file_is_valid - True if the file is valid vfio file
1247b1a59be8SYi Liu  * @file: VFIO group file or VFIO device file
1248b1a59be8SYi Liu  */
1249b1a59be8SYi Liu bool vfio_file_is_valid(struct file *file)
1250b1a59be8SYi Liu {
125134aeeecdSYi Liu 	return vfio_group_from_file(file) ||
125234aeeecdSYi Liu 	       vfio_device_from_file(file);
1253b1a59be8SYi Liu }
1254b1a59be8SYi Liu EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1255b1a59be8SYi Liu 
1256b1a59be8SYi Liu /**
1257b1a59be8SYi Liu  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1258b1a59be8SYi Liu  *        is always CPU cache coherent
1259b1a59be8SYi Liu  * @file: VFIO group file or VFIO device file
1260b1a59be8SYi Liu  *
1261b1a59be8SYi Liu  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1262b1a59be8SYi Liu  * bit in DMA transactions. A return of false indicates that the user has
1263b1a59be8SYi Liu  * rights to access additional instructions such as wbinvd on x86.
1264b1a59be8SYi Liu  */
1265b1a59be8SYi Liu bool vfio_file_enforced_coherent(struct file *file)
1266b1a59be8SYi Liu {
126734aeeecdSYi Liu 	struct vfio_device *device;
1268b1a59be8SYi Liu 	struct vfio_group *group;
1269b1a59be8SYi Liu 
1270b1a59be8SYi Liu 	group = vfio_group_from_file(file);
1271b1a59be8SYi Liu 	if (group)
1272b1a59be8SYi Liu 		return vfio_group_enforced_coherent(group);
1273b1a59be8SYi Liu 
127434aeeecdSYi Liu 	device = vfio_device_from_file(file);
127534aeeecdSYi Liu 	if (device)
127634aeeecdSYi Liu 		return device_iommu_capable(device->dev,
127734aeeecdSYi Liu 					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
127834aeeecdSYi Liu 
1279b1a59be8SYi Liu 	return true;
1280b1a59be8SYi Liu }
1281b1a59be8SYi Liu EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1282b1a59be8SYi Liu 
128334aeeecdSYi Liu static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
128434aeeecdSYi Liu {
128534aeeecdSYi Liu 	struct vfio_device_file *df = file->private_data;
128634aeeecdSYi Liu 
128734aeeecdSYi Liu 	/*
128834aeeecdSYi Liu 	 * The kvm is first recorded in the vfio_device_file, and will
128934aeeecdSYi Liu 	 * be propagated to vfio_device::kvm when the file is bound to
129034aeeecdSYi Liu 	 * iommufd successfully in the vfio device cdev path.
129134aeeecdSYi Liu 	 */
129234aeeecdSYi Liu 	spin_lock(&df->kvm_ref_lock);
129334aeeecdSYi Liu 	df->kvm = kvm;
129434aeeecdSYi Liu 	spin_unlock(&df->kvm_ref_lock);
129534aeeecdSYi Liu }
129634aeeecdSYi Liu 
1297b1a59be8SYi Liu /**
1298b1a59be8SYi Liu  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1299b1a59be8SYi Liu  * @file: VFIO group file or VFIO device file
1300b1a59be8SYi Liu  * @kvm: KVM to link
1301b1a59be8SYi Liu  *
1302b1a59be8SYi Liu  * When a VFIO device is first opened the KVM will be available in
1303b1a59be8SYi Liu  * device->kvm if one was associated with the file.
1304b1a59be8SYi Liu  */
1305b1a59be8SYi Liu void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1306b1a59be8SYi Liu {
1307b1a59be8SYi Liu 	struct vfio_group *group;
1308b1a59be8SYi Liu 
1309b1a59be8SYi Liu 	group = vfio_group_from_file(file);
1310b1a59be8SYi Liu 	if (group)
1311b1a59be8SYi Liu 		vfio_group_set_kvm(group, kvm);
131234aeeecdSYi Liu 
131334aeeecdSYi Liu 	if (vfio_device_from_file(file))
131434aeeecdSYi Liu 		vfio_device_file_set_kvm(file, kvm);
1315b1a59be8SYi Liu }
1316b1a59be8SYi Liu EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1317b1a59be8SYi Liu 
13180f3e72b5SJason Gunthorpe /*
13190f3e72b5SJason Gunthorpe  * Sub-module support
13200f3e72b5SJason Gunthorpe  */
13210f3e72b5SJason Gunthorpe /*
13220f3e72b5SJason Gunthorpe  * Helper for managing a buffer of info chain capabilities, allocate or
13230f3e72b5SJason Gunthorpe  * reallocate a buffer with additional @size, filling in @id and @version
13240f3e72b5SJason Gunthorpe  * of the capability.  A pointer to the new capability is returned.
13250f3e72b5SJason Gunthorpe  *
13260f3e72b5SJason Gunthorpe  * NB. The chain is based at the head of the buffer, so new entries are
13270f3e72b5SJason Gunthorpe  * added to the tail, vfio_info_cap_shift() should be called to fixup the
13280f3e72b5SJason Gunthorpe  * next offsets prior to copying to the user buffer.
13290f3e72b5SJason Gunthorpe  */
13300f3e72b5SJason Gunthorpe struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
13310f3e72b5SJason Gunthorpe 					       size_t size, u16 id, u16 version)
13320f3e72b5SJason Gunthorpe {
13330f3e72b5SJason Gunthorpe 	void *buf;
13340f3e72b5SJason Gunthorpe 	struct vfio_info_cap_header *header, *tmp;
13350f3e72b5SJason Gunthorpe 
13360f3e72b5SJason Gunthorpe 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
13370f3e72b5SJason Gunthorpe 	if (!buf) {
13380f3e72b5SJason Gunthorpe 		kfree(caps->buf);
13390f3e72b5SJason Gunthorpe 		caps->buf = NULL;
13400f3e72b5SJason Gunthorpe 		caps->size = 0;
13410f3e72b5SJason Gunthorpe 		return ERR_PTR(-ENOMEM);
13420f3e72b5SJason Gunthorpe 	}
13430f3e72b5SJason Gunthorpe 
13440f3e72b5SJason Gunthorpe 	caps->buf = buf;
13450f3e72b5SJason Gunthorpe 	header = buf + caps->size;
13460f3e72b5SJason Gunthorpe 
13470f3e72b5SJason Gunthorpe 	/* Eventually copied to user buffer, zero */
13480f3e72b5SJason Gunthorpe 	memset(header, 0, size);
13490f3e72b5SJason Gunthorpe 
13500f3e72b5SJason Gunthorpe 	header->id = id;
13510f3e72b5SJason Gunthorpe 	header->version = version;
13520f3e72b5SJason Gunthorpe 
13530f3e72b5SJason Gunthorpe 	/* Add to the end of the capability chain */
13540f3e72b5SJason Gunthorpe 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
13550f3e72b5SJason Gunthorpe 		; /* nothing */
13560f3e72b5SJason Gunthorpe 
13570f3e72b5SJason Gunthorpe 	tmp->next = caps->size;
13580f3e72b5SJason Gunthorpe 	caps->size += size;
13590f3e72b5SJason Gunthorpe 
13600f3e72b5SJason Gunthorpe 	return header;
13610f3e72b5SJason Gunthorpe }
13620f3e72b5SJason Gunthorpe EXPORT_SYMBOL_GPL(vfio_info_cap_add);
13630f3e72b5SJason Gunthorpe 
13640f3e72b5SJason Gunthorpe void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
13650f3e72b5SJason Gunthorpe {
13660f3e72b5SJason Gunthorpe 	struct vfio_info_cap_header *tmp;
13670f3e72b5SJason Gunthorpe 	void *buf = (void *)caps->buf;
13680f3e72b5SJason Gunthorpe 
13690f3e72b5SJason Gunthorpe 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
13700f3e72b5SJason Gunthorpe 		tmp->next += offset;
13710f3e72b5SJason Gunthorpe }
13720f3e72b5SJason Gunthorpe EXPORT_SYMBOL(vfio_info_cap_shift);
13730f3e72b5SJason Gunthorpe 
13740f3e72b5SJason Gunthorpe int vfio_info_add_capability(struct vfio_info_cap *caps,
13750f3e72b5SJason Gunthorpe 			     struct vfio_info_cap_header *cap, size_t size)
13760f3e72b5SJason Gunthorpe {
13770f3e72b5SJason Gunthorpe 	struct vfio_info_cap_header *header;
13780f3e72b5SJason Gunthorpe 
13790f3e72b5SJason Gunthorpe 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
13800f3e72b5SJason Gunthorpe 	if (IS_ERR(header))
13810f3e72b5SJason Gunthorpe 		return PTR_ERR(header);
13820f3e72b5SJason Gunthorpe 
13830f3e72b5SJason Gunthorpe 	memcpy(header + 1, cap + 1, size - sizeof(*header));
13840f3e72b5SJason Gunthorpe 
13850f3e72b5SJason Gunthorpe 	return 0;
13860f3e72b5SJason Gunthorpe }
13870f3e72b5SJason Gunthorpe EXPORT_SYMBOL(vfio_info_add_capability);
13880f3e72b5SJason Gunthorpe 
13890f3e72b5SJason Gunthorpe int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
13900f3e72b5SJason Gunthorpe 				       int max_irq_type, size_t *data_size)
13910f3e72b5SJason Gunthorpe {
13920f3e72b5SJason Gunthorpe 	unsigned long minsz;
13930f3e72b5SJason Gunthorpe 	size_t size;
13940f3e72b5SJason Gunthorpe 
13950f3e72b5SJason Gunthorpe 	minsz = offsetofend(struct vfio_irq_set, count);
13960f3e72b5SJason Gunthorpe 
13970f3e72b5SJason Gunthorpe 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
13980f3e72b5SJason Gunthorpe 	    (hdr->count >= (U32_MAX - hdr->start)) ||
13990f3e72b5SJason Gunthorpe 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
14000f3e72b5SJason Gunthorpe 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
14010f3e72b5SJason Gunthorpe 		return -EINVAL;
14020f3e72b5SJason Gunthorpe 
14030f3e72b5SJason Gunthorpe 	if (data_size)
14040f3e72b5SJason Gunthorpe 		*data_size = 0;
14050f3e72b5SJason Gunthorpe 
14060f3e72b5SJason Gunthorpe 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
14070f3e72b5SJason Gunthorpe 		return -EINVAL;
14080f3e72b5SJason Gunthorpe 
14090f3e72b5SJason Gunthorpe 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
14100f3e72b5SJason Gunthorpe 	case VFIO_IRQ_SET_DATA_NONE:
14110f3e72b5SJason Gunthorpe 		size = 0;
14120f3e72b5SJason Gunthorpe 		break;
14130f3e72b5SJason Gunthorpe 	case VFIO_IRQ_SET_DATA_BOOL:
14140f3e72b5SJason Gunthorpe 		size = sizeof(uint8_t);
14150f3e72b5SJason Gunthorpe 		break;
14160f3e72b5SJason Gunthorpe 	case VFIO_IRQ_SET_DATA_EVENTFD:
14170f3e72b5SJason Gunthorpe 		size = sizeof(int32_t);
14180f3e72b5SJason Gunthorpe 		break;
14190f3e72b5SJason Gunthorpe 	default:
14200f3e72b5SJason Gunthorpe 		return -EINVAL;
14210f3e72b5SJason Gunthorpe 	}
14220f3e72b5SJason Gunthorpe 
14230f3e72b5SJason Gunthorpe 	if (size) {
14240f3e72b5SJason Gunthorpe 		if (hdr->argsz - minsz < hdr->count * size)
14250f3e72b5SJason Gunthorpe 			return -EINVAL;
14260f3e72b5SJason Gunthorpe 
14270f3e72b5SJason Gunthorpe 		if (!data_size)
14280f3e72b5SJason Gunthorpe 			return -EINVAL;
14290f3e72b5SJason Gunthorpe 
14300f3e72b5SJason Gunthorpe 		*data_size = hdr->count * size;
14310f3e72b5SJason Gunthorpe 	}
14320f3e72b5SJason Gunthorpe 
14330f3e72b5SJason Gunthorpe 	return 0;
14340f3e72b5SJason Gunthorpe }
14350f3e72b5SJason Gunthorpe EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
14360f3e72b5SJason Gunthorpe 
14370f3e72b5SJason Gunthorpe /*
14384741f2e9SJason Gunthorpe  * Pin contiguous user pages and return their associated host pages for local
14394741f2e9SJason Gunthorpe  * domain only.
14404741f2e9SJason Gunthorpe  * @device [in]  : device
14414741f2e9SJason Gunthorpe  * @iova [in]    : starting IOVA of user pages to be pinned.
14424741f2e9SJason Gunthorpe  * @npage [in]   : count of pages to be pinned.  This count should not
14434741f2e9SJason Gunthorpe  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
14444741f2e9SJason Gunthorpe  * @prot [in]    : protection flags
14454741f2e9SJason Gunthorpe  * @pages[out]   : array of host pages
14464741f2e9SJason Gunthorpe  * Return error or number of pages pinned.
14474741f2e9SJason Gunthorpe  *
14484741f2e9SJason Gunthorpe  * A driver may only call this function if the vfio_device was created
14498da7a0e7SYi Liu  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
14504741f2e9SJason Gunthorpe  */
14514741f2e9SJason Gunthorpe int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
14524741f2e9SJason Gunthorpe 		   int npage, int prot, struct page **pages)
14534741f2e9SJason Gunthorpe {
14544741f2e9SJason Gunthorpe 	/* group->container cannot change while a vfio device is open */
14554741f2e9SJason Gunthorpe 	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
14564741f2e9SJason Gunthorpe 		return -EINVAL;
14578da7a0e7SYi Liu 	if (vfio_device_has_container(device))
14588da7a0e7SYi Liu 		return vfio_device_container_pin_pages(device, iova,
14598da7a0e7SYi Liu 						       npage, prot, pages);
14604741f2e9SJason Gunthorpe 	if (device->iommufd_access) {
14614741f2e9SJason Gunthorpe 		int ret;
14624741f2e9SJason Gunthorpe 
14634741f2e9SJason Gunthorpe 		if (iova > ULONG_MAX)
14644741f2e9SJason Gunthorpe 			return -EINVAL;
14654741f2e9SJason Gunthorpe 		/*
14664741f2e9SJason Gunthorpe 		 * VFIO ignores the sub page offset, npages is from the start of
14674741f2e9SJason Gunthorpe 		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
14684741f2e9SJason Gunthorpe 		 * the sub page offset by doing:
14694741f2e9SJason Gunthorpe 		 *     pages[0] + (iova % PAGE_SIZE)
14704741f2e9SJason Gunthorpe 		 */
14714741f2e9SJason Gunthorpe 		ret = iommufd_access_pin_pages(
14724741f2e9SJason Gunthorpe 			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
14734741f2e9SJason Gunthorpe 			npage * PAGE_SIZE, pages,
14744741f2e9SJason Gunthorpe 			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
14754741f2e9SJason Gunthorpe 		if (ret)
14764741f2e9SJason Gunthorpe 			return ret;
14774741f2e9SJason Gunthorpe 		return npage;
14784741f2e9SJason Gunthorpe 	}
14794741f2e9SJason Gunthorpe 	return -EINVAL;
14804741f2e9SJason Gunthorpe }
14814741f2e9SJason Gunthorpe EXPORT_SYMBOL(vfio_pin_pages);
14824741f2e9SJason Gunthorpe 
14834741f2e9SJason Gunthorpe /*
14844741f2e9SJason Gunthorpe  * Unpin contiguous host pages for local domain only.
14854741f2e9SJason Gunthorpe  * @device [in]  : device
14864741f2e9SJason Gunthorpe  * @iova [in]    : starting address of user pages to be unpinned.
14874741f2e9SJason Gunthorpe  * @npage [in]   : count of pages to be unpinned.  This count should not
14884741f2e9SJason Gunthorpe  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
14894741f2e9SJason Gunthorpe  */
14904741f2e9SJason Gunthorpe void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
14914741f2e9SJason Gunthorpe {
14924741f2e9SJason Gunthorpe 	if (WARN_ON(!vfio_assert_device_open(device)))
14934741f2e9SJason Gunthorpe 		return;
14944741f2e9SJason Gunthorpe 
14958da7a0e7SYi Liu 	if (vfio_device_has_container(device)) {
14968da7a0e7SYi Liu 		vfio_device_container_unpin_pages(device, iova, npage);
14974741f2e9SJason Gunthorpe 		return;
14984741f2e9SJason Gunthorpe 	}
14994741f2e9SJason Gunthorpe 	if (device->iommufd_access) {
15004741f2e9SJason Gunthorpe 		if (WARN_ON(iova > ULONG_MAX))
15014741f2e9SJason Gunthorpe 			return;
15024741f2e9SJason Gunthorpe 		iommufd_access_unpin_pages(device->iommufd_access,
15034741f2e9SJason Gunthorpe 					   ALIGN_DOWN(iova, PAGE_SIZE),
15044741f2e9SJason Gunthorpe 					   npage * PAGE_SIZE);
15054741f2e9SJason Gunthorpe 		return;
15064741f2e9SJason Gunthorpe 	}
15074741f2e9SJason Gunthorpe }
15084741f2e9SJason Gunthorpe EXPORT_SYMBOL(vfio_unpin_pages);
15094741f2e9SJason Gunthorpe 
15104741f2e9SJason Gunthorpe /*
15114741f2e9SJason Gunthorpe  * This interface allows the CPUs to perform some sort of virtual DMA on
15124741f2e9SJason Gunthorpe  * behalf of the device.
15134741f2e9SJason Gunthorpe  *
15144741f2e9SJason Gunthorpe  * CPUs read/write from/into a range of IOVAs pointing to user space memory
15154741f2e9SJason Gunthorpe  * into/from a kernel buffer.
15164741f2e9SJason Gunthorpe  *
15174741f2e9SJason Gunthorpe  * As the read/write of user space memory is conducted via the CPUs and is
15184741f2e9SJason Gunthorpe  * not a real device DMA, it is not necessary to pin the user space memory.
15194741f2e9SJason Gunthorpe  *
15204741f2e9SJason Gunthorpe  * @device [in]		: VFIO device
15214741f2e9SJason Gunthorpe  * @iova [in]		: base IOVA of a user space buffer
15224741f2e9SJason Gunthorpe  * @data [in]		: pointer to kernel buffer
15234741f2e9SJason Gunthorpe  * @len [in]		: kernel buffer length
15244741f2e9SJason Gunthorpe  * @write		: indicate read or write
15254741f2e9SJason Gunthorpe  * Return error code on failure or 0 on success.
15264741f2e9SJason Gunthorpe  */
15274741f2e9SJason Gunthorpe int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
15284741f2e9SJason Gunthorpe 		size_t len, bool write)
15294741f2e9SJason Gunthorpe {
15304741f2e9SJason Gunthorpe 	if (!data || len <= 0 || !vfio_assert_device_open(device))
15314741f2e9SJason Gunthorpe 		return -EINVAL;
15324741f2e9SJason Gunthorpe 
15338da7a0e7SYi Liu 	if (vfio_device_has_container(device))
15348da7a0e7SYi Liu 		return vfio_device_container_dma_rw(device, iova,
15354741f2e9SJason Gunthorpe 						    data, len, write);
15364741f2e9SJason Gunthorpe 
15374741f2e9SJason Gunthorpe 	if (device->iommufd_access) {
15384741f2e9SJason Gunthorpe 		unsigned int flags = 0;
15394741f2e9SJason Gunthorpe 
15404741f2e9SJason Gunthorpe 		if (iova > ULONG_MAX)
15414741f2e9SJason Gunthorpe 			return -EINVAL;
15424741f2e9SJason Gunthorpe 
15434741f2e9SJason Gunthorpe 		/* VFIO historically tries to auto-detect a kthread */
15444741f2e9SJason Gunthorpe 		if (!current->mm)
15454741f2e9SJason Gunthorpe 			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
15464741f2e9SJason Gunthorpe 		if (write)
15474741f2e9SJason Gunthorpe 			flags |= IOMMUFD_ACCESS_RW_WRITE;
15484741f2e9SJason Gunthorpe 		return iommufd_access_rw(device->iommufd_access, iova, data,
15494741f2e9SJason Gunthorpe 					 len, flags);
15504741f2e9SJason Gunthorpe 	}
15514741f2e9SJason Gunthorpe 	return -EINVAL;
15524741f2e9SJason Gunthorpe }
15534741f2e9SJason Gunthorpe EXPORT_SYMBOL(vfio_dma_rw);
15544741f2e9SJason Gunthorpe 
15554741f2e9SJason Gunthorpe /*
15560f3e72b5SJason Gunthorpe  * Module/class support
15570f3e72b5SJason Gunthorpe  */
15581334e47eSYi Liu static int __init vfio_init(void)
15591334e47eSYi Liu {
15601334e47eSYi Liu 	int ret;
15611334e47eSYi Liu 
15621334e47eSYi Liu 	ida_init(&vfio.device_ida);
15631334e47eSYi Liu 
15641334e47eSYi Liu 	ret = vfio_group_init();
15651334e47eSYi Liu 	if (ret)
15661334e47eSYi Liu 		return ret;
15671334e47eSYi Liu 
1568e2d55709SJason Gunthorpe 	ret = vfio_virqfd_init();
1569e2d55709SJason Gunthorpe 	if (ret)
1570e2d55709SJason Gunthorpe 		goto err_virqfd;
1571e2d55709SJason Gunthorpe 
15721334e47eSYi Liu 	/* /sys/class/vfio-dev/vfioX */
15731aaba11dSGreg Kroah-Hartman 	vfio.device_class = class_create("vfio-dev");
15741334e47eSYi Liu 	if (IS_ERR(vfio.device_class)) {
15751334e47eSYi Liu 		ret = PTR_ERR(vfio.device_class);
15761334e47eSYi Liu 		goto err_dev_class;
15771334e47eSYi Liu 	}
15781334e47eSYi Liu 
15798b6f173aSYi Liu 	ret = vfio_cdev_init(vfio.device_class);
15808b6f173aSYi Liu 	if (ret)
15818b6f173aSYi Liu 		goto err_alloc_dev_chrdev;
15828b6f173aSYi Liu 
15831334e47eSYi Liu 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
15841334e47eSYi Liu 	return 0;
15851334e47eSYi Liu 
15868b6f173aSYi Liu err_alloc_dev_chrdev:
15878b6f173aSYi Liu 	class_destroy(vfio.device_class);
15888b6f173aSYi Liu 	vfio.device_class = NULL;
15891334e47eSYi Liu err_dev_class:
1590e2d55709SJason Gunthorpe 	vfio_virqfd_exit();
1591e2d55709SJason Gunthorpe err_virqfd:
15921334e47eSYi Liu 	vfio_group_cleanup();
15931334e47eSYi Liu 	return ret;
15941334e47eSYi Liu }
15951334e47eSYi Liu 
15961334e47eSYi Liu static void __exit vfio_cleanup(void)
15971334e47eSYi Liu {
15981334e47eSYi Liu 	ida_destroy(&vfio.device_ida);
15998b6f173aSYi Liu 	vfio_cdev_cleanup();
16003c28a761SYi Liu 	class_destroy(vfio.device_class);
16013c28a761SYi Liu 	vfio.device_class = NULL;
1602e2d55709SJason Gunthorpe 	vfio_virqfd_exit();
16031334e47eSYi Liu 	vfio_group_cleanup();
16040f3e72b5SJason Gunthorpe 	xa_destroy(&vfio_device_set_xa);
16050f3e72b5SJason Gunthorpe }
16060f3e72b5SJason Gunthorpe 
16070f3e72b5SJason Gunthorpe module_init(vfio_init);
16080f3e72b5SJason Gunthorpe module_exit(vfio_cleanup);
16090f3e72b5SJason Gunthorpe 
16100f3e72b5SJason Gunthorpe MODULE_VERSION(DRIVER_VERSION);
16110f3e72b5SJason Gunthorpe MODULE_LICENSE("GPL v2");
16120f3e72b5SJason Gunthorpe MODULE_AUTHOR(DRIVER_AUTHOR);
16130f3e72b5SJason Gunthorpe MODULE_DESCRIPTION(DRIVER_DESC);
16140f3e72b5SJason Gunthorpe MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1615