xref: /openbmc/linux/drivers/vfio/vfio_main.c (revision 603c09f2)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
18 #include <linux/fs.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
35 #include <linux/pm_runtime.h>
36 #include <linux/interval_tree.h>
37 #include <linux/iova_bitmap.h>
38 #include "vfio.h"
39 
40 #define DRIVER_VERSION	"0.3"
41 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
42 #define DRIVER_DESC	"VFIO - User Level meta-driver"
43 
44 static struct vfio {
45 	struct class			*class;
46 	struct list_head		iommu_drivers_list;
47 	struct mutex			iommu_drivers_lock;
48 	struct list_head		group_list;
49 	struct mutex			group_lock; /* locks group_list */
50 	struct ida			group_ida;
51 	dev_t				group_devt;
52 } vfio;
53 
54 struct vfio_iommu_driver {
55 	const struct vfio_iommu_driver_ops	*ops;
56 	struct list_head			vfio_next;
57 };
58 
59 struct vfio_container {
60 	struct kref			kref;
61 	struct list_head		group_list;
62 	struct rw_semaphore		group_lock;
63 	struct vfio_iommu_driver	*iommu_driver;
64 	void				*iommu_data;
65 	bool				noiommu;
66 };
67 
68 struct vfio_group {
69 	struct device 			dev;
70 	struct cdev			cdev;
71 	refcount_t			users;
72 	unsigned int			container_users;
73 	struct iommu_group		*iommu_group;
74 	struct vfio_container		*container;
75 	struct list_head		device_list;
76 	struct mutex			device_lock;
77 	struct list_head		vfio_next;
78 	struct list_head		container_next;
79 	enum vfio_group_type		type;
80 	struct rw_semaphore		group_rwsem;
81 	struct kvm			*kvm;
82 	struct file			*opened_file;
83 	struct blocking_notifier_head	notifier;
84 };
85 
86 #ifdef CONFIG_VFIO_NOIOMMU
87 static bool noiommu __read_mostly;
88 module_param_named(enable_unsafe_noiommu_mode,
89 		   noiommu, bool, S_IRUGO | S_IWUSR);
90 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
91 #endif
92 
93 static DEFINE_XARRAY(vfio_device_set_xa);
94 static const struct file_operations vfio_group_fops;
95 
96 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
97 {
98 	unsigned long idx = (unsigned long)set_id;
99 	struct vfio_device_set *new_dev_set;
100 	struct vfio_device_set *dev_set;
101 
102 	if (WARN_ON(!set_id))
103 		return -EINVAL;
104 
105 	/*
106 	 * Atomically acquire a singleton object in the xarray for this set_id
107 	 */
108 	xa_lock(&vfio_device_set_xa);
109 	dev_set = xa_load(&vfio_device_set_xa, idx);
110 	if (dev_set)
111 		goto found_get_ref;
112 	xa_unlock(&vfio_device_set_xa);
113 
114 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
115 	if (!new_dev_set)
116 		return -ENOMEM;
117 	mutex_init(&new_dev_set->lock);
118 	INIT_LIST_HEAD(&new_dev_set->device_list);
119 	new_dev_set->set_id = set_id;
120 
121 	xa_lock(&vfio_device_set_xa);
122 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
123 			       GFP_KERNEL);
124 	if (!dev_set) {
125 		dev_set = new_dev_set;
126 		goto found_get_ref;
127 	}
128 
129 	kfree(new_dev_set);
130 	if (xa_is_err(dev_set)) {
131 		xa_unlock(&vfio_device_set_xa);
132 		return xa_err(dev_set);
133 	}
134 
135 found_get_ref:
136 	dev_set->device_count++;
137 	xa_unlock(&vfio_device_set_xa);
138 	mutex_lock(&dev_set->lock);
139 	device->dev_set = dev_set;
140 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
141 	mutex_unlock(&dev_set->lock);
142 	return 0;
143 }
144 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
145 
146 static void vfio_release_device_set(struct vfio_device *device)
147 {
148 	struct vfio_device_set *dev_set = device->dev_set;
149 
150 	if (!dev_set)
151 		return;
152 
153 	mutex_lock(&dev_set->lock);
154 	list_del(&device->dev_set_list);
155 	mutex_unlock(&dev_set->lock);
156 
157 	xa_lock(&vfio_device_set_xa);
158 	if (!--dev_set->device_count) {
159 		__xa_erase(&vfio_device_set_xa,
160 			   (unsigned long)dev_set->set_id);
161 		mutex_destroy(&dev_set->lock);
162 		kfree(dev_set);
163 	}
164 	xa_unlock(&vfio_device_set_xa);
165 }
166 
167 #ifdef CONFIG_VFIO_NOIOMMU
168 static void *vfio_noiommu_open(unsigned long arg)
169 {
170 	if (arg != VFIO_NOIOMMU_IOMMU)
171 		return ERR_PTR(-EINVAL);
172 	if (!capable(CAP_SYS_RAWIO))
173 		return ERR_PTR(-EPERM);
174 
175 	return NULL;
176 }
177 
178 static void vfio_noiommu_release(void *iommu_data)
179 {
180 }
181 
182 static long vfio_noiommu_ioctl(void *iommu_data,
183 			       unsigned int cmd, unsigned long arg)
184 {
185 	if (cmd == VFIO_CHECK_EXTENSION)
186 		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
187 
188 	return -ENOTTY;
189 }
190 
191 static int vfio_noiommu_attach_group(void *iommu_data,
192 		struct iommu_group *iommu_group, enum vfio_group_type type)
193 {
194 	return 0;
195 }
196 
197 static void vfio_noiommu_detach_group(void *iommu_data,
198 				      struct iommu_group *iommu_group)
199 {
200 }
201 
202 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
203 	.name = "vfio-noiommu",
204 	.owner = THIS_MODULE,
205 	.open = vfio_noiommu_open,
206 	.release = vfio_noiommu_release,
207 	.ioctl = vfio_noiommu_ioctl,
208 	.attach_group = vfio_noiommu_attach_group,
209 	.detach_group = vfio_noiommu_detach_group,
210 };
211 
212 /*
213  * Only noiommu containers can use vfio-noiommu and noiommu containers can only
214  * use vfio-noiommu.
215  */
216 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
217 		const struct vfio_iommu_driver *driver)
218 {
219 	return container->noiommu == (driver->ops == &vfio_noiommu_ops);
220 }
221 #else
222 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
223 		const struct vfio_iommu_driver *driver)
224 {
225 	return true;
226 }
227 #endif /* CONFIG_VFIO_NOIOMMU */
228 
229 /*
230  * IOMMU driver registration
231  */
232 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
233 {
234 	struct vfio_iommu_driver *driver, *tmp;
235 
236 	if (WARN_ON(!ops->register_device != !ops->unregister_device))
237 		return -EINVAL;
238 
239 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
240 	if (!driver)
241 		return -ENOMEM;
242 
243 	driver->ops = ops;
244 
245 	mutex_lock(&vfio.iommu_drivers_lock);
246 
247 	/* Check for duplicates */
248 	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
249 		if (tmp->ops == ops) {
250 			mutex_unlock(&vfio.iommu_drivers_lock);
251 			kfree(driver);
252 			return -EINVAL;
253 		}
254 	}
255 
256 	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
257 
258 	mutex_unlock(&vfio.iommu_drivers_lock);
259 
260 	return 0;
261 }
262 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
263 
264 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
265 {
266 	struct vfio_iommu_driver *driver;
267 
268 	mutex_lock(&vfio.iommu_drivers_lock);
269 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
270 		if (driver->ops == ops) {
271 			list_del(&driver->vfio_next);
272 			mutex_unlock(&vfio.iommu_drivers_lock);
273 			kfree(driver);
274 			return;
275 		}
276 	}
277 	mutex_unlock(&vfio.iommu_drivers_lock);
278 }
279 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
280 
281 static void vfio_group_get(struct vfio_group *group);
282 
283 /*
284  * Container objects - containers are created when /dev/vfio/vfio is
285  * opened, but their lifecycle extends until the last user is done, so
286  * it's freed via kref.  Must support container/group/device being
287  * closed in any order.
288  */
289 static void vfio_container_get(struct vfio_container *container)
290 {
291 	kref_get(&container->kref);
292 }
293 
294 static void vfio_container_release(struct kref *kref)
295 {
296 	struct vfio_container *container;
297 	container = container_of(kref, struct vfio_container, kref);
298 
299 	kfree(container);
300 }
301 
302 static void vfio_container_put(struct vfio_container *container)
303 {
304 	kref_put(&container->kref, vfio_container_release);
305 }
306 
307 /*
308  * Group objects - create, release, get, put, search
309  */
310 static struct vfio_group *
311 __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
312 {
313 	struct vfio_group *group;
314 
315 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
316 		if (group->iommu_group == iommu_group) {
317 			vfio_group_get(group);
318 			return group;
319 		}
320 	}
321 	return NULL;
322 }
323 
324 static struct vfio_group *
325 vfio_group_get_from_iommu(struct iommu_group *iommu_group)
326 {
327 	struct vfio_group *group;
328 
329 	mutex_lock(&vfio.group_lock);
330 	group = __vfio_group_get_from_iommu(iommu_group);
331 	mutex_unlock(&vfio.group_lock);
332 	return group;
333 }
334 
335 static void vfio_group_release(struct device *dev)
336 {
337 	struct vfio_group *group = container_of(dev, struct vfio_group, dev);
338 
339 	mutex_destroy(&group->device_lock);
340 	iommu_group_put(group->iommu_group);
341 	ida_free(&vfio.group_ida, MINOR(group->dev.devt));
342 	kfree(group);
343 }
344 
345 static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
346 					   enum vfio_group_type type)
347 {
348 	struct vfio_group *group;
349 	int minor;
350 
351 	group = kzalloc(sizeof(*group), GFP_KERNEL);
352 	if (!group)
353 		return ERR_PTR(-ENOMEM);
354 
355 	minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
356 	if (minor < 0) {
357 		kfree(group);
358 		return ERR_PTR(minor);
359 	}
360 
361 	device_initialize(&group->dev);
362 	group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
363 	group->dev.class = vfio.class;
364 	group->dev.release = vfio_group_release;
365 	cdev_init(&group->cdev, &vfio_group_fops);
366 	group->cdev.owner = THIS_MODULE;
367 
368 	refcount_set(&group->users, 1);
369 	init_rwsem(&group->group_rwsem);
370 	INIT_LIST_HEAD(&group->device_list);
371 	mutex_init(&group->device_lock);
372 	group->iommu_group = iommu_group;
373 	/* put in vfio_group_release() */
374 	iommu_group_ref_get(iommu_group);
375 	group->type = type;
376 	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
377 
378 	return group;
379 }
380 
381 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
382 		enum vfio_group_type type)
383 {
384 	struct vfio_group *group;
385 	struct vfio_group *ret;
386 	int err;
387 
388 	group = vfio_group_alloc(iommu_group, type);
389 	if (IS_ERR(group))
390 		return group;
391 
392 	err = dev_set_name(&group->dev, "%s%d",
393 			   group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
394 			   iommu_group_id(iommu_group));
395 	if (err) {
396 		ret = ERR_PTR(err);
397 		goto err_put;
398 	}
399 
400 	mutex_lock(&vfio.group_lock);
401 
402 	/* Did we race creating this group? */
403 	ret = __vfio_group_get_from_iommu(iommu_group);
404 	if (ret)
405 		goto err_unlock;
406 
407 	err = cdev_device_add(&group->cdev, &group->dev);
408 	if (err) {
409 		ret = ERR_PTR(err);
410 		goto err_unlock;
411 	}
412 
413 	list_add(&group->vfio_next, &vfio.group_list);
414 
415 	mutex_unlock(&vfio.group_lock);
416 	return group;
417 
418 err_unlock:
419 	mutex_unlock(&vfio.group_lock);
420 err_put:
421 	put_device(&group->dev);
422 	return ret;
423 }
424 
425 static void vfio_group_put(struct vfio_group *group)
426 {
427 	if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
428 		return;
429 
430 	/*
431 	 * These data structures all have paired operations that can only be
432 	 * undone when the caller holds a live reference on the group. Since all
433 	 * pairs must be undone these WARN_ON's indicate some caller did not
434 	 * properly hold the group reference.
435 	 */
436 	WARN_ON(!list_empty(&group->device_list));
437 	WARN_ON(group->container || group->container_users);
438 	WARN_ON(group->notifier.head);
439 
440 	list_del(&group->vfio_next);
441 	cdev_device_del(&group->cdev, &group->dev);
442 	mutex_unlock(&vfio.group_lock);
443 
444 	put_device(&group->dev);
445 }
446 
447 static void vfio_group_get(struct vfio_group *group)
448 {
449 	refcount_inc(&group->users);
450 }
451 
452 /*
453  * Device objects - create, release, get, put, search
454  */
455 /* Device reference always implies a group reference */
456 static void vfio_device_put(struct vfio_device *device)
457 {
458 	if (refcount_dec_and_test(&device->refcount))
459 		complete(&device->comp);
460 }
461 
462 static bool vfio_device_try_get(struct vfio_device *device)
463 {
464 	return refcount_inc_not_zero(&device->refcount);
465 }
466 
467 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
468 						 struct device *dev)
469 {
470 	struct vfio_device *device;
471 
472 	mutex_lock(&group->device_lock);
473 	list_for_each_entry(device, &group->device_list, group_next) {
474 		if (device->dev == dev && vfio_device_try_get(device)) {
475 			mutex_unlock(&group->device_lock);
476 			return device;
477 		}
478 	}
479 	mutex_unlock(&group->device_lock);
480 	return NULL;
481 }
482 
483 /*
484  * VFIO driver API
485  */
486 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
487 			 const struct vfio_device_ops *ops)
488 {
489 	init_completion(&device->comp);
490 	device->dev = dev;
491 	device->ops = ops;
492 }
493 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
494 
495 void vfio_uninit_group_dev(struct vfio_device *device)
496 {
497 	vfio_release_device_set(device);
498 }
499 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
500 
501 /* Release helper called by vfio_put_device() */
502 void vfio_device_release(struct kref *kref)
503 {
504 	struct vfio_device *device =
505 			container_of(kref, struct vfio_device, kref);
506 
507 	vfio_uninit_group_dev(device);
508 
509 	/*
510 	 * kvfree() cannot be done here due to a life cycle mess in
511 	 * vfio-ccw. Before the ccw part is fixed all drivers are
512 	 * required to support @release and call vfio_free_device()
513 	 * from there.
514 	 */
515 	device->ops->release(device);
516 }
517 EXPORT_SYMBOL_GPL(vfio_device_release);
518 
519 /*
520  * Allocate and initialize vfio_device so it can be registered to vfio
521  * core.
522  *
523  * Drivers should use the wrapper vfio_alloc_device() for allocation.
524  * @size is the size of the structure to be allocated, including any
525  * private data used by the driver.
526  *
527  * Driver may provide an @init callback to cover device private data.
528  *
529  * Use vfio_put_device() to release the structure after success return.
530  */
531 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
532 				       const struct vfio_device_ops *ops)
533 {
534 	struct vfio_device *device;
535 	int ret;
536 
537 	if (WARN_ON(size < sizeof(struct vfio_device)))
538 		return ERR_PTR(-EINVAL);
539 
540 	device = kvzalloc(size, GFP_KERNEL);
541 	if (!device)
542 		return ERR_PTR(-ENOMEM);
543 
544 	ret = vfio_init_device(device, dev, ops);
545 	if (ret)
546 		goto out_free;
547 	return device;
548 
549 out_free:
550 	kvfree(device);
551 	return ERR_PTR(ret);
552 }
553 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
554 
555 /*
556  * Initialize a vfio_device so it can be registered to vfio core.
557  *
558  * Only vfio-ccw driver should call this interface.
559  */
560 int vfio_init_device(struct vfio_device *device, struct device *dev,
561 		     const struct vfio_device_ops *ops)
562 {
563 	int ret;
564 
565 	vfio_init_group_dev(device, dev, ops);
566 
567 	if (ops->init) {
568 		ret = ops->init(device);
569 		if (ret)
570 			goto out_uninit;
571 	}
572 
573 	kref_init(&device->kref);
574 	return 0;
575 
576 out_uninit:
577 	vfio_uninit_group_dev(device);
578 	return ret;
579 }
580 EXPORT_SYMBOL_GPL(vfio_init_device);
581 
582 /*
583  * The helper called by driver @release callback to free the device
584  * structure. Drivers which don't have private data to clean can
585  * simply use this helper as its @release.
586  */
587 void vfio_free_device(struct vfio_device *device)
588 {
589 	kvfree(device);
590 }
591 EXPORT_SYMBOL_GPL(vfio_free_device);
592 
593 static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
594 		enum vfio_group_type type)
595 {
596 	struct iommu_group *iommu_group;
597 	struct vfio_group *group;
598 	int ret;
599 
600 	iommu_group = iommu_group_alloc();
601 	if (IS_ERR(iommu_group))
602 		return ERR_CAST(iommu_group);
603 
604 	ret = iommu_group_set_name(iommu_group, "vfio-noiommu");
605 	if (ret)
606 		goto out_put_group;
607 	ret = iommu_group_add_device(iommu_group, dev);
608 	if (ret)
609 		goto out_put_group;
610 
611 	group = vfio_create_group(iommu_group, type);
612 	if (IS_ERR(group)) {
613 		ret = PTR_ERR(group);
614 		goto out_remove_device;
615 	}
616 	iommu_group_put(iommu_group);
617 	return group;
618 
619 out_remove_device:
620 	iommu_group_remove_device(dev);
621 out_put_group:
622 	iommu_group_put(iommu_group);
623 	return ERR_PTR(ret);
624 }
625 
626 static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
627 {
628 	struct iommu_group *iommu_group;
629 	struct vfio_group *group;
630 
631 	iommu_group = iommu_group_get(dev);
632 #ifdef CONFIG_VFIO_NOIOMMU
633 	if (!iommu_group && noiommu) {
634 		/*
635 		 * With noiommu enabled, create an IOMMU group for devices that
636 		 * don't already have one, implying no IOMMU hardware/driver
637 		 * exists.  Taint the kernel because we're about to give a DMA
638 		 * capable device to a user without IOMMU protection.
639 		 */
640 		group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
641 		if (!IS_ERR(group)) {
642 			add_taint(TAINT_USER, LOCKDEP_STILL_OK);
643 			dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
644 		}
645 		return group;
646 	}
647 #endif
648 	if (!iommu_group)
649 		return ERR_PTR(-EINVAL);
650 
651 	/*
652 	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
653 	 * restore cache coherency. It has to be checked here because it is only
654 	 * valid for cases where we are using iommu groups.
655 	 */
656 	if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) {
657 		iommu_group_put(iommu_group);
658 		return ERR_PTR(-EINVAL);
659 	}
660 
661 	group = vfio_group_get_from_iommu(iommu_group);
662 	if (!group)
663 		group = vfio_create_group(iommu_group, VFIO_IOMMU);
664 
665 	/* The vfio_group holds a reference to the iommu_group */
666 	iommu_group_put(iommu_group);
667 	return group;
668 }
669 
670 static int __vfio_register_dev(struct vfio_device *device,
671 		struct vfio_group *group)
672 {
673 	struct vfio_device *existing_device;
674 
675 	if (IS_ERR(group))
676 		return PTR_ERR(group);
677 
678 	/*
679 	 * If the driver doesn't specify a set then the device is added to a
680 	 * singleton set just for itself.
681 	 */
682 	if (!device->dev_set)
683 		vfio_assign_device_set(device, device);
684 
685 	existing_device = vfio_group_get_device(group, device->dev);
686 	if (existing_device) {
687 		dev_WARN(device->dev, "Device already exists on group %d\n",
688 			 iommu_group_id(group->iommu_group));
689 		vfio_device_put(existing_device);
690 		if (group->type == VFIO_NO_IOMMU ||
691 		    group->type == VFIO_EMULATED_IOMMU)
692 			iommu_group_remove_device(device->dev);
693 		vfio_group_put(group);
694 		return -EBUSY;
695 	}
696 
697 	/* Our reference on group is moved to the device */
698 	device->group = group;
699 
700 	/* Refcounting can't start until the driver calls register */
701 	refcount_set(&device->refcount, 1);
702 
703 	mutex_lock(&group->device_lock);
704 	list_add(&device->group_next, &group->device_list);
705 	mutex_unlock(&group->device_lock);
706 
707 	return 0;
708 }
709 
710 int vfio_register_group_dev(struct vfio_device *device)
711 {
712 	return __vfio_register_dev(device,
713 		vfio_group_find_or_alloc(device->dev));
714 }
715 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
716 
717 /*
718  * Register a virtual device without IOMMU backing.  The user of this
719  * device must not be able to directly trigger unmediated DMA.
720  */
721 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
722 {
723 	return __vfio_register_dev(device,
724 		vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
725 }
726 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
727 
728 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
729 						     char *buf)
730 {
731 	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
732 
733 	mutex_lock(&group->device_lock);
734 	list_for_each_entry(it, &group->device_list, group_next) {
735 		int ret;
736 
737 		if (it->ops->match) {
738 			ret = it->ops->match(it, buf);
739 			if (ret < 0) {
740 				device = ERR_PTR(ret);
741 				break;
742 			}
743 		} else {
744 			ret = !strcmp(dev_name(it->dev), buf);
745 		}
746 
747 		if (ret && vfio_device_try_get(it)) {
748 			device = it;
749 			break;
750 		}
751 	}
752 	mutex_unlock(&group->device_lock);
753 
754 	return device;
755 }
756 
757 /*
758  * Decrement the device reference count and wait for the device to be
759  * removed.  Open file descriptors for the device... */
760 void vfio_unregister_group_dev(struct vfio_device *device)
761 {
762 	struct vfio_group *group = device->group;
763 	unsigned int i = 0;
764 	bool interrupted = false;
765 	long rc;
766 
767 	vfio_device_put(device);
768 	rc = try_wait_for_completion(&device->comp);
769 	while (rc <= 0) {
770 		if (device->ops->request)
771 			device->ops->request(device, i++);
772 
773 		if (interrupted) {
774 			rc = wait_for_completion_timeout(&device->comp,
775 							 HZ * 10);
776 		} else {
777 			rc = wait_for_completion_interruptible_timeout(
778 				&device->comp, HZ * 10);
779 			if (rc < 0) {
780 				interrupted = true;
781 				dev_warn(device->dev,
782 					 "Device is currently in use, task"
783 					 " \"%s\" (%d) "
784 					 "blocked until device is released",
785 					 current->comm, task_pid_nr(current));
786 			}
787 		}
788 	}
789 
790 	mutex_lock(&group->device_lock);
791 	list_del(&device->group_next);
792 	mutex_unlock(&group->device_lock);
793 
794 	if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
795 		iommu_group_remove_device(device->dev);
796 
797 	/* Matches the get in vfio_register_group_dev() */
798 	vfio_group_put(group);
799 }
800 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
801 
802 /*
803  * VFIO base fd, /dev/vfio/vfio
804  */
805 static long vfio_ioctl_check_extension(struct vfio_container *container,
806 				       unsigned long arg)
807 {
808 	struct vfio_iommu_driver *driver;
809 	long ret = 0;
810 
811 	down_read(&container->group_lock);
812 
813 	driver = container->iommu_driver;
814 
815 	switch (arg) {
816 		/* No base extensions yet */
817 	default:
818 		/*
819 		 * If no driver is set, poll all registered drivers for
820 		 * extensions and return the first positive result.  If
821 		 * a driver is already set, further queries will be passed
822 		 * only to that driver.
823 		 */
824 		if (!driver) {
825 			mutex_lock(&vfio.iommu_drivers_lock);
826 			list_for_each_entry(driver, &vfio.iommu_drivers_list,
827 					    vfio_next) {
828 
829 				if (!list_empty(&container->group_list) &&
830 				    !vfio_iommu_driver_allowed(container,
831 							       driver))
832 					continue;
833 				if (!try_module_get(driver->ops->owner))
834 					continue;
835 
836 				ret = driver->ops->ioctl(NULL,
837 							 VFIO_CHECK_EXTENSION,
838 							 arg);
839 				module_put(driver->ops->owner);
840 				if (ret > 0)
841 					break;
842 			}
843 			mutex_unlock(&vfio.iommu_drivers_lock);
844 		} else
845 			ret = driver->ops->ioctl(container->iommu_data,
846 						 VFIO_CHECK_EXTENSION, arg);
847 	}
848 
849 	up_read(&container->group_lock);
850 
851 	return ret;
852 }
853 
854 /* hold write lock on container->group_lock */
855 static int __vfio_container_attach_groups(struct vfio_container *container,
856 					  struct vfio_iommu_driver *driver,
857 					  void *data)
858 {
859 	struct vfio_group *group;
860 	int ret = -ENODEV;
861 
862 	list_for_each_entry(group, &container->group_list, container_next) {
863 		ret = driver->ops->attach_group(data, group->iommu_group,
864 						group->type);
865 		if (ret)
866 			goto unwind;
867 	}
868 
869 	return ret;
870 
871 unwind:
872 	list_for_each_entry_continue_reverse(group, &container->group_list,
873 					     container_next) {
874 		driver->ops->detach_group(data, group->iommu_group);
875 	}
876 
877 	return ret;
878 }
879 
880 static long vfio_ioctl_set_iommu(struct vfio_container *container,
881 				 unsigned long arg)
882 {
883 	struct vfio_iommu_driver *driver;
884 	long ret = -ENODEV;
885 
886 	down_write(&container->group_lock);
887 
888 	/*
889 	 * The container is designed to be an unprivileged interface while
890 	 * the group can be assigned to specific users.  Therefore, only by
891 	 * adding a group to a container does the user get the privilege of
892 	 * enabling the iommu, which may allocate finite resources.  There
893 	 * is no unset_iommu, but by removing all the groups from a container,
894 	 * the container is deprivileged and returns to an unset state.
895 	 */
896 	if (list_empty(&container->group_list) || container->iommu_driver) {
897 		up_write(&container->group_lock);
898 		return -EINVAL;
899 	}
900 
901 	mutex_lock(&vfio.iommu_drivers_lock);
902 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
903 		void *data;
904 
905 		if (!vfio_iommu_driver_allowed(container, driver))
906 			continue;
907 		if (!try_module_get(driver->ops->owner))
908 			continue;
909 
910 		/*
911 		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
912 		 * so test which iommu driver reported support for this
913 		 * extension and call open on them.  We also pass them the
914 		 * magic, allowing a single driver to support multiple
915 		 * interfaces if they'd like.
916 		 */
917 		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
918 			module_put(driver->ops->owner);
919 			continue;
920 		}
921 
922 		data = driver->ops->open(arg);
923 		if (IS_ERR(data)) {
924 			ret = PTR_ERR(data);
925 			module_put(driver->ops->owner);
926 			continue;
927 		}
928 
929 		ret = __vfio_container_attach_groups(container, driver, data);
930 		if (ret) {
931 			driver->ops->release(data);
932 			module_put(driver->ops->owner);
933 			continue;
934 		}
935 
936 		container->iommu_driver = driver;
937 		container->iommu_data = data;
938 		break;
939 	}
940 
941 	mutex_unlock(&vfio.iommu_drivers_lock);
942 	up_write(&container->group_lock);
943 
944 	return ret;
945 }
946 
947 static long vfio_fops_unl_ioctl(struct file *filep,
948 				unsigned int cmd, unsigned long arg)
949 {
950 	struct vfio_container *container = filep->private_data;
951 	struct vfio_iommu_driver *driver;
952 	void *data;
953 	long ret = -EINVAL;
954 
955 	if (!container)
956 		return ret;
957 
958 	switch (cmd) {
959 	case VFIO_GET_API_VERSION:
960 		ret = VFIO_API_VERSION;
961 		break;
962 	case VFIO_CHECK_EXTENSION:
963 		ret = vfio_ioctl_check_extension(container, arg);
964 		break;
965 	case VFIO_SET_IOMMU:
966 		ret = vfio_ioctl_set_iommu(container, arg);
967 		break;
968 	default:
969 		driver = container->iommu_driver;
970 		data = container->iommu_data;
971 
972 		if (driver) /* passthrough all unrecognized ioctls */
973 			ret = driver->ops->ioctl(data, cmd, arg);
974 	}
975 
976 	return ret;
977 }
978 
979 static int vfio_fops_open(struct inode *inode, struct file *filep)
980 {
981 	struct vfio_container *container;
982 
983 	container = kzalloc(sizeof(*container), GFP_KERNEL);
984 	if (!container)
985 		return -ENOMEM;
986 
987 	INIT_LIST_HEAD(&container->group_list);
988 	init_rwsem(&container->group_lock);
989 	kref_init(&container->kref);
990 
991 	filep->private_data = container;
992 
993 	return 0;
994 }
995 
996 static int vfio_fops_release(struct inode *inode, struct file *filep)
997 {
998 	struct vfio_container *container = filep->private_data;
999 	struct vfio_iommu_driver *driver = container->iommu_driver;
1000 
1001 	if (driver && driver->ops->notify)
1002 		driver->ops->notify(container->iommu_data,
1003 				    VFIO_IOMMU_CONTAINER_CLOSE);
1004 
1005 	filep->private_data = NULL;
1006 
1007 	vfio_container_put(container);
1008 
1009 	return 0;
1010 }
1011 
1012 static const struct file_operations vfio_fops = {
1013 	.owner		= THIS_MODULE,
1014 	.open		= vfio_fops_open,
1015 	.release	= vfio_fops_release,
1016 	.unlocked_ioctl	= vfio_fops_unl_ioctl,
1017 	.compat_ioctl	= compat_ptr_ioctl,
1018 };
1019 
1020 /*
1021  * VFIO Group fd, /dev/vfio/$GROUP
1022  */
1023 static void __vfio_group_unset_container(struct vfio_group *group)
1024 {
1025 	struct vfio_container *container = group->container;
1026 	struct vfio_iommu_driver *driver;
1027 
1028 	lockdep_assert_held_write(&group->group_rwsem);
1029 
1030 	down_write(&container->group_lock);
1031 
1032 	driver = container->iommu_driver;
1033 	if (driver)
1034 		driver->ops->detach_group(container->iommu_data,
1035 					  group->iommu_group);
1036 
1037 	if (group->type == VFIO_IOMMU)
1038 		iommu_group_release_dma_owner(group->iommu_group);
1039 
1040 	group->container = NULL;
1041 	group->container_users = 0;
1042 	list_del(&group->container_next);
1043 
1044 	/* Detaching the last group deprivileges a container, remove iommu */
1045 	if (driver && list_empty(&container->group_list)) {
1046 		driver->ops->release(container->iommu_data);
1047 		module_put(driver->ops->owner);
1048 		container->iommu_driver = NULL;
1049 		container->iommu_data = NULL;
1050 	}
1051 
1052 	up_write(&container->group_lock);
1053 
1054 	vfio_container_put(container);
1055 }
1056 
1057 /*
1058  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1059  * if there was no container to unset.  Since the ioctl is called on
1060  * the group, we know that still exists, therefore the only valid
1061  * transition here is 1->0.
1062  */
1063 static int vfio_group_ioctl_unset_container(struct vfio_group *group)
1064 {
1065 	int ret = 0;
1066 
1067 	down_write(&group->group_rwsem);
1068 	if (!group->container) {
1069 		ret = -EINVAL;
1070 		goto out_unlock;
1071 	}
1072 	if (group->container_users != 1) {
1073 		ret = -EBUSY;
1074 		goto out_unlock;
1075 	}
1076 	__vfio_group_unset_container(group);
1077 
1078 out_unlock:
1079 	up_write(&group->group_rwsem);
1080 	return ret;
1081 }
1082 
1083 static int vfio_group_ioctl_set_container(struct vfio_group *group,
1084 					  int __user *arg)
1085 {
1086 	struct fd f;
1087 	struct vfio_container *container;
1088 	struct vfio_iommu_driver *driver;
1089 	int container_fd;
1090 	int ret = 0;
1091 
1092 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1093 		return -EPERM;
1094 
1095 	if (get_user(container_fd, arg))
1096 		return -EFAULT;
1097 	if (container_fd < 0)
1098 		return -EINVAL;
1099 	f = fdget(container_fd);
1100 	if (!f.file)
1101 		return -EBADF;
1102 
1103 	/* Sanity check, is this really our fd? */
1104 	if (f.file->f_op != &vfio_fops) {
1105 		ret = -EINVAL;
1106 		goto out_fdput;
1107 	}
1108 	container = f.file->private_data;
1109 	WARN_ON(!container); /* fget ensures we don't race vfio_release */
1110 
1111 	down_write(&group->group_rwsem);
1112 
1113 	if (group->container || WARN_ON(group->container_users)) {
1114 		ret = -EINVAL;
1115 		goto out_unlock_group;
1116 	}
1117 
1118 	down_write(&container->group_lock);
1119 
1120 	/* Real groups and fake groups cannot mix */
1121 	if (!list_empty(&container->group_list) &&
1122 	    container->noiommu != (group->type == VFIO_NO_IOMMU)) {
1123 		ret = -EPERM;
1124 		goto out_unlock_container;
1125 	}
1126 
1127 	if (group->type == VFIO_IOMMU) {
1128 		ret = iommu_group_claim_dma_owner(group->iommu_group, f.file);
1129 		if (ret)
1130 			goto out_unlock_container;
1131 	}
1132 
1133 	driver = container->iommu_driver;
1134 	if (driver) {
1135 		ret = driver->ops->attach_group(container->iommu_data,
1136 						group->iommu_group,
1137 						group->type);
1138 		if (ret) {
1139 			if (group->type == VFIO_IOMMU)
1140 				iommu_group_release_dma_owner(
1141 					group->iommu_group);
1142 			goto out_unlock_container;
1143 		}
1144 	}
1145 
1146 	group->container = container;
1147 	group->container_users = 1;
1148 	container->noiommu = (group->type == VFIO_NO_IOMMU);
1149 	list_add(&group->container_next, &container->group_list);
1150 
1151 	/* Get a reference on the container and mark a user within the group */
1152 	vfio_container_get(container);
1153 
1154 out_unlock_container:
1155 	up_write(&container->group_lock);
1156 out_unlock_group:
1157 	up_write(&group->group_rwsem);
1158 out_fdput:
1159 	fdput(f);
1160 	return ret;
1161 }
1162 
1163 static const struct file_operations vfio_device_fops;
1164 
1165 /* true if the vfio_device has open_device() called but not close_device() */
1166 static bool vfio_assert_device_open(struct vfio_device *device)
1167 {
1168 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
1169 }
1170 
1171 static int vfio_device_assign_container(struct vfio_device *device)
1172 {
1173 	struct vfio_group *group = device->group;
1174 
1175 	lockdep_assert_held_write(&group->group_rwsem);
1176 
1177 	if (!group->container || !group->container->iommu_driver ||
1178 	    WARN_ON(!group->container_users))
1179 		return -EINVAL;
1180 
1181 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1182 		return -EPERM;
1183 
1184 	get_file(group->opened_file);
1185 	group->container_users++;
1186 	return 0;
1187 }
1188 
1189 static void vfio_device_unassign_container(struct vfio_device *device)
1190 {
1191 	down_write(&device->group->group_rwsem);
1192 	WARN_ON(device->group->container_users <= 1);
1193 	device->group->container_users--;
1194 	fput(device->group->opened_file);
1195 	up_write(&device->group->group_rwsem);
1196 }
1197 
1198 static struct file *vfio_device_open(struct vfio_device *device)
1199 {
1200 	struct vfio_iommu_driver *iommu_driver;
1201 	struct file *filep;
1202 	int ret;
1203 
1204 	down_write(&device->group->group_rwsem);
1205 	ret = vfio_device_assign_container(device);
1206 	up_write(&device->group->group_rwsem);
1207 	if (ret)
1208 		return ERR_PTR(ret);
1209 
1210 	if (!try_module_get(device->dev->driver->owner)) {
1211 		ret = -ENODEV;
1212 		goto err_unassign_container;
1213 	}
1214 
1215 	mutex_lock(&device->dev_set->lock);
1216 	device->open_count++;
1217 	if (device->open_count == 1) {
1218 		/*
1219 		 * Here we pass the KVM pointer with the group under the read
1220 		 * lock.  If the device driver will use it, it must obtain a
1221 		 * reference and release it during close_device.
1222 		 */
1223 		down_read(&device->group->group_rwsem);
1224 		device->kvm = device->group->kvm;
1225 
1226 		if (device->ops->open_device) {
1227 			ret = device->ops->open_device(device);
1228 			if (ret)
1229 				goto err_undo_count;
1230 		}
1231 
1232 		iommu_driver = device->group->container->iommu_driver;
1233 		if (iommu_driver && iommu_driver->ops->register_device)
1234 			iommu_driver->ops->register_device(
1235 				device->group->container->iommu_data, device);
1236 
1237 		up_read(&device->group->group_rwsem);
1238 	}
1239 	mutex_unlock(&device->dev_set->lock);
1240 
1241 	/*
1242 	 * We can't use anon_inode_getfd() because we need to modify
1243 	 * the f_mode flags directly to allow more than just ioctls
1244 	 */
1245 	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1246 				   device, O_RDWR);
1247 	if (IS_ERR(filep)) {
1248 		ret = PTR_ERR(filep);
1249 		goto err_close_device;
1250 	}
1251 
1252 	/*
1253 	 * TODO: add an anon_inode interface to do this.
1254 	 * Appears to be missing by lack of need rather than
1255 	 * explicitly prevented.  Now there's need.
1256 	 */
1257 	filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
1258 
1259 	if (device->group->type == VFIO_NO_IOMMU)
1260 		dev_warn(device->dev, "vfio-noiommu device opened by user "
1261 			 "(%s:%d)\n", current->comm, task_pid_nr(current));
1262 	/*
1263 	 * On success the ref of device is moved to the file and
1264 	 * put in vfio_device_fops_release()
1265 	 */
1266 	return filep;
1267 
1268 err_close_device:
1269 	mutex_lock(&device->dev_set->lock);
1270 	down_read(&device->group->group_rwsem);
1271 	if (device->open_count == 1 && device->ops->close_device) {
1272 		device->ops->close_device(device);
1273 
1274 		iommu_driver = device->group->container->iommu_driver;
1275 		if (iommu_driver && iommu_driver->ops->unregister_device)
1276 			iommu_driver->ops->unregister_device(
1277 				device->group->container->iommu_data, device);
1278 	}
1279 err_undo_count:
1280 	up_read(&device->group->group_rwsem);
1281 	device->open_count--;
1282 	if (device->open_count == 0 && device->kvm)
1283 		device->kvm = NULL;
1284 	mutex_unlock(&device->dev_set->lock);
1285 	module_put(device->dev->driver->owner);
1286 err_unassign_container:
1287 	vfio_device_unassign_container(device);
1288 	return ERR_PTR(ret);
1289 }
1290 
1291 static int vfio_group_ioctl_get_device_fd(struct vfio_group *group,
1292 					  char __user *arg)
1293 {
1294 	struct vfio_device *device;
1295 	struct file *filep;
1296 	char *buf;
1297 	int fdno;
1298 	int ret;
1299 
1300 	buf = strndup_user(arg, PAGE_SIZE);
1301 	if (IS_ERR(buf))
1302 		return PTR_ERR(buf);
1303 
1304 	device = vfio_device_get_from_name(group, buf);
1305 	kfree(buf);
1306 	if (IS_ERR(device))
1307 		return PTR_ERR(device);
1308 
1309 	fdno = get_unused_fd_flags(O_CLOEXEC);
1310 	if (fdno < 0) {
1311 		ret = fdno;
1312 		goto err_put_device;
1313 	}
1314 
1315 	filep = vfio_device_open(device);
1316 	if (IS_ERR(filep)) {
1317 		ret = PTR_ERR(filep);
1318 		goto err_put_fdno;
1319 	}
1320 
1321 	fd_install(fdno, filep);
1322 	return fdno;
1323 
1324 err_put_fdno:
1325 	put_unused_fd(fdno);
1326 err_put_device:
1327 	vfio_device_put(device);
1328 	return ret;
1329 }
1330 
1331 static int vfio_group_ioctl_get_status(struct vfio_group *group,
1332 				       struct vfio_group_status __user *arg)
1333 {
1334 	unsigned long minsz = offsetofend(struct vfio_group_status, flags);
1335 	struct vfio_group_status status;
1336 
1337 	if (copy_from_user(&status, arg, minsz))
1338 		return -EFAULT;
1339 
1340 	if (status.argsz < minsz)
1341 		return -EINVAL;
1342 
1343 	status.flags = 0;
1344 
1345 	down_read(&group->group_rwsem);
1346 	if (group->container)
1347 		status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
1348 				VFIO_GROUP_FLAGS_VIABLE;
1349 	else if (!iommu_group_dma_owner_claimed(group->iommu_group))
1350 		status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1351 	up_read(&group->group_rwsem);
1352 
1353 	if (copy_to_user(arg, &status, minsz))
1354 		return -EFAULT;
1355 	return 0;
1356 }
1357 
1358 static long vfio_group_fops_unl_ioctl(struct file *filep,
1359 				      unsigned int cmd, unsigned long arg)
1360 {
1361 	struct vfio_group *group = filep->private_data;
1362 	void __user *uarg = (void __user *)arg;
1363 
1364 	switch (cmd) {
1365 	case VFIO_GROUP_GET_DEVICE_FD:
1366 		return vfio_group_ioctl_get_device_fd(group, uarg);
1367 	case VFIO_GROUP_GET_STATUS:
1368 		return vfio_group_ioctl_get_status(group, uarg);
1369 	case VFIO_GROUP_SET_CONTAINER:
1370 		return vfio_group_ioctl_set_container(group, uarg);
1371 	case VFIO_GROUP_UNSET_CONTAINER:
1372 		return vfio_group_ioctl_unset_container(group);
1373 	default:
1374 		return -ENOTTY;
1375 	}
1376 }
1377 
1378 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1379 {
1380 	struct vfio_group *group =
1381 		container_of(inode->i_cdev, struct vfio_group, cdev);
1382 	int ret;
1383 
1384 	down_write(&group->group_rwsem);
1385 
1386 	/* users can be zero if this races with vfio_group_put() */
1387 	if (!refcount_inc_not_zero(&group->users)) {
1388 		ret = -ENODEV;
1389 		goto err_unlock;
1390 	}
1391 
1392 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
1393 		ret = -EPERM;
1394 		goto err_put;
1395 	}
1396 
1397 	/*
1398 	 * Do we need multiple instances of the group open?  Seems not.
1399 	 */
1400 	if (group->opened_file) {
1401 		ret = -EBUSY;
1402 		goto err_put;
1403 	}
1404 	group->opened_file = filep;
1405 	filep->private_data = group;
1406 
1407 	up_write(&group->group_rwsem);
1408 	return 0;
1409 err_put:
1410 	vfio_group_put(group);
1411 err_unlock:
1412 	up_write(&group->group_rwsem);
1413 	return ret;
1414 }
1415 
1416 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1417 {
1418 	struct vfio_group *group = filep->private_data;
1419 
1420 	filep->private_data = NULL;
1421 
1422 	down_write(&group->group_rwsem);
1423 	/*
1424 	 * Device FDs hold a group file reference, therefore the group release
1425 	 * is only called when there are no open devices.
1426 	 */
1427 	WARN_ON(group->notifier.head);
1428 	if (group->container) {
1429 		WARN_ON(group->container_users != 1);
1430 		__vfio_group_unset_container(group);
1431 	}
1432 	group->opened_file = NULL;
1433 	up_write(&group->group_rwsem);
1434 
1435 	vfio_group_put(group);
1436 
1437 	return 0;
1438 }
1439 
1440 static const struct file_operations vfio_group_fops = {
1441 	.owner		= THIS_MODULE,
1442 	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
1443 	.compat_ioctl	= compat_ptr_ioctl,
1444 	.open		= vfio_group_fops_open,
1445 	.release	= vfio_group_fops_release,
1446 };
1447 
1448 /*
1449  * Wrapper around pm_runtime_resume_and_get().
1450  * Return error code on failure or 0 on success.
1451  */
1452 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
1453 {
1454 	struct device *dev = device->dev;
1455 
1456 	if (dev->driver && dev->driver->pm) {
1457 		int ret;
1458 
1459 		ret = pm_runtime_resume_and_get(dev);
1460 		if (ret) {
1461 			dev_info_ratelimited(dev,
1462 				"vfio: runtime resume failed %d\n", ret);
1463 			return -EIO;
1464 		}
1465 	}
1466 
1467 	return 0;
1468 }
1469 
1470 /*
1471  * Wrapper around pm_runtime_put().
1472  */
1473 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
1474 {
1475 	struct device *dev = device->dev;
1476 
1477 	if (dev->driver && dev->driver->pm)
1478 		pm_runtime_put(dev);
1479 }
1480 
1481 /*
1482  * VFIO Device fd
1483  */
1484 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1485 {
1486 	struct vfio_device *device = filep->private_data;
1487 	struct vfio_iommu_driver *iommu_driver;
1488 
1489 	mutex_lock(&device->dev_set->lock);
1490 	vfio_assert_device_open(device);
1491 	down_read(&device->group->group_rwsem);
1492 	if (device->open_count == 1 && device->ops->close_device)
1493 		device->ops->close_device(device);
1494 
1495 	iommu_driver = device->group->container->iommu_driver;
1496 	if (iommu_driver && iommu_driver->ops->unregister_device)
1497 		iommu_driver->ops->unregister_device(
1498 			device->group->container->iommu_data, device);
1499 	up_read(&device->group->group_rwsem);
1500 	device->open_count--;
1501 	if (device->open_count == 0)
1502 		device->kvm = NULL;
1503 	mutex_unlock(&device->dev_set->lock);
1504 
1505 	module_put(device->dev->driver->owner);
1506 
1507 	vfio_device_unassign_container(device);
1508 
1509 	vfio_device_put(device);
1510 
1511 	return 0;
1512 }
1513 
1514 /*
1515  * vfio_mig_get_next_state - Compute the next step in the FSM
1516  * @cur_fsm - The current state the device is in
1517  * @new_fsm - The target state to reach
1518  * @next_fsm - Pointer to the next step to get to new_fsm
1519  *
1520  * Return 0 upon success, otherwise -errno
1521  * Upon success the next step in the state progression between cur_fsm and
1522  * new_fsm will be set in next_fsm.
1523  *
1524  * This breaks down requests for combination transitions into smaller steps and
1525  * returns the next step to get to new_fsm. The function may need to be called
1526  * multiple times before reaching new_fsm.
1527  *
1528  */
1529 int vfio_mig_get_next_state(struct vfio_device *device,
1530 			    enum vfio_device_mig_state cur_fsm,
1531 			    enum vfio_device_mig_state new_fsm,
1532 			    enum vfio_device_mig_state *next_fsm)
1533 {
1534 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
1535 	/*
1536 	 * The coding in this table requires the driver to implement the
1537 	 * following FSM arcs:
1538 	 *         RESUMING -> STOP
1539 	 *         STOP -> RESUMING
1540 	 *         STOP -> STOP_COPY
1541 	 *         STOP_COPY -> STOP
1542 	 *
1543 	 * If P2P is supported then the driver must also implement these FSM
1544 	 * arcs:
1545 	 *         RUNNING -> RUNNING_P2P
1546 	 *         RUNNING_P2P -> RUNNING
1547 	 *         RUNNING_P2P -> STOP
1548 	 *         STOP -> RUNNING_P2P
1549 	 * Without P2P the driver must implement:
1550 	 *         RUNNING -> STOP
1551 	 *         STOP -> RUNNING
1552 	 *
1553 	 * The coding will step through multiple states for some combination
1554 	 * transitions; if all optional features are supported, this means the
1555 	 * following ones:
1556 	 *         RESUMING -> STOP -> RUNNING_P2P
1557 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
1558 	 *         RESUMING -> STOP -> STOP_COPY
1559 	 *         RUNNING -> RUNNING_P2P -> STOP
1560 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
1561 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
1562 	 *         RUNNING_P2P -> STOP -> RESUMING
1563 	 *         RUNNING_P2P -> STOP -> STOP_COPY
1564 	 *         STOP -> RUNNING_P2P -> RUNNING
1565 	 *         STOP_COPY -> STOP -> RESUMING
1566 	 *         STOP_COPY -> STOP -> RUNNING_P2P
1567 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
1568 	 */
1569 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
1570 		[VFIO_DEVICE_STATE_STOP] = {
1571 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1572 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1573 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1574 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1575 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1576 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1577 		},
1578 		[VFIO_DEVICE_STATE_RUNNING] = {
1579 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
1580 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1581 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
1582 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1583 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1584 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1585 		},
1586 		[VFIO_DEVICE_STATE_STOP_COPY] = {
1587 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1588 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1589 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1590 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1591 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1592 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1593 		},
1594 		[VFIO_DEVICE_STATE_RESUMING] = {
1595 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1596 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1597 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1598 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1599 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1600 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1601 		},
1602 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
1603 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1604 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1605 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1606 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1607 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1608 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1609 		},
1610 		[VFIO_DEVICE_STATE_ERROR] = {
1611 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
1612 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
1613 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
1614 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
1615 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
1616 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1617 		},
1618 	};
1619 
1620 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
1621 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
1622 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
1623 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
1624 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
1625 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
1626 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
1627 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
1628 	};
1629 
1630 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1631 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
1632 			state_flags_table[cur_fsm]))
1633 		return -EINVAL;
1634 
1635 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1636 	   (state_flags_table[new_fsm] & device->migration_flags) !=
1637 			state_flags_table[new_fsm])
1638 		return -EINVAL;
1639 
1640 	/*
1641 	 * Arcs touching optional and unsupported states are skipped over. The
1642 	 * driver will instead see an arc from the original state to the next
1643 	 * logical state, as per the above comment.
1644 	 */
1645 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
1646 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
1647 			state_flags_table[*next_fsm])
1648 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
1649 
1650 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
1651 }
1652 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
1653 
1654 /*
1655  * Convert the drivers's struct file into a FD number and return it to userspace
1656  */
1657 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
1658 				   struct vfio_device_feature_mig_state *mig)
1659 {
1660 	int ret;
1661 	int fd;
1662 
1663 	fd = get_unused_fd_flags(O_CLOEXEC);
1664 	if (fd < 0) {
1665 		ret = fd;
1666 		goto out_fput;
1667 	}
1668 
1669 	mig->data_fd = fd;
1670 	if (copy_to_user(arg, mig, sizeof(*mig))) {
1671 		ret = -EFAULT;
1672 		goto out_put_unused;
1673 	}
1674 	fd_install(fd, filp);
1675 	return 0;
1676 
1677 out_put_unused:
1678 	put_unused_fd(fd);
1679 out_fput:
1680 	fput(filp);
1681 	return ret;
1682 }
1683 
1684 static int
1685 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
1686 					   u32 flags, void __user *arg,
1687 					   size_t argsz)
1688 {
1689 	size_t minsz =
1690 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
1691 	struct vfio_device_feature_mig_state mig;
1692 	struct file *filp = NULL;
1693 	int ret;
1694 
1695 	if (!device->mig_ops)
1696 		return -ENOTTY;
1697 
1698 	ret = vfio_check_feature(flags, argsz,
1699 				 VFIO_DEVICE_FEATURE_SET |
1700 				 VFIO_DEVICE_FEATURE_GET,
1701 				 sizeof(mig));
1702 	if (ret != 1)
1703 		return ret;
1704 
1705 	if (copy_from_user(&mig, arg, minsz))
1706 		return -EFAULT;
1707 
1708 	if (flags & VFIO_DEVICE_FEATURE_GET) {
1709 		enum vfio_device_mig_state curr_state;
1710 
1711 		ret = device->mig_ops->migration_get_state(device,
1712 							   &curr_state);
1713 		if (ret)
1714 			return ret;
1715 		mig.device_state = curr_state;
1716 		goto out_copy;
1717 	}
1718 
1719 	/* Handle the VFIO_DEVICE_FEATURE_SET */
1720 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
1721 	if (IS_ERR(filp) || !filp)
1722 		goto out_copy;
1723 
1724 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
1725 out_copy:
1726 	mig.data_fd = -1;
1727 	if (copy_to_user(arg, &mig, sizeof(mig)))
1728 		return -EFAULT;
1729 	if (IS_ERR(filp))
1730 		return PTR_ERR(filp);
1731 	return 0;
1732 }
1733 
1734 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
1735 					       u32 flags, void __user *arg,
1736 					       size_t argsz)
1737 {
1738 	struct vfio_device_feature_migration mig = {
1739 		.flags = device->migration_flags,
1740 	};
1741 	int ret;
1742 
1743 	if (!device->mig_ops)
1744 		return -ENOTTY;
1745 
1746 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
1747 				 sizeof(mig));
1748 	if (ret != 1)
1749 		return ret;
1750 	if (copy_to_user(arg, &mig, sizeof(mig)))
1751 		return -EFAULT;
1752 	return 0;
1753 }
1754 
1755 /* Ranges should fit into a single kernel page */
1756 #define LOG_MAX_RANGES \
1757 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1758 
1759 static int
1760 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1761 					u32 flags, void __user *arg,
1762 					size_t argsz)
1763 {
1764 	size_t minsz =
1765 		offsetofend(struct vfio_device_feature_dma_logging_control,
1766 			    ranges);
1767 	struct vfio_device_feature_dma_logging_range __user *ranges;
1768 	struct vfio_device_feature_dma_logging_control control;
1769 	struct vfio_device_feature_dma_logging_range range;
1770 	struct rb_root_cached root = RB_ROOT_CACHED;
1771 	struct interval_tree_node *nodes;
1772 	u64 iova_end;
1773 	u32 nnodes;
1774 	int i, ret;
1775 
1776 	if (!device->log_ops)
1777 		return -ENOTTY;
1778 
1779 	ret = vfio_check_feature(flags, argsz,
1780 				 VFIO_DEVICE_FEATURE_SET,
1781 				 sizeof(control));
1782 	if (ret != 1)
1783 		return ret;
1784 
1785 	if (copy_from_user(&control, arg, minsz))
1786 		return -EFAULT;
1787 
1788 	nnodes = control.num_ranges;
1789 	if (!nnodes)
1790 		return -EINVAL;
1791 
1792 	if (nnodes > LOG_MAX_RANGES)
1793 		return -E2BIG;
1794 
1795 	ranges = u64_to_user_ptr(control.ranges);
1796 	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1797 			      GFP_KERNEL);
1798 	if (!nodes)
1799 		return -ENOMEM;
1800 
1801 	for (i = 0; i < nnodes; i++) {
1802 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1803 			ret = -EFAULT;
1804 			goto end;
1805 		}
1806 		if (!IS_ALIGNED(range.iova, control.page_size) ||
1807 		    !IS_ALIGNED(range.length, control.page_size)) {
1808 			ret = -EINVAL;
1809 			goto end;
1810 		}
1811 
1812 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1813 		    iova_end > ULONG_MAX) {
1814 			ret = -EOVERFLOW;
1815 			goto end;
1816 		}
1817 
1818 		nodes[i].start = range.iova;
1819 		nodes[i].last = range.iova + range.length - 1;
1820 		if (interval_tree_iter_first(&root, nodes[i].start,
1821 					     nodes[i].last)) {
1822 			/* Range overlapping */
1823 			ret = -EINVAL;
1824 			goto end;
1825 		}
1826 		interval_tree_insert(nodes + i, &root);
1827 	}
1828 
1829 	ret = device->log_ops->log_start(device, &root, nnodes,
1830 					 &control.page_size);
1831 	if (ret)
1832 		goto end;
1833 
1834 	if (copy_to_user(arg, &control, sizeof(control))) {
1835 		ret = -EFAULT;
1836 		device->log_ops->log_stop(device);
1837 	}
1838 
1839 end:
1840 	kfree(nodes);
1841 	return ret;
1842 }
1843 
1844 static int
1845 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1846 				       u32 flags, void __user *arg,
1847 				       size_t argsz)
1848 {
1849 	int ret;
1850 
1851 	if (!device->log_ops)
1852 		return -ENOTTY;
1853 
1854 	ret = vfio_check_feature(flags, argsz,
1855 				 VFIO_DEVICE_FEATURE_SET, 0);
1856 	if (ret != 1)
1857 		return ret;
1858 
1859 	return device->log_ops->log_stop(device);
1860 }
1861 
1862 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1863 					  unsigned long iova, size_t length,
1864 					  void *opaque)
1865 {
1866 	struct vfio_device *device = opaque;
1867 
1868 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1869 }
1870 
1871 static int
1872 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1873 					 u32 flags, void __user *arg,
1874 					 size_t argsz)
1875 {
1876 	size_t minsz =
1877 		offsetofend(struct vfio_device_feature_dma_logging_report,
1878 			    bitmap);
1879 	struct vfio_device_feature_dma_logging_report report;
1880 	struct iova_bitmap *iter;
1881 	u64 iova_end;
1882 	int ret;
1883 
1884 	if (!device->log_ops)
1885 		return -ENOTTY;
1886 
1887 	ret = vfio_check_feature(flags, argsz,
1888 				 VFIO_DEVICE_FEATURE_GET,
1889 				 sizeof(report));
1890 	if (ret != 1)
1891 		return ret;
1892 
1893 	if (copy_from_user(&report, arg, minsz))
1894 		return -EFAULT;
1895 
1896 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1897 		return -EINVAL;
1898 
1899 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1900 	    iova_end > ULONG_MAX)
1901 		return -EOVERFLOW;
1902 
1903 	iter = iova_bitmap_alloc(report.iova, report.length,
1904 				 report.page_size,
1905 				 u64_to_user_ptr(report.bitmap));
1906 	if (IS_ERR(iter))
1907 		return PTR_ERR(iter);
1908 
1909 	ret = iova_bitmap_for_each(iter, device,
1910 				   vfio_device_log_read_and_clear);
1911 
1912 	iova_bitmap_free(iter);
1913 	return ret;
1914 }
1915 
1916 static int vfio_ioctl_device_feature(struct vfio_device *device,
1917 				     struct vfio_device_feature __user *arg)
1918 {
1919 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1920 	struct vfio_device_feature feature;
1921 
1922 	if (copy_from_user(&feature, arg, minsz))
1923 		return -EFAULT;
1924 
1925 	if (feature.argsz < minsz)
1926 		return -EINVAL;
1927 
1928 	/* Check unknown flags */
1929 	if (feature.flags &
1930 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1931 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1932 		return -EINVAL;
1933 
1934 	/* GET & SET are mutually exclusive except with PROBE */
1935 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1936 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1937 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1938 		return -EINVAL;
1939 
1940 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1941 	case VFIO_DEVICE_FEATURE_MIGRATION:
1942 		return vfio_ioctl_device_feature_migration(
1943 			device, feature.flags, arg->data,
1944 			feature.argsz - minsz);
1945 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1946 		return vfio_ioctl_device_feature_mig_device_state(
1947 			device, feature.flags, arg->data,
1948 			feature.argsz - minsz);
1949 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1950 		return vfio_ioctl_device_feature_logging_start(
1951 			device, feature.flags, arg->data,
1952 			feature.argsz - minsz);
1953 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1954 		return vfio_ioctl_device_feature_logging_stop(
1955 			device, feature.flags, arg->data,
1956 			feature.argsz - minsz);
1957 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1958 		return vfio_ioctl_device_feature_logging_report(
1959 			device, feature.flags, arg->data,
1960 			feature.argsz - minsz);
1961 	default:
1962 		if (unlikely(!device->ops->device_feature))
1963 			return -EINVAL;
1964 		return device->ops->device_feature(device, feature.flags,
1965 						   arg->data,
1966 						   feature.argsz - minsz);
1967 	}
1968 }
1969 
1970 static long vfio_device_fops_unl_ioctl(struct file *filep,
1971 				       unsigned int cmd, unsigned long arg)
1972 {
1973 	struct vfio_device *device = filep->private_data;
1974 	int ret;
1975 
1976 	ret = vfio_device_pm_runtime_get(device);
1977 	if (ret)
1978 		return ret;
1979 
1980 	switch (cmd) {
1981 	case VFIO_DEVICE_FEATURE:
1982 		ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1983 		break;
1984 
1985 	default:
1986 		if (unlikely(!device->ops->ioctl))
1987 			ret = -EINVAL;
1988 		else
1989 			ret = device->ops->ioctl(device, cmd, arg);
1990 		break;
1991 	}
1992 
1993 	vfio_device_pm_runtime_put(device);
1994 	return ret;
1995 }
1996 
1997 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1998 				     size_t count, loff_t *ppos)
1999 {
2000 	struct vfio_device *device = filep->private_data;
2001 
2002 	if (unlikely(!device->ops->read))
2003 		return -EINVAL;
2004 
2005 	return device->ops->read(device, buf, count, ppos);
2006 }
2007 
2008 static ssize_t vfio_device_fops_write(struct file *filep,
2009 				      const char __user *buf,
2010 				      size_t count, loff_t *ppos)
2011 {
2012 	struct vfio_device *device = filep->private_data;
2013 
2014 	if (unlikely(!device->ops->write))
2015 		return -EINVAL;
2016 
2017 	return device->ops->write(device, buf, count, ppos);
2018 }
2019 
2020 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
2021 {
2022 	struct vfio_device *device = filep->private_data;
2023 
2024 	if (unlikely(!device->ops->mmap))
2025 		return -EINVAL;
2026 
2027 	return device->ops->mmap(device, vma);
2028 }
2029 
2030 static const struct file_operations vfio_device_fops = {
2031 	.owner		= THIS_MODULE,
2032 	.release	= vfio_device_fops_release,
2033 	.read		= vfio_device_fops_read,
2034 	.write		= vfio_device_fops_write,
2035 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
2036 	.compat_ioctl	= compat_ptr_ioctl,
2037 	.mmap		= vfio_device_fops_mmap,
2038 };
2039 
2040 /**
2041  * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
2042  * @file: VFIO group file
2043  *
2044  * The returned iommu_group is valid as long as a ref is held on the file.
2045  */
2046 struct iommu_group *vfio_file_iommu_group(struct file *file)
2047 {
2048 	struct vfio_group *group = file->private_data;
2049 
2050 	if (file->f_op != &vfio_group_fops)
2051 		return NULL;
2052 	return group->iommu_group;
2053 }
2054 EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
2055 
2056 /**
2057  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
2058  *        is always CPU cache coherent
2059  * @file: VFIO group file
2060  *
2061  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
2062  * bit in DMA transactions. A return of false indicates that the user has
2063  * rights to access additional instructions such as wbinvd on x86.
2064  */
2065 bool vfio_file_enforced_coherent(struct file *file)
2066 {
2067 	struct vfio_group *group = file->private_data;
2068 	bool ret;
2069 
2070 	if (file->f_op != &vfio_group_fops)
2071 		return true;
2072 
2073 	down_read(&group->group_rwsem);
2074 	if (group->container) {
2075 		ret = vfio_ioctl_check_extension(group->container,
2076 						 VFIO_DMA_CC_IOMMU);
2077 	} else {
2078 		/*
2079 		 * Since the coherency state is determined only once a container
2080 		 * is attached the user must do so before they can prove they
2081 		 * have permission.
2082 		 */
2083 		ret = true;
2084 	}
2085 	up_read(&group->group_rwsem);
2086 	return ret;
2087 }
2088 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
2089 
2090 /**
2091  * vfio_file_set_kvm - Link a kvm with VFIO drivers
2092  * @file: VFIO group file
2093  * @kvm: KVM to link
2094  *
2095  * When a VFIO device is first opened the KVM will be available in
2096  * device->kvm if one was associated with the group.
2097  */
2098 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
2099 {
2100 	struct vfio_group *group = file->private_data;
2101 
2102 	if (file->f_op != &vfio_group_fops)
2103 		return;
2104 
2105 	down_write(&group->group_rwsem);
2106 	group->kvm = kvm;
2107 	up_write(&group->group_rwsem);
2108 }
2109 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
2110 
2111 /**
2112  * vfio_file_has_dev - True if the VFIO file is a handle for device
2113  * @file: VFIO file to check
2114  * @device: Device that must be part of the file
2115  *
2116  * Returns true if given file has permission to manipulate the given device.
2117  */
2118 bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
2119 {
2120 	struct vfio_group *group = file->private_data;
2121 
2122 	if (file->f_op != &vfio_group_fops)
2123 		return false;
2124 
2125 	return group == device->group;
2126 }
2127 EXPORT_SYMBOL_GPL(vfio_file_has_dev);
2128 
2129 /*
2130  * Sub-module support
2131  */
2132 /*
2133  * Helper for managing a buffer of info chain capabilities, allocate or
2134  * reallocate a buffer with additional @size, filling in @id and @version
2135  * of the capability.  A pointer to the new capability is returned.
2136  *
2137  * NB. The chain is based at the head of the buffer, so new entries are
2138  * added to the tail, vfio_info_cap_shift() should be called to fixup the
2139  * next offsets prior to copying to the user buffer.
2140  */
2141 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
2142 					       size_t size, u16 id, u16 version)
2143 {
2144 	void *buf;
2145 	struct vfio_info_cap_header *header, *tmp;
2146 
2147 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
2148 	if (!buf) {
2149 		kfree(caps->buf);
2150 		caps->buf = NULL;
2151 		caps->size = 0;
2152 		return ERR_PTR(-ENOMEM);
2153 	}
2154 
2155 	caps->buf = buf;
2156 	header = buf + caps->size;
2157 
2158 	/* Eventually copied to user buffer, zero */
2159 	memset(header, 0, size);
2160 
2161 	header->id = id;
2162 	header->version = version;
2163 
2164 	/* Add to the end of the capability chain */
2165 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
2166 		; /* nothing */
2167 
2168 	tmp->next = caps->size;
2169 	caps->size += size;
2170 
2171 	return header;
2172 }
2173 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
2174 
2175 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
2176 {
2177 	struct vfio_info_cap_header *tmp;
2178 	void *buf = (void *)caps->buf;
2179 
2180 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
2181 		tmp->next += offset;
2182 }
2183 EXPORT_SYMBOL(vfio_info_cap_shift);
2184 
2185 int vfio_info_add_capability(struct vfio_info_cap *caps,
2186 			     struct vfio_info_cap_header *cap, size_t size)
2187 {
2188 	struct vfio_info_cap_header *header;
2189 
2190 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
2191 	if (IS_ERR(header))
2192 		return PTR_ERR(header);
2193 
2194 	memcpy(header + 1, cap + 1, size - sizeof(*header));
2195 
2196 	return 0;
2197 }
2198 EXPORT_SYMBOL(vfio_info_add_capability);
2199 
2200 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
2201 				       int max_irq_type, size_t *data_size)
2202 {
2203 	unsigned long minsz;
2204 	size_t size;
2205 
2206 	minsz = offsetofend(struct vfio_irq_set, count);
2207 
2208 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
2209 	    (hdr->count >= (U32_MAX - hdr->start)) ||
2210 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
2211 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
2212 		return -EINVAL;
2213 
2214 	if (data_size)
2215 		*data_size = 0;
2216 
2217 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
2218 		return -EINVAL;
2219 
2220 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
2221 	case VFIO_IRQ_SET_DATA_NONE:
2222 		size = 0;
2223 		break;
2224 	case VFIO_IRQ_SET_DATA_BOOL:
2225 		size = sizeof(uint8_t);
2226 		break;
2227 	case VFIO_IRQ_SET_DATA_EVENTFD:
2228 		size = sizeof(int32_t);
2229 		break;
2230 	default:
2231 		return -EINVAL;
2232 	}
2233 
2234 	if (size) {
2235 		if (hdr->argsz - minsz < hdr->count * size)
2236 			return -EINVAL;
2237 
2238 		if (!data_size)
2239 			return -EINVAL;
2240 
2241 		*data_size = hdr->count * size;
2242 	}
2243 
2244 	return 0;
2245 }
2246 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
2247 
2248 /*
2249  * Pin contiguous user pages and return their associated host pages for local
2250  * domain only.
2251  * @device [in]  : device
2252  * @iova [in]    : starting IOVA of user pages to be pinned.
2253  * @npage [in]   : count of pages to be pinned.  This count should not
2254  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2255  * @prot [in]    : protection flags
2256  * @pages[out]   : array of host pages
2257  * Return error or number of pages pinned.
2258  *
2259  * A driver may only call this function if the vfio_device was created
2260  * by vfio_register_emulated_iommu_dev().
2261  */
2262 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
2263 		   int npage, int prot, struct page **pages)
2264 {
2265 	struct vfio_container *container;
2266 	struct vfio_group *group = device->group;
2267 	struct vfio_iommu_driver *driver;
2268 	int ret;
2269 
2270 	if (!pages || !npage || !vfio_assert_device_open(device))
2271 		return -EINVAL;
2272 
2273 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2274 		return -E2BIG;
2275 
2276 	/* group->container cannot change while a vfio device is open */
2277 	container = group->container;
2278 	driver = container->iommu_driver;
2279 	if (likely(driver && driver->ops->pin_pages))
2280 		ret = driver->ops->pin_pages(container->iommu_data,
2281 					     group->iommu_group, iova,
2282 					     npage, prot, pages);
2283 	else
2284 		ret = -ENOTTY;
2285 
2286 	return ret;
2287 }
2288 EXPORT_SYMBOL(vfio_pin_pages);
2289 
2290 /*
2291  * Unpin contiguous host pages for local domain only.
2292  * @device [in]  : device
2293  * @iova [in]    : starting address of user pages to be unpinned.
2294  * @npage [in]   : count of pages to be unpinned.  This count should not
2295  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2296  */
2297 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
2298 {
2299 	struct vfio_container *container;
2300 	struct vfio_iommu_driver *driver;
2301 
2302 	if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES))
2303 		return;
2304 
2305 	if (WARN_ON(!vfio_assert_device_open(device)))
2306 		return;
2307 
2308 	/* group->container cannot change while a vfio device is open */
2309 	container = device->group->container;
2310 	driver = container->iommu_driver;
2311 
2312 	driver->ops->unpin_pages(container->iommu_data, iova, npage);
2313 }
2314 EXPORT_SYMBOL(vfio_unpin_pages);
2315 
2316 /*
2317  * This interface allows the CPUs to perform some sort of virtual DMA on
2318  * behalf of the device.
2319  *
2320  * CPUs read/write from/into a range of IOVAs pointing to user space memory
2321  * into/from a kernel buffer.
2322  *
2323  * As the read/write of user space memory is conducted via the CPUs and is
2324  * not a real device DMA, it is not necessary to pin the user space memory.
2325  *
2326  * @device [in]		: VFIO device
2327  * @iova [in]		: base IOVA of a user space buffer
2328  * @data [in]		: pointer to kernel buffer
2329  * @len [in]		: kernel buffer length
2330  * @write		: indicate read or write
2331  * Return error code on failure or 0 on success.
2332  */
2333 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
2334 		size_t len, bool write)
2335 {
2336 	struct vfio_container *container;
2337 	struct vfio_iommu_driver *driver;
2338 	int ret = 0;
2339 
2340 	if (!data || len <= 0 || !vfio_assert_device_open(device))
2341 		return -EINVAL;
2342 
2343 	/* group->container cannot change while a vfio device is open */
2344 	container = device->group->container;
2345 	driver = container->iommu_driver;
2346 
2347 	if (likely(driver && driver->ops->dma_rw))
2348 		ret = driver->ops->dma_rw(container->iommu_data,
2349 					  iova, data, len, write);
2350 	else
2351 		ret = -ENOTTY;
2352 	return ret;
2353 }
2354 EXPORT_SYMBOL(vfio_dma_rw);
2355 
2356 /*
2357  * Module/class support
2358  */
2359 static char *vfio_devnode(struct device *dev, umode_t *mode)
2360 {
2361 	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2362 }
2363 
2364 static struct miscdevice vfio_dev = {
2365 	.minor = VFIO_MINOR,
2366 	.name = "vfio",
2367 	.fops = &vfio_fops,
2368 	.nodename = "vfio/vfio",
2369 	.mode = S_IRUGO | S_IWUGO,
2370 };
2371 
2372 static int __init vfio_init(void)
2373 {
2374 	int ret;
2375 
2376 	ida_init(&vfio.group_ida);
2377 	mutex_init(&vfio.group_lock);
2378 	mutex_init(&vfio.iommu_drivers_lock);
2379 	INIT_LIST_HEAD(&vfio.group_list);
2380 	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2381 
2382 	ret = misc_register(&vfio_dev);
2383 	if (ret) {
2384 		pr_err("vfio: misc device register failed\n");
2385 		return ret;
2386 	}
2387 
2388 	/* /dev/vfio/$GROUP */
2389 	vfio.class = class_create(THIS_MODULE, "vfio");
2390 	if (IS_ERR(vfio.class)) {
2391 		ret = PTR_ERR(vfio.class);
2392 		goto err_class;
2393 	}
2394 
2395 	vfio.class->devnode = vfio_devnode;
2396 
2397 	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2398 	if (ret)
2399 		goto err_alloc_chrdev;
2400 
2401 #ifdef CONFIG_VFIO_NOIOMMU
2402 	ret = vfio_register_iommu_driver(&vfio_noiommu_ops);
2403 #endif
2404 	if (ret)
2405 		goto err_driver_register;
2406 
2407 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2408 	return 0;
2409 
2410 err_driver_register:
2411 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2412 err_alloc_chrdev:
2413 	class_destroy(vfio.class);
2414 	vfio.class = NULL;
2415 err_class:
2416 	misc_deregister(&vfio_dev);
2417 	return ret;
2418 }
2419 
2420 static void __exit vfio_cleanup(void)
2421 {
2422 	WARN_ON(!list_empty(&vfio.group_list));
2423 
2424 #ifdef CONFIG_VFIO_NOIOMMU
2425 	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2426 #endif
2427 	ida_destroy(&vfio.group_ida);
2428 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2429 	class_destroy(vfio.class);
2430 	vfio.class = NULL;
2431 	misc_deregister(&vfio_dev);
2432 	xa_destroy(&vfio_device_set_xa);
2433 }
2434 
2435 module_init(vfio_init);
2436 module_exit(vfio_cleanup);
2437 
2438 MODULE_VERSION(DRIVER_VERSION);
2439 MODULE_LICENSE("GPL v2");
2440 MODULE_AUTHOR(DRIVER_AUTHOR);
2441 MODULE_DESCRIPTION(DRIVER_DESC);
2442 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2443 MODULE_ALIAS("devname:vfio/vfio");
2444 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2445