xref: /openbmc/linux/drivers/vfio/vfio_main.c (revision 2dfb62d6ce80b3536d1a915177ae82496bd7ac4a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
18 #include <linux/fs.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
35 #include "vfio.h"
36 
37 #define DRIVER_VERSION	"0.3"
38 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
39 #define DRIVER_DESC	"VFIO - User Level meta-driver"
40 
41 static struct vfio {
42 	struct class			*class;
43 	struct list_head		iommu_drivers_list;
44 	struct mutex			iommu_drivers_lock;
45 	struct list_head		group_list;
46 	struct mutex			group_lock; /* locks group_list */
47 	struct ida			group_ida;
48 	dev_t				group_devt;
49 } vfio;
50 
51 struct vfio_iommu_driver {
52 	const struct vfio_iommu_driver_ops	*ops;
53 	struct list_head			vfio_next;
54 };
55 
56 struct vfio_container {
57 	struct kref			kref;
58 	struct list_head		group_list;
59 	struct rw_semaphore		group_lock;
60 	struct vfio_iommu_driver	*iommu_driver;
61 	void				*iommu_data;
62 	bool				noiommu;
63 };
64 
65 struct vfio_group {
66 	struct device 			dev;
67 	struct cdev			cdev;
68 	refcount_t			users;
69 	unsigned int			container_users;
70 	struct iommu_group		*iommu_group;
71 	struct vfio_container		*container;
72 	struct list_head		device_list;
73 	struct mutex			device_lock;
74 	struct list_head		vfio_next;
75 	struct list_head		container_next;
76 	enum vfio_group_type		type;
77 	unsigned int			dev_counter;
78 	struct rw_semaphore		group_rwsem;
79 	struct kvm			*kvm;
80 	struct file			*opened_file;
81 	struct blocking_notifier_head	notifier;
82 };
83 
84 #ifdef CONFIG_VFIO_NOIOMMU
85 static bool noiommu __read_mostly;
86 module_param_named(enable_unsafe_noiommu_mode,
87 		   noiommu, bool, S_IRUGO | S_IWUSR);
88 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
89 #endif
90 
91 static DEFINE_XARRAY(vfio_device_set_xa);
92 static const struct file_operations vfio_group_fops;
93 
94 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
95 {
96 	unsigned long idx = (unsigned long)set_id;
97 	struct vfio_device_set *new_dev_set;
98 	struct vfio_device_set *dev_set;
99 
100 	if (WARN_ON(!set_id))
101 		return -EINVAL;
102 
103 	/*
104 	 * Atomically acquire a singleton object in the xarray for this set_id
105 	 */
106 	xa_lock(&vfio_device_set_xa);
107 	dev_set = xa_load(&vfio_device_set_xa, idx);
108 	if (dev_set)
109 		goto found_get_ref;
110 	xa_unlock(&vfio_device_set_xa);
111 
112 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
113 	if (!new_dev_set)
114 		return -ENOMEM;
115 	mutex_init(&new_dev_set->lock);
116 	INIT_LIST_HEAD(&new_dev_set->device_list);
117 	new_dev_set->set_id = set_id;
118 
119 	xa_lock(&vfio_device_set_xa);
120 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
121 			       GFP_KERNEL);
122 	if (!dev_set) {
123 		dev_set = new_dev_set;
124 		goto found_get_ref;
125 	}
126 
127 	kfree(new_dev_set);
128 	if (xa_is_err(dev_set)) {
129 		xa_unlock(&vfio_device_set_xa);
130 		return xa_err(dev_set);
131 	}
132 
133 found_get_ref:
134 	dev_set->device_count++;
135 	xa_unlock(&vfio_device_set_xa);
136 	mutex_lock(&dev_set->lock);
137 	device->dev_set = dev_set;
138 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
139 	mutex_unlock(&dev_set->lock);
140 	return 0;
141 }
142 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
143 
144 static void vfio_release_device_set(struct vfio_device *device)
145 {
146 	struct vfio_device_set *dev_set = device->dev_set;
147 
148 	if (!dev_set)
149 		return;
150 
151 	mutex_lock(&dev_set->lock);
152 	list_del(&device->dev_set_list);
153 	mutex_unlock(&dev_set->lock);
154 
155 	xa_lock(&vfio_device_set_xa);
156 	if (!--dev_set->device_count) {
157 		__xa_erase(&vfio_device_set_xa,
158 			   (unsigned long)dev_set->set_id);
159 		mutex_destroy(&dev_set->lock);
160 		kfree(dev_set);
161 	}
162 	xa_unlock(&vfio_device_set_xa);
163 }
164 
165 #ifdef CONFIG_VFIO_NOIOMMU
166 static void *vfio_noiommu_open(unsigned long arg)
167 {
168 	if (arg != VFIO_NOIOMMU_IOMMU)
169 		return ERR_PTR(-EINVAL);
170 	if (!capable(CAP_SYS_RAWIO))
171 		return ERR_PTR(-EPERM);
172 
173 	return NULL;
174 }
175 
176 static void vfio_noiommu_release(void *iommu_data)
177 {
178 }
179 
180 static long vfio_noiommu_ioctl(void *iommu_data,
181 			       unsigned int cmd, unsigned long arg)
182 {
183 	if (cmd == VFIO_CHECK_EXTENSION)
184 		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
185 
186 	return -ENOTTY;
187 }
188 
189 static int vfio_noiommu_attach_group(void *iommu_data,
190 		struct iommu_group *iommu_group, enum vfio_group_type type)
191 {
192 	return 0;
193 }
194 
195 static void vfio_noiommu_detach_group(void *iommu_data,
196 				      struct iommu_group *iommu_group)
197 {
198 }
199 
200 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
201 	.name = "vfio-noiommu",
202 	.owner = THIS_MODULE,
203 	.open = vfio_noiommu_open,
204 	.release = vfio_noiommu_release,
205 	.ioctl = vfio_noiommu_ioctl,
206 	.attach_group = vfio_noiommu_attach_group,
207 	.detach_group = vfio_noiommu_detach_group,
208 };
209 
210 /*
211  * Only noiommu containers can use vfio-noiommu and noiommu containers can only
212  * use vfio-noiommu.
213  */
214 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
215 		const struct vfio_iommu_driver *driver)
216 {
217 	return container->noiommu == (driver->ops == &vfio_noiommu_ops);
218 }
219 #else
220 static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
221 		const struct vfio_iommu_driver *driver)
222 {
223 	return true;
224 }
225 #endif /* CONFIG_VFIO_NOIOMMU */
226 
227 /*
228  * IOMMU driver registration
229  */
230 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
231 {
232 	struct vfio_iommu_driver *driver, *tmp;
233 
234 	if (WARN_ON(!ops->register_device != !ops->unregister_device))
235 		return -EINVAL;
236 
237 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
238 	if (!driver)
239 		return -ENOMEM;
240 
241 	driver->ops = ops;
242 
243 	mutex_lock(&vfio.iommu_drivers_lock);
244 
245 	/* Check for duplicates */
246 	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
247 		if (tmp->ops == ops) {
248 			mutex_unlock(&vfio.iommu_drivers_lock);
249 			kfree(driver);
250 			return -EINVAL;
251 		}
252 	}
253 
254 	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
255 
256 	mutex_unlock(&vfio.iommu_drivers_lock);
257 
258 	return 0;
259 }
260 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
261 
262 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
263 {
264 	struct vfio_iommu_driver *driver;
265 
266 	mutex_lock(&vfio.iommu_drivers_lock);
267 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
268 		if (driver->ops == ops) {
269 			list_del(&driver->vfio_next);
270 			mutex_unlock(&vfio.iommu_drivers_lock);
271 			kfree(driver);
272 			return;
273 		}
274 	}
275 	mutex_unlock(&vfio.iommu_drivers_lock);
276 }
277 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
278 
279 static void vfio_group_get(struct vfio_group *group);
280 
281 /*
282  * Container objects - containers are created when /dev/vfio/vfio is
283  * opened, but their lifecycle extends until the last user is done, so
284  * it's freed via kref.  Must support container/group/device being
285  * closed in any order.
286  */
287 static void vfio_container_get(struct vfio_container *container)
288 {
289 	kref_get(&container->kref);
290 }
291 
292 static void vfio_container_release(struct kref *kref)
293 {
294 	struct vfio_container *container;
295 	container = container_of(kref, struct vfio_container, kref);
296 
297 	kfree(container);
298 }
299 
300 static void vfio_container_put(struct vfio_container *container)
301 {
302 	kref_put(&container->kref, vfio_container_release);
303 }
304 
305 /*
306  * Group objects - create, release, get, put, search
307  */
308 static struct vfio_group *
309 __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
310 {
311 	struct vfio_group *group;
312 
313 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
314 		if (group->iommu_group == iommu_group) {
315 			vfio_group_get(group);
316 			return group;
317 		}
318 	}
319 	return NULL;
320 }
321 
322 static struct vfio_group *
323 vfio_group_get_from_iommu(struct iommu_group *iommu_group)
324 {
325 	struct vfio_group *group;
326 
327 	mutex_lock(&vfio.group_lock);
328 	group = __vfio_group_get_from_iommu(iommu_group);
329 	mutex_unlock(&vfio.group_lock);
330 	return group;
331 }
332 
333 static void vfio_group_release(struct device *dev)
334 {
335 	struct vfio_group *group = container_of(dev, struct vfio_group, dev);
336 
337 	mutex_destroy(&group->device_lock);
338 	iommu_group_put(group->iommu_group);
339 	ida_free(&vfio.group_ida, MINOR(group->dev.devt));
340 	kfree(group);
341 }
342 
343 static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
344 					   enum vfio_group_type type)
345 {
346 	struct vfio_group *group;
347 	int minor;
348 
349 	group = kzalloc(sizeof(*group), GFP_KERNEL);
350 	if (!group)
351 		return ERR_PTR(-ENOMEM);
352 
353 	minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
354 	if (minor < 0) {
355 		kfree(group);
356 		return ERR_PTR(minor);
357 	}
358 
359 	device_initialize(&group->dev);
360 	group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
361 	group->dev.class = vfio.class;
362 	group->dev.release = vfio_group_release;
363 	cdev_init(&group->cdev, &vfio_group_fops);
364 	group->cdev.owner = THIS_MODULE;
365 
366 	refcount_set(&group->users, 1);
367 	init_rwsem(&group->group_rwsem);
368 	INIT_LIST_HEAD(&group->device_list);
369 	mutex_init(&group->device_lock);
370 	group->iommu_group = iommu_group;
371 	/* put in vfio_group_release() */
372 	iommu_group_ref_get(iommu_group);
373 	group->type = type;
374 	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
375 
376 	return group;
377 }
378 
379 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
380 		enum vfio_group_type type)
381 {
382 	struct vfio_group *group;
383 	struct vfio_group *ret;
384 	int err;
385 
386 	group = vfio_group_alloc(iommu_group, type);
387 	if (IS_ERR(group))
388 		return group;
389 
390 	err = dev_set_name(&group->dev, "%s%d",
391 			   group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
392 			   iommu_group_id(iommu_group));
393 	if (err) {
394 		ret = ERR_PTR(err);
395 		goto err_put;
396 	}
397 
398 	mutex_lock(&vfio.group_lock);
399 
400 	/* Did we race creating this group? */
401 	ret = __vfio_group_get_from_iommu(iommu_group);
402 	if (ret)
403 		goto err_unlock;
404 
405 	err = cdev_device_add(&group->cdev, &group->dev);
406 	if (err) {
407 		ret = ERR_PTR(err);
408 		goto err_unlock;
409 	}
410 
411 	list_add(&group->vfio_next, &vfio.group_list);
412 
413 	mutex_unlock(&vfio.group_lock);
414 	return group;
415 
416 err_unlock:
417 	mutex_unlock(&vfio.group_lock);
418 err_put:
419 	put_device(&group->dev);
420 	return ret;
421 }
422 
423 static void vfio_group_put(struct vfio_group *group)
424 {
425 	if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
426 		return;
427 
428 	/*
429 	 * These data structures all have paired operations that can only be
430 	 * undone when the caller holds a live reference on the group. Since all
431 	 * pairs must be undone these WARN_ON's indicate some caller did not
432 	 * properly hold the group reference.
433 	 */
434 	WARN_ON(!list_empty(&group->device_list));
435 	WARN_ON(group->container || group->container_users);
436 	WARN_ON(group->notifier.head);
437 
438 	list_del(&group->vfio_next);
439 	cdev_device_del(&group->cdev, &group->dev);
440 	mutex_unlock(&vfio.group_lock);
441 
442 	put_device(&group->dev);
443 }
444 
445 static void vfio_group_get(struct vfio_group *group)
446 {
447 	refcount_inc(&group->users);
448 }
449 
450 /*
451  * Device objects - create, release, get, put, search
452  */
453 /* Device reference always implies a group reference */
454 static void vfio_device_put(struct vfio_device *device)
455 {
456 	if (refcount_dec_and_test(&device->refcount))
457 		complete(&device->comp);
458 }
459 
460 static bool vfio_device_try_get(struct vfio_device *device)
461 {
462 	return refcount_inc_not_zero(&device->refcount);
463 }
464 
465 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
466 						 struct device *dev)
467 {
468 	struct vfio_device *device;
469 
470 	mutex_lock(&group->device_lock);
471 	list_for_each_entry(device, &group->device_list, group_next) {
472 		if (device->dev == dev && vfio_device_try_get(device)) {
473 			mutex_unlock(&group->device_lock);
474 			return device;
475 		}
476 	}
477 	mutex_unlock(&group->device_lock);
478 	return NULL;
479 }
480 
481 /*
482  * VFIO driver API
483  */
484 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
485 			 const struct vfio_device_ops *ops)
486 {
487 	init_completion(&device->comp);
488 	device->dev = dev;
489 	device->ops = ops;
490 }
491 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
492 
493 void vfio_uninit_group_dev(struct vfio_device *device)
494 {
495 	vfio_release_device_set(device);
496 }
497 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
498 
499 static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
500 		enum vfio_group_type type)
501 {
502 	struct iommu_group *iommu_group;
503 	struct vfio_group *group;
504 	int ret;
505 
506 	iommu_group = iommu_group_alloc();
507 	if (IS_ERR(iommu_group))
508 		return ERR_CAST(iommu_group);
509 
510 	ret = iommu_group_set_name(iommu_group, "vfio-noiommu");
511 	if (ret)
512 		goto out_put_group;
513 	ret = iommu_group_add_device(iommu_group, dev);
514 	if (ret)
515 		goto out_put_group;
516 
517 	group = vfio_create_group(iommu_group, type);
518 	if (IS_ERR(group)) {
519 		ret = PTR_ERR(group);
520 		goto out_remove_device;
521 	}
522 	iommu_group_put(iommu_group);
523 	return group;
524 
525 out_remove_device:
526 	iommu_group_remove_device(dev);
527 out_put_group:
528 	iommu_group_put(iommu_group);
529 	return ERR_PTR(ret);
530 }
531 
532 static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
533 {
534 	struct iommu_group *iommu_group;
535 	struct vfio_group *group;
536 
537 	iommu_group = iommu_group_get(dev);
538 #ifdef CONFIG_VFIO_NOIOMMU
539 	if (!iommu_group && noiommu) {
540 		/*
541 		 * With noiommu enabled, create an IOMMU group for devices that
542 		 * don't already have one, implying no IOMMU hardware/driver
543 		 * exists.  Taint the kernel because we're about to give a DMA
544 		 * capable device to a user without IOMMU protection.
545 		 */
546 		group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
547 		if (!IS_ERR(group)) {
548 			add_taint(TAINT_USER, LOCKDEP_STILL_OK);
549 			dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
550 		}
551 		return group;
552 	}
553 #endif
554 	if (!iommu_group)
555 		return ERR_PTR(-EINVAL);
556 
557 	/*
558 	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
559 	 * restore cache coherency. It has to be checked here because it is only
560 	 * valid for cases where we are using iommu groups.
561 	 */
562 	if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) {
563 		iommu_group_put(iommu_group);
564 		return ERR_PTR(-EINVAL);
565 	}
566 
567 	group = vfio_group_get_from_iommu(iommu_group);
568 	if (!group)
569 		group = vfio_create_group(iommu_group, VFIO_IOMMU);
570 
571 	/* The vfio_group holds a reference to the iommu_group */
572 	iommu_group_put(iommu_group);
573 	return group;
574 }
575 
576 static int __vfio_register_dev(struct vfio_device *device,
577 		struct vfio_group *group)
578 {
579 	struct vfio_device *existing_device;
580 
581 	if (IS_ERR(group))
582 		return PTR_ERR(group);
583 
584 	/*
585 	 * If the driver doesn't specify a set then the device is added to a
586 	 * singleton set just for itself.
587 	 */
588 	if (!device->dev_set)
589 		vfio_assign_device_set(device, device);
590 
591 	existing_device = vfio_group_get_device(group, device->dev);
592 	if (existing_device) {
593 		dev_WARN(device->dev, "Device already exists on group %d\n",
594 			 iommu_group_id(group->iommu_group));
595 		vfio_device_put(existing_device);
596 		if (group->type == VFIO_NO_IOMMU ||
597 		    group->type == VFIO_EMULATED_IOMMU)
598 			iommu_group_remove_device(device->dev);
599 		vfio_group_put(group);
600 		return -EBUSY;
601 	}
602 
603 	/* Our reference on group is moved to the device */
604 	device->group = group;
605 
606 	/* Refcounting can't start until the driver calls register */
607 	refcount_set(&device->refcount, 1);
608 
609 	mutex_lock(&group->device_lock);
610 	list_add(&device->group_next, &group->device_list);
611 	group->dev_counter++;
612 	mutex_unlock(&group->device_lock);
613 
614 	return 0;
615 }
616 
617 int vfio_register_group_dev(struct vfio_device *device)
618 {
619 	return __vfio_register_dev(device,
620 		vfio_group_find_or_alloc(device->dev));
621 }
622 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
623 
624 /*
625  * Register a virtual device without IOMMU backing.  The user of this
626  * device must not be able to directly trigger unmediated DMA.
627  */
628 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
629 {
630 	return __vfio_register_dev(device,
631 		vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
632 }
633 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
634 
635 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
636 						     char *buf)
637 {
638 	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
639 
640 	mutex_lock(&group->device_lock);
641 	list_for_each_entry(it, &group->device_list, group_next) {
642 		int ret;
643 
644 		if (it->ops->match) {
645 			ret = it->ops->match(it, buf);
646 			if (ret < 0) {
647 				device = ERR_PTR(ret);
648 				break;
649 			}
650 		} else {
651 			ret = !strcmp(dev_name(it->dev), buf);
652 		}
653 
654 		if (ret && vfio_device_try_get(it)) {
655 			device = it;
656 			break;
657 		}
658 	}
659 	mutex_unlock(&group->device_lock);
660 
661 	return device;
662 }
663 
664 /*
665  * Decrement the device reference count and wait for the device to be
666  * removed.  Open file descriptors for the device... */
667 void vfio_unregister_group_dev(struct vfio_device *device)
668 {
669 	struct vfio_group *group = device->group;
670 	unsigned int i = 0;
671 	bool interrupted = false;
672 	long rc;
673 
674 	vfio_device_put(device);
675 	rc = try_wait_for_completion(&device->comp);
676 	while (rc <= 0) {
677 		if (device->ops->request)
678 			device->ops->request(device, i++);
679 
680 		if (interrupted) {
681 			rc = wait_for_completion_timeout(&device->comp,
682 							 HZ * 10);
683 		} else {
684 			rc = wait_for_completion_interruptible_timeout(
685 				&device->comp, HZ * 10);
686 			if (rc < 0) {
687 				interrupted = true;
688 				dev_warn(device->dev,
689 					 "Device is currently in use, task"
690 					 " \"%s\" (%d) "
691 					 "blocked until device is released",
692 					 current->comm, task_pid_nr(current));
693 			}
694 		}
695 	}
696 
697 	mutex_lock(&group->device_lock);
698 	list_del(&device->group_next);
699 	group->dev_counter--;
700 	mutex_unlock(&group->device_lock);
701 
702 	if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
703 		iommu_group_remove_device(device->dev);
704 
705 	/* Matches the get in vfio_register_group_dev() */
706 	vfio_group_put(group);
707 }
708 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
709 
710 /*
711  * VFIO base fd, /dev/vfio/vfio
712  */
713 static long vfio_ioctl_check_extension(struct vfio_container *container,
714 				       unsigned long arg)
715 {
716 	struct vfio_iommu_driver *driver;
717 	long ret = 0;
718 
719 	down_read(&container->group_lock);
720 
721 	driver = container->iommu_driver;
722 
723 	switch (arg) {
724 		/* No base extensions yet */
725 	default:
726 		/*
727 		 * If no driver is set, poll all registered drivers for
728 		 * extensions and return the first positive result.  If
729 		 * a driver is already set, further queries will be passed
730 		 * only to that driver.
731 		 */
732 		if (!driver) {
733 			mutex_lock(&vfio.iommu_drivers_lock);
734 			list_for_each_entry(driver, &vfio.iommu_drivers_list,
735 					    vfio_next) {
736 
737 				if (!list_empty(&container->group_list) &&
738 				    !vfio_iommu_driver_allowed(container,
739 							       driver))
740 					continue;
741 				if (!try_module_get(driver->ops->owner))
742 					continue;
743 
744 				ret = driver->ops->ioctl(NULL,
745 							 VFIO_CHECK_EXTENSION,
746 							 arg);
747 				module_put(driver->ops->owner);
748 				if (ret > 0)
749 					break;
750 			}
751 			mutex_unlock(&vfio.iommu_drivers_lock);
752 		} else
753 			ret = driver->ops->ioctl(container->iommu_data,
754 						 VFIO_CHECK_EXTENSION, arg);
755 	}
756 
757 	up_read(&container->group_lock);
758 
759 	return ret;
760 }
761 
762 /* hold write lock on container->group_lock */
763 static int __vfio_container_attach_groups(struct vfio_container *container,
764 					  struct vfio_iommu_driver *driver,
765 					  void *data)
766 {
767 	struct vfio_group *group;
768 	int ret = -ENODEV;
769 
770 	list_for_each_entry(group, &container->group_list, container_next) {
771 		ret = driver->ops->attach_group(data, group->iommu_group,
772 						group->type);
773 		if (ret)
774 			goto unwind;
775 	}
776 
777 	return ret;
778 
779 unwind:
780 	list_for_each_entry_continue_reverse(group, &container->group_list,
781 					     container_next) {
782 		driver->ops->detach_group(data, group->iommu_group);
783 	}
784 
785 	return ret;
786 }
787 
788 static long vfio_ioctl_set_iommu(struct vfio_container *container,
789 				 unsigned long arg)
790 {
791 	struct vfio_iommu_driver *driver;
792 	long ret = -ENODEV;
793 
794 	down_write(&container->group_lock);
795 
796 	/*
797 	 * The container is designed to be an unprivileged interface while
798 	 * the group can be assigned to specific users.  Therefore, only by
799 	 * adding a group to a container does the user get the privilege of
800 	 * enabling the iommu, which may allocate finite resources.  There
801 	 * is no unset_iommu, but by removing all the groups from a container,
802 	 * the container is deprivileged and returns to an unset state.
803 	 */
804 	if (list_empty(&container->group_list) || container->iommu_driver) {
805 		up_write(&container->group_lock);
806 		return -EINVAL;
807 	}
808 
809 	mutex_lock(&vfio.iommu_drivers_lock);
810 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
811 		void *data;
812 
813 		if (!vfio_iommu_driver_allowed(container, driver))
814 			continue;
815 		if (!try_module_get(driver->ops->owner))
816 			continue;
817 
818 		/*
819 		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
820 		 * so test which iommu driver reported support for this
821 		 * extension and call open on them.  We also pass them the
822 		 * magic, allowing a single driver to support multiple
823 		 * interfaces if they'd like.
824 		 */
825 		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
826 			module_put(driver->ops->owner);
827 			continue;
828 		}
829 
830 		data = driver->ops->open(arg);
831 		if (IS_ERR(data)) {
832 			ret = PTR_ERR(data);
833 			module_put(driver->ops->owner);
834 			continue;
835 		}
836 
837 		ret = __vfio_container_attach_groups(container, driver, data);
838 		if (ret) {
839 			driver->ops->release(data);
840 			module_put(driver->ops->owner);
841 			continue;
842 		}
843 
844 		container->iommu_driver = driver;
845 		container->iommu_data = data;
846 		break;
847 	}
848 
849 	mutex_unlock(&vfio.iommu_drivers_lock);
850 	up_write(&container->group_lock);
851 
852 	return ret;
853 }
854 
855 static long vfio_fops_unl_ioctl(struct file *filep,
856 				unsigned int cmd, unsigned long arg)
857 {
858 	struct vfio_container *container = filep->private_data;
859 	struct vfio_iommu_driver *driver;
860 	void *data;
861 	long ret = -EINVAL;
862 
863 	if (!container)
864 		return ret;
865 
866 	switch (cmd) {
867 	case VFIO_GET_API_VERSION:
868 		ret = VFIO_API_VERSION;
869 		break;
870 	case VFIO_CHECK_EXTENSION:
871 		ret = vfio_ioctl_check_extension(container, arg);
872 		break;
873 	case VFIO_SET_IOMMU:
874 		ret = vfio_ioctl_set_iommu(container, arg);
875 		break;
876 	default:
877 		driver = container->iommu_driver;
878 		data = container->iommu_data;
879 
880 		if (driver) /* passthrough all unrecognized ioctls */
881 			ret = driver->ops->ioctl(data, cmd, arg);
882 	}
883 
884 	return ret;
885 }
886 
887 static int vfio_fops_open(struct inode *inode, struct file *filep)
888 {
889 	struct vfio_container *container;
890 
891 	container = kzalloc(sizeof(*container), GFP_KERNEL);
892 	if (!container)
893 		return -ENOMEM;
894 
895 	INIT_LIST_HEAD(&container->group_list);
896 	init_rwsem(&container->group_lock);
897 	kref_init(&container->kref);
898 
899 	filep->private_data = container;
900 
901 	return 0;
902 }
903 
904 static int vfio_fops_release(struct inode *inode, struct file *filep)
905 {
906 	struct vfio_container *container = filep->private_data;
907 	struct vfio_iommu_driver *driver = container->iommu_driver;
908 
909 	if (driver && driver->ops->notify)
910 		driver->ops->notify(container->iommu_data,
911 				    VFIO_IOMMU_CONTAINER_CLOSE);
912 
913 	filep->private_data = NULL;
914 
915 	vfio_container_put(container);
916 
917 	return 0;
918 }
919 
920 static const struct file_operations vfio_fops = {
921 	.owner		= THIS_MODULE,
922 	.open		= vfio_fops_open,
923 	.release	= vfio_fops_release,
924 	.unlocked_ioctl	= vfio_fops_unl_ioctl,
925 	.compat_ioctl	= compat_ptr_ioctl,
926 };
927 
928 /*
929  * VFIO Group fd, /dev/vfio/$GROUP
930  */
931 static void __vfio_group_unset_container(struct vfio_group *group)
932 {
933 	struct vfio_container *container = group->container;
934 	struct vfio_iommu_driver *driver;
935 
936 	lockdep_assert_held_write(&group->group_rwsem);
937 
938 	down_write(&container->group_lock);
939 
940 	driver = container->iommu_driver;
941 	if (driver)
942 		driver->ops->detach_group(container->iommu_data,
943 					  group->iommu_group);
944 
945 	if (group->type == VFIO_IOMMU)
946 		iommu_group_release_dma_owner(group->iommu_group);
947 
948 	group->container = NULL;
949 	group->container_users = 0;
950 	list_del(&group->container_next);
951 
952 	/* Detaching the last group deprivileges a container, remove iommu */
953 	if (driver && list_empty(&container->group_list)) {
954 		driver->ops->release(container->iommu_data);
955 		module_put(driver->ops->owner);
956 		container->iommu_driver = NULL;
957 		container->iommu_data = NULL;
958 	}
959 
960 	up_write(&container->group_lock);
961 
962 	vfio_container_put(container);
963 }
964 
965 /*
966  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
967  * if there was no container to unset.  Since the ioctl is called on
968  * the group, we know that still exists, therefore the only valid
969  * transition here is 1->0.
970  */
971 static int vfio_group_unset_container(struct vfio_group *group)
972 {
973 	lockdep_assert_held_write(&group->group_rwsem);
974 
975 	if (!group->container)
976 		return -EINVAL;
977 	if (group->container_users != 1)
978 		return -EBUSY;
979 	__vfio_group_unset_container(group);
980 	return 0;
981 }
982 
983 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
984 {
985 	struct fd f;
986 	struct vfio_container *container;
987 	struct vfio_iommu_driver *driver;
988 	int ret = 0;
989 
990 	lockdep_assert_held_write(&group->group_rwsem);
991 
992 	if (group->container || WARN_ON(group->container_users))
993 		return -EINVAL;
994 
995 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
996 		return -EPERM;
997 
998 	f = fdget(container_fd);
999 	if (!f.file)
1000 		return -EBADF;
1001 
1002 	/* Sanity check, is this really our fd? */
1003 	if (f.file->f_op != &vfio_fops) {
1004 		fdput(f);
1005 		return -EINVAL;
1006 	}
1007 
1008 	container = f.file->private_data;
1009 	WARN_ON(!container); /* fget ensures we don't race vfio_release */
1010 
1011 	down_write(&container->group_lock);
1012 
1013 	/* Real groups and fake groups cannot mix */
1014 	if (!list_empty(&container->group_list) &&
1015 	    container->noiommu != (group->type == VFIO_NO_IOMMU)) {
1016 		ret = -EPERM;
1017 		goto unlock_out;
1018 	}
1019 
1020 	if (group->type == VFIO_IOMMU) {
1021 		ret = iommu_group_claim_dma_owner(group->iommu_group, f.file);
1022 		if (ret)
1023 			goto unlock_out;
1024 	}
1025 
1026 	driver = container->iommu_driver;
1027 	if (driver) {
1028 		ret = driver->ops->attach_group(container->iommu_data,
1029 						group->iommu_group,
1030 						group->type);
1031 		if (ret) {
1032 			if (group->type == VFIO_IOMMU)
1033 				iommu_group_release_dma_owner(
1034 					group->iommu_group);
1035 			goto unlock_out;
1036 		}
1037 	}
1038 
1039 	group->container = container;
1040 	group->container_users = 1;
1041 	container->noiommu = (group->type == VFIO_NO_IOMMU);
1042 	list_add(&group->container_next, &container->group_list);
1043 
1044 	/* Get a reference on the container and mark a user within the group */
1045 	vfio_container_get(container);
1046 
1047 unlock_out:
1048 	up_write(&container->group_lock);
1049 	fdput(f);
1050 	return ret;
1051 }
1052 
1053 static const struct file_operations vfio_device_fops;
1054 
1055 /* true if the vfio_device has open_device() called but not close_device() */
1056 static bool vfio_assert_device_open(struct vfio_device *device)
1057 {
1058 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
1059 }
1060 
1061 static int vfio_device_assign_container(struct vfio_device *device)
1062 {
1063 	struct vfio_group *group = device->group;
1064 
1065 	lockdep_assert_held_write(&group->group_rwsem);
1066 
1067 	if (!group->container || !group->container->iommu_driver ||
1068 	    WARN_ON(!group->container_users))
1069 		return -EINVAL;
1070 
1071 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
1072 		return -EPERM;
1073 
1074 	get_file(group->opened_file);
1075 	group->container_users++;
1076 	return 0;
1077 }
1078 
1079 static void vfio_device_unassign_container(struct vfio_device *device)
1080 {
1081 	down_write(&device->group->group_rwsem);
1082 	WARN_ON(device->group->container_users <= 1);
1083 	device->group->container_users--;
1084 	fput(device->group->opened_file);
1085 	up_write(&device->group->group_rwsem);
1086 }
1087 
1088 static struct file *vfio_device_open(struct vfio_device *device)
1089 {
1090 	struct vfio_iommu_driver *iommu_driver;
1091 	struct file *filep;
1092 	int ret;
1093 
1094 	down_write(&device->group->group_rwsem);
1095 	ret = vfio_device_assign_container(device);
1096 	up_write(&device->group->group_rwsem);
1097 	if (ret)
1098 		return ERR_PTR(ret);
1099 
1100 	if (!try_module_get(device->dev->driver->owner)) {
1101 		ret = -ENODEV;
1102 		goto err_unassign_container;
1103 	}
1104 
1105 	mutex_lock(&device->dev_set->lock);
1106 	device->open_count++;
1107 	if (device->open_count == 1) {
1108 		/*
1109 		 * Here we pass the KVM pointer with the group under the read
1110 		 * lock.  If the device driver will use it, it must obtain a
1111 		 * reference and release it during close_device.
1112 		 */
1113 		down_read(&device->group->group_rwsem);
1114 		device->kvm = device->group->kvm;
1115 
1116 		if (device->ops->open_device) {
1117 			ret = device->ops->open_device(device);
1118 			if (ret)
1119 				goto err_undo_count;
1120 		}
1121 
1122 		iommu_driver = device->group->container->iommu_driver;
1123 		if (iommu_driver && iommu_driver->ops->register_device)
1124 			iommu_driver->ops->register_device(
1125 				device->group->container->iommu_data, device);
1126 
1127 		up_read(&device->group->group_rwsem);
1128 	}
1129 	mutex_unlock(&device->dev_set->lock);
1130 
1131 	/*
1132 	 * We can't use anon_inode_getfd() because we need to modify
1133 	 * the f_mode flags directly to allow more than just ioctls
1134 	 */
1135 	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1136 				   device, O_RDWR);
1137 	if (IS_ERR(filep)) {
1138 		ret = PTR_ERR(filep);
1139 		goto err_close_device;
1140 	}
1141 
1142 	/*
1143 	 * TODO: add an anon_inode interface to do this.
1144 	 * Appears to be missing by lack of need rather than
1145 	 * explicitly prevented.  Now there's need.
1146 	 */
1147 	filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
1148 
1149 	if (device->group->type == VFIO_NO_IOMMU)
1150 		dev_warn(device->dev, "vfio-noiommu device opened by user "
1151 			 "(%s:%d)\n", current->comm, task_pid_nr(current));
1152 	/*
1153 	 * On success the ref of device is moved to the file and
1154 	 * put in vfio_device_fops_release()
1155 	 */
1156 	return filep;
1157 
1158 err_close_device:
1159 	mutex_lock(&device->dev_set->lock);
1160 	down_read(&device->group->group_rwsem);
1161 	if (device->open_count == 1 && device->ops->close_device) {
1162 		device->ops->close_device(device);
1163 
1164 		iommu_driver = device->group->container->iommu_driver;
1165 		if (iommu_driver && iommu_driver->ops->unregister_device)
1166 			iommu_driver->ops->unregister_device(
1167 				device->group->container->iommu_data, device);
1168 	}
1169 err_undo_count:
1170 	up_read(&device->group->group_rwsem);
1171 	device->open_count--;
1172 	if (device->open_count == 0 && device->kvm)
1173 		device->kvm = NULL;
1174 	mutex_unlock(&device->dev_set->lock);
1175 	module_put(device->dev->driver->owner);
1176 err_unassign_container:
1177 	vfio_device_unassign_container(device);
1178 	return ERR_PTR(ret);
1179 }
1180 
1181 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1182 {
1183 	struct vfio_device *device;
1184 	struct file *filep;
1185 	int fdno;
1186 	int ret;
1187 
1188 	device = vfio_device_get_from_name(group, buf);
1189 	if (IS_ERR(device))
1190 		return PTR_ERR(device);
1191 
1192 	fdno = get_unused_fd_flags(O_CLOEXEC);
1193 	if (fdno < 0) {
1194 		ret = fdno;
1195 		goto err_put_device;
1196 	}
1197 
1198 	filep = vfio_device_open(device);
1199 	if (IS_ERR(filep)) {
1200 		ret = PTR_ERR(filep);
1201 		goto err_put_fdno;
1202 	}
1203 
1204 	fd_install(fdno, filep);
1205 	return fdno;
1206 
1207 err_put_fdno:
1208 	put_unused_fd(fdno);
1209 err_put_device:
1210 	vfio_device_put(device);
1211 	return ret;
1212 }
1213 
1214 static long vfio_group_fops_unl_ioctl(struct file *filep,
1215 				      unsigned int cmd, unsigned long arg)
1216 {
1217 	struct vfio_group *group = filep->private_data;
1218 	long ret = -ENOTTY;
1219 
1220 	switch (cmd) {
1221 	case VFIO_GROUP_GET_STATUS:
1222 	{
1223 		struct vfio_group_status status;
1224 		unsigned long minsz;
1225 
1226 		minsz = offsetofend(struct vfio_group_status, flags);
1227 
1228 		if (copy_from_user(&status, (void __user *)arg, minsz))
1229 			return -EFAULT;
1230 
1231 		if (status.argsz < minsz)
1232 			return -EINVAL;
1233 
1234 		status.flags = 0;
1235 
1236 		down_read(&group->group_rwsem);
1237 		if (group->container)
1238 			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
1239 					VFIO_GROUP_FLAGS_VIABLE;
1240 		else if (!iommu_group_dma_owner_claimed(group->iommu_group))
1241 			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1242 		up_read(&group->group_rwsem);
1243 
1244 		if (copy_to_user((void __user *)arg, &status, minsz))
1245 			return -EFAULT;
1246 
1247 		ret = 0;
1248 		break;
1249 	}
1250 	case VFIO_GROUP_SET_CONTAINER:
1251 	{
1252 		int fd;
1253 
1254 		if (get_user(fd, (int __user *)arg))
1255 			return -EFAULT;
1256 
1257 		if (fd < 0)
1258 			return -EINVAL;
1259 
1260 		down_write(&group->group_rwsem);
1261 		ret = vfio_group_set_container(group, fd);
1262 		up_write(&group->group_rwsem);
1263 		break;
1264 	}
1265 	case VFIO_GROUP_UNSET_CONTAINER:
1266 		down_write(&group->group_rwsem);
1267 		ret = vfio_group_unset_container(group);
1268 		up_write(&group->group_rwsem);
1269 		break;
1270 	case VFIO_GROUP_GET_DEVICE_FD:
1271 	{
1272 		char *buf;
1273 
1274 		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1275 		if (IS_ERR(buf))
1276 			return PTR_ERR(buf);
1277 
1278 		ret = vfio_group_get_device_fd(group, buf);
1279 		kfree(buf);
1280 		break;
1281 	}
1282 	}
1283 
1284 	return ret;
1285 }
1286 
1287 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1288 {
1289 	struct vfio_group *group =
1290 		container_of(inode->i_cdev, struct vfio_group, cdev);
1291 	int ret;
1292 
1293 	down_write(&group->group_rwsem);
1294 
1295 	/* users can be zero if this races with vfio_group_put() */
1296 	if (!refcount_inc_not_zero(&group->users)) {
1297 		ret = -ENODEV;
1298 		goto err_unlock;
1299 	}
1300 
1301 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
1302 		ret = -EPERM;
1303 		goto err_put;
1304 	}
1305 
1306 	/*
1307 	 * Do we need multiple instances of the group open?  Seems not.
1308 	 */
1309 	if (group->opened_file) {
1310 		ret = -EBUSY;
1311 		goto err_put;
1312 	}
1313 	group->opened_file = filep;
1314 	filep->private_data = group;
1315 
1316 	up_write(&group->group_rwsem);
1317 	return 0;
1318 err_put:
1319 	vfio_group_put(group);
1320 err_unlock:
1321 	up_write(&group->group_rwsem);
1322 	return ret;
1323 }
1324 
1325 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1326 {
1327 	struct vfio_group *group = filep->private_data;
1328 
1329 	filep->private_data = NULL;
1330 
1331 	down_write(&group->group_rwsem);
1332 	/*
1333 	 * Device FDs hold a group file reference, therefore the group release
1334 	 * is only called when there are no open devices.
1335 	 */
1336 	WARN_ON(group->notifier.head);
1337 	if (group->container) {
1338 		WARN_ON(group->container_users != 1);
1339 		__vfio_group_unset_container(group);
1340 	}
1341 	group->opened_file = NULL;
1342 	up_write(&group->group_rwsem);
1343 
1344 	vfio_group_put(group);
1345 
1346 	return 0;
1347 }
1348 
1349 static const struct file_operations vfio_group_fops = {
1350 	.owner		= THIS_MODULE,
1351 	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
1352 	.compat_ioctl	= compat_ptr_ioctl,
1353 	.open		= vfio_group_fops_open,
1354 	.release	= vfio_group_fops_release,
1355 };
1356 
1357 /*
1358  * VFIO Device fd
1359  */
1360 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1361 {
1362 	struct vfio_device *device = filep->private_data;
1363 	struct vfio_iommu_driver *iommu_driver;
1364 
1365 	mutex_lock(&device->dev_set->lock);
1366 	vfio_assert_device_open(device);
1367 	down_read(&device->group->group_rwsem);
1368 	if (device->open_count == 1 && device->ops->close_device)
1369 		device->ops->close_device(device);
1370 
1371 	iommu_driver = device->group->container->iommu_driver;
1372 	if (iommu_driver && iommu_driver->ops->unregister_device)
1373 		iommu_driver->ops->unregister_device(
1374 			device->group->container->iommu_data, device);
1375 	up_read(&device->group->group_rwsem);
1376 	device->open_count--;
1377 	if (device->open_count == 0)
1378 		device->kvm = NULL;
1379 	mutex_unlock(&device->dev_set->lock);
1380 
1381 	module_put(device->dev->driver->owner);
1382 
1383 	vfio_device_unassign_container(device);
1384 
1385 	vfio_device_put(device);
1386 
1387 	return 0;
1388 }
1389 
1390 /*
1391  * vfio_mig_get_next_state - Compute the next step in the FSM
1392  * @cur_fsm - The current state the device is in
1393  * @new_fsm - The target state to reach
1394  * @next_fsm - Pointer to the next step to get to new_fsm
1395  *
1396  * Return 0 upon success, otherwise -errno
1397  * Upon success the next step in the state progression between cur_fsm and
1398  * new_fsm will be set in next_fsm.
1399  *
1400  * This breaks down requests for combination transitions into smaller steps and
1401  * returns the next step to get to new_fsm. The function may need to be called
1402  * multiple times before reaching new_fsm.
1403  *
1404  */
1405 int vfio_mig_get_next_state(struct vfio_device *device,
1406 			    enum vfio_device_mig_state cur_fsm,
1407 			    enum vfio_device_mig_state new_fsm,
1408 			    enum vfio_device_mig_state *next_fsm)
1409 {
1410 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
1411 	/*
1412 	 * The coding in this table requires the driver to implement the
1413 	 * following FSM arcs:
1414 	 *         RESUMING -> STOP
1415 	 *         STOP -> RESUMING
1416 	 *         STOP -> STOP_COPY
1417 	 *         STOP_COPY -> STOP
1418 	 *
1419 	 * If P2P is supported then the driver must also implement these FSM
1420 	 * arcs:
1421 	 *         RUNNING -> RUNNING_P2P
1422 	 *         RUNNING_P2P -> RUNNING
1423 	 *         RUNNING_P2P -> STOP
1424 	 *         STOP -> RUNNING_P2P
1425 	 * Without P2P the driver must implement:
1426 	 *         RUNNING -> STOP
1427 	 *         STOP -> RUNNING
1428 	 *
1429 	 * The coding will step through multiple states for some combination
1430 	 * transitions; if all optional features are supported, this means the
1431 	 * following ones:
1432 	 *         RESUMING -> STOP -> RUNNING_P2P
1433 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
1434 	 *         RESUMING -> STOP -> STOP_COPY
1435 	 *         RUNNING -> RUNNING_P2P -> STOP
1436 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
1437 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
1438 	 *         RUNNING_P2P -> STOP -> RESUMING
1439 	 *         RUNNING_P2P -> STOP -> STOP_COPY
1440 	 *         STOP -> RUNNING_P2P -> RUNNING
1441 	 *         STOP_COPY -> STOP -> RESUMING
1442 	 *         STOP_COPY -> STOP -> RUNNING_P2P
1443 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
1444 	 */
1445 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
1446 		[VFIO_DEVICE_STATE_STOP] = {
1447 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1448 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1449 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1450 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1451 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1452 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1453 		},
1454 		[VFIO_DEVICE_STATE_RUNNING] = {
1455 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
1456 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1457 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
1458 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1459 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1460 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1461 		},
1462 		[VFIO_DEVICE_STATE_STOP_COPY] = {
1463 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1464 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1465 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1466 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1467 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1468 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1469 		},
1470 		[VFIO_DEVICE_STATE_RESUMING] = {
1471 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1472 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1473 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1474 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1475 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1476 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1477 		},
1478 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
1479 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1480 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1481 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1482 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1483 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1484 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1485 		},
1486 		[VFIO_DEVICE_STATE_ERROR] = {
1487 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
1488 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
1489 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
1490 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
1491 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
1492 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1493 		},
1494 	};
1495 
1496 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
1497 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
1498 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
1499 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
1500 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
1501 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
1502 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
1503 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
1504 	};
1505 
1506 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1507 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
1508 			state_flags_table[cur_fsm]))
1509 		return -EINVAL;
1510 
1511 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1512 	   (state_flags_table[new_fsm] & device->migration_flags) !=
1513 			state_flags_table[new_fsm])
1514 		return -EINVAL;
1515 
1516 	/*
1517 	 * Arcs touching optional and unsupported states are skipped over. The
1518 	 * driver will instead see an arc from the original state to the next
1519 	 * logical state, as per the above comment.
1520 	 */
1521 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
1522 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
1523 			state_flags_table[*next_fsm])
1524 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
1525 
1526 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
1527 }
1528 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
1529 
1530 /*
1531  * Convert the drivers's struct file into a FD number and return it to userspace
1532  */
1533 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
1534 				   struct vfio_device_feature_mig_state *mig)
1535 {
1536 	int ret;
1537 	int fd;
1538 
1539 	fd = get_unused_fd_flags(O_CLOEXEC);
1540 	if (fd < 0) {
1541 		ret = fd;
1542 		goto out_fput;
1543 	}
1544 
1545 	mig->data_fd = fd;
1546 	if (copy_to_user(arg, mig, sizeof(*mig))) {
1547 		ret = -EFAULT;
1548 		goto out_put_unused;
1549 	}
1550 	fd_install(fd, filp);
1551 	return 0;
1552 
1553 out_put_unused:
1554 	put_unused_fd(fd);
1555 out_fput:
1556 	fput(filp);
1557 	return ret;
1558 }
1559 
1560 static int
1561 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
1562 					   u32 flags, void __user *arg,
1563 					   size_t argsz)
1564 {
1565 	size_t minsz =
1566 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
1567 	struct vfio_device_feature_mig_state mig;
1568 	struct file *filp = NULL;
1569 	int ret;
1570 
1571 	if (!device->mig_ops)
1572 		return -ENOTTY;
1573 
1574 	ret = vfio_check_feature(flags, argsz,
1575 				 VFIO_DEVICE_FEATURE_SET |
1576 				 VFIO_DEVICE_FEATURE_GET,
1577 				 sizeof(mig));
1578 	if (ret != 1)
1579 		return ret;
1580 
1581 	if (copy_from_user(&mig, arg, minsz))
1582 		return -EFAULT;
1583 
1584 	if (flags & VFIO_DEVICE_FEATURE_GET) {
1585 		enum vfio_device_mig_state curr_state;
1586 
1587 		ret = device->mig_ops->migration_get_state(device,
1588 							   &curr_state);
1589 		if (ret)
1590 			return ret;
1591 		mig.device_state = curr_state;
1592 		goto out_copy;
1593 	}
1594 
1595 	/* Handle the VFIO_DEVICE_FEATURE_SET */
1596 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
1597 	if (IS_ERR(filp) || !filp)
1598 		goto out_copy;
1599 
1600 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
1601 out_copy:
1602 	mig.data_fd = -1;
1603 	if (copy_to_user(arg, &mig, sizeof(mig)))
1604 		return -EFAULT;
1605 	if (IS_ERR(filp))
1606 		return PTR_ERR(filp);
1607 	return 0;
1608 }
1609 
1610 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
1611 					       u32 flags, void __user *arg,
1612 					       size_t argsz)
1613 {
1614 	struct vfio_device_feature_migration mig = {
1615 		.flags = device->migration_flags,
1616 	};
1617 	int ret;
1618 
1619 	if (!device->mig_ops)
1620 		return -ENOTTY;
1621 
1622 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
1623 				 sizeof(mig));
1624 	if (ret != 1)
1625 		return ret;
1626 	if (copy_to_user(arg, &mig, sizeof(mig)))
1627 		return -EFAULT;
1628 	return 0;
1629 }
1630 
1631 static int vfio_ioctl_device_feature(struct vfio_device *device,
1632 				     struct vfio_device_feature __user *arg)
1633 {
1634 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1635 	struct vfio_device_feature feature;
1636 
1637 	if (copy_from_user(&feature, arg, minsz))
1638 		return -EFAULT;
1639 
1640 	if (feature.argsz < minsz)
1641 		return -EINVAL;
1642 
1643 	/* Check unknown flags */
1644 	if (feature.flags &
1645 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1646 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1647 		return -EINVAL;
1648 
1649 	/* GET & SET are mutually exclusive except with PROBE */
1650 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1651 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1652 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1653 		return -EINVAL;
1654 
1655 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1656 	case VFIO_DEVICE_FEATURE_MIGRATION:
1657 		return vfio_ioctl_device_feature_migration(
1658 			device, feature.flags, arg->data,
1659 			feature.argsz - minsz);
1660 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1661 		return vfio_ioctl_device_feature_mig_device_state(
1662 			device, feature.flags, arg->data,
1663 			feature.argsz - minsz);
1664 	default:
1665 		if (unlikely(!device->ops->device_feature))
1666 			return -EINVAL;
1667 		return device->ops->device_feature(device, feature.flags,
1668 						   arg->data,
1669 						   feature.argsz - minsz);
1670 	}
1671 }
1672 
1673 static long vfio_device_fops_unl_ioctl(struct file *filep,
1674 				       unsigned int cmd, unsigned long arg)
1675 {
1676 	struct vfio_device *device = filep->private_data;
1677 
1678 	switch (cmd) {
1679 	case VFIO_DEVICE_FEATURE:
1680 		return vfio_ioctl_device_feature(device, (void __user *)arg);
1681 	default:
1682 		if (unlikely(!device->ops->ioctl))
1683 			return -EINVAL;
1684 		return device->ops->ioctl(device, cmd, arg);
1685 	}
1686 }
1687 
1688 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1689 				     size_t count, loff_t *ppos)
1690 {
1691 	struct vfio_device *device = filep->private_data;
1692 
1693 	if (unlikely(!device->ops->read))
1694 		return -EINVAL;
1695 
1696 	return device->ops->read(device, buf, count, ppos);
1697 }
1698 
1699 static ssize_t vfio_device_fops_write(struct file *filep,
1700 				      const char __user *buf,
1701 				      size_t count, loff_t *ppos)
1702 {
1703 	struct vfio_device *device = filep->private_data;
1704 
1705 	if (unlikely(!device->ops->write))
1706 		return -EINVAL;
1707 
1708 	return device->ops->write(device, buf, count, ppos);
1709 }
1710 
1711 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1712 {
1713 	struct vfio_device *device = filep->private_data;
1714 
1715 	if (unlikely(!device->ops->mmap))
1716 		return -EINVAL;
1717 
1718 	return device->ops->mmap(device, vma);
1719 }
1720 
1721 static const struct file_operations vfio_device_fops = {
1722 	.owner		= THIS_MODULE,
1723 	.release	= vfio_device_fops_release,
1724 	.read		= vfio_device_fops_read,
1725 	.write		= vfio_device_fops_write,
1726 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1727 	.compat_ioctl	= compat_ptr_ioctl,
1728 	.mmap		= vfio_device_fops_mmap,
1729 };
1730 
1731 /**
1732  * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
1733  * @file: VFIO group file
1734  *
1735  * The returned iommu_group is valid as long as a ref is held on the file.
1736  */
1737 struct iommu_group *vfio_file_iommu_group(struct file *file)
1738 {
1739 	struct vfio_group *group = file->private_data;
1740 
1741 	if (file->f_op != &vfio_group_fops)
1742 		return NULL;
1743 	return group->iommu_group;
1744 }
1745 EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
1746 
1747 /**
1748  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1749  *        is always CPU cache coherent
1750  * @file: VFIO group file
1751  *
1752  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1753  * bit in DMA transactions. A return of false indicates that the user has
1754  * rights to access additional instructions such as wbinvd on x86.
1755  */
1756 bool vfio_file_enforced_coherent(struct file *file)
1757 {
1758 	struct vfio_group *group = file->private_data;
1759 	bool ret;
1760 
1761 	if (file->f_op != &vfio_group_fops)
1762 		return true;
1763 
1764 	down_read(&group->group_rwsem);
1765 	if (group->container) {
1766 		ret = vfio_ioctl_check_extension(group->container,
1767 						 VFIO_DMA_CC_IOMMU);
1768 	} else {
1769 		/*
1770 		 * Since the coherency state is determined only once a container
1771 		 * is attached the user must do so before they can prove they
1772 		 * have permission.
1773 		 */
1774 		ret = true;
1775 	}
1776 	up_read(&group->group_rwsem);
1777 	return ret;
1778 }
1779 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1780 
1781 /**
1782  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1783  * @file: VFIO group file
1784  * @kvm: KVM to link
1785  *
1786  * When a VFIO device is first opened the KVM will be available in
1787  * device->kvm if one was associated with the group.
1788  */
1789 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1790 {
1791 	struct vfio_group *group = file->private_data;
1792 
1793 	if (file->f_op != &vfio_group_fops)
1794 		return;
1795 
1796 	down_write(&group->group_rwsem);
1797 	group->kvm = kvm;
1798 	up_write(&group->group_rwsem);
1799 }
1800 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1801 
1802 /**
1803  * vfio_file_has_dev - True if the VFIO file is a handle for device
1804  * @file: VFIO file to check
1805  * @device: Device that must be part of the file
1806  *
1807  * Returns true if given file has permission to manipulate the given device.
1808  */
1809 bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
1810 {
1811 	struct vfio_group *group = file->private_data;
1812 
1813 	if (file->f_op != &vfio_group_fops)
1814 		return false;
1815 
1816 	return group == device->group;
1817 }
1818 EXPORT_SYMBOL_GPL(vfio_file_has_dev);
1819 
1820 /*
1821  * Sub-module support
1822  */
1823 /*
1824  * Helper for managing a buffer of info chain capabilities, allocate or
1825  * reallocate a buffer with additional @size, filling in @id and @version
1826  * of the capability.  A pointer to the new capability is returned.
1827  *
1828  * NB. The chain is based at the head of the buffer, so new entries are
1829  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1830  * next offsets prior to copying to the user buffer.
1831  */
1832 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1833 					       size_t size, u16 id, u16 version)
1834 {
1835 	void *buf;
1836 	struct vfio_info_cap_header *header, *tmp;
1837 
1838 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1839 	if (!buf) {
1840 		kfree(caps->buf);
1841 		caps->buf = NULL;
1842 		caps->size = 0;
1843 		return ERR_PTR(-ENOMEM);
1844 	}
1845 
1846 	caps->buf = buf;
1847 	header = buf + caps->size;
1848 
1849 	/* Eventually copied to user buffer, zero */
1850 	memset(header, 0, size);
1851 
1852 	header->id = id;
1853 	header->version = version;
1854 
1855 	/* Add to the end of the capability chain */
1856 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1857 		; /* nothing */
1858 
1859 	tmp->next = caps->size;
1860 	caps->size += size;
1861 
1862 	return header;
1863 }
1864 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1865 
1866 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1867 {
1868 	struct vfio_info_cap_header *tmp;
1869 	void *buf = (void *)caps->buf;
1870 
1871 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1872 		tmp->next += offset;
1873 }
1874 EXPORT_SYMBOL(vfio_info_cap_shift);
1875 
1876 int vfio_info_add_capability(struct vfio_info_cap *caps,
1877 			     struct vfio_info_cap_header *cap, size_t size)
1878 {
1879 	struct vfio_info_cap_header *header;
1880 
1881 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1882 	if (IS_ERR(header))
1883 		return PTR_ERR(header);
1884 
1885 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1886 
1887 	return 0;
1888 }
1889 EXPORT_SYMBOL(vfio_info_add_capability);
1890 
1891 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1892 				       int max_irq_type, size_t *data_size)
1893 {
1894 	unsigned long minsz;
1895 	size_t size;
1896 
1897 	minsz = offsetofend(struct vfio_irq_set, count);
1898 
1899 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1900 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1901 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1902 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1903 		return -EINVAL;
1904 
1905 	if (data_size)
1906 		*data_size = 0;
1907 
1908 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1909 		return -EINVAL;
1910 
1911 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1912 	case VFIO_IRQ_SET_DATA_NONE:
1913 		size = 0;
1914 		break;
1915 	case VFIO_IRQ_SET_DATA_BOOL:
1916 		size = sizeof(uint8_t);
1917 		break;
1918 	case VFIO_IRQ_SET_DATA_EVENTFD:
1919 		size = sizeof(int32_t);
1920 		break;
1921 	default:
1922 		return -EINVAL;
1923 	}
1924 
1925 	if (size) {
1926 		if (hdr->argsz - minsz < hdr->count * size)
1927 			return -EINVAL;
1928 
1929 		if (!data_size)
1930 			return -EINVAL;
1931 
1932 		*data_size = hdr->count * size;
1933 	}
1934 
1935 	return 0;
1936 }
1937 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1938 
1939 /*
1940  * Pin contiguous user pages and return their associated host pages for local
1941  * domain only.
1942  * @device [in]  : device
1943  * @iova [in]    : starting IOVA of user pages to be pinned.
1944  * @npage [in]   : count of pages to be pinned.  This count should not
1945  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1946  * @prot [in]    : protection flags
1947  * @pages[out]   : array of host pages
1948  * Return error or number of pages pinned.
1949  */
1950 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1951 		   int npage, int prot, struct page **pages)
1952 {
1953 	struct vfio_container *container;
1954 	struct vfio_group *group = device->group;
1955 	struct vfio_iommu_driver *driver;
1956 	int ret;
1957 
1958 	if (!pages || !npage || !vfio_assert_device_open(device))
1959 		return -EINVAL;
1960 
1961 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1962 		return -E2BIG;
1963 
1964 	if (group->dev_counter > 1)
1965 		return -EINVAL;
1966 
1967 	/* group->container cannot change while a vfio device is open */
1968 	container = group->container;
1969 	driver = container->iommu_driver;
1970 	if (likely(driver && driver->ops->pin_pages))
1971 		ret = driver->ops->pin_pages(container->iommu_data,
1972 					     group->iommu_group, iova,
1973 					     npage, prot, pages);
1974 	else
1975 		ret = -ENOTTY;
1976 
1977 	return ret;
1978 }
1979 EXPORT_SYMBOL(vfio_pin_pages);
1980 
1981 /*
1982  * Unpin contiguous host pages for local domain only.
1983  * @device [in]  : device
1984  * @iova [in]    : starting address of user pages to be unpinned.
1985  * @npage [in]   : count of pages to be unpinned.  This count should not
1986  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1987  */
1988 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1989 {
1990 	struct vfio_container *container;
1991 	struct vfio_iommu_driver *driver;
1992 
1993 	if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES))
1994 		return;
1995 
1996 	if (WARN_ON(!vfio_assert_device_open(device)))
1997 		return;
1998 
1999 	/* group->container cannot change while a vfio device is open */
2000 	container = device->group->container;
2001 	driver = container->iommu_driver;
2002 
2003 	driver->ops->unpin_pages(container->iommu_data, iova, npage);
2004 }
2005 EXPORT_SYMBOL(vfio_unpin_pages);
2006 
2007 /*
2008  * This interface allows the CPUs to perform some sort of virtual DMA on
2009  * behalf of the device.
2010  *
2011  * CPUs read/write from/into a range of IOVAs pointing to user space memory
2012  * into/from a kernel buffer.
2013  *
2014  * As the read/write of user space memory is conducted via the CPUs and is
2015  * not a real device DMA, it is not necessary to pin the user space memory.
2016  *
2017  * @device [in]		: VFIO device
2018  * @iova [in]		: base IOVA of a user space buffer
2019  * @data [in]		: pointer to kernel buffer
2020  * @len [in]		: kernel buffer length
2021  * @write		: indicate read or write
2022  * Return error code on failure or 0 on success.
2023  */
2024 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
2025 		size_t len, bool write)
2026 {
2027 	struct vfio_container *container;
2028 	struct vfio_iommu_driver *driver;
2029 	int ret = 0;
2030 
2031 	if (!data || len <= 0 || !vfio_assert_device_open(device))
2032 		return -EINVAL;
2033 
2034 	/* group->container cannot change while a vfio device is open */
2035 	container = device->group->container;
2036 	driver = container->iommu_driver;
2037 
2038 	if (likely(driver && driver->ops->dma_rw))
2039 		ret = driver->ops->dma_rw(container->iommu_data,
2040 					  iova, data, len, write);
2041 	else
2042 		ret = -ENOTTY;
2043 	return ret;
2044 }
2045 EXPORT_SYMBOL(vfio_dma_rw);
2046 
2047 /*
2048  * Module/class support
2049  */
2050 static char *vfio_devnode(struct device *dev, umode_t *mode)
2051 {
2052 	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2053 }
2054 
2055 static struct miscdevice vfio_dev = {
2056 	.minor = VFIO_MINOR,
2057 	.name = "vfio",
2058 	.fops = &vfio_fops,
2059 	.nodename = "vfio/vfio",
2060 	.mode = S_IRUGO | S_IWUGO,
2061 };
2062 
2063 static int __init vfio_init(void)
2064 {
2065 	int ret;
2066 
2067 	ida_init(&vfio.group_ida);
2068 	mutex_init(&vfio.group_lock);
2069 	mutex_init(&vfio.iommu_drivers_lock);
2070 	INIT_LIST_HEAD(&vfio.group_list);
2071 	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2072 
2073 	ret = misc_register(&vfio_dev);
2074 	if (ret) {
2075 		pr_err("vfio: misc device register failed\n");
2076 		return ret;
2077 	}
2078 
2079 	/* /dev/vfio/$GROUP */
2080 	vfio.class = class_create(THIS_MODULE, "vfio");
2081 	if (IS_ERR(vfio.class)) {
2082 		ret = PTR_ERR(vfio.class);
2083 		goto err_class;
2084 	}
2085 
2086 	vfio.class->devnode = vfio_devnode;
2087 
2088 	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2089 	if (ret)
2090 		goto err_alloc_chrdev;
2091 
2092 #ifdef CONFIG_VFIO_NOIOMMU
2093 	ret = vfio_register_iommu_driver(&vfio_noiommu_ops);
2094 #endif
2095 	if (ret)
2096 		goto err_driver_register;
2097 
2098 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2099 	return 0;
2100 
2101 err_driver_register:
2102 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2103 err_alloc_chrdev:
2104 	class_destroy(vfio.class);
2105 	vfio.class = NULL;
2106 err_class:
2107 	misc_deregister(&vfio_dev);
2108 	return ret;
2109 }
2110 
2111 static void __exit vfio_cleanup(void)
2112 {
2113 	WARN_ON(!list_empty(&vfio.group_list));
2114 
2115 #ifdef CONFIG_VFIO_NOIOMMU
2116 	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2117 #endif
2118 	ida_destroy(&vfio.group_ida);
2119 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2120 	class_destroy(vfio.class);
2121 	vfio.class = NULL;
2122 	misc_deregister(&vfio_dev);
2123 	xa_destroy(&vfio_device_set_xa);
2124 }
2125 
2126 module_init(vfio_init);
2127 module_exit(vfio_cleanup);
2128 
2129 MODULE_VERSION(DRIVER_VERSION);
2130 MODULE_LICENSE("GPL v2");
2131 MODULE_AUTHOR(DRIVER_AUTHOR);
2132 MODULE_DESCRIPTION(DRIVER_DESC);
2133 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2134 MODULE_ALIAS("devname:vfio/vfio");
2135 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2136