xref: /openbmc/linux/drivers/vfio/vfio_main.c (revision bdef2b78)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
18 #include <linux/fs.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
35 #include <linux/pm_runtime.h>
36 #include <linux/interval_tree.h>
37 #include <linux/iova_bitmap.h>
38 #include "vfio.h"
39 
40 #define DRIVER_VERSION	"0.3"
41 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
42 #define DRIVER_DESC	"VFIO - User Level meta-driver"
43 
44 static struct vfio {
45 	struct class			*class;
46 	struct list_head		group_list;
47 	struct mutex			group_lock; /* locks group_list */
48 	struct ida			group_ida;
49 	dev_t				group_devt;
50 	struct class			*device_class;
51 	struct ida			device_ida;
52 } vfio;
53 
54 static DEFINE_XARRAY(vfio_device_set_xa);
55 static const struct file_operations vfio_group_fops;
56 
57 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
58 {
59 	unsigned long idx = (unsigned long)set_id;
60 	struct vfio_device_set *new_dev_set;
61 	struct vfio_device_set *dev_set;
62 
63 	if (WARN_ON(!set_id))
64 		return -EINVAL;
65 
66 	/*
67 	 * Atomically acquire a singleton object in the xarray for this set_id
68 	 */
69 	xa_lock(&vfio_device_set_xa);
70 	dev_set = xa_load(&vfio_device_set_xa, idx);
71 	if (dev_set)
72 		goto found_get_ref;
73 	xa_unlock(&vfio_device_set_xa);
74 
75 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
76 	if (!new_dev_set)
77 		return -ENOMEM;
78 	mutex_init(&new_dev_set->lock);
79 	INIT_LIST_HEAD(&new_dev_set->device_list);
80 	new_dev_set->set_id = set_id;
81 
82 	xa_lock(&vfio_device_set_xa);
83 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
84 			       GFP_KERNEL);
85 	if (!dev_set) {
86 		dev_set = new_dev_set;
87 		goto found_get_ref;
88 	}
89 
90 	kfree(new_dev_set);
91 	if (xa_is_err(dev_set)) {
92 		xa_unlock(&vfio_device_set_xa);
93 		return xa_err(dev_set);
94 	}
95 
96 found_get_ref:
97 	dev_set->device_count++;
98 	xa_unlock(&vfio_device_set_xa);
99 	mutex_lock(&dev_set->lock);
100 	device->dev_set = dev_set;
101 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
102 	mutex_unlock(&dev_set->lock);
103 	return 0;
104 }
105 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
106 
107 static void vfio_release_device_set(struct vfio_device *device)
108 {
109 	struct vfio_device_set *dev_set = device->dev_set;
110 
111 	if (!dev_set)
112 		return;
113 
114 	mutex_lock(&dev_set->lock);
115 	list_del(&device->dev_set_list);
116 	mutex_unlock(&dev_set->lock);
117 
118 	xa_lock(&vfio_device_set_xa);
119 	if (!--dev_set->device_count) {
120 		__xa_erase(&vfio_device_set_xa,
121 			   (unsigned long)dev_set->set_id);
122 		mutex_destroy(&dev_set->lock);
123 		kfree(dev_set);
124 	}
125 	xa_unlock(&vfio_device_set_xa);
126 }
127 
128 /*
129  * Group objects - create, release, get, put, search
130  */
131 static struct vfio_group *
132 __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
133 {
134 	struct vfio_group *group;
135 
136 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
137 		if (group->iommu_group == iommu_group) {
138 			refcount_inc(&group->drivers);
139 			return group;
140 		}
141 	}
142 	return NULL;
143 }
144 
145 static struct vfio_group *
146 vfio_group_get_from_iommu(struct iommu_group *iommu_group)
147 {
148 	struct vfio_group *group;
149 
150 	mutex_lock(&vfio.group_lock);
151 	group = __vfio_group_get_from_iommu(iommu_group);
152 	mutex_unlock(&vfio.group_lock);
153 	return group;
154 }
155 
156 static void vfio_group_release(struct device *dev)
157 {
158 	struct vfio_group *group = container_of(dev, struct vfio_group, dev);
159 
160 	mutex_destroy(&group->device_lock);
161 	iommu_group_put(group->iommu_group);
162 	ida_free(&vfio.group_ida, MINOR(group->dev.devt));
163 	kfree(group);
164 }
165 
166 static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
167 					   enum vfio_group_type type)
168 {
169 	struct vfio_group *group;
170 	int minor;
171 
172 	group = kzalloc(sizeof(*group), GFP_KERNEL);
173 	if (!group)
174 		return ERR_PTR(-ENOMEM);
175 
176 	minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
177 	if (minor < 0) {
178 		kfree(group);
179 		return ERR_PTR(minor);
180 	}
181 
182 	device_initialize(&group->dev);
183 	group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
184 	group->dev.class = vfio.class;
185 	group->dev.release = vfio_group_release;
186 	cdev_init(&group->cdev, &vfio_group_fops);
187 	group->cdev.owner = THIS_MODULE;
188 
189 	refcount_set(&group->users, 1);
190 	refcount_set(&group->drivers, 1);
191 	init_completion(&group->users_comp);
192 	init_rwsem(&group->group_rwsem);
193 	INIT_LIST_HEAD(&group->device_list);
194 	mutex_init(&group->device_lock);
195 	group->iommu_group = iommu_group;
196 	/* put in vfio_group_release() */
197 	iommu_group_ref_get(iommu_group);
198 	group->type = type;
199 	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
200 
201 	return group;
202 }
203 
204 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
205 		enum vfio_group_type type)
206 {
207 	struct vfio_group *group;
208 	struct vfio_group *ret;
209 	int err;
210 
211 	group = vfio_group_alloc(iommu_group, type);
212 	if (IS_ERR(group))
213 		return group;
214 
215 	err = dev_set_name(&group->dev, "%s%d",
216 			   group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
217 			   iommu_group_id(iommu_group));
218 	if (err) {
219 		ret = ERR_PTR(err);
220 		goto err_put;
221 	}
222 
223 	mutex_lock(&vfio.group_lock);
224 
225 	/* Did we race creating this group? */
226 	ret = __vfio_group_get_from_iommu(iommu_group);
227 	if (ret)
228 		goto err_unlock;
229 
230 	err = cdev_device_add(&group->cdev, &group->dev);
231 	if (err) {
232 		ret = ERR_PTR(err);
233 		goto err_unlock;
234 	}
235 
236 	list_add(&group->vfio_next, &vfio.group_list);
237 
238 	mutex_unlock(&vfio.group_lock);
239 	return group;
240 
241 err_unlock:
242 	mutex_unlock(&vfio.group_lock);
243 err_put:
244 	put_device(&group->dev);
245 	return ret;
246 }
247 
248 static void vfio_group_put(struct vfio_group *group)
249 {
250 	if (refcount_dec_and_test(&group->users))
251 		complete(&group->users_comp);
252 }
253 
254 static void vfio_device_remove_group(struct vfio_device *device)
255 {
256 	struct vfio_group *group = device->group;
257 
258 	if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
259 		iommu_group_remove_device(device->dev);
260 
261 	/* Pairs with vfio_create_group() / vfio_group_get_from_iommu() */
262 	if (!refcount_dec_and_mutex_lock(&group->drivers, &vfio.group_lock))
263 		return;
264 	list_del(&group->vfio_next);
265 
266 	/*
267 	 * We could concurrently probe another driver in the group that might
268 	 * race vfio_device_remove_group() with vfio_get_group(), so we have to
269 	 * ensure that the sysfs is all cleaned up under lock otherwise the
270 	 * cdev_device_add() will fail due to the name aready existing.
271 	 */
272 	cdev_device_del(&group->cdev, &group->dev);
273 	mutex_unlock(&vfio.group_lock);
274 
275 	/* Matches the get from vfio_group_alloc() */
276 	vfio_group_put(group);
277 
278 	/*
279 	 * Before we allow the last driver in the group to be unplugged the
280 	 * group must be sanitized so nothing else is or can reference it. This
281 	 * is because the group->iommu_group pointer should only be used so long
282 	 * as a device driver is attached to a device in the group.
283 	 */
284 	wait_for_completion(&group->users_comp);
285 
286 	/*
287 	 * These data structures all have paired operations that can only be
288 	 * undone when the caller holds a live reference on the group. Since all
289 	 * pairs must be undone these WARN_ON's indicate some caller did not
290 	 * properly hold the group reference.
291 	 */
292 	WARN_ON(!list_empty(&group->device_list));
293 	WARN_ON(group->container || group->container_users);
294 	WARN_ON(group->notifier.head);
295 	group->iommu_group = NULL;
296 
297 	put_device(&group->dev);
298 }
299 
300 /*
301  * Device objects - create, release, get, put, search
302  */
303 /* Device reference always implies a group reference */
304 static void vfio_device_put_registration(struct vfio_device *device)
305 {
306 	if (refcount_dec_and_test(&device->refcount))
307 		complete(&device->comp);
308 }
309 
310 static bool vfio_device_try_get_registration(struct vfio_device *device)
311 {
312 	return refcount_inc_not_zero(&device->refcount);
313 }
314 
315 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
316 						 struct device *dev)
317 {
318 	struct vfio_device *device;
319 
320 	mutex_lock(&group->device_lock);
321 	list_for_each_entry(device, &group->device_list, group_next) {
322 		if (device->dev == dev &&
323 		    vfio_device_try_get_registration(device)) {
324 			mutex_unlock(&group->device_lock);
325 			return device;
326 		}
327 	}
328 	mutex_unlock(&group->device_lock);
329 	return NULL;
330 }
331 
332 /*
333  * VFIO driver API
334  */
335 /* Release helper called by vfio_put_device() */
336 static void vfio_device_release(struct device *dev)
337 {
338 	struct vfio_device *device =
339 			container_of(dev, struct vfio_device, device);
340 
341 	vfio_release_device_set(device);
342 	ida_free(&vfio.device_ida, device->index);
343 
344 	/*
345 	 * kvfree() cannot be done here due to a life cycle mess in
346 	 * vfio-ccw. Before the ccw part is fixed all drivers are
347 	 * required to support @release and call vfio_free_device()
348 	 * from there.
349 	 */
350 	device->ops->release(device);
351 }
352 
353 /*
354  * Allocate and initialize vfio_device so it can be registered to vfio
355  * core.
356  *
357  * Drivers should use the wrapper vfio_alloc_device() for allocation.
358  * @size is the size of the structure to be allocated, including any
359  * private data used by the driver.
360  *
361  * Driver may provide an @init callback to cover device private data.
362  *
363  * Use vfio_put_device() to release the structure after success return.
364  */
365 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
366 				       const struct vfio_device_ops *ops)
367 {
368 	struct vfio_device *device;
369 	int ret;
370 
371 	if (WARN_ON(size < sizeof(struct vfio_device)))
372 		return ERR_PTR(-EINVAL);
373 
374 	device = kvzalloc(size, GFP_KERNEL);
375 	if (!device)
376 		return ERR_PTR(-ENOMEM);
377 
378 	ret = vfio_init_device(device, dev, ops);
379 	if (ret)
380 		goto out_free;
381 	return device;
382 
383 out_free:
384 	kvfree(device);
385 	return ERR_PTR(ret);
386 }
387 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
388 
389 /*
390  * Initialize a vfio_device so it can be registered to vfio core.
391  *
392  * Only vfio-ccw driver should call this interface.
393  */
394 int vfio_init_device(struct vfio_device *device, struct device *dev,
395 		     const struct vfio_device_ops *ops)
396 {
397 	int ret;
398 
399 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
400 	if (ret < 0) {
401 		dev_dbg(dev, "Error to alloc index\n");
402 		return ret;
403 	}
404 
405 	device->index = ret;
406 	init_completion(&device->comp);
407 	device->dev = dev;
408 	device->ops = ops;
409 
410 	if (ops->init) {
411 		ret = ops->init(device);
412 		if (ret)
413 			goto out_uninit;
414 	}
415 
416 	device_initialize(&device->device);
417 	device->device.release = vfio_device_release;
418 	device->device.class = vfio.device_class;
419 	device->device.parent = device->dev;
420 	return 0;
421 
422 out_uninit:
423 	vfio_release_device_set(device);
424 	ida_free(&vfio.device_ida, device->index);
425 	return ret;
426 }
427 EXPORT_SYMBOL_GPL(vfio_init_device);
428 
429 /*
430  * The helper called by driver @release callback to free the device
431  * structure. Drivers which don't have private data to clean can
432  * simply use this helper as its @release.
433  */
434 void vfio_free_device(struct vfio_device *device)
435 {
436 	kvfree(device);
437 }
438 EXPORT_SYMBOL_GPL(vfio_free_device);
439 
440 static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
441 		enum vfio_group_type type)
442 {
443 	struct iommu_group *iommu_group;
444 	struct vfio_group *group;
445 	int ret;
446 
447 	iommu_group = iommu_group_alloc();
448 	if (IS_ERR(iommu_group))
449 		return ERR_CAST(iommu_group);
450 
451 	ret = iommu_group_set_name(iommu_group, "vfio-noiommu");
452 	if (ret)
453 		goto out_put_group;
454 	ret = iommu_group_add_device(iommu_group, dev);
455 	if (ret)
456 		goto out_put_group;
457 
458 	group = vfio_create_group(iommu_group, type);
459 	if (IS_ERR(group)) {
460 		ret = PTR_ERR(group);
461 		goto out_remove_device;
462 	}
463 	iommu_group_put(iommu_group);
464 	return group;
465 
466 out_remove_device:
467 	iommu_group_remove_device(dev);
468 out_put_group:
469 	iommu_group_put(iommu_group);
470 	return ERR_PTR(ret);
471 }
472 
473 static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
474 {
475 	struct iommu_group *iommu_group;
476 	struct vfio_group *group;
477 
478 	iommu_group = iommu_group_get(dev);
479 	if (!iommu_group && vfio_noiommu) {
480 		/*
481 		 * With noiommu enabled, create an IOMMU group for devices that
482 		 * don't already have one, implying no IOMMU hardware/driver
483 		 * exists.  Taint the kernel because we're about to give a DMA
484 		 * capable device to a user without IOMMU protection.
485 		 */
486 		group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
487 		if (!IS_ERR(group)) {
488 			add_taint(TAINT_USER, LOCKDEP_STILL_OK);
489 			dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
490 		}
491 		return group;
492 	}
493 
494 	if (!iommu_group)
495 		return ERR_PTR(-EINVAL);
496 
497 	/*
498 	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
499 	 * restore cache coherency. It has to be checked here because it is only
500 	 * valid for cases where we are using iommu groups.
501 	 */
502 	if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) {
503 		iommu_group_put(iommu_group);
504 		return ERR_PTR(-EINVAL);
505 	}
506 
507 	group = vfio_group_get_from_iommu(iommu_group);
508 	if (!group)
509 		group = vfio_create_group(iommu_group, VFIO_IOMMU);
510 
511 	/* The vfio_group holds a reference to the iommu_group */
512 	iommu_group_put(iommu_group);
513 	return group;
514 }
515 
516 static int __vfio_register_dev(struct vfio_device *device,
517 		struct vfio_group *group)
518 {
519 	struct vfio_device *existing_device;
520 	int ret;
521 
522 	/*
523 	 * In all cases group is the output of one of the group allocation
524 	 * functions and we have group->drivers incremented for us.
525 	 */
526 	if (IS_ERR(group))
527 		return PTR_ERR(group);
528 
529 	/*
530 	 * If the driver doesn't specify a set then the device is added to a
531 	 * singleton set just for itself.
532 	 */
533 	if (!device->dev_set)
534 		vfio_assign_device_set(device, device);
535 
536 	existing_device = vfio_group_get_device(group, device->dev);
537 	if (existing_device) {
538 		dev_WARN(device->dev, "Device already exists on group %d\n",
539 			 iommu_group_id(group->iommu_group));
540 		vfio_device_put_registration(existing_device);
541 		ret = -EBUSY;
542 		goto err_out;
543 	}
544 
545 	/* Our reference on group is moved to the device */
546 	device->group = group;
547 
548 	ret = dev_set_name(&device->device, "vfio%d", device->index);
549 	if (ret)
550 		goto err_out;
551 
552 	ret = device_add(&device->device);
553 	if (ret)
554 		goto err_out;
555 
556 	/* Refcounting can't start until the driver calls register */
557 	refcount_set(&device->refcount, 1);
558 
559 	mutex_lock(&group->device_lock);
560 	list_add(&device->group_next, &group->device_list);
561 	mutex_unlock(&group->device_lock);
562 
563 	return 0;
564 err_out:
565 	vfio_device_remove_group(device);
566 	return ret;
567 }
568 
569 int vfio_register_group_dev(struct vfio_device *device)
570 {
571 	return __vfio_register_dev(device,
572 		vfio_group_find_or_alloc(device->dev));
573 }
574 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
575 
576 /*
577  * Register a virtual device without IOMMU backing.  The user of this
578  * device must not be able to directly trigger unmediated DMA.
579  */
580 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
581 {
582 	return __vfio_register_dev(device,
583 		vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
584 }
585 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
586 
587 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
588 						     char *buf)
589 {
590 	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
591 
592 	mutex_lock(&group->device_lock);
593 	list_for_each_entry(it, &group->device_list, group_next) {
594 		int ret;
595 
596 		if (it->ops->match) {
597 			ret = it->ops->match(it, buf);
598 			if (ret < 0) {
599 				device = ERR_PTR(ret);
600 				break;
601 			}
602 		} else {
603 			ret = !strcmp(dev_name(it->dev), buf);
604 		}
605 
606 		if (ret && vfio_device_try_get_registration(it)) {
607 			device = it;
608 			break;
609 		}
610 	}
611 	mutex_unlock(&group->device_lock);
612 
613 	return device;
614 }
615 
616 /*
617  * Decrement the device reference count and wait for the device to be
618  * removed.  Open file descriptors for the device... */
619 void vfio_unregister_group_dev(struct vfio_device *device)
620 {
621 	struct vfio_group *group = device->group;
622 	unsigned int i = 0;
623 	bool interrupted = false;
624 	long rc;
625 
626 	vfio_device_put_registration(device);
627 	rc = try_wait_for_completion(&device->comp);
628 	while (rc <= 0) {
629 		if (device->ops->request)
630 			device->ops->request(device, i++);
631 
632 		if (interrupted) {
633 			rc = wait_for_completion_timeout(&device->comp,
634 							 HZ * 10);
635 		} else {
636 			rc = wait_for_completion_interruptible_timeout(
637 				&device->comp, HZ * 10);
638 			if (rc < 0) {
639 				interrupted = true;
640 				dev_warn(device->dev,
641 					 "Device is currently in use, task"
642 					 " \"%s\" (%d) "
643 					 "blocked until device is released",
644 					 current->comm, task_pid_nr(current));
645 			}
646 		}
647 	}
648 
649 	mutex_lock(&group->device_lock);
650 	list_del(&device->group_next);
651 	mutex_unlock(&group->device_lock);
652 
653 	/* Balances device_add in register path */
654 	device_del(&device->device);
655 
656 	vfio_device_remove_group(device);
657 }
658 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
659 
660 /*
661  * VFIO Group fd, /dev/vfio/$GROUP
662  */
663 /*
664  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
665  * if there was no container to unset.  Since the ioctl is called on
666  * the group, we know that still exists, therefore the only valid
667  * transition here is 1->0.
668  */
669 static int vfio_group_ioctl_unset_container(struct vfio_group *group)
670 {
671 	int ret = 0;
672 
673 	down_write(&group->group_rwsem);
674 	if (!group->container) {
675 		ret = -EINVAL;
676 		goto out_unlock;
677 	}
678 	if (group->container_users != 1) {
679 		ret = -EBUSY;
680 		goto out_unlock;
681 	}
682 	vfio_group_detach_container(group);
683 
684 out_unlock:
685 	up_write(&group->group_rwsem);
686 	return ret;
687 }
688 
689 static int vfio_group_ioctl_set_container(struct vfio_group *group,
690 					  int __user *arg)
691 {
692 	struct vfio_container *container;
693 	struct fd f;
694 	int ret;
695 	int fd;
696 
697 	if (get_user(fd, arg))
698 		return -EFAULT;
699 
700 	f = fdget(fd);
701 	if (!f.file)
702 		return -EBADF;
703 
704 	down_write(&group->group_rwsem);
705 	if (group->container || WARN_ON(group->container_users)) {
706 		ret = -EINVAL;
707 		goto out_unlock;
708 	}
709 	container = vfio_container_from_file(f.file);
710 	ret = -EINVAL;
711 	if (container) {
712 		ret = vfio_container_attach_group(container, group);
713 		goto out_unlock;
714 	}
715 
716 out_unlock:
717 	up_write(&group->group_rwsem);
718 	fdput(f);
719 	return ret;
720 }
721 
722 static const struct file_operations vfio_device_fops;
723 
724 /* true if the vfio_device has open_device() called but not close_device() */
725 bool vfio_assert_device_open(struct vfio_device *device)
726 {
727 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
728 }
729 
730 static struct file *vfio_device_open(struct vfio_device *device)
731 {
732 	struct file *filep;
733 	int ret;
734 
735 	down_write(&device->group->group_rwsem);
736 	ret = vfio_device_assign_container(device);
737 	up_write(&device->group->group_rwsem);
738 	if (ret)
739 		return ERR_PTR(ret);
740 
741 	if (!try_module_get(device->dev->driver->owner)) {
742 		ret = -ENODEV;
743 		goto err_unassign_container;
744 	}
745 
746 	mutex_lock(&device->dev_set->lock);
747 	device->open_count++;
748 	if (device->open_count == 1) {
749 		/*
750 		 * Here we pass the KVM pointer with the group under the read
751 		 * lock.  If the device driver will use it, it must obtain a
752 		 * reference and release it during close_device.
753 		 */
754 		down_read(&device->group->group_rwsem);
755 		device->kvm = device->group->kvm;
756 
757 		if (device->ops->open_device) {
758 			ret = device->ops->open_device(device);
759 			if (ret)
760 				goto err_undo_count;
761 		}
762 		vfio_device_container_register(device);
763 		up_read(&device->group->group_rwsem);
764 	}
765 	mutex_unlock(&device->dev_set->lock);
766 
767 	/*
768 	 * We can't use anon_inode_getfd() because we need to modify
769 	 * the f_mode flags directly to allow more than just ioctls
770 	 */
771 	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
772 				   device, O_RDWR);
773 	if (IS_ERR(filep)) {
774 		ret = PTR_ERR(filep);
775 		goto err_close_device;
776 	}
777 
778 	/*
779 	 * TODO: add an anon_inode interface to do this.
780 	 * Appears to be missing by lack of need rather than
781 	 * explicitly prevented.  Now there's need.
782 	 */
783 	filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
784 
785 	if (device->group->type == VFIO_NO_IOMMU)
786 		dev_warn(device->dev, "vfio-noiommu device opened by user "
787 			 "(%s:%d)\n", current->comm, task_pid_nr(current));
788 	/*
789 	 * On success the ref of device is moved to the file and
790 	 * put in vfio_device_fops_release()
791 	 */
792 	return filep;
793 
794 err_close_device:
795 	mutex_lock(&device->dev_set->lock);
796 	down_read(&device->group->group_rwsem);
797 	if (device->open_count == 1 && device->ops->close_device) {
798 		device->ops->close_device(device);
799 
800 		vfio_device_container_unregister(device);
801 	}
802 err_undo_count:
803 	up_read(&device->group->group_rwsem);
804 	device->open_count--;
805 	if (device->open_count == 0 && device->kvm)
806 		device->kvm = NULL;
807 	mutex_unlock(&device->dev_set->lock);
808 	module_put(device->dev->driver->owner);
809 err_unassign_container:
810 	vfio_device_unassign_container(device);
811 	return ERR_PTR(ret);
812 }
813 
814 static int vfio_group_ioctl_get_device_fd(struct vfio_group *group,
815 					  char __user *arg)
816 {
817 	struct vfio_device *device;
818 	struct file *filep;
819 	char *buf;
820 	int fdno;
821 	int ret;
822 
823 	buf = strndup_user(arg, PAGE_SIZE);
824 	if (IS_ERR(buf))
825 		return PTR_ERR(buf);
826 
827 	device = vfio_device_get_from_name(group, buf);
828 	kfree(buf);
829 	if (IS_ERR(device))
830 		return PTR_ERR(device);
831 
832 	fdno = get_unused_fd_flags(O_CLOEXEC);
833 	if (fdno < 0) {
834 		ret = fdno;
835 		goto err_put_device;
836 	}
837 
838 	filep = vfio_device_open(device);
839 	if (IS_ERR(filep)) {
840 		ret = PTR_ERR(filep);
841 		goto err_put_fdno;
842 	}
843 
844 	fd_install(fdno, filep);
845 	return fdno;
846 
847 err_put_fdno:
848 	put_unused_fd(fdno);
849 err_put_device:
850 	vfio_device_put_registration(device);
851 	return ret;
852 }
853 
854 static int vfio_group_ioctl_get_status(struct vfio_group *group,
855 				       struct vfio_group_status __user *arg)
856 {
857 	unsigned long minsz = offsetofend(struct vfio_group_status, flags);
858 	struct vfio_group_status status;
859 
860 	if (copy_from_user(&status, arg, minsz))
861 		return -EFAULT;
862 
863 	if (status.argsz < minsz)
864 		return -EINVAL;
865 
866 	status.flags = 0;
867 
868 	down_read(&group->group_rwsem);
869 	if (group->container)
870 		status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
871 				VFIO_GROUP_FLAGS_VIABLE;
872 	else if (!iommu_group_dma_owner_claimed(group->iommu_group))
873 		status.flags |= VFIO_GROUP_FLAGS_VIABLE;
874 	up_read(&group->group_rwsem);
875 
876 	if (copy_to_user(arg, &status, minsz))
877 		return -EFAULT;
878 	return 0;
879 }
880 
881 static long vfio_group_fops_unl_ioctl(struct file *filep,
882 				      unsigned int cmd, unsigned long arg)
883 {
884 	struct vfio_group *group = filep->private_data;
885 	void __user *uarg = (void __user *)arg;
886 
887 	switch (cmd) {
888 	case VFIO_GROUP_GET_DEVICE_FD:
889 		return vfio_group_ioctl_get_device_fd(group, uarg);
890 	case VFIO_GROUP_GET_STATUS:
891 		return vfio_group_ioctl_get_status(group, uarg);
892 	case VFIO_GROUP_SET_CONTAINER:
893 		return vfio_group_ioctl_set_container(group, uarg);
894 	case VFIO_GROUP_UNSET_CONTAINER:
895 		return vfio_group_ioctl_unset_container(group);
896 	default:
897 		return -ENOTTY;
898 	}
899 }
900 
901 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
902 {
903 	struct vfio_group *group =
904 		container_of(inode->i_cdev, struct vfio_group, cdev);
905 	int ret;
906 
907 	down_write(&group->group_rwsem);
908 
909 	/* users can be zero if this races with vfio_device_remove_group() */
910 	if (!refcount_inc_not_zero(&group->users)) {
911 		ret = -ENODEV;
912 		goto err_unlock;
913 	}
914 
915 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
916 		ret = -EPERM;
917 		goto err_put;
918 	}
919 
920 	/*
921 	 * Do we need multiple instances of the group open?  Seems not.
922 	 */
923 	if (group->opened_file) {
924 		ret = -EBUSY;
925 		goto err_put;
926 	}
927 	group->opened_file = filep;
928 	filep->private_data = group;
929 
930 	up_write(&group->group_rwsem);
931 	return 0;
932 err_put:
933 	vfio_group_put(group);
934 err_unlock:
935 	up_write(&group->group_rwsem);
936 	return ret;
937 }
938 
939 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
940 {
941 	struct vfio_group *group = filep->private_data;
942 
943 	filep->private_data = NULL;
944 
945 	down_write(&group->group_rwsem);
946 	/*
947 	 * Device FDs hold a group file reference, therefore the group release
948 	 * is only called when there are no open devices.
949 	 */
950 	WARN_ON(group->notifier.head);
951 	if (group->container)
952 		vfio_group_detach_container(group);
953 	group->opened_file = NULL;
954 	up_write(&group->group_rwsem);
955 
956 	vfio_group_put(group);
957 
958 	return 0;
959 }
960 
961 static const struct file_operations vfio_group_fops = {
962 	.owner		= THIS_MODULE,
963 	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
964 	.compat_ioctl	= compat_ptr_ioctl,
965 	.open		= vfio_group_fops_open,
966 	.release	= vfio_group_fops_release,
967 };
968 
969 /*
970  * Wrapper around pm_runtime_resume_and_get().
971  * Return error code on failure or 0 on success.
972  */
973 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
974 {
975 	struct device *dev = device->dev;
976 
977 	if (dev->driver && dev->driver->pm) {
978 		int ret;
979 
980 		ret = pm_runtime_resume_and_get(dev);
981 		if (ret) {
982 			dev_info_ratelimited(dev,
983 				"vfio: runtime resume failed %d\n", ret);
984 			return -EIO;
985 		}
986 	}
987 
988 	return 0;
989 }
990 
991 /*
992  * Wrapper around pm_runtime_put().
993  */
994 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
995 {
996 	struct device *dev = device->dev;
997 
998 	if (dev->driver && dev->driver->pm)
999 		pm_runtime_put(dev);
1000 }
1001 
1002 /*
1003  * VFIO Device fd
1004  */
1005 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1006 {
1007 	struct vfio_device *device = filep->private_data;
1008 
1009 	mutex_lock(&device->dev_set->lock);
1010 	vfio_assert_device_open(device);
1011 	down_read(&device->group->group_rwsem);
1012 	if (device->open_count == 1 && device->ops->close_device)
1013 		device->ops->close_device(device);
1014 
1015 	vfio_device_container_unregister(device);
1016 	up_read(&device->group->group_rwsem);
1017 	device->open_count--;
1018 	if (device->open_count == 0)
1019 		device->kvm = NULL;
1020 	mutex_unlock(&device->dev_set->lock);
1021 
1022 	module_put(device->dev->driver->owner);
1023 
1024 	vfio_device_unassign_container(device);
1025 
1026 	vfio_device_put_registration(device);
1027 
1028 	return 0;
1029 }
1030 
1031 /*
1032  * vfio_mig_get_next_state - Compute the next step in the FSM
1033  * @cur_fsm - The current state the device is in
1034  * @new_fsm - The target state to reach
1035  * @next_fsm - Pointer to the next step to get to new_fsm
1036  *
1037  * Return 0 upon success, otherwise -errno
1038  * Upon success the next step in the state progression between cur_fsm and
1039  * new_fsm will be set in next_fsm.
1040  *
1041  * This breaks down requests for combination transitions into smaller steps and
1042  * returns the next step to get to new_fsm. The function may need to be called
1043  * multiple times before reaching new_fsm.
1044  *
1045  */
1046 int vfio_mig_get_next_state(struct vfio_device *device,
1047 			    enum vfio_device_mig_state cur_fsm,
1048 			    enum vfio_device_mig_state new_fsm,
1049 			    enum vfio_device_mig_state *next_fsm)
1050 {
1051 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
1052 	/*
1053 	 * The coding in this table requires the driver to implement the
1054 	 * following FSM arcs:
1055 	 *         RESUMING -> STOP
1056 	 *         STOP -> RESUMING
1057 	 *         STOP -> STOP_COPY
1058 	 *         STOP_COPY -> STOP
1059 	 *
1060 	 * If P2P is supported then the driver must also implement these FSM
1061 	 * arcs:
1062 	 *         RUNNING -> RUNNING_P2P
1063 	 *         RUNNING_P2P -> RUNNING
1064 	 *         RUNNING_P2P -> STOP
1065 	 *         STOP -> RUNNING_P2P
1066 	 * Without P2P the driver must implement:
1067 	 *         RUNNING -> STOP
1068 	 *         STOP -> RUNNING
1069 	 *
1070 	 * The coding will step through multiple states for some combination
1071 	 * transitions; if all optional features are supported, this means the
1072 	 * following ones:
1073 	 *         RESUMING -> STOP -> RUNNING_P2P
1074 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
1075 	 *         RESUMING -> STOP -> STOP_COPY
1076 	 *         RUNNING -> RUNNING_P2P -> STOP
1077 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
1078 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
1079 	 *         RUNNING_P2P -> STOP -> RESUMING
1080 	 *         RUNNING_P2P -> STOP -> STOP_COPY
1081 	 *         STOP -> RUNNING_P2P -> RUNNING
1082 	 *         STOP_COPY -> STOP -> RESUMING
1083 	 *         STOP_COPY -> STOP -> RUNNING_P2P
1084 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
1085 	 */
1086 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
1087 		[VFIO_DEVICE_STATE_STOP] = {
1088 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1089 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1090 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1091 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1092 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1093 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1094 		},
1095 		[VFIO_DEVICE_STATE_RUNNING] = {
1096 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
1097 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1098 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
1099 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1100 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1101 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1102 		},
1103 		[VFIO_DEVICE_STATE_STOP_COPY] = {
1104 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1105 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1106 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1107 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1108 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1109 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1110 		},
1111 		[VFIO_DEVICE_STATE_RESUMING] = {
1112 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1113 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1114 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1115 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
1116 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1117 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1118 		},
1119 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
1120 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1121 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1122 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1123 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1124 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
1125 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1126 		},
1127 		[VFIO_DEVICE_STATE_ERROR] = {
1128 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
1129 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
1130 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
1131 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
1132 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
1133 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1134 		},
1135 	};
1136 
1137 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
1138 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
1139 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
1140 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
1141 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
1142 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
1143 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
1144 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
1145 	};
1146 
1147 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1148 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
1149 			state_flags_table[cur_fsm]))
1150 		return -EINVAL;
1151 
1152 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1153 	   (state_flags_table[new_fsm] & device->migration_flags) !=
1154 			state_flags_table[new_fsm])
1155 		return -EINVAL;
1156 
1157 	/*
1158 	 * Arcs touching optional and unsupported states are skipped over. The
1159 	 * driver will instead see an arc from the original state to the next
1160 	 * logical state, as per the above comment.
1161 	 */
1162 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
1163 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
1164 			state_flags_table[*next_fsm])
1165 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
1166 
1167 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
1168 }
1169 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
1170 
1171 /*
1172  * Convert the drivers's struct file into a FD number and return it to userspace
1173  */
1174 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
1175 				   struct vfio_device_feature_mig_state *mig)
1176 {
1177 	int ret;
1178 	int fd;
1179 
1180 	fd = get_unused_fd_flags(O_CLOEXEC);
1181 	if (fd < 0) {
1182 		ret = fd;
1183 		goto out_fput;
1184 	}
1185 
1186 	mig->data_fd = fd;
1187 	if (copy_to_user(arg, mig, sizeof(*mig))) {
1188 		ret = -EFAULT;
1189 		goto out_put_unused;
1190 	}
1191 	fd_install(fd, filp);
1192 	return 0;
1193 
1194 out_put_unused:
1195 	put_unused_fd(fd);
1196 out_fput:
1197 	fput(filp);
1198 	return ret;
1199 }
1200 
1201 static int
1202 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
1203 					   u32 flags, void __user *arg,
1204 					   size_t argsz)
1205 {
1206 	size_t minsz =
1207 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
1208 	struct vfio_device_feature_mig_state mig;
1209 	struct file *filp = NULL;
1210 	int ret;
1211 
1212 	if (!device->mig_ops)
1213 		return -ENOTTY;
1214 
1215 	ret = vfio_check_feature(flags, argsz,
1216 				 VFIO_DEVICE_FEATURE_SET |
1217 				 VFIO_DEVICE_FEATURE_GET,
1218 				 sizeof(mig));
1219 	if (ret != 1)
1220 		return ret;
1221 
1222 	if (copy_from_user(&mig, arg, minsz))
1223 		return -EFAULT;
1224 
1225 	if (flags & VFIO_DEVICE_FEATURE_GET) {
1226 		enum vfio_device_mig_state curr_state;
1227 
1228 		ret = device->mig_ops->migration_get_state(device,
1229 							   &curr_state);
1230 		if (ret)
1231 			return ret;
1232 		mig.device_state = curr_state;
1233 		goto out_copy;
1234 	}
1235 
1236 	/* Handle the VFIO_DEVICE_FEATURE_SET */
1237 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
1238 	if (IS_ERR(filp) || !filp)
1239 		goto out_copy;
1240 
1241 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
1242 out_copy:
1243 	mig.data_fd = -1;
1244 	if (copy_to_user(arg, &mig, sizeof(mig)))
1245 		return -EFAULT;
1246 	if (IS_ERR(filp))
1247 		return PTR_ERR(filp);
1248 	return 0;
1249 }
1250 
1251 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
1252 					       u32 flags, void __user *arg,
1253 					       size_t argsz)
1254 {
1255 	struct vfio_device_feature_migration mig = {
1256 		.flags = device->migration_flags,
1257 	};
1258 	int ret;
1259 
1260 	if (!device->mig_ops)
1261 		return -ENOTTY;
1262 
1263 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
1264 				 sizeof(mig));
1265 	if (ret != 1)
1266 		return ret;
1267 	if (copy_to_user(arg, &mig, sizeof(mig)))
1268 		return -EFAULT;
1269 	return 0;
1270 }
1271 
1272 /* Ranges should fit into a single kernel page */
1273 #define LOG_MAX_RANGES \
1274 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1275 
1276 static int
1277 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1278 					u32 flags, void __user *arg,
1279 					size_t argsz)
1280 {
1281 	size_t minsz =
1282 		offsetofend(struct vfio_device_feature_dma_logging_control,
1283 			    ranges);
1284 	struct vfio_device_feature_dma_logging_range __user *ranges;
1285 	struct vfio_device_feature_dma_logging_control control;
1286 	struct vfio_device_feature_dma_logging_range range;
1287 	struct rb_root_cached root = RB_ROOT_CACHED;
1288 	struct interval_tree_node *nodes;
1289 	u64 iova_end;
1290 	u32 nnodes;
1291 	int i, ret;
1292 
1293 	if (!device->log_ops)
1294 		return -ENOTTY;
1295 
1296 	ret = vfio_check_feature(flags, argsz,
1297 				 VFIO_DEVICE_FEATURE_SET,
1298 				 sizeof(control));
1299 	if (ret != 1)
1300 		return ret;
1301 
1302 	if (copy_from_user(&control, arg, minsz))
1303 		return -EFAULT;
1304 
1305 	nnodes = control.num_ranges;
1306 	if (!nnodes)
1307 		return -EINVAL;
1308 
1309 	if (nnodes > LOG_MAX_RANGES)
1310 		return -E2BIG;
1311 
1312 	ranges = u64_to_user_ptr(control.ranges);
1313 	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1314 			      GFP_KERNEL);
1315 	if (!nodes)
1316 		return -ENOMEM;
1317 
1318 	for (i = 0; i < nnodes; i++) {
1319 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1320 			ret = -EFAULT;
1321 			goto end;
1322 		}
1323 		if (!IS_ALIGNED(range.iova, control.page_size) ||
1324 		    !IS_ALIGNED(range.length, control.page_size)) {
1325 			ret = -EINVAL;
1326 			goto end;
1327 		}
1328 
1329 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1330 		    iova_end > ULONG_MAX) {
1331 			ret = -EOVERFLOW;
1332 			goto end;
1333 		}
1334 
1335 		nodes[i].start = range.iova;
1336 		nodes[i].last = range.iova + range.length - 1;
1337 		if (interval_tree_iter_first(&root, nodes[i].start,
1338 					     nodes[i].last)) {
1339 			/* Range overlapping */
1340 			ret = -EINVAL;
1341 			goto end;
1342 		}
1343 		interval_tree_insert(nodes + i, &root);
1344 	}
1345 
1346 	ret = device->log_ops->log_start(device, &root, nnodes,
1347 					 &control.page_size);
1348 	if (ret)
1349 		goto end;
1350 
1351 	if (copy_to_user(arg, &control, sizeof(control))) {
1352 		ret = -EFAULT;
1353 		device->log_ops->log_stop(device);
1354 	}
1355 
1356 end:
1357 	kfree(nodes);
1358 	return ret;
1359 }
1360 
1361 static int
1362 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1363 				       u32 flags, void __user *arg,
1364 				       size_t argsz)
1365 {
1366 	int ret;
1367 
1368 	if (!device->log_ops)
1369 		return -ENOTTY;
1370 
1371 	ret = vfio_check_feature(flags, argsz,
1372 				 VFIO_DEVICE_FEATURE_SET, 0);
1373 	if (ret != 1)
1374 		return ret;
1375 
1376 	return device->log_ops->log_stop(device);
1377 }
1378 
1379 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1380 					  unsigned long iova, size_t length,
1381 					  void *opaque)
1382 {
1383 	struct vfio_device *device = opaque;
1384 
1385 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1386 }
1387 
1388 static int
1389 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1390 					 u32 flags, void __user *arg,
1391 					 size_t argsz)
1392 {
1393 	size_t minsz =
1394 		offsetofend(struct vfio_device_feature_dma_logging_report,
1395 			    bitmap);
1396 	struct vfio_device_feature_dma_logging_report report;
1397 	struct iova_bitmap *iter;
1398 	u64 iova_end;
1399 	int ret;
1400 
1401 	if (!device->log_ops)
1402 		return -ENOTTY;
1403 
1404 	ret = vfio_check_feature(flags, argsz,
1405 				 VFIO_DEVICE_FEATURE_GET,
1406 				 sizeof(report));
1407 	if (ret != 1)
1408 		return ret;
1409 
1410 	if (copy_from_user(&report, arg, minsz))
1411 		return -EFAULT;
1412 
1413 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1414 		return -EINVAL;
1415 
1416 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1417 	    iova_end > ULONG_MAX)
1418 		return -EOVERFLOW;
1419 
1420 	iter = iova_bitmap_alloc(report.iova, report.length,
1421 				 report.page_size,
1422 				 u64_to_user_ptr(report.bitmap));
1423 	if (IS_ERR(iter))
1424 		return PTR_ERR(iter);
1425 
1426 	ret = iova_bitmap_for_each(iter, device,
1427 				   vfio_device_log_read_and_clear);
1428 
1429 	iova_bitmap_free(iter);
1430 	return ret;
1431 }
1432 
1433 static int vfio_ioctl_device_feature(struct vfio_device *device,
1434 				     struct vfio_device_feature __user *arg)
1435 {
1436 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1437 	struct vfio_device_feature feature;
1438 
1439 	if (copy_from_user(&feature, arg, minsz))
1440 		return -EFAULT;
1441 
1442 	if (feature.argsz < minsz)
1443 		return -EINVAL;
1444 
1445 	/* Check unknown flags */
1446 	if (feature.flags &
1447 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1448 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1449 		return -EINVAL;
1450 
1451 	/* GET & SET are mutually exclusive except with PROBE */
1452 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1453 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1454 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1455 		return -EINVAL;
1456 
1457 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1458 	case VFIO_DEVICE_FEATURE_MIGRATION:
1459 		return vfio_ioctl_device_feature_migration(
1460 			device, feature.flags, arg->data,
1461 			feature.argsz - minsz);
1462 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1463 		return vfio_ioctl_device_feature_mig_device_state(
1464 			device, feature.flags, arg->data,
1465 			feature.argsz - minsz);
1466 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1467 		return vfio_ioctl_device_feature_logging_start(
1468 			device, feature.flags, arg->data,
1469 			feature.argsz - minsz);
1470 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1471 		return vfio_ioctl_device_feature_logging_stop(
1472 			device, feature.flags, arg->data,
1473 			feature.argsz - minsz);
1474 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1475 		return vfio_ioctl_device_feature_logging_report(
1476 			device, feature.flags, arg->data,
1477 			feature.argsz - minsz);
1478 	default:
1479 		if (unlikely(!device->ops->device_feature))
1480 			return -EINVAL;
1481 		return device->ops->device_feature(device, feature.flags,
1482 						   arg->data,
1483 						   feature.argsz - minsz);
1484 	}
1485 }
1486 
1487 static long vfio_device_fops_unl_ioctl(struct file *filep,
1488 				       unsigned int cmd, unsigned long arg)
1489 {
1490 	struct vfio_device *device = filep->private_data;
1491 	int ret;
1492 
1493 	ret = vfio_device_pm_runtime_get(device);
1494 	if (ret)
1495 		return ret;
1496 
1497 	switch (cmd) {
1498 	case VFIO_DEVICE_FEATURE:
1499 		ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1500 		break;
1501 
1502 	default:
1503 		if (unlikely(!device->ops->ioctl))
1504 			ret = -EINVAL;
1505 		else
1506 			ret = device->ops->ioctl(device, cmd, arg);
1507 		break;
1508 	}
1509 
1510 	vfio_device_pm_runtime_put(device);
1511 	return ret;
1512 }
1513 
1514 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1515 				     size_t count, loff_t *ppos)
1516 {
1517 	struct vfio_device *device = filep->private_data;
1518 
1519 	if (unlikely(!device->ops->read))
1520 		return -EINVAL;
1521 
1522 	return device->ops->read(device, buf, count, ppos);
1523 }
1524 
1525 static ssize_t vfio_device_fops_write(struct file *filep,
1526 				      const char __user *buf,
1527 				      size_t count, loff_t *ppos)
1528 {
1529 	struct vfio_device *device = filep->private_data;
1530 
1531 	if (unlikely(!device->ops->write))
1532 		return -EINVAL;
1533 
1534 	return device->ops->write(device, buf, count, ppos);
1535 }
1536 
1537 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1538 {
1539 	struct vfio_device *device = filep->private_data;
1540 
1541 	if (unlikely(!device->ops->mmap))
1542 		return -EINVAL;
1543 
1544 	return device->ops->mmap(device, vma);
1545 }
1546 
1547 static const struct file_operations vfio_device_fops = {
1548 	.owner		= THIS_MODULE,
1549 	.release	= vfio_device_fops_release,
1550 	.read		= vfio_device_fops_read,
1551 	.write		= vfio_device_fops_write,
1552 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1553 	.compat_ioctl	= compat_ptr_ioctl,
1554 	.mmap		= vfio_device_fops_mmap,
1555 };
1556 
1557 /**
1558  * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
1559  * @file: VFIO group file
1560  *
1561  * The returned iommu_group is valid as long as a ref is held on the file.
1562  */
1563 struct iommu_group *vfio_file_iommu_group(struct file *file)
1564 {
1565 	struct vfio_group *group = file->private_data;
1566 
1567 	if (file->f_op != &vfio_group_fops)
1568 		return NULL;
1569 	return group->iommu_group;
1570 }
1571 EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
1572 
1573 /**
1574  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1575  *        is always CPU cache coherent
1576  * @file: VFIO group file
1577  *
1578  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1579  * bit in DMA transactions. A return of false indicates that the user has
1580  * rights to access additional instructions such as wbinvd on x86.
1581  */
1582 bool vfio_file_enforced_coherent(struct file *file)
1583 {
1584 	struct vfio_group *group = file->private_data;
1585 	bool ret;
1586 
1587 	if (file->f_op != &vfio_group_fops)
1588 		return true;
1589 
1590 	down_read(&group->group_rwsem);
1591 	if (group->container) {
1592 		ret = vfio_container_ioctl_check_extension(group->container,
1593 							   VFIO_DMA_CC_IOMMU);
1594 	} else {
1595 		/*
1596 		 * Since the coherency state is determined only once a container
1597 		 * is attached the user must do so before they can prove they
1598 		 * have permission.
1599 		 */
1600 		ret = true;
1601 	}
1602 	up_read(&group->group_rwsem);
1603 	return ret;
1604 }
1605 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1606 
1607 /**
1608  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1609  * @file: VFIO group file
1610  * @kvm: KVM to link
1611  *
1612  * When a VFIO device is first opened the KVM will be available in
1613  * device->kvm if one was associated with the group.
1614  */
1615 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1616 {
1617 	struct vfio_group *group = file->private_data;
1618 
1619 	if (file->f_op != &vfio_group_fops)
1620 		return;
1621 
1622 	down_write(&group->group_rwsem);
1623 	group->kvm = kvm;
1624 	up_write(&group->group_rwsem);
1625 }
1626 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1627 
1628 /**
1629  * vfio_file_has_dev - True if the VFIO file is a handle for device
1630  * @file: VFIO file to check
1631  * @device: Device that must be part of the file
1632  *
1633  * Returns true if given file has permission to manipulate the given device.
1634  */
1635 bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
1636 {
1637 	struct vfio_group *group = file->private_data;
1638 
1639 	if (file->f_op != &vfio_group_fops)
1640 		return false;
1641 
1642 	return group == device->group;
1643 }
1644 EXPORT_SYMBOL_GPL(vfio_file_has_dev);
1645 
1646 /*
1647  * Sub-module support
1648  */
1649 /*
1650  * Helper for managing a buffer of info chain capabilities, allocate or
1651  * reallocate a buffer with additional @size, filling in @id and @version
1652  * of the capability.  A pointer to the new capability is returned.
1653  *
1654  * NB. The chain is based at the head of the buffer, so new entries are
1655  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1656  * next offsets prior to copying to the user buffer.
1657  */
1658 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1659 					       size_t size, u16 id, u16 version)
1660 {
1661 	void *buf;
1662 	struct vfio_info_cap_header *header, *tmp;
1663 
1664 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1665 	if (!buf) {
1666 		kfree(caps->buf);
1667 		caps->buf = NULL;
1668 		caps->size = 0;
1669 		return ERR_PTR(-ENOMEM);
1670 	}
1671 
1672 	caps->buf = buf;
1673 	header = buf + caps->size;
1674 
1675 	/* Eventually copied to user buffer, zero */
1676 	memset(header, 0, size);
1677 
1678 	header->id = id;
1679 	header->version = version;
1680 
1681 	/* Add to the end of the capability chain */
1682 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1683 		; /* nothing */
1684 
1685 	tmp->next = caps->size;
1686 	caps->size += size;
1687 
1688 	return header;
1689 }
1690 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1691 
1692 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1693 {
1694 	struct vfio_info_cap_header *tmp;
1695 	void *buf = (void *)caps->buf;
1696 
1697 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1698 		tmp->next += offset;
1699 }
1700 EXPORT_SYMBOL(vfio_info_cap_shift);
1701 
1702 int vfio_info_add_capability(struct vfio_info_cap *caps,
1703 			     struct vfio_info_cap_header *cap, size_t size)
1704 {
1705 	struct vfio_info_cap_header *header;
1706 
1707 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1708 	if (IS_ERR(header))
1709 		return PTR_ERR(header);
1710 
1711 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1712 
1713 	return 0;
1714 }
1715 EXPORT_SYMBOL(vfio_info_add_capability);
1716 
1717 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1718 				       int max_irq_type, size_t *data_size)
1719 {
1720 	unsigned long minsz;
1721 	size_t size;
1722 
1723 	minsz = offsetofend(struct vfio_irq_set, count);
1724 
1725 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1726 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1727 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1728 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1729 		return -EINVAL;
1730 
1731 	if (data_size)
1732 		*data_size = 0;
1733 
1734 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1735 		return -EINVAL;
1736 
1737 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1738 	case VFIO_IRQ_SET_DATA_NONE:
1739 		size = 0;
1740 		break;
1741 	case VFIO_IRQ_SET_DATA_BOOL:
1742 		size = sizeof(uint8_t);
1743 		break;
1744 	case VFIO_IRQ_SET_DATA_EVENTFD:
1745 		size = sizeof(int32_t);
1746 		break;
1747 	default:
1748 		return -EINVAL;
1749 	}
1750 
1751 	if (size) {
1752 		if (hdr->argsz - minsz < hdr->count * size)
1753 			return -EINVAL;
1754 
1755 		if (!data_size)
1756 			return -EINVAL;
1757 
1758 		*data_size = hdr->count * size;
1759 	}
1760 
1761 	return 0;
1762 }
1763 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1764 
1765 /*
1766  * Module/class support
1767  */
1768 static char *vfio_devnode(struct device *dev, umode_t *mode)
1769 {
1770 	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
1771 }
1772 
1773 static int __init vfio_init(void)
1774 {
1775 	int ret;
1776 
1777 	ida_init(&vfio.group_ida);
1778 	ida_init(&vfio.device_ida);
1779 	mutex_init(&vfio.group_lock);
1780 	INIT_LIST_HEAD(&vfio.group_list);
1781 
1782 	ret = vfio_container_init();
1783 	if (ret)
1784 		return ret;
1785 
1786 	/* /dev/vfio/$GROUP */
1787 	vfio.class = class_create(THIS_MODULE, "vfio");
1788 	if (IS_ERR(vfio.class)) {
1789 		ret = PTR_ERR(vfio.class);
1790 		goto err_group_class;
1791 	}
1792 
1793 	vfio.class->devnode = vfio_devnode;
1794 
1795 	/* /sys/class/vfio-dev/vfioX */
1796 	vfio.device_class = class_create(THIS_MODULE, "vfio-dev");
1797 	if (IS_ERR(vfio.device_class)) {
1798 		ret = PTR_ERR(vfio.device_class);
1799 		goto err_dev_class;
1800 	}
1801 
1802 	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
1803 	if (ret)
1804 		goto err_alloc_chrdev;
1805 
1806 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1807 	return 0;
1808 
1809 err_alloc_chrdev:
1810 	class_destroy(vfio.device_class);
1811 	vfio.device_class = NULL;
1812 err_dev_class:
1813 	class_destroy(vfio.class);
1814 	vfio.class = NULL;
1815 err_group_class:
1816 	vfio_container_cleanup();
1817 	return ret;
1818 }
1819 
1820 static void __exit vfio_cleanup(void)
1821 {
1822 	WARN_ON(!list_empty(&vfio.group_list));
1823 
1824 	ida_destroy(&vfio.device_ida);
1825 	ida_destroy(&vfio.group_ida);
1826 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
1827 	class_destroy(vfio.device_class);
1828 	vfio.device_class = NULL;
1829 	class_destroy(vfio.class);
1830 	vfio_container_cleanup();
1831 	vfio.class = NULL;
1832 	xa_destroy(&vfio_device_set_xa);
1833 }
1834 
1835 module_init(vfio_init);
1836 module_exit(vfio_cleanup);
1837 
1838 MODULE_VERSION(DRIVER_VERSION);
1839 MODULE_LICENSE("GPL v2");
1840 MODULE_AUTHOR(DRIVER_AUTHOR);
1841 MODULE_DESCRIPTION(DRIVER_DESC);
1842 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
1843 MODULE_ALIAS("devname:vfio/vfio");
1844 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1845