xref: /openbmc/linux/drivers/vfio/vfio_main.c (revision 8cfa7186)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #ifdef CONFIG_HAVE_KVM
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mutex.h>
26 #include <linux/pci.h>
27 #include <linux/rwsem.h>
28 #include <linux/sched.h>
29 #include <linux/slab.h>
30 #include <linux/stat.h>
31 #include <linux/string.h>
32 #include <linux/uaccess.h>
33 #include <linux/vfio.h>
34 #include <linux/wait.h>
35 #include <linux/sched/signal.h>
36 #include <linux/pm_runtime.h>
37 #include <linux/interval_tree.h>
38 #include <linux/iova_bitmap.h>
39 #include <linux/iommufd.h>
40 #include "vfio.h"
41 
42 #define DRIVER_VERSION	"0.3"
43 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC	"VFIO - User Level meta-driver"
45 
46 static struct vfio {
47 	struct class			*device_class;
48 	struct ida			device_ida;
49 } vfio;
50 
51 #ifdef CONFIG_VFIO_NOIOMMU
52 bool vfio_noiommu __read_mostly;
53 module_param_named(enable_unsafe_noiommu_mode,
54 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
56 #endif
57 
58 static DEFINE_XARRAY(vfio_device_set_xa);
59 
60 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
61 {
62 	unsigned long idx = (unsigned long)set_id;
63 	struct vfio_device_set *new_dev_set;
64 	struct vfio_device_set *dev_set;
65 
66 	if (WARN_ON(!set_id))
67 		return -EINVAL;
68 
69 	/*
70 	 * Atomically acquire a singleton object in the xarray for this set_id
71 	 */
72 	xa_lock(&vfio_device_set_xa);
73 	dev_set = xa_load(&vfio_device_set_xa, idx);
74 	if (dev_set)
75 		goto found_get_ref;
76 	xa_unlock(&vfio_device_set_xa);
77 
78 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
79 	if (!new_dev_set)
80 		return -ENOMEM;
81 	mutex_init(&new_dev_set->lock);
82 	INIT_LIST_HEAD(&new_dev_set->device_list);
83 	new_dev_set->set_id = set_id;
84 
85 	xa_lock(&vfio_device_set_xa);
86 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
87 			       GFP_KERNEL);
88 	if (!dev_set) {
89 		dev_set = new_dev_set;
90 		goto found_get_ref;
91 	}
92 
93 	kfree(new_dev_set);
94 	if (xa_is_err(dev_set)) {
95 		xa_unlock(&vfio_device_set_xa);
96 		return xa_err(dev_set);
97 	}
98 
99 found_get_ref:
100 	dev_set->device_count++;
101 	xa_unlock(&vfio_device_set_xa);
102 	mutex_lock(&dev_set->lock);
103 	device->dev_set = dev_set;
104 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
105 	mutex_unlock(&dev_set->lock);
106 	return 0;
107 }
108 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
109 
110 static void vfio_release_device_set(struct vfio_device *device)
111 {
112 	struct vfio_device_set *dev_set = device->dev_set;
113 
114 	if (!dev_set)
115 		return;
116 
117 	mutex_lock(&dev_set->lock);
118 	list_del(&device->dev_set_list);
119 	mutex_unlock(&dev_set->lock);
120 
121 	xa_lock(&vfio_device_set_xa);
122 	if (!--dev_set->device_count) {
123 		__xa_erase(&vfio_device_set_xa,
124 			   (unsigned long)dev_set->set_id);
125 		mutex_destroy(&dev_set->lock);
126 		kfree(dev_set);
127 	}
128 	xa_unlock(&vfio_device_set_xa);
129 }
130 
131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
132 {
133 	struct vfio_device *cur;
134 	unsigned int open_count = 0;
135 
136 	lockdep_assert_held(&dev_set->lock);
137 
138 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139 		open_count += cur->open_count;
140 	return open_count;
141 }
142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
143 
144 struct vfio_device *
145 vfio_find_device_in_devset(struct vfio_device_set *dev_set,
146 			   struct device *dev)
147 {
148 	struct vfio_device *cur;
149 
150 	lockdep_assert_held(&dev_set->lock);
151 
152 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
153 		if (cur->dev == dev)
154 			return cur;
155 	return NULL;
156 }
157 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
158 
159 /*
160  * Device objects - create, release, get, put, search
161  */
162 /* Device reference always implies a group reference */
163 void vfio_device_put_registration(struct vfio_device *device)
164 {
165 	if (refcount_dec_and_test(&device->refcount))
166 		complete(&device->comp);
167 }
168 
169 bool vfio_device_try_get_registration(struct vfio_device *device)
170 {
171 	return refcount_inc_not_zero(&device->refcount);
172 }
173 
174 /*
175  * VFIO driver API
176  */
177 /* Release helper called by vfio_put_device() */
178 static void vfio_device_release(struct device *dev)
179 {
180 	struct vfio_device *device =
181 			container_of(dev, struct vfio_device, device);
182 
183 	vfio_release_device_set(device);
184 	ida_free(&vfio.device_ida, device->index);
185 
186 	if (device->ops->release)
187 		device->ops->release(device);
188 
189 	kvfree(device);
190 }
191 
192 static int vfio_init_device(struct vfio_device *device, struct device *dev,
193 			    const struct vfio_device_ops *ops);
194 
195 /*
196  * Allocate and initialize vfio_device so it can be registered to vfio
197  * core.
198  *
199  * Drivers should use the wrapper vfio_alloc_device() for allocation.
200  * @size is the size of the structure to be allocated, including any
201  * private data used by the driver.
202  *
203  * Driver may provide an @init callback to cover device private data.
204  *
205  * Use vfio_put_device() to release the structure after success return.
206  */
207 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
208 				       const struct vfio_device_ops *ops)
209 {
210 	struct vfio_device *device;
211 	int ret;
212 
213 	if (WARN_ON(size < sizeof(struct vfio_device)))
214 		return ERR_PTR(-EINVAL);
215 
216 	device = kvzalloc(size, GFP_KERNEL);
217 	if (!device)
218 		return ERR_PTR(-ENOMEM);
219 
220 	ret = vfio_init_device(device, dev, ops);
221 	if (ret)
222 		goto out_free;
223 	return device;
224 
225 out_free:
226 	kvfree(device);
227 	return ERR_PTR(ret);
228 }
229 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
230 
231 /*
232  * Initialize a vfio_device so it can be registered to vfio core.
233  */
234 static int vfio_init_device(struct vfio_device *device, struct device *dev,
235 			    const struct vfio_device_ops *ops)
236 {
237 	int ret;
238 
239 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
240 	if (ret < 0) {
241 		dev_dbg(dev, "Error to alloc index\n");
242 		return ret;
243 	}
244 
245 	device->index = ret;
246 	init_completion(&device->comp);
247 	device->dev = dev;
248 	device->ops = ops;
249 
250 	if (ops->init) {
251 		ret = ops->init(device);
252 		if (ret)
253 			goto out_uninit;
254 	}
255 
256 	device_initialize(&device->device);
257 	device->device.release = vfio_device_release;
258 	device->device.class = vfio.device_class;
259 	device->device.parent = device->dev;
260 	return 0;
261 
262 out_uninit:
263 	vfio_release_device_set(device);
264 	ida_free(&vfio.device_ida, device->index);
265 	return ret;
266 }
267 
268 static int __vfio_register_dev(struct vfio_device *device,
269 			       enum vfio_group_type type)
270 {
271 	int ret;
272 
273 	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
274 		    (!device->ops->bind_iommufd ||
275 		     !device->ops->unbind_iommufd ||
276 		     !device->ops->attach_ioas ||
277 		     !device->ops->detach_ioas)))
278 		return -EINVAL;
279 
280 	/*
281 	 * If the driver doesn't specify a set then the device is added to a
282 	 * singleton set just for itself.
283 	 */
284 	if (!device->dev_set)
285 		vfio_assign_device_set(device, device);
286 
287 	ret = dev_set_name(&device->device, "vfio%d", device->index);
288 	if (ret)
289 		return ret;
290 
291 	ret = vfio_device_set_group(device, type);
292 	if (ret)
293 		return ret;
294 
295 	ret = device_add(&device->device);
296 	if (ret)
297 		goto err_out;
298 
299 	/* Refcounting can't start until the driver calls register */
300 	refcount_set(&device->refcount, 1);
301 
302 	vfio_device_group_register(device);
303 
304 	return 0;
305 err_out:
306 	vfio_device_remove_group(device);
307 	return ret;
308 }
309 
310 int vfio_register_group_dev(struct vfio_device *device)
311 {
312 	return __vfio_register_dev(device, VFIO_IOMMU);
313 }
314 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
315 
316 /*
317  * Register a virtual device without IOMMU backing.  The user of this
318  * device must not be able to directly trigger unmediated DMA.
319  */
320 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
321 {
322 	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
323 }
324 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
325 
326 /*
327  * Decrement the device reference count and wait for the device to be
328  * removed.  Open file descriptors for the device... */
329 void vfio_unregister_group_dev(struct vfio_device *device)
330 {
331 	unsigned int i = 0;
332 	bool interrupted = false;
333 	long rc;
334 
335 	vfio_device_put_registration(device);
336 	rc = try_wait_for_completion(&device->comp);
337 	while (rc <= 0) {
338 		if (device->ops->request)
339 			device->ops->request(device, i++);
340 
341 		if (interrupted) {
342 			rc = wait_for_completion_timeout(&device->comp,
343 							 HZ * 10);
344 		} else {
345 			rc = wait_for_completion_interruptible_timeout(
346 				&device->comp, HZ * 10);
347 			if (rc < 0) {
348 				interrupted = true;
349 				dev_warn(device->dev,
350 					 "Device is currently in use, task"
351 					 " \"%s\" (%d) "
352 					 "blocked until device is released",
353 					 current->comm, task_pid_nr(current));
354 			}
355 		}
356 	}
357 
358 	vfio_device_group_unregister(device);
359 
360 	/* Balances device_add in register path */
361 	device_del(&device->device);
362 
363 	/* Balances vfio_device_set_group in register path */
364 	vfio_device_remove_group(device);
365 }
366 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
367 
368 #ifdef CONFIG_HAVE_KVM
369 void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
370 {
371 	void (*pfn)(struct kvm *kvm);
372 	bool (*fn)(struct kvm *kvm);
373 	bool ret;
374 
375 	lockdep_assert_held(&device->dev_set->lock);
376 
377 	pfn = symbol_get(kvm_put_kvm);
378 	if (WARN_ON(!pfn))
379 		return;
380 
381 	fn = symbol_get(kvm_get_kvm_safe);
382 	if (WARN_ON(!fn)) {
383 		symbol_put(kvm_put_kvm);
384 		return;
385 	}
386 
387 	ret = fn(kvm);
388 	symbol_put(kvm_get_kvm_safe);
389 	if (!ret) {
390 		symbol_put(kvm_put_kvm);
391 		return;
392 	}
393 
394 	device->put_kvm = pfn;
395 	device->kvm = kvm;
396 }
397 
398 void vfio_device_put_kvm(struct vfio_device *device)
399 {
400 	lockdep_assert_held(&device->dev_set->lock);
401 
402 	if (!device->kvm)
403 		return;
404 
405 	if (WARN_ON(!device->put_kvm))
406 		goto clear;
407 
408 	device->put_kvm(device->kvm);
409 	device->put_kvm = NULL;
410 	symbol_put(kvm_put_kvm);
411 
412 clear:
413 	device->kvm = NULL;
414 }
415 #endif
416 
417 /* true if the vfio_device has open_device() called but not close_device() */
418 static bool vfio_assert_device_open(struct vfio_device *device)
419 {
420 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
421 }
422 
423 struct vfio_device_file *
424 vfio_allocate_device_file(struct vfio_device *device)
425 {
426 	struct vfio_device_file *df;
427 
428 	df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
429 	if (!df)
430 		return ERR_PTR(-ENOMEM);
431 
432 	df->device = device;
433 	spin_lock_init(&df->kvm_ref_lock);
434 
435 	return df;
436 }
437 
438 static int vfio_df_device_first_open(struct vfio_device_file *df)
439 {
440 	struct vfio_device *device = df->device;
441 	struct iommufd_ctx *iommufd = df->iommufd;
442 	int ret;
443 
444 	lockdep_assert_held(&device->dev_set->lock);
445 
446 	if (!try_module_get(device->dev->driver->owner))
447 		return -ENODEV;
448 
449 	if (iommufd)
450 		ret = vfio_df_iommufd_bind(df);
451 	else
452 		ret = vfio_device_group_use_iommu(device);
453 	if (ret)
454 		goto err_module_put;
455 
456 	if (device->ops->open_device) {
457 		ret = device->ops->open_device(device);
458 		if (ret)
459 			goto err_unuse_iommu;
460 	}
461 	return 0;
462 
463 err_unuse_iommu:
464 	if (iommufd)
465 		vfio_df_iommufd_unbind(df);
466 	else
467 		vfio_device_group_unuse_iommu(device);
468 err_module_put:
469 	module_put(device->dev->driver->owner);
470 	return ret;
471 }
472 
473 static void vfio_df_device_last_close(struct vfio_device_file *df)
474 {
475 	struct vfio_device *device = df->device;
476 	struct iommufd_ctx *iommufd = df->iommufd;
477 
478 	lockdep_assert_held(&device->dev_set->lock);
479 
480 	if (device->ops->close_device)
481 		device->ops->close_device(device);
482 	if (iommufd)
483 		vfio_df_iommufd_unbind(df);
484 	else
485 		vfio_device_group_unuse_iommu(device);
486 	module_put(device->dev->driver->owner);
487 }
488 
489 int vfio_df_open(struct vfio_device_file *df)
490 {
491 	struct vfio_device *device = df->device;
492 	int ret = 0;
493 
494 	lockdep_assert_held(&device->dev_set->lock);
495 
496 	/*
497 	 * Only the group path allows the device to be opened multiple
498 	 * times.  The device cdev path doesn't have a secure way for it.
499 	 */
500 	if (device->open_count != 0 && !df->group)
501 		return -EINVAL;
502 
503 	device->open_count++;
504 	if (device->open_count == 1) {
505 		ret = vfio_df_device_first_open(df);
506 		if (ret)
507 			device->open_count--;
508 	}
509 
510 	return ret;
511 }
512 
513 void vfio_df_close(struct vfio_device_file *df)
514 {
515 	struct vfio_device *device = df->device;
516 
517 	lockdep_assert_held(&device->dev_set->lock);
518 
519 	vfio_assert_device_open(device);
520 	if (device->open_count == 1)
521 		vfio_df_device_last_close(df);
522 	device->open_count--;
523 }
524 
525 /*
526  * Wrapper around pm_runtime_resume_and_get().
527  * Return error code on failure or 0 on success.
528  */
529 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
530 {
531 	struct device *dev = device->dev;
532 
533 	if (dev->driver && dev->driver->pm) {
534 		int ret;
535 
536 		ret = pm_runtime_resume_and_get(dev);
537 		if (ret) {
538 			dev_info_ratelimited(dev,
539 				"vfio: runtime resume failed %d\n", ret);
540 			return -EIO;
541 		}
542 	}
543 
544 	return 0;
545 }
546 
547 /*
548  * Wrapper around pm_runtime_put().
549  */
550 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
551 {
552 	struct device *dev = device->dev;
553 
554 	if (dev->driver && dev->driver->pm)
555 		pm_runtime_put(dev);
556 }
557 
558 /*
559  * VFIO Device fd
560  */
561 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
562 {
563 	struct vfio_device_file *df = filep->private_data;
564 	struct vfio_device *device = df->device;
565 
566 	vfio_df_group_close(df);
567 
568 	vfio_device_put_registration(device);
569 
570 	kfree(df);
571 
572 	return 0;
573 }
574 
575 /*
576  * vfio_mig_get_next_state - Compute the next step in the FSM
577  * @cur_fsm - The current state the device is in
578  * @new_fsm - The target state to reach
579  * @next_fsm - Pointer to the next step to get to new_fsm
580  *
581  * Return 0 upon success, otherwise -errno
582  * Upon success the next step in the state progression between cur_fsm and
583  * new_fsm will be set in next_fsm.
584  *
585  * This breaks down requests for combination transitions into smaller steps and
586  * returns the next step to get to new_fsm. The function may need to be called
587  * multiple times before reaching new_fsm.
588  *
589  */
590 int vfio_mig_get_next_state(struct vfio_device *device,
591 			    enum vfio_device_mig_state cur_fsm,
592 			    enum vfio_device_mig_state new_fsm,
593 			    enum vfio_device_mig_state *next_fsm)
594 {
595 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
596 	/*
597 	 * The coding in this table requires the driver to implement the
598 	 * following FSM arcs:
599 	 *         RESUMING -> STOP
600 	 *         STOP -> RESUMING
601 	 *         STOP -> STOP_COPY
602 	 *         STOP_COPY -> STOP
603 	 *
604 	 * If P2P is supported then the driver must also implement these FSM
605 	 * arcs:
606 	 *         RUNNING -> RUNNING_P2P
607 	 *         RUNNING_P2P -> RUNNING
608 	 *         RUNNING_P2P -> STOP
609 	 *         STOP -> RUNNING_P2P
610 	 *
611 	 * If precopy is supported then the driver must support these additional
612 	 * FSM arcs:
613 	 *         RUNNING -> PRE_COPY
614 	 *         PRE_COPY -> RUNNING
615 	 *         PRE_COPY -> STOP_COPY
616 	 * However, if precopy and P2P are supported together then the driver
617 	 * must support these additional arcs beyond the P2P arcs above:
618 	 *         PRE_COPY -> RUNNING
619 	 *         PRE_COPY -> PRE_COPY_P2P
620 	 *         PRE_COPY_P2P -> PRE_COPY
621 	 *         PRE_COPY_P2P -> RUNNING_P2P
622 	 *         PRE_COPY_P2P -> STOP_COPY
623 	 *         RUNNING -> PRE_COPY
624 	 *         RUNNING_P2P -> PRE_COPY_P2P
625 	 *
626 	 * Without P2P and precopy the driver must implement:
627 	 *         RUNNING -> STOP
628 	 *         STOP -> RUNNING
629 	 *
630 	 * The coding will step through multiple states for some combination
631 	 * transitions; if all optional features are supported, this means the
632 	 * following ones:
633 	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
634 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
635 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
636 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
637 	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
638 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
639 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
640 	 *         RESUMING -> STOP -> RUNNING_P2P
641 	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
642 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
643 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
644 	 *         RESUMING -> STOP -> STOP_COPY
645 	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
646 	 *         RUNNING -> RUNNING_P2P -> STOP
647 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
648 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
649 	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
650 	 *         RUNNING_P2P -> STOP -> RESUMING
651 	 *         RUNNING_P2P -> STOP -> STOP_COPY
652 	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
653 	 *         STOP -> RUNNING_P2P -> RUNNING
654 	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
655 	 *         STOP_COPY -> STOP -> RESUMING
656 	 *         STOP_COPY -> STOP -> RUNNING_P2P
657 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
658 	 *
659 	 *  The following transitions are blocked:
660 	 *         STOP_COPY -> PRE_COPY
661 	 *         STOP_COPY -> PRE_COPY_P2P
662 	 */
663 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
664 		[VFIO_DEVICE_STATE_STOP] = {
665 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
666 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
667 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
668 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
669 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
670 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
671 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
672 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
673 		},
674 		[VFIO_DEVICE_STATE_RUNNING] = {
675 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
676 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
677 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
678 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
679 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
680 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
681 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
682 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
683 		},
684 		[VFIO_DEVICE_STATE_PRE_COPY] = {
685 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
686 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
687 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
688 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
689 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
690 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
691 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
692 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
693 		},
694 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
695 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
696 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
697 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
698 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
699 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
700 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
701 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
702 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
703 		},
704 		[VFIO_DEVICE_STATE_STOP_COPY] = {
705 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
706 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
707 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
708 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
709 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
710 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
711 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
712 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
713 		},
714 		[VFIO_DEVICE_STATE_RESUMING] = {
715 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
716 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
717 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
718 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
719 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
720 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
721 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
722 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
723 		},
724 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
725 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
726 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
727 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
728 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
729 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
730 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
731 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
732 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
733 		},
734 		[VFIO_DEVICE_STATE_ERROR] = {
735 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
736 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
737 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
738 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
739 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
740 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
741 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
742 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
743 		},
744 	};
745 
746 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
747 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
748 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
749 		[VFIO_DEVICE_STATE_PRE_COPY] =
750 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
751 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
752 						   VFIO_MIGRATION_P2P |
753 						   VFIO_MIGRATION_PRE_COPY,
754 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
755 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
756 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
757 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
758 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
759 	};
760 
761 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
762 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
763 			state_flags_table[cur_fsm]))
764 		return -EINVAL;
765 
766 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
767 	   (state_flags_table[new_fsm] & device->migration_flags) !=
768 			state_flags_table[new_fsm])
769 		return -EINVAL;
770 
771 	/*
772 	 * Arcs touching optional and unsupported states are skipped over. The
773 	 * driver will instead see an arc from the original state to the next
774 	 * logical state, as per the above comment.
775 	 */
776 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
777 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
778 			state_flags_table[*next_fsm])
779 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
780 
781 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
782 }
783 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
784 
785 /*
786  * Convert the drivers's struct file into a FD number and return it to userspace
787  */
788 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
789 				   struct vfio_device_feature_mig_state *mig)
790 {
791 	int ret;
792 	int fd;
793 
794 	fd = get_unused_fd_flags(O_CLOEXEC);
795 	if (fd < 0) {
796 		ret = fd;
797 		goto out_fput;
798 	}
799 
800 	mig->data_fd = fd;
801 	if (copy_to_user(arg, mig, sizeof(*mig))) {
802 		ret = -EFAULT;
803 		goto out_put_unused;
804 	}
805 	fd_install(fd, filp);
806 	return 0;
807 
808 out_put_unused:
809 	put_unused_fd(fd);
810 out_fput:
811 	fput(filp);
812 	return ret;
813 }
814 
815 static int
816 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
817 					   u32 flags, void __user *arg,
818 					   size_t argsz)
819 {
820 	size_t minsz =
821 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
822 	struct vfio_device_feature_mig_state mig;
823 	struct file *filp = NULL;
824 	int ret;
825 
826 	if (!device->mig_ops)
827 		return -ENOTTY;
828 
829 	ret = vfio_check_feature(flags, argsz,
830 				 VFIO_DEVICE_FEATURE_SET |
831 				 VFIO_DEVICE_FEATURE_GET,
832 				 sizeof(mig));
833 	if (ret != 1)
834 		return ret;
835 
836 	if (copy_from_user(&mig, arg, minsz))
837 		return -EFAULT;
838 
839 	if (flags & VFIO_DEVICE_FEATURE_GET) {
840 		enum vfio_device_mig_state curr_state;
841 
842 		ret = device->mig_ops->migration_get_state(device,
843 							   &curr_state);
844 		if (ret)
845 			return ret;
846 		mig.device_state = curr_state;
847 		goto out_copy;
848 	}
849 
850 	/* Handle the VFIO_DEVICE_FEATURE_SET */
851 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
852 	if (IS_ERR(filp) || !filp)
853 		goto out_copy;
854 
855 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
856 out_copy:
857 	mig.data_fd = -1;
858 	if (copy_to_user(arg, &mig, sizeof(mig)))
859 		return -EFAULT;
860 	if (IS_ERR(filp))
861 		return PTR_ERR(filp);
862 	return 0;
863 }
864 
865 static int
866 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
867 					      u32 flags, void __user *arg,
868 					      size_t argsz)
869 {
870 	struct vfio_device_feature_mig_data_size data_size = {};
871 	unsigned long stop_copy_length;
872 	int ret;
873 
874 	if (!device->mig_ops)
875 		return -ENOTTY;
876 
877 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
878 				 sizeof(data_size));
879 	if (ret != 1)
880 		return ret;
881 
882 	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
883 	if (ret)
884 		return ret;
885 
886 	data_size.stop_copy_length = stop_copy_length;
887 	if (copy_to_user(arg, &data_size, sizeof(data_size)))
888 		return -EFAULT;
889 
890 	return 0;
891 }
892 
893 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
894 					       u32 flags, void __user *arg,
895 					       size_t argsz)
896 {
897 	struct vfio_device_feature_migration mig = {
898 		.flags = device->migration_flags,
899 	};
900 	int ret;
901 
902 	if (!device->mig_ops)
903 		return -ENOTTY;
904 
905 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
906 				 sizeof(mig));
907 	if (ret != 1)
908 		return ret;
909 	if (copy_to_user(arg, &mig, sizeof(mig)))
910 		return -EFAULT;
911 	return 0;
912 }
913 
914 /* Ranges should fit into a single kernel page */
915 #define LOG_MAX_RANGES \
916 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
917 
918 static int
919 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
920 					u32 flags, void __user *arg,
921 					size_t argsz)
922 {
923 	size_t minsz =
924 		offsetofend(struct vfio_device_feature_dma_logging_control,
925 			    ranges);
926 	struct vfio_device_feature_dma_logging_range __user *ranges;
927 	struct vfio_device_feature_dma_logging_control control;
928 	struct vfio_device_feature_dma_logging_range range;
929 	struct rb_root_cached root = RB_ROOT_CACHED;
930 	struct interval_tree_node *nodes;
931 	u64 iova_end;
932 	u32 nnodes;
933 	int i, ret;
934 
935 	if (!device->log_ops)
936 		return -ENOTTY;
937 
938 	ret = vfio_check_feature(flags, argsz,
939 				 VFIO_DEVICE_FEATURE_SET,
940 				 sizeof(control));
941 	if (ret != 1)
942 		return ret;
943 
944 	if (copy_from_user(&control, arg, minsz))
945 		return -EFAULT;
946 
947 	nnodes = control.num_ranges;
948 	if (!nnodes)
949 		return -EINVAL;
950 
951 	if (nnodes > LOG_MAX_RANGES)
952 		return -E2BIG;
953 
954 	ranges = u64_to_user_ptr(control.ranges);
955 	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
956 			      GFP_KERNEL);
957 	if (!nodes)
958 		return -ENOMEM;
959 
960 	for (i = 0; i < nnodes; i++) {
961 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
962 			ret = -EFAULT;
963 			goto end;
964 		}
965 		if (!IS_ALIGNED(range.iova, control.page_size) ||
966 		    !IS_ALIGNED(range.length, control.page_size)) {
967 			ret = -EINVAL;
968 			goto end;
969 		}
970 
971 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
972 		    iova_end > ULONG_MAX) {
973 			ret = -EOVERFLOW;
974 			goto end;
975 		}
976 
977 		nodes[i].start = range.iova;
978 		nodes[i].last = range.iova + range.length - 1;
979 		if (interval_tree_iter_first(&root, nodes[i].start,
980 					     nodes[i].last)) {
981 			/* Range overlapping */
982 			ret = -EINVAL;
983 			goto end;
984 		}
985 		interval_tree_insert(nodes + i, &root);
986 	}
987 
988 	ret = device->log_ops->log_start(device, &root, nnodes,
989 					 &control.page_size);
990 	if (ret)
991 		goto end;
992 
993 	if (copy_to_user(arg, &control, sizeof(control))) {
994 		ret = -EFAULT;
995 		device->log_ops->log_stop(device);
996 	}
997 
998 end:
999 	kfree(nodes);
1000 	return ret;
1001 }
1002 
1003 static int
1004 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1005 				       u32 flags, void __user *arg,
1006 				       size_t argsz)
1007 {
1008 	int ret;
1009 
1010 	if (!device->log_ops)
1011 		return -ENOTTY;
1012 
1013 	ret = vfio_check_feature(flags, argsz,
1014 				 VFIO_DEVICE_FEATURE_SET, 0);
1015 	if (ret != 1)
1016 		return ret;
1017 
1018 	return device->log_ops->log_stop(device);
1019 }
1020 
1021 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1022 					  unsigned long iova, size_t length,
1023 					  void *opaque)
1024 {
1025 	struct vfio_device *device = opaque;
1026 
1027 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1028 }
1029 
1030 static int
1031 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1032 					 u32 flags, void __user *arg,
1033 					 size_t argsz)
1034 {
1035 	size_t minsz =
1036 		offsetofend(struct vfio_device_feature_dma_logging_report,
1037 			    bitmap);
1038 	struct vfio_device_feature_dma_logging_report report;
1039 	struct iova_bitmap *iter;
1040 	u64 iova_end;
1041 	int ret;
1042 
1043 	if (!device->log_ops)
1044 		return -ENOTTY;
1045 
1046 	ret = vfio_check_feature(flags, argsz,
1047 				 VFIO_DEVICE_FEATURE_GET,
1048 				 sizeof(report));
1049 	if (ret != 1)
1050 		return ret;
1051 
1052 	if (copy_from_user(&report, arg, minsz))
1053 		return -EFAULT;
1054 
1055 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1056 		return -EINVAL;
1057 
1058 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1059 	    iova_end > ULONG_MAX)
1060 		return -EOVERFLOW;
1061 
1062 	iter = iova_bitmap_alloc(report.iova, report.length,
1063 				 report.page_size,
1064 				 u64_to_user_ptr(report.bitmap));
1065 	if (IS_ERR(iter))
1066 		return PTR_ERR(iter);
1067 
1068 	ret = iova_bitmap_for_each(iter, device,
1069 				   vfio_device_log_read_and_clear);
1070 
1071 	iova_bitmap_free(iter);
1072 	return ret;
1073 }
1074 
1075 static int vfio_ioctl_device_feature(struct vfio_device *device,
1076 				     struct vfio_device_feature __user *arg)
1077 {
1078 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1079 	struct vfio_device_feature feature;
1080 
1081 	if (copy_from_user(&feature, arg, minsz))
1082 		return -EFAULT;
1083 
1084 	if (feature.argsz < minsz)
1085 		return -EINVAL;
1086 
1087 	/* Check unknown flags */
1088 	if (feature.flags &
1089 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1090 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1091 		return -EINVAL;
1092 
1093 	/* GET & SET are mutually exclusive except with PROBE */
1094 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1095 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1096 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1097 		return -EINVAL;
1098 
1099 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1100 	case VFIO_DEVICE_FEATURE_MIGRATION:
1101 		return vfio_ioctl_device_feature_migration(
1102 			device, feature.flags, arg->data,
1103 			feature.argsz - minsz);
1104 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1105 		return vfio_ioctl_device_feature_mig_device_state(
1106 			device, feature.flags, arg->data,
1107 			feature.argsz - minsz);
1108 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1109 		return vfio_ioctl_device_feature_logging_start(
1110 			device, feature.flags, arg->data,
1111 			feature.argsz - minsz);
1112 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1113 		return vfio_ioctl_device_feature_logging_stop(
1114 			device, feature.flags, arg->data,
1115 			feature.argsz - minsz);
1116 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1117 		return vfio_ioctl_device_feature_logging_report(
1118 			device, feature.flags, arg->data,
1119 			feature.argsz - minsz);
1120 	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1121 		return vfio_ioctl_device_feature_migration_data_size(
1122 			device, feature.flags, arg->data,
1123 			feature.argsz - minsz);
1124 	default:
1125 		if (unlikely(!device->ops->device_feature))
1126 			return -EINVAL;
1127 		return device->ops->device_feature(device, feature.flags,
1128 						   arg->data,
1129 						   feature.argsz - minsz);
1130 	}
1131 }
1132 
1133 static long vfio_device_fops_unl_ioctl(struct file *filep,
1134 				       unsigned int cmd, unsigned long arg)
1135 {
1136 	struct vfio_device_file *df = filep->private_data;
1137 	struct vfio_device *device = df->device;
1138 	int ret;
1139 
1140 	/* Paired with smp_store_release() following vfio_df_open() */
1141 	if (!smp_load_acquire(&df->access_granted))
1142 		return -EINVAL;
1143 
1144 	ret = vfio_device_pm_runtime_get(device);
1145 	if (ret)
1146 		return ret;
1147 
1148 	switch (cmd) {
1149 	case VFIO_DEVICE_FEATURE:
1150 		ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1151 		break;
1152 
1153 	default:
1154 		if (unlikely(!device->ops->ioctl))
1155 			ret = -EINVAL;
1156 		else
1157 			ret = device->ops->ioctl(device, cmd, arg);
1158 		break;
1159 	}
1160 
1161 	vfio_device_pm_runtime_put(device);
1162 	return ret;
1163 }
1164 
1165 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1166 				     size_t count, loff_t *ppos)
1167 {
1168 	struct vfio_device_file *df = filep->private_data;
1169 	struct vfio_device *device = df->device;
1170 
1171 	/* Paired with smp_store_release() following vfio_df_open() */
1172 	if (!smp_load_acquire(&df->access_granted))
1173 		return -EINVAL;
1174 
1175 	if (unlikely(!device->ops->read))
1176 		return -EINVAL;
1177 
1178 	return device->ops->read(device, buf, count, ppos);
1179 }
1180 
1181 static ssize_t vfio_device_fops_write(struct file *filep,
1182 				      const char __user *buf,
1183 				      size_t count, loff_t *ppos)
1184 {
1185 	struct vfio_device_file *df = filep->private_data;
1186 	struct vfio_device *device = df->device;
1187 
1188 	/* Paired with smp_store_release() following vfio_df_open() */
1189 	if (!smp_load_acquire(&df->access_granted))
1190 		return -EINVAL;
1191 
1192 	if (unlikely(!device->ops->write))
1193 		return -EINVAL;
1194 
1195 	return device->ops->write(device, buf, count, ppos);
1196 }
1197 
1198 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1199 {
1200 	struct vfio_device_file *df = filep->private_data;
1201 	struct vfio_device *device = df->device;
1202 
1203 	/* Paired with smp_store_release() following vfio_df_open() */
1204 	if (!smp_load_acquire(&df->access_granted))
1205 		return -EINVAL;
1206 
1207 	if (unlikely(!device->ops->mmap))
1208 		return -EINVAL;
1209 
1210 	return device->ops->mmap(device, vma);
1211 }
1212 
1213 const struct file_operations vfio_device_fops = {
1214 	.owner		= THIS_MODULE,
1215 	.release	= vfio_device_fops_release,
1216 	.read		= vfio_device_fops_read,
1217 	.write		= vfio_device_fops_write,
1218 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1219 	.compat_ioctl	= compat_ptr_ioctl,
1220 	.mmap		= vfio_device_fops_mmap,
1221 };
1222 
1223 static struct vfio_device *vfio_device_from_file(struct file *file)
1224 {
1225 	struct vfio_device_file *df = file->private_data;
1226 
1227 	if (file->f_op != &vfio_device_fops)
1228 		return NULL;
1229 	return df->device;
1230 }
1231 
1232 /**
1233  * vfio_file_is_valid - True if the file is valid vfio file
1234  * @file: VFIO group file or VFIO device file
1235  */
1236 bool vfio_file_is_valid(struct file *file)
1237 {
1238 	return vfio_group_from_file(file) ||
1239 	       vfio_device_from_file(file);
1240 }
1241 EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1242 
1243 /**
1244  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1245  *        is always CPU cache coherent
1246  * @file: VFIO group file or VFIO device file
1247  *
1248  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1249  * bit in DMA transactions. A return of false indicates that the user has
1250  * rights to access additional instructions such as wbinvd on x86.
1251  */
1252 bool vfio_file_enforced_coherent(struct file *file)
1253 {
1254 	struct vfio_device *device;
1255 	struct vfio_group *group;
1256 
1257 	group = vfio_group_from_file(file);
1258 	if (group)
1259 		return vfio_group_enforced_coherent(group);
1260 
1261 	device = vfio_device_from_file(file);
1262 	if (device)
1263 		return device_iommu_capable(device->dev,
1264 					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1265 
1266 	return true;
1267 }
1268 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1269 
1270 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1271 {
1272 	struct vfio_device_file *df = file->private_data;
1273 
1274 	/*
1275 	 * The kvm is first recorded in the vfio_device_file, and will
1276 	 * be propagated to vfio_device::kvm when the file is bound to
1277 	 * iommufd successfully in the vfio device cdev path.
1278 	 */
1279 	spin_lock(&df->kvm_ref_lock);
1280 	df->kvm = kvm;
1281 	spin_unlock(&df->kvm_ref_lock);
1282 }
1283 
1284 /**
1285  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1286  * @file: VFIO group file or VFIO device file
1287  * @kvm: KVM to link
1288  *
1289  * When a VFIO device is first opened the KVM will be available in
1290  * device->kvm if one was associated with the file.
1291  */
1292 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1293 {
1294 	struct vfio_group *group;
1295 
1296 	group = vfio_group_from_file(file);
1297 	if (group)
1298 		vfio_group_set_kvm(group, kvm);
1299 
1300 	if (vfio_device_from_file(file))
1301 		vfio_device_file_set_kvm(file, kvm);
1302 }
1303 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1304 
1305 /*
1306  * Sub-module support
1307  */
1308 /*
1309  * Helper for managing a buffer of info chain capabilities, allocate or
1310  * reallocate a buffer with additional @size, filling in @id and @version
1311  * of the capability.  A pointer to the new capability is returned.
1312  *
1313  * NB. The chain is based at the head of the buffer, so new entries are
1314  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1315  * next offsets prior to copying to the user buffer.
1316  */
1317 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1318 					       size_t size, u16 id, u16 version)
1319 {
1320 	void *buf;
1321 	struct vfio_info_cap_header *header, *tmp;
1322 
1323 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1324 	if (!buf) {
1325 		kfree(caps->buf);
1326 		caps->buf = NULL;
1327 		caps->size = 0;
1328 		return ERR_PTR(-ENOMEM);
1329 	}
1330 
1331 	caps->buf = buf;
1332 	header = buf + caps->size;
1333 
1334 	/* Eventually copied to user buffer, zero */
1335 	memset(header, 0, size);
1336 
1337 	header->id = id;
1338 	header->version = version;
1339 
1340 	/* Add to the end of the capability chain */
1341 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1342 		; /* nothing */
1343 
1344 	tmp->next = caps->size;
1345 	caps->size += size;
1346 
1347 	return header;
1348 }
1349 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1350 
1351 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1352 {
1353 	struct vfio_info_cap_header *tmp;
1354 	void *buf = (void *)caps->buf;
1355 
1356 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1357 		tmp->next += offset;
1358 }
1359 EXPORT_SYMBOL(vfio_info_cap_shift);
1360 
1361 int vfio_info_add_capability(struct vfio_info_cap *caps,
1362 			     struct vfio_info_cap_header *cap, size_t size)
1363 {
1364 	struct vfio_info_cap_header *header;
1365 
1366 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1367 	if (IS_ERR(header))
1368 		return PTR_ERR(header);
1369 
1370 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1371 
1372 	return 0;
1373 }
1374 EXPORT_SYMBOL(vfio_info_add_capability);
1375 
1376 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1377 				       int max_irq_type, size_t *data_size)
1378 {
1379 	unsigned long minsz;
1380 	size_t size;
1381 
1382 	minsz = offsetofend(struct vfio_irq_set, count);
1383 
1384 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1385 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1386 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1387 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1388 		return -EINVAL;
1389 
1390 	if (data_size)
1391 		*data_size = 0;
1392 
1393 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1394 		return -EINVAL;
1395 
1396 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1397 	case VFIO_IRQ_SET_DATA_NONE:
1398 		size = 0;
1399 		break;
1400 	case VFIO_IRQ_SET_DATA_BOOL:
1401 		size = sizeof(uint8_t);
1402 		break;
1403 	case VFIO_IRQ_SET_DATA_EVENTFD:
1404 		size = sizeof(int32_t);
1405 		break;
1406 	default:
1407 		return -EINVAL;
1408 	}
1409 
1410 	if (size) {
1411 		if (hdr->argsz - minsz < hdr->count * size)
1412 			return -EINVAL;
1413 
1414 		if (!data_size)
1415 			return -EINVAL;
1416 
1417 		*data_size = hdr->count * size;
1418 	}
1419 
1420 	return 0;
1421 }
1422 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1423 
1424 /*
1425  * Pin contiguous user pages and return their associated host pages for local
1426  * domain only.
1427  * @device [in]  : device
1428  * @iova [in]    : starting IOVA of user pages to be pinned.
1429  * @npage [in]   : count of pages to be pinned.  This count should not
1430  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1431  * @prot [in]    : protection flags
1432  * @pages[out]   : array of host pages
1433  * Return error or number of pages pinned.
1434  *
1435  * A driver may only call this function if the vfio_device was created
1436  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1437  */
1438 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1439 		   int npage, int prot, struct page **pages)
1440 {
1441 	/* group->container cannot change while a vfio device is open */
1442 	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1443 		return -EINVAL;
1444 	if (vfio_device_has_container(device))
1445 		return vfio_device_container_pin_pages(device, iova,
1446 						       npage, prot, pages);
1447 	if (device->iommufd_access) {
1448 		int ret;
1449 
1450 		if (iova > ULONG_MAX)
1451 			return -EINVAL;
1452 		/*
1453 		 * VFIO ignores the sub page offset, npages is from the start of
1454 		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1455 		 * the sub page offset by doing:
1456 		 *     pages[0] + (iova % PAGE_SIZE)
1457 		 */
1458 		ret = iommufd_access_pin_pages(
1459 			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1460 			npage * PAGE_SIZE, pages,
1461 			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1462 		if (ret)
1463 			return ret;
1464 		return npage;
1465 	}
1466 	return -EINVAL;
1467 }
1468 EXPORT_SYMBOL(vfio_pin_pages);
1469 
1470 /*
1471  * Unpin contiguous host pages for local domain only.
1472  * @device [in]  : device
1473  * @iova [in]    : starting address of user pages to be unpinned.
1474  * @npage [in]   : count of pages to be unpinned.  This count should not
1475  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1476  */
1477 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1478 {
1479 	if (WARN_ON(!vfio_assert_device_open(device)))
1480 		return;
1481 
1482 	if (vfio_device_has_container(device)) {
1483 		vfio_device_container_unpin_pages(device, iova, npage);
1484 		return;
1485 	}
1486 	if (device->iommufd_access) {
1487 		if (WARN_ON(iova > ULONG_MAX))
1488 			return;
1489 		iommufd_access_unpin_pages(device->iommufd_access,
1490 					   ALIGN_DOWN(iova, PAGE_SIZE),
1491 					   npage * PAGE_SIZE);
1492 		return;
1493 	}
1494 }
1495 EXPORT_SYMBOL(vfio_unpin_pages);
1496 
1497 /*
1498  * This interface allows the CPUs to perform some sort of virtual DMA on
1499  * behalf of the device.
1500  *
1501  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1502  * into/from a kernel buffer.
1503  *
1504  * As the read/write of user space memory is conducted via the CPUs and is
1505  * not a real device DMA, it is not necessary to pin the user space memory.
1506  *
1507  * @device [in]		: VFIO device
1508  * @iova [in]		: base IOVA of a user space buffer
1509  * @data [in]		: pointer to kernel buffer
1510  * @len [in]		: kernel buffer length
1511  * @write		: indicate read or write
1512  * Return error code on failure or 0 on success.
1513  */
1514 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1515 		size_t len, bool write)
1516 {
1517 	if (!data || len <= 0 || !vfio_assert_device_open(device))
1518 		return -EINVAL;
1519 
1520 	if (vfio_device_has_container(device))
1521 		return vfio_device_container_dma_rw(device, iova,
1522 						    data, len, write);
1523 
1524 	if (device->iommufd_access) {
1525 		unsigned int flags = 0;
1526 
1527 		if (iova > ULONG_MAX)
1528 			return -EINVAL;
1529 
1530 		/* VFIO historically tries to auto-detect a kthread */
1531 		if (!current->mm)
1532 			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1533 		if (write)
1534 			flags |= IOMMUFD_ACCESS_RW_WRITE;
1535 		return iommufd_access_rw(device->iommufd_access, iova, data,
1536 					 len, flags);
1537 	}
1538 	return -EINVAL;
1539 }
1540 EXPORT_SYMBOL(vfio_dma_rw);
1541 
1542 /*
1543  * Module/class support
1544  */
1545 static int __init vfio_init(void)
1546 {
1547 	int ret;
1548 
1549 	ida_init(&vfio.device_ida);
1550 
1551 	ret = vfio_group_init();
1552 	if (ret)
1553 		return ret;
1554 
1555 	ret = vfio_virqfd_init();
1556 	if (ret)
1557 		goto err_virqfd;
1558 
1559 	/* /sys/class/vfio-dev/vfioX */
1560 	vfio.device_class = class_create("vfio-dev");
1561 	if (IS_ERR(vfio.device_class)) {
1562 		ret = PTR_ERR(vfio.device_class);
1563 		goto err_dev_class;
1564 	}
1565 
1566 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1567 	return 0;
1568 
1569 err_dev_class:
1570 	vfio_virqfd_exit();
1571 err_virqfd:
1572 	vfio_group_cleanup();
1573 	return ret;
1574 }
1575 
1576 static void __exit vfio_cleanup(void)
1577 {
1578 	ida_destroy(&vfio.device_ida);
1579 	class_destroy(vfio.device_class);
1580 	vfio.device_class = NULL;
1581 	vfio_virqfd_exit();
1582 	vfio_group_cleanup();
1583 	xa_destroy(&vfio_device_set_xa);
1584 }
1585 
1586 module_init(vfio_init);
1587 module_exit(vfio_cleanup);
1588 
1589 MODULE_VERSION(DRIVER_VERSION);
1590 MODULE_LICENSE("GPL v2");
1591 MODULE_AUTHOR(DRIVER_AUTHOR);
1592 MODULE_DESCRIPTION(DRIVER_DESC);
1593 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1594