xref: /openbmc/linux/drivers/vfio/vfio_main.c (revision d47a97bd)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #ifdef CONFIG_HAVE_KVM
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mutex.h>
26 #include <linux/pci.h>
27 #include <linux/rwsem.h>
28 #include <linux/sched.h>
29 #include <linux/slab.h>
30 #include <linux/stat.h>
31 #include <linux/string.h>
32 #include <linux/uaccess.h>
33 #include <linux/vfio.h>
34 #include <linux/wait.h>
35 #include <linux/sched/signal.h>
36 #include <linux/pm_runtime.h>
37 #include <linux/interval_tree.h>
38 #include <linux/iova_bitmap.h>
39 #include <linux/iommufd.h>
40 #include "vfio.h"
41 
42 #define DRIVER_VERSION	"0.3"
43 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC	"VFIO - User Level meta-driver"
45 
46 static struct vfio {
47 	struct class			*device_class;
48 	struct ida			device_ida;
49 } vfio;
50 
51 #ifdef CONFIG_VFIO_NOIOMMU
52 bool vfio_noiommu __read_mostly;
53 module_param_named(enable_unsafe_noiommu_mode,
54 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
56 #endif
57 
58 static DEFINE_XARRAY(vfio_device_set_xa);
59 
60 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
61 {
62 	unsigned long idx = (unsigned long)set_id;
63 	struct vfio_device_set *new_dev_set;
64 	struct vfio_device_set *dev_set;
65 
66 	if (WARN_ON(!set_id))
67 		return -EINVAL;
68 
69 	/*
70 	 * Atomically acquire a singleton object in the xarray for this set_id
71 	 */
72 	xa_lock(&vfio_device_set_xa);
73 	dev_set = xa_load(&vfio_device_set_xa, idx);
74 	if (dev_set)
75 		goto found_get_ref;
76 	xa_unlock(&vfio_device_set_xa);
77 
78 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
79 	if (!new_dev_set)
80 		return -ENOMEM;
81 	mutex_init(&new_dev_set->lock);
82 	INIT_LIST_HEAD(&new_dev_set->device_list);
83 	new_dev_set->set_id = set_id;
84 
85 	xa_lock(&vfio_device_set_xa);
86 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
87 			       GFP_KERNEL);
88 	if (!dev_set) {
89 		dev_set = new_dev_set;
90 		goto found_get_ref;
91 	}
92 
93 	kfree(new_dev_set);
94 	if (xa_is_err(dev_set)) {
95 		xa_unlock(&vfio_device_set_xa);
96 		return xa_err(dev_set);
97 	}
98 
99 found_get_ref:
100 	dev_set->device_count++;
101 	xa_unlock(&vfio_device_set_xa);
102 	mutex_lock(&dev_set->lock);
103 	device->dev_set = dev_set;
104 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
105 	mutex_unlock(&dev_set->lock);
106 	return 0;
107 }
108 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
109 
110 static void vfio_release_device_set(struct vfio_device *device)
111 {
112 	struct vfio_device_set *dev_set = device->dev_set;
113 
114 	if (!dev_set)
115 		return;
116 
117 	mutex_lock(&dev_set->lock);
118 	list_del(&device->dev_set_list);
119 	mutex_unlock(&dev_set->lock);
120 
121 	xa_lock(&vfio_device_set_xa);
122 	if (!--dev_set->device_count) {
123 		__xa_erase(&vfio_device_set_xa,
124 			   (unsigned long)dev_set->set_id);
125 		mutex_destroy(&dev_set->lock);
126 		kfree(dev_set);
127 	}
128 	xa_unlock(&vfio_device_set_xa);
129 }
130 
131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
132 {
133 	struct vfio_device *cur;
134 	unsigned int open_count = 0;
135 
136 	lockdep_assert_held(&dev_set->lock);
137 
138 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139 		open_count += cur->open_count;
140 	return open_count;
141 }
142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
143 
144 /*
145  * Device objects - create, release, get, put, search
146  */
147 /* Device reference always implies a group reference */
148 void vfio_device_put_registration(struct vfio_device *device)
149 {
150 	if (refcount_dec_and_test(&device->refcount))
151 		complete(&device->comp);
152 }
153 
154 bool vfio_device_try_get_registration(struct vfio_device *device)
155 {
156 	return refcount_inc_not_zero(&device->refcount);
157 }
158 
159 /*
160  * VFIO driver API
161  */
162 /* Release helper called by vfio_put_device() */
163 static void vfio_device_release(struct device *dev)
164 {
165 	struct vfio_device *device =
166 			container_of(dev, struct vfio_device, device);
167 
168 	vfio_release_device_set(device);
169 	ida_free(&vfio.device_ida, device->index);
170 
171 	if (device->ops->release)
172 		device->ops->release(device);
173 
174 	kvfree(device);
175 }
176 
177 static int vfio_init_device(struct vfio_device *device, struct device *dev,
178 			    const struct vfio_device_ops *ops);
179 
180 /*
181  * Allocate and initialize vfio_device so it can be registered to vfio
182  * core.
183  *
184  * Drivers should use the wrapper vfio_alloc_device() for allocation.
185  * @size is the size of the structure to be allocated, including any
186  * private data used by the driver.
187  *
188  * Driver may provide an @init callback to cover device private data.
189  *
190  * Use vfio_put_device() to release the structure after success return.
191  */
192 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
193 				       const struct vfio_device_ops *ops)
194 {
195 	struct vfio_device *device;
196 	int ret;
197 
198 	if (WARN_ON(size < sizeof(struct vfio_device)))
199 		return ERR_PTR(-EINVAL);
200 
201 	device = kvzalloc(size, GFP_KERNEL);
202 	if (!device)
203 		return ERR_PTR(-ENOMEM);
204 
205 	ret = vfio_init_device(device, dev, ops);
206 	if (ret)
207 		goto out_free;
208 	return device;
209 
210 out_free:
211 	kvfree(device);
212 	return ERR_PTR(ret);
213 }
214 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
215 
216 /*
217  * Initialize a vfio_device so it can be registered to vfio core.
218  */
219 static int vfio_init_device(struct vfio_device *device, struct device *dev,
220 			    const struct vfio_device_ops *ops)
221 {
222 	int ret;
223 
224 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
225 	if (ret < 0) {
226 		dev_dbg(dev, "Error to alloc index\n");
227 		return ret;
228 	}
229 
230 	device->index = ret;
231 	init_completion(&device->comp);
232 	device->dev = dev;
233 	device->ops = ops;
234 
235 	if (ops->init) {
236 		ret = ops->init(device);
237 		if (ret)
238 			goto out_uninit;
239 	}
240 
241 	device_initialize(&device->device);
242 	device->device.release = vfio_device_release;
243 	device->device.class = vfio.device_class;
244 	device->device.parent = device->dev;
245 	return 0;
246 
247 out_uninit:
248 	vfio_release_device_set(device);
249 	ida_free(&vfio.device_ida, device->index);
250 	return ret;
251 }
252 
253 static int __vfio_register_dev(struct vfio_device *device,
254 			       enum vfio_group_type type)
255 {
256 	int ret;
257 
258 	if (WARN_ON(device->ops->bind_iommufd &&
259 		    (!device->ops->unbind_iommufd ||
260 		     !device->ops->attach_ioas)))
261 		return -EINVAL;
262 
263 	/*
264 	 * If the driver doesn't specify a set then the device is added to a
265 	 * singleton set just for itself.
266 	 */
267 	if (!device->dev_set)
268 		vfio_assign_device_set(device, device);
269 
270 	ret = dev_set_name(&device->device, "vfio%d", device->index);
271 	if (ret)
272 		return ret;
273 
274 	ret = vfio_device_set_group(device, type);
275 	if (ret)
276 		return ret;
277 
278 	ret = device_add(&device->device);
279 	if (ret)
280 		goto err_out;
281 
282 	/* Refcounting can't start until the driver calls register */
283 	refcount_set(&device->refcount, 1);
284 
285 	vfio_device_group_register(device);
286 
287 	return 0;
288 err_out:
289 	vfio_device_remove_group(device);
290 	return ret;
291 }
292 
293 int vfio_register_group_dev(struct vfio_device *device)
294 {
295 	return __vfio_register_dev(device, VFIO_IOMMU);
296 }
297 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
298 
299 /*
300  * Register a virtual device without IOMMU backing.  The user of this
301  * device must not be able to directly trigger unmediated DMA.
302  */
303 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
304 {
305 	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
306 }
307 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
308 
309 /*
310  * Decrement the device reference count and wait for the device to be
311  * removed.  Open file descriptors for the device... */
312 void vfio_unregister_group_dev(struct vfio_device *device)
313 {
314 	unsigned int i = 0;
315 	bool interrupted = false;
316 	long rc;
317 
318 	vfio_device_put_registration(device);
319 	rc = try_wait_for_completion(&device->comp);
320 	while (rc <= 0) {
321 		if (device->ops->request)
322 			device->ops->request(device, i++);
323 
324 		if (interrupted) {
325 			rc = wait_for_completion_timeout(&device->comp,
326 							 HZ * 10);
327 		} else {
328 			rc = wait_for_completion_interruptible_timeout(
329 				&device->comp, HZ * 10);
330 			if (rc < 0) {
331 				interrupted = true;
332 				dev_warn(device->dev,
333 					 "Device is currently in use, task"
334 					 " \"%s\" (%d) "
335 					 "blocked until device is released",
336 					 current->comm, task_pid_nr(current));
337 			}
338 		}
339 	}
340 
341 	vfio_device_group_unregister(device);
342 
343 	/* Balances device_add in register path */
344 	device_del(&device->device);
345 
346 	/* Balances vfio_device_set_group in register path */
347 	vfio_device_remove_group(device);
348 }
349 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
350 
351 #ifdef CONFIG_HAVE_KVM
352 void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
353 {
354 	void (*pfn)(struct kvm *kvm);
355 	bool (*fn)(struct kvm *kvm);
356 	bool ret;
357 
358 	lockdep_assert_held(&device->dev_set->lock);
359 
360 	pfn = symbol_get(kvm_put_kvm);
361 	if (WARN_ON(!pfn))
362 		return;
363 
364 	fn = symbol_get(kvm_get_kvm_safe);
365 	if (WARN_ON(!fn)) {
366 		symbol_put(kvm_put_kvm);
367 		return;
368 	}
369 
370 	ret = fn(kvm);
371 	symbol_put(kvm_get_kvm_safe);
372 	if (!ret) {
373 		symbol_put(kvm_put_kvm);
374 		return;
375 	}
376 
377 	device->put_kvm = pfn;
378 	device->kvm = kvm;
379 }
380 
381 void vfio_device_put_kvm(struct vfio_device *device)
382 {
383 	lockdep_assert_held(&device->dev_set->lock);
384 
385 	if (!device->kvm)
386 		return;
387 
388 	if (WARN_ON(!device->put_kvm))
389 		goto clear;
390 
391 	device->put_kvm(device->kvm);
392 	device->put_kvm = NULL;
393 	symbol_put(kvm_put_kvm);
394 
395 clear:
396 	device->kvm = NULL;
397 }
398 #endif
399 
400 /* true if the vfio_device has open_device() called but not close_device() */
401 static bool vfio_assert_device_open(struct vfio_device *device)
402 {
403 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
404 }
405 
406 static int vfio_device_first_open(struct vfio_device *device,
407 				  struct iommufd_ctx *iommufd)
408 {
409 	int ret;
410 
411 	lockdep_assert_held(&device->dev_set->lock);
412 
413 	if (!try_module_get(device->dev->driver->owner))
414 		return -ENODEV;
415 
416 	if (iommufd)
417 		ret = vfio_iommufd_bind(device, iommufd);
418 	else
419 		ret = vfio_device_group_use_iommu(device);
420 	if (ret)
421 		goto err_module_put;
422 
423 	if (device->ops->open_device) {
424 		ret = device->ops->open_device(device);
425 		if (ret)
426 			goto err_unuse_iommu;
427 	}
428 	return 0;
429 
430 err_unuse_iommu:
431 	if (iommufd)
432 		vfio_iommufd_unbind(device);
433 	else
434 		vfio_device_group_unuse_iommu(device);
435 err_module_put:
436 	module_put(device->dev->driver->owner);
437 	return ret;
438 }
439 
440 static void vfio_device_last_close(struct vfio_device *device,
441 				   struct iommufd_ctx *iommufd)
442 {
443 	lockdep_assert_held(&device->dev_set->lock);
444 
445 	if (device->ops->close_device)
446 		device->ops->close_device(device);
447 	if (iommufd)
448 		vfio_iommufd_unbind(device);
449 	else
450 		vfio_device_group_unuse_iommu(device);
451 	module_put(device->dev->driver->owner);
452 }
453 
454 int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd)
455 {
456 	int ret = 0;
457 
458 	lockdep_assert_held(&device->dev_set->lock);
459 
460 	device->open_count++;
461 	if (device->open_count == 1) {
462 		ret = vfio_device_first_open(device, iommufd);
463 		if (ret)
464 			device->open_count--;
465 	}
466 
467 	return ret;
468 }
469 
470 void vfio_device_close(struct vfio_device *device,
471 		       struct iommufd_ctx *iommufd)
472 {
473 	lockdep_assert_held(&device->dev_set->lock);
474 
475 	vfio_assert_device_open(device);
476 	if (device->open_count == 1)
477 		vfio_device_last_close(device, iommufd);
478 	device->open_count--;
479 }
480 
481 /*
482  * Wrapper around pm_runtime_resume_and_get().
483  * Return error code on failure or 0 on success.
484  */
485 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
486 {
487 	struct device *dev = device->dev;
488 
489 	if (dev->driver && dev->driver->pm) {
490 		int ret;
491 
492 		ret = pm_runtime_resume_and_get(dev);
493 		if (ret) {
494 			dev_info_ratelimited(dev,
495 				"vfio: runtime resume failed %d\n", ret);
496 			return -EIO;
497 		}
498 	}
499 
500 	return 0;
501 }
502 
503 /*
504  * Wrapper around pm_runtime_put().
505  */
506 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
507 {
508 	struct device *dev = device->dev;
509 
510 	if (dev->driver && dev->driver->pm)
511 		pm_runtime_put(dev);
512 }
513 
514 /*
515  * VFIO Device fd
516  */
517 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
518 {
519 	struct vfio_device *device = filep->private_data;
520 
521 	vfio_device_group_close(device);
522 
523 	vfio_device_put_registration(device);
524 
525 	return 0;
526 }
527 
528 /*
529  * vfio_mig_get_next_state - Compute the next step in the FSM
530  * @cur_fsm - The current state the device is in
531  * @new_fsm - The target state to reach
532  * @next_fsm - Pointer to the next step to get to new_fsm
533  *
534  * Return 0 upon success, otherwise -errno
535  * Upon success the next step in the state progression between cur_fsm and
536  * new_fsm will be set in next_fsm.
537  *
538  * This breaks down requests for combination transitions into smaller steps and
539  * returns the next step to get to new_fsm. The function may need to be called
540  * multiple times before reaching new_fsm.
541  *
542  */
543 int vfio_mig_get_next_state(struct vfio_device *device,
544 			    enum vfio_device_mig_state cur_fsm,
545 			    enum vfio_device_mig_state new_fsm,
546 			    enum vfio_device_mig_state *next_fsm)
547 {
548 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
549 	/*
550 	 * The coding in this table requires the driver to implement the
551 	 * following FSM arcs:
552 	 *         RESUMING -> STOP
553 	 *         STOP -> RESUMING
554 	 *         STOP -> STOP_COPY
555 	 *         STOP_COPY -> STOP
556 	 *
557 	 * If P2P is supported then the driver must also implement these FSM
558 	 * arcs:
559 	 *         RUNNING -> RUNNING_P2P
560 	 *         RUNNING_P2P -> RUNNING
561 	 *         RUNNING_P2P -> STOP
562 	 *         STOP -> RUNNING_P2P
563 	 *
564 	 * If precopy is supported then the driver must support these additional
565 	 * FSM arcs:
566 	 *         RUNNING -> PRE_COPY
567 	 *         PRE_COPY -> RUNNING
568 	 *         PRE_COPY -> STOP_COPY
569 	 * However, if precopy and P2P are supported together then the driver
570 	 * must support these additional arcs beyond the P2P arcs above:
571 	 *         PRE_COPY -> RUNNING
572 	 *         PRE_COPY -> PRE_COPY_P2P
573 	 *         PRE_COPY_P2P -> PRE_COPY
574 	 *         PRE_COPY_P2P -> RUNNING_P2P
575 	 *         PRE_COPY_P2P -> STOP_COPY
576 	 *         RUNNING -> PRE_COPY
577 	 *         RUNNING_P2P -> PRE_COPY_P2P
578 	 *
579 	 * Without P2P and precopy the driver must implement:
580 	 *         RUNNING -> STOP
581 	 *         STOP -> RUNNING
582 	 *
583 	 * The coding will step through multiple states for some combination
584 	 * transitions; if all optional features are supported, this means the
585 	 * following ones:
586 	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
587 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
588 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
589 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
590 	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
591 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
592 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
593 	 *         RESUMING -> STOP -> RUNNING_P2P
594 	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
595 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
596 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
597 	 *         RESUMING -> STOP -> STOP_COPY
598 	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
599 	 *         RUNNING -> RUNNING_P2P -> STOP
600 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
601 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
602 	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
603 	 *         RUNNING_P2P -> STOP -> RESUMING
604 	 *         RUNNING_P2P -> STOP -> STOP_COPY
605 	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
606 	 *         STOP -> RUNNING_P2P -> RUNNING
607 	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
608 	 *         STOP_COPY -> STOP -> RESUMING
609 	 *         STOP_COPY -> STOP -> RUNNING_P2P
610 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
611 	 *
612 	 *  The following transitions are blocked:
613 	 *         STOP_COPY -> PRE_COPY
614 	 *         STOP_COPY -> PRE_COPY_P2P
615 	 */
616 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
617 		[VFIO_DEVICE_STATE_STOP] = {
618 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
619 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
620 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
621 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
622 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
623 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
624 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
625 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
626 		},
627 		[VFIO_DEVICE_STATE_RUNNING] = {
628 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
629 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
630 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
631 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
632 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
633 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
634 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
635 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
636 		},
637 		[VFIO_DEVICE_STATE_PRE_COPY] = {
638 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
639 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
640 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
641 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
642 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
643 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
644 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
645 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
646 		},
647 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
648 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
649 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
650 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
651 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
652 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
653 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
654 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
655 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
656 		},
657 		[VFIO_DEVICE_STATE_STOP_COPY] = {
658 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
659 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
660 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
661 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
662 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
663 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
664 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
665 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
666 		},
667 		[VFIO_DEVICE_STATE_RESUMING] = {
668 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
669 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
670 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
671 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
672 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
673 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
674 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
675 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
676 		},
677 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
678 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
679 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
680 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
681 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
682 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
683 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
684 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
685 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
686 		},
687 		[VFIO_DEVICE_STATE_ERROR] = {
688 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
689 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
690 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
691 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
692 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
693 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
694 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
695 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
696 		},
697 	};
698 
699 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
700 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
701 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
702 		[VFIO_DEVICE_STATE_PRE_COPY] =
703 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
704 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
705 						   VFIO_MIGRATION_P2P |
706 						   VFIO_MIGRATION_PRE_COPY,
707 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
708 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
709 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
710 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
711 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
712 	};
713 
714 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
715 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
716 			state_flags_table[cur_fsm]))
717 		return -EINVAL;
718 
719 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
720 	   (state_flags_table[new_fsm] & device->migration_flags) !=
721 			state_flags_table[new_fsm])
722 		return -EINVAL;
723 
724 	/*
725 	 * Arcs touching optional and unsupported states are skipped over. The
726 	 * driver will instead see an arc from the original state to the next
727 	 * logical state, as per the above comment.
728 	 */
729 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
730 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
731 			state_flags_table[*next_fsm])
732 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
733 
734 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
735 }
736 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
737 
738 /*
739  * Convert the drivers's struct file into a FD number and return it to userspace
740  */
741 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
742 				   struct vfio_device_feature_mig_state *mig)
743 {
744 	int ret;
745 	int fd;
746 
747 	fd = get_unused_fd_flags(O_CLOEXEC);
748 	if (fd < 0) {
749 		ret = fd;
750 		goto out_fput;
751 	}
752 
753 	mig->data_fd = fd;
754 	if (copy_to_user(arg, mig, sizeof(*mig))) {
755 		ret = -EFAULT;
756 		goto out_put_unused;
757 	}
758 	fd_install(fd, filp);
759 	return 0;
760 
761 out_put_unused:
762 	put_unused_fd(fd);
763 out_fput:
764 	fput(filp);
765 	return ret;
766 }
767 
768 static int
769 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
770 					   u32 flags, void __user *arg,
771 					   size_t argsz)
772 {
773 	size_t minsz =
774 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
775 	struct vfio_device_feature_mig_state mig;
776 	struct file *filp = NULL;
777 	int ret;
778 
779 	if (!device->mig_ops)
780 		return -ENOTTY;
781 
782 	ret = vfio_check_feature(flags, argsz,
783 				 VFIO_DEVICE_FEATURE_SET |
784 				 VFIO_DEVICE_FEATURE_GET,
785 				 sizeof(mig));
786 	if (ret != 1)
787 		return ret;
788 
789 	if (copy_from_user(&mig, arg, minsz))
790 		return -EFAULT;
791 
792 	if (flags & VFIO_DEVICE_FEATURE_GET) {
793 		enum vfio_device_mig_state curr_state;
794 
795 		ret = device->mig_ops->migration_get_state(device,
796 							   &curr_state);
797 		if (ret)
798 			return ret;
799 		mig.device_state = curr_state;
800 		goto out_copy;
801 	}
802 
803 	/* Handle the VFIO_DEVICE_FEATURE_SET */
804 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
805 	if (IS_ERR(filp) || !filp)
806 		goto out_copy;
807 
808 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
809 out_copy:
810 	mig.data_fd = -1;
811 	if (copy_to_user(arg, &mig, sizeof(mig)))
812 		return -EFAULT;
813 	if (IS_ERR(filp))
814 		return PTR_ERR(filp);
815 	return 0;
816 }
817 
818 static int
819 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
820 					      u32 flags, void __user *arg,
821 					      size_t argsz)
822 {
823 	struct vfio_device_feature_mig_data_size data_size = {};
824 	unsigned long stop_copy_length;
825 	int ret;
826 
827 	if (!device->mig_ops)
828 		return -ENOTTY;
829 
830 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
831 				 sizeof(data_size));
832 	if (ret != 1)
833 		return ret;
834 
835 	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
836 	if (ret)
837 		return ret;
838 
839 	data_size.stop_copy_length = stop_copy_length;
840 	if (copy_to_user(arg, &data_size, sizeof(data_size)))
841 		return -EFAULT;
842 
843 	return 0;
844 }
845 
846 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
847 					       u32 flags, void __user *arg,
848 					       size_t argsz)
849 {
850 	struct vfio_device_feature_migration mig = {
851 		.flags = device->migration_flags,
852 	};
853 	int ret;
854 
855 	if (!device->mig_ops)
856 		return -ENOTTY;
857 
858 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
859 				 sizeof(mig));
860 	if (ret != 1)
861 		return ret;
862 	if (copy_to_user(arg, &mig, sizeof(mig)))
863 		return -EFAULT;
864 	return 0;
865 }
866 
867 /* Ranges should fit into a single kernel page */
868 #define LOG_MAX_RANGES \
869 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
870 
871 static int
872 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
873 					u32 flags, void __user *arg,
874 					size_t argsz)
875 {
876 	size_t minsz =
877 		offsetofend(struct vfio_device_feature_dma_logging_control,
878 			    ranges);
879 	struct vfio_device_feature_dma_logging_range __user *ranges;
880 	struct vfio_device_feature_dma_logging_control control;
881 	struct vfio_device_feature_dma_logging_range range;
882 	struct rb_root_cached root = RB_ROOT_CACHED;
883 	struct interval_tree_node *nodes;
884 	u64 iova_end;
885 	u32 nnodes;
886 	int i, ret;
887 
888 	if (!device->log_ops)
889 		return -ENOTTY;
890 
891 	ret = vfio_check_feature(flags, argsz,
892 				 VFIO_DEVICE_FEATURE_SET,
893 				 sizeof(control));
894 	if (ret != 1)
895 		return ret;
896 
897 	if (copy_from_user(&control, arg, minsz))
898 		return -EFAULT;
899 
900 	nnodes = control.num_ranges;
901 	if (!nnodes)
902 		return -EINVAL;
903 
904 	if (nnodes > LOG_MAX_RANGES)
905 		return -E2BIG;
906 
907 	ranges = u64_to_user_ptr(control.ranges);
908 	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
909 			      GFP_KERNEL);
910 	if (!nodes)
911 		return -ENOMEM;
912 
913 	for (i = 0; i < nnodes; i++) {
914 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
915 			ret = -EFAULT;
916 			goto end;
917 		}
918 		if (!IS_ALIGNED(range.iova, control.page_size) ||
919 		    !IS_ALIGNED(range.length, control.page_size)) {
920 			ret = -EINVAL;
921 			goto end;
922 		}
923 
924 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
925 		    iova_end > ULONG_MAX) {
926 			ret = -EOVERFLOW;
927 			goto end;
928 		}
929 
930 		nodes[i].start = range.iova;
931 		nodes[i].last = range.iova + range.length - 1;
932 		if (interval_tree_iter_first(&root, nodes[i].start,
933 					     nodes[i].last)) {
934 			/* Range overlapping */
935 			ret = -EINVAL;
936 			goto end;
937 		}
938 		interval_tree_insert(nodes + i, &root);
939 	}
940 
941 	ret = device->log_ops->log_start(device, &root, nnodes,
942 					 &control.page_size);
943 	if (ret)
944 		goto end;
945 
946 	if (copy_to_user(arg, &control, sizeof(control))) {
947 		ret = -EFAULT;
948 		device->log_ops->log_stop(device);
949 	}
950 
951 end:
952 	kfree(nodes);
953 	return ret;
954 }
955 
956 static int
957 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
958 				       u32 flags, void __user *arg,
959 				       size_t argsz)
960 {
961 	int ret;
962 
963 	if (!device->log_ops)
964 		return -ENOTTY;
965 
966 	ret = vfio_check_feature(flags, argsz,
967 				 VFIO_DEVICE_FEATURE_SET, 0);
968 	if (ret != 1)
969 		return ret;
970 
971 	return device->log_ops->log_stop(device);
972 }
973 
974 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
975 					  unsigned long iova, size_t length,
976 					  void *opaque)
977 {
978 	struct vfio_device *device = opaque;
979 
980 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
981 }
982 
983 static int
984 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
985 					 u32 flags, void __user *arg,
986 					 size_t argsz)
987 {
988 	size_t minsz =
989 		offsetofend(struct vfio_device_feature_dma_logging_report,
990 			    bitmap);
991 	struct vfio_device_feature_dma_logging_report report;
992 	struct iova_bitmap *iter;
993 	u64 iova_end;
994 	int ret;
995 
996 	if (!device->log_ops)
997 		return -ENOTTY;
998 
999 	ret = vfio_check_feature(flags, argsz,
1000 				 VFIO_DEVICE_FEATURE_GET,
1001 				 sizeof(report));
1002 	if (ret != 1)
1003 		return ret;
1004 
1005 	if (copy_from_user(&report, arg, minsz))
1006 		return -EFAULT;
1007 
1008 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1009 		return -EINVAL;
1010 
1011 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1012 	    iova_end > ULONG_MAX)
1013 		return -EOVERFLOW;
1014 
1015 	iter = iova_bitmap_alloc(report.iova, report.length,
1016 				 report.page_size,
1017 				 u64_to_user_ptr(report.bitmap));
1018 	if (IS_ERR(iter))
1019 		return PTR_ERR(iter);
1020 
1021 	ret = iova_bitmap_for_each(iter, device,
1022 				   vfio_device_log_read_and_clear);
1023 
1024 	iova_bitmap_free(iter);
1025 	return ret;
1026 }
1027 
1028 static int vfio_ioctl_device_feature(struct vfio_device *device,
1029 				     struct vfio_device_feature __user *arg)
1030 {
1031 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1032 	struct vfio_device_feature feature;
1033 
1034 	if (copy_from_user(&feature, arg, minsz))
1035 		return -EFAULT;
1036 
1037 	if (feature.argsz < minsz)
1038 		return -EINVAL;
1039 
1040 	/* Check unknown flags */
1041 	if (feature.flags &
1042 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1043 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1044 		return -EINVAL;
1045 
1046 	/* GET & SET are mutually exclusive except with PROBE */
1047 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1048 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1049 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1050 		return -EINVAL;
1051 
1052 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1053 	case VFIO_DEVICE_FEATURE_MIGRATION:
1054 		return vfio_ioctl_device_feature_migration(
1055 			device, feature.flags, arg->data,
1056 			feature.argsz - minsz);
1057 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1058 		return vfio_ioctl_device_feature_mig_device_state(
1059 			device, feature.flags, arg->data,
1060 			feature.argsz - minsz);
1061 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1062 		return vfio_ioctl_device_feature_logging_start(
1063 			device, feature.flags, arg->data,
1064 			feature.argsz - minsz);
1065 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1066 		return vfio_ioctl_device_feature_logging_stop(
1067 			device, feature.flags, arg->data,
1068 			feature.argsz - minsz);
1069 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1070 		return vfio_ioctl_device_feature_logging_report(
1071 			device, feature.flags, arg->data,
1072 			feature.argsz - minsz);
1073 	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1074 		return vfio_ioctl_device_feature_migration_data_size(
1075 			device, feature.flags, arg->data,
1076 			feature.argsz - minsz);
1077 	default:
1078 		if (unlikely(!device->ops->device_feature))
1079 			return -EINVAL;
1080 		return device->ops->device_feature(device, feature.flags,
1081 						   arg->data,
1082 						   feature.argsz - minsz);
1083 	}
1084 }
1085 
1086 static long vfio_device_fops_unl_ioctl(struct file *filep,
1087 				       unsigned int cmd, unsigned long arg)
1088 {
1089 	struct vfio_device *device = filep->private_data;
1090 	int ret;
1091 
1092 	ret = vfio_device_pm_runtime_get(device);
1093 	if (ret)
1094 		return ret;
1095 
1096 	switch (cmd) {
1097 	case VFIO_DEVICE_FEATURE:
1098 		ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1099 		break;
1100 
1101 	default:
1102 		if (unlikely(!device->ops->ioctl))
1103 			ret = -EINVAL;
1104 		else
1105 			ret = device->ops->ioctl(device, cmd, arg);
1106 		break;
1107 	}
1108 
1109 	vfio_device_pm_runtime_put(device);
1110 	return ret;
1111 }
1112 
1113 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1114 				     size_t count, loff_t *ppos)
1115 {
1116 	struct vfio_device *device = filep->private_data;
1117 
1118 	if (unlikely(!device->ops->read))
1119 		return -EINVAL;
1120 
1121 	return device->ops->read(device, buf, count, ppos);
1122 }
1123 
1124 static ssize_t vfio_device_fops_write(struct file *filep,
1125 				      const char __user *buf,
1126 				      size_t count, loff_t *ppos)
1127 {
1128 	struct vfio_device *device = filep->private_data;
1129 
1130 	if (unlikely(!device->ops->write))
1131 		return -EINVAL;
1132 
1133 	return device->ops->write(device, buf, count, ppos);
1134 }
1135 
1136 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1137 {
1138 	struct vfio_device *device = filep->private_data;
1139 
1140 	if (unlikely(!device->ops->mmap))
1141 		return -EINVAL;
1142 
1143 	return device->ops->mmap(device, vma);
1144 }
1145 
1146 const struct file_operations vfio_device_fops = {
1147 	.owner		= THIS_MODULE,
1148 	.release	= vfio_device_fops_release,
1149 	.read		= vfio_device_fops_read,
1150 	.write		= vfio_device_fops_write,
1151 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1152 	.compat_ioctl	= compat_ptr_ioctl,
1153 	.mmap		= vfio_device_fops_mmap,
1154 };
1155 
1156 /*
1157  * Sub-module support
1158  */
1159 /*
1160  * Helper for managing a buffer of info chain capabilities, allocate or
1161  * reallocate a buffer with additional @size, filling in @id and @version
1162  * of the capability.  A pointer to the new capability is returned.
1163  *
1164  * NB. The chain is based at the head of the buffer, so new entries are
1165  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1166  * next offsets prior to copying to the user buffer.
1167  */
1168 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1169 					       size_t size, u16 id, u16 version)
1170 {
1171 	void *buf;
1172 	struct vfio_info_cap_header *header, *tmp;
1173 
1174 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1175 	if (!buf) {
1176 		kfree(caps->buf);
1177 		caps->buf = NULL;
1178 		caps->size = 0;
1179 		return ERR_PTR(-ENOMEM);
1180 	}
1181 
1182 	caps->buf = buf;
1183 	header = buf + caps->size;
1184 
1185 	/* Eventually copied to user buffer, zero */
1186 	memset(header, 0, size);
1187 
1188 	header->id = id;
1189 	header->version = version;
1190 
1191 	/* Add to the end of the capability chain */
1192 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1193 		; /* nothing */
1194 
1195 	tmp->next = caps->size;
1196 	caps->size += size;
1197 
1198 	return header;
1199 }
1200 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1201 
1202 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1203 {
1204 	struct vfio_info_cap_header *tmp;
1205 	void *buf = (void *)caps->buf;
1206 
1207 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1208 		tmp->next += offset;
1209 }
1210 EXPORT_SYMBOL(vfio_info_cap_shift);
1211 
1212 int vfio_info_add_capability(struct vfio_info_cap *caps,
1213 			     struct vfio_info_cap_header *cap, size_t size)
1214 {
1215 	struct vfio_info_cap_header *header;
1216 
1217 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1218 	if (IS_ERR(header))
1219 		return PTR_ERR(header);
1220 
1221 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1222 
1223 	return 0;
1224 }
1225 EXPORT_SYMBOL(vfio_info_add_capability);
1226 
1227 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1228 				       int max_irq_type, size_t *data_size)
1229 {
1230 	unsigned long minsz;
1231 	size_t size;
1232 
1233 	minsz = offsetofend(struct vfio_irq_set, count);
1234 
1235 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1236 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1237 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1238 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1239 		return -EINVAL;
1240 
1241 	if (data_size)
1242 		*data_size = 0;
1243 
1244 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1245 		return -EINVAL;
1246 
1247 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1248 	case VFIO_IRQ_SET_DATA_NONE:
1249 		size = 0;
1250 		break;
1251 	case VFIO_IRQ_SET_DATA_BOOL:
1252 		size = sizeof(uint8_t);
1253 		break;
1254 	case VFIO_IRQ_SET_DATA_EVENTFD:
1255 		size = sizeof(int32_t);
1256 		break;
1257 	default:
1258 		return -EINVAL;
1259 	}
1260 
1261 	if (size) {
1262 		if (hdr->argsz - minsz < hdr->count * size)
1263 			return -EINVAL;
1264 
1265 		if (!data_size)
1266 			return -EINVAL;
1267 
1268 		*data_size = hdr->count * size;
1269 	}
1270 
1271 	return 0;
1272 }
1273 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1274 
1275 /*
1276  * Pin contiguous user pages and return their associated host pages for local
1277  * domain only.
1278  * @device [in]  : device
1279  * @iova [in]    : starting IOVA of user pages to be pinned.
1280  * @npage [in]   : count of pages to be pinned.  This count should not
1281  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1282  * @prot [in]    : protection flags
1283  * @pages[out]   : array of host pages
1284  * Return error or number of pages pinned.
1285  *
1286  * A driver may only call this function if the vfio_device was created
1287  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1288  */
1289 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1290 		   int npage, int prot, struct page **pages)
1291 {
1292 	/* group->container cannot change while a vfio device is open */
1293 	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1294 		return -EINVAL;
1295 	if (vfio_device_has_container(device))
1296 		return vfio_device_container_pin_pages(device, iova,
1297 						       npage, prot, pages);
1298 	if (device->iommufd_access) {
1299 		int ret;
1300 
1301 		if (iova > ULONG_MAX)
1302 			return -EINVAL;
1303 		/*
1304 		 * VFIO ignores the sub page offset, npages is from the start of
1305 		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1306 		 * the sub page offset by doing:
1307 		 *     pages[0] + (iova % PAGE_SIZE)
1308 		 */
1309 		ret = iommufd_access_pin_pages(
1310 			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1311 			npage * PAGE_SIZE, pages,
1312 			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1313 		if (ret)
1314 			return ret;
1315 		return npage;
1316 	}
1317 	return -EINVAL;
1318 }
1319 EXPORT_SYMBOL(vfio_pin_pages);
1320 
1321 /*
1322  * Unpin contiguous host pages for local domain only.
1323  * @device [in]  : device
1324  * @iova [in]    : starting address of user pages to be unpinned.
1325  * @npage [in]   : count of pages to be unpinned.  This count should not
1326  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1327  */
1328 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1329 {
1330 	if (WARN_ON(!vfio_assert_device_open(device)))
1331 		return;
1332 
1333 	if (vfio_device_has_container(device)) {
1334 		vfio_device_container_unpin_pages(device, iova, npage);
1335 		return;
1336 	}
1337 	if (device->iommufd_access) {
1338 		if (WARN_ON(iova > ULONG_MAX))
1339 			return;
1340 		iommufd_access_unpin_pages(device->iommufd_access,
1341 					   ALIGN_DOWN(iova, PAGE_SIZE),
1342 					   npage * PAGE_SIZE);
1343 		return;
1344 	}
1345 }
1346 EXPORT_SYMBOL(vfio_unpin_pages);
1347 
1348 /*
1349  * This interface allows the CPUs to perform some sort of virtual DMA on
1350  * behalf of the device.
1351  *
1352  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1353  * into/from a kernel buffer.
1354  *
1355  * As the read/write of user space memory is conducted via the CPUs and is
1356  * not a real device DMA, it is not necessary to pin the user space memory.
1357  *
1358  * @device [in]		: VFIO device
1359  * @iova [in]		: base IOVA of a user space buffer
1360  * @data [in]		: pointer to kernel buffer
1361  * @len [in]		: kernel buffer length
1362  * @write		: indicate read or write
1363  * Return error code on failure or 0 on success.
1364  */
1365 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1366 		size_t len, bool write)
1367 {
1368 	if (!data || len <= 0 || !vfio_assert_device_open(device))
1369 		return -EINVAL;
1370 
1371 	if (vfio_device_has_container(device))
1372 		return vfio_device_container_dma_rw(device, iova,
1373 						    data, len, write);
1374 
1375 	if (device->iommufd_access) {
1376 		unsigned int flags = 0;
1377 
1378 		if (iova > ULONG_MAX)
1379 			return -EINVAL;
1380 
1381 		/* VFIO historically tries to auto-detect a kthread */
1382 		if (!current->mm)
1383 			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1384 		if (write)
1385 			flags |= IOMMUFD_ACCESS_RW_WRITE;
1386 		return iommufd_access_rw(device->iommufd_access, iova, data,
1387 					 len, flags);
1388 	}
1389 	return -EINVAL;
1390 }
1391 EXPORT_SYMBOL(vfio_dma_rw);
1392 
1393 /*
1394  * Module/class support
1395  */
1396 static int __init vfio_init(void)
1397 {
1398 	int ret;
1399 
1400 	ida_init(&vfio.device_ida);
1401 
1402 	ret = vfio_group_init();
1403 	if (ret)
1404 		return ret;
1405 
1406 	ret = vfio_virqfd_init();
1407 	if (ret)
1408 		goto err_virqfd;
1409 
1410 	/* /sys/class/vfio-dev/vfioX */
1411 	vfio.device_class = class_create(THIS_MODULE, "vfio-dev");
1412 	if (IS_ERR(vfio.device_class)) {
1413 		ret = PTR_ERR(vfio.device_class);
1414 		goto err_dev_class;
1415 	}
1416 
1417 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1418 	return 0;
1419 
1420 err_dev_class:
1421 	vfio_virqfd_exit();
1422 err_virqfd:
1423 	vfio_group_cleanup();
1424 	return ret;
1425 }
1426 
1427 static void __exit vfio_cleanup(void)
1428 {
1429 	ida_destroy(&vfio.device_ida);
1430 	class_destroy(vfio.device_class);
1431 	vfio.device_class = NULL;
1432 	vfio_virqfd_exit();
1433 	vfio_group_cleanup();
1434 	xa_destroy(&vfio_device_set_xa);
1435 }
1436 
1437 module_init(vfio_init);
1438 module_exit(vfio_cleanup);
1439 
1440 MODULE_VERSION(DRIVER_VERSION);
1441 MODULE_LICENSE("GPL v2");
1442 MODULE_AUTHOR(DRIVER_AUTHOR);
1443 MODULE_DESCRIPTION(DRIVER_DESC);
1444 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1445