xref: /openbmc/linux/drivers/vfio/vfio_main.c (revision 726ccdba)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #ifdef CONFIG_HAVE_KVM
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mutex.h>
26 #include <linux/pci.h>
27 #include <linux/rwsem.h>
28 #include <linux/sched.h>
29 #include <linux/slab.h>
30 #include <linux/stat.h>
31 #include <linux/string.h>
32 #include <linux/uaccess.h>
33 #include <linux/vfio.h>
34 #include <linux/wait.h>
35 #include <linux/sched/signal.h>
36 #include <linux/pm_runtime.h>
37 #include <linux/interval_tree.h>
38 #include <linux/iova_bitmap.h>
39 #include <linux/iommufd.h>
40 #include "vfio.h"
41 
42 #define DRIVER_VERSION	"0.3"
43 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC	"VFIO - User Level meta-driver"
45 
46 static struct vfio {
47 	struct class			*device_class;
48 	struct ida			device_ida;
49 } vfio;
50 
51 #ifdef CONFIG_VFIO_NOIOMMU
52 bool vfio_noiommu __read_mostly;
53 module_param_named(enable_unsafe_noiommu_mode,
54 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
56 #endif
57 
58 static DEFINE_XARRAY(vfio_device_set_xa);
59 
60 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
61 {
62 	unsigned long idx = (unsigned long)set_id;
63 	struct vfio_device_set *new_dev_set;
64 	struct vfio_device_set *dev_set;
65 
66 	if (WARN_ON(!set_id))
67 		return -EINVAL;
68 
69 	/*
70 	 * Atomically acquire a singleton object in the xarray for this set_id
71 	 */
72 	xa_lock(&vfio_device_set_xa);
73 	dev_set = xa_load(&vfio_device_set_xa, idx);
74 	if (dev_set)
75 		goto found_get_ref;
76 	xa_unlock(&vfio_device_set_xa);
77 
78 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
79 	if (!new_dev_set)
80 		return -ENOMEM;
81 	mutex_init(&new_dev_set->lock);
82 	INIT_LIST_HEAD(&new_dev_set->device_list);
83 	new_dev_set->set_id = set_id;
84 
85 	xa_lock(&vfio_device_set_xa);
86 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
87 			       GFP_KERNEL);
88 	if (!dev_set) {
89 		dev_set = new_dev_set;
90 		goto found_get_ref;
91 	}
92 
93 	kfree(new_dev_set);
94 	if (xa_is_err(dev_set)) {
95 		xa_unlock(&vfio_device_set_xa);
96 		return xa_err(dev_set);
97 	}
98 
99 found_get_ref:
100 	dev_set->device_count++;
101 	xa_unlock(&vfio_device_set_xa);
102 	mutex_lock(&dev_set->lock);
103 	device->dev_set = dev_set;
104 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
105 	mutex_unlock(&dev_set->lock);
106 	return 0;
107 }
108 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
109 
110 static void vfio_release_device_set(struct vfio_device *device)
111 {
112 	struct vfio_device_set *dev_set = device->dev_set;
113 
114 	if (!dev_set)
115 		return;
116 
117 	mutex_lock(&dev_set->lock);
118 	list_del(&device->dev_set_list);
119 	mutex_unlock(&dev_set->lock);
120 
121 	xa_lock(&vfio_device_set_xa);
122 	if (!--dev_set->device_count) {
123 		__xa_erase(&vfio_device_set_xa,
124 			   (unsigned long)dev_set->set_id);
125 		mutex_destroy(&dev_set->lock);
126 		kfree(dev_set);
127 	}
128 	xa_unlock(&vfio_device_set_xa);
129 }
130 
131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
132 {
133 	struct vfio_device *cur;
134 	unsigned int open_count = 0;
135 
136 	lockdep_assert_held(&dev_set->lock);
137 
138 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139 		open_count += cur->open_count;
140 	return open_count;
141 }
142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
143 
144 /*
145  * Device objects - create, release, get, put, search
146  */
147 /* Device reference always implies a group reference */
148 void vfio_device_put_registration(struct vfio_device *device)
149 {
150 	if (refcount_dec_and_test(&device->refcount))
151 		complete(&device->comp);
152 }
153 
154 bool vfio_device_try_get_registration(struct vfio_device *device)
155 {
156 	return refcount_inc_not_zero(&device->refcount);
157 }
158 
159 /*
160  * VFIO driver API
161  */
162 /* Release helper called by vfio_put_device() */
163 static void vfio_device_release(struct device *dev)
164 {
165 	struct vfio_device *device =
166 			container_of(dev, struct vfio_device, device);
167 
168 	vfio_release_device_set(device);
169 	ida_free(&vfio.device_ida, device->index);
170 
171 	if (device->ops->release)
172 		device->ops->release(device);
173 
174 	kvfree(device);
175 }
176 
177 static int vfio_init_device(struct vfio_device *device, struct device *dev,
178 			    const struct vfio_device_ops *ops);
179 
180 /*
181  * Allocate and initialize vfio_device so it can be registered to vfio
182  * core.
183  *
184  * Drivers should use the wrapper vfio_alloc_device() for allocation.
185  * @size is the size of the structure to be allocated, including any
186  * private data used by the driver.
187  *
188  * Driver may provide an @init callback to cover device private data.
189  *
190  * Use vfio_put_device() to release the structure after success return.
191  */
192 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
193 				       const struct vfio_device_ops *ops)
194 {
195 	struct vfio_device *device;
196 	int ret;
197 
198 	if (WARN_ON(size < sizeof(struct vfio_device)))
199 		return ERR_PTR(-EINVAL);
200 
201 	device = kvzalloc(size, GFP_KERNEL);
202 	if (!device)
203 		return ERR_PTR(-ENOMEM);
204 
205 	ret = vfio_init_device(device, dev, ops);
206 	if (ret)
207 		goto out_free;
208 	return device;
209 
210 out_free:
211 	kvfree(device);
212 	return ERR_PTR(ret);
213 }
214 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
215 
216 /*
217  * Initialize a vfio_device so it can be registered to vfio core.
218  */
219 static int vfio_init_device(struct vfio_device *device, struct device *dev,
220 			    const struct vfio_device_ops *ops)
221 {
222 	int ret;
223 
224 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
225 	if (ret < 0) {
226 		dev_dbg(dev, "Error to alloc index\n");
227 		return ret;
228 	}
229 
230 	device->index = ret;
231 	init_completion(&device->comp);
232 	device->dev = dev;
233 	device->ops = ops;
234 
235 	if (ops->init) {
236 		ret = ops->init(device);
237 		if (ret)
238 			goto out_uninit;
239 	}
240 
241 	device_initialize(&device->device);
242 	device->device.release = vfio_device_release;
243 	device->device.class = vfio.device_class;
244 	device->device.parent = device->dev;
245 	return 0;
246 
247 out_uninit:
248 	vfio_release_device_set(device);
249 	ida_free(&vfio.device_ida, device->index);
250 	return ret;
251 }
252 
253 static int __vfio_register_dev(struct vfio_device *device,
254 			       enum vfio_group_type type)
255 {
256 	int ret;
257 
258 	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
259 		    (!device->ops->bind_iommufd ||
260 		     !device->ops->unbind_iommufd ||
261 		     !device->ops->attach_ioas)))
262 		return -EINVAL;
263 
264 	/*
265 	 * If the driver doesn't specify a set then the device is added to a
266 	 * singleton set just for itself.
267 	 */
268 	if (!device->dev_set)
269 		vfio_assign_device_set(device, device);
270 
271 	ret = dev_set_name(&device->device, "vfio%d", device->index);
272 	if (ret)
273 		return ret;
274 
275 	ret = vfio_device_set_group(device, type);
276 	if (ret)
277 		return ret;
278 
279 	ret = device_add(&device->device);
280 	if (ret)
281 		goto err_out;
282 
283 	/* Refcounting can't start until the driver calls register */
284 	refcount_set(&device->refcount, 1);
285 
286 	vfio_device_group_register(device);
287 
288 	return 0;
289 err_out:
290 	vfio_device_remove_group(device);
291 	return ret;
292 }
293 
294 int vfio_register_group_dev(struct vfio_device *device)
295 {
296 	return __vfio_register_dev(device, VFIO_IOMMU);
297 }
298 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
299 
300 /*
301  * Register a virtual device without IOMMU backing.  The user of this
302  * device must not be able to directly trigger unmediated DMA.
303  */
304 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
305 {
306 	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
307 }
308 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
309 
310 /*
311  * Decrement the device reference count and wait for the device to be
312  * removed.  Open file descriptors for the device... */
313 void vfio_unregister_group_dev(struct vfio_device *device)
314 {
315 	unsigned int i = 0;
316 	bool interrupted = false;
317 	long rc;
318 
319 	vfio_device_put_registration(device);
320 	rc = try_wait_for_completion(&device->comp);
321 	while (rc <= 0) {
322 		if (device->ops->request)
323 			device->ops->request(device, i++);
324 
325 		if (interrupted) {
326 			rc = wait_for_completion_timeout(&device->comp,
327 							 HZ * 10);
328 		} else {
329 			rc = wait_for_completion_interruptible_timeout(
330 				&device->comp, HZ * 10);
331 			if (rc < 0) {
332 				interrupted = true;
333 				dev_warn(device->dev,
334 					 "Device is currently in use, task"
335 					 " \"%s\" (%d) "
336 					 "blocked until device is released",
337 					 current->comm, task_pid_nr(current));
338 			}
339 		}
340 	}
341 
342 	vfio_device_group_unregister(device);
343 
344 	/* Balances device_add in register path */
345 	device_del(&device->device);
346 
347 	/* Balances vfio_device_set_group in register path */
348 	vfio_device_remove_group(device);
349 }
350 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
351 
352 #ifdef CONFIG_HAVE_KVM
353 void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
354 {
355 	void (*pfn)(struct kvm *kvm);
356 	bool (*fn)(struct kvm *kvm);
357 	bool ret;
358 
359 	lockdep_assert_held(&device->dev_set->lock);
360 
361 	pfn = symbol_get(kvm_put_kvm);
362 	if (WARN_ON(!pfn))
363 		return;
364 
365 	fn = symbol_get(kvm_get_kvm_safe);
366 	if (WARN_ON(!fn)) {
367 		symbol_put(kvm_put_kvm);
368 		return;
369 	}
370 
371 	ret = fn(kvm);
372 	symbol_put(kvm_get_kvm_safe);
373 	if (!ret) {
374 		symbol_put(kvm_put_kvm);
375 		return;
376 	}
377 
378 	device->put_kvm = pfn;
379 	device->kvm = kvm;
380 }
381 
382 void vfio_device_put_kvm(struct vfio_device *device)
383 {
384 	lockdep_assert_held(&device->dev_set->lock);
385 
386 	if (!device->kvm)
387 		return;
388 
389 	if (WARN_ON(!device->put_kvm))
390 		goto clear;
391 
392 	device->put_kvm(device->kvm);
393 	device->put_kvm = NULL;
394 	symbol_put(kvm_put_kvm);
395 
396 clear:
397 	device->kvm = NULL;
398 }
399 #endif
400 
401 /* true if the vfio_device has open_device() called but not close_device() */
402 static bool vfio_assert_device_open(struct vfio_device *device)
403 {
404 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
405 }
406 
407 static int vfio_device_first_open(struct vfio_device *device,
408 				  struct iommufd_ctx *iommufd)
409 {
410 	int ret;
411 
412 	lockdep_assert_held(&device->dev_set->lock);
413 
414 	if (!try_module_get(device->dev->driver->owner))
415 		return -ENODEV;
416 
417 	if (iommufd)
418 		ret = vfio_iommufd_bind(device, iommufd);
419 	else
420 		ret = vfio_device_group_use_iommu(device);
421 	if (ret)
422 		goto err_module_put;
423 
424 	if (device->ops->open_device) {
425 		ret = device->ops->open_device(device);
426 		if (ret)
427 			goto err_unuse_iommu;
428 	}
429 	return 0;
430 
431 err_unuse_iommu:
432 	if (iommufd)
433 		vfio_iommufd_unbind(device);
434 	else
435 		vfio_device_group_unuse_iommu(device);
436 err_module_put:
437 	module_put(device->dev->driver->owner);
438 	return ret;
439 }
440 
441 static void vfio_device_last_close(struct vfio_device *device,
442 				   struct iommufd_ctx *iommufd)
443 {
444 	lockdep_assert_held(&device->dev_set->lock);
445 
446 	if (device->ops->close_device)
447 		device->ops->close_device(device);
448 	if (iommufd)
449 		vfio_iommufd_unbind(device);
450 	else
451 		vfio_device_group_unuse_iommu(device);
452 	module_put(device->dev->driver->owner);
453 }
454 
455 int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd)
456 {
457 	int ret = 0;
458 
459 	lockdep_assert_held(&device->dev_set->lock);
460 
461 	device->open_count++;
462 	if (device->open_count == 1) {
463 		ret = vfio_device_first_open(device, iommufd);
464 		if (ret)
465 			device->open_count--;
466 	}
467 
468 	return ret;
469 }
470 
471 void vfio_device_close(struct vfio_device *device,
472 		       struct iommufd_ctx *iommufd)
473 {
474 	lockdep_assert_held(&device->dev_set->lock);
475 
476 	vfio_assert_device_open(device);
477 	if (device->open_count == 1)
478 		vfio_device_last_close(device, iommufd);
479 	device->open_count--;
480 }
481 
482 /*
483  * Wrapper around pm_runtime_resume_and_get().
484  * Return error code on failure or 0 on success.
485  */
486 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
487 {
488 	struct device *dev = device->dev;
489 
490 	if (dev->driver && dev->driver->pm) {
491 		int ret;
492 
493 		ret = pm_runtime_resume_and_get(dev);
494 		if (ret) {
495 			dev_info_ratelimited(dev,
496 				"vfio: runtime resume failed %d\n", ret);
497 			return -EIO;
498 		}
499 	}
500 
501 	return 0;
502 }
503 
504 /*
505  * Wrapper around pm_runtime_put().
506  */
507 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
508 {
509 	struct device *dev = device->dev;
510 
511 	if (dev->driver && dev->driver->pm)
512 		pm_runtime_put(dev);
513 }
514 
515 /*
516  * VFIO Device fd
517  */
518 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
519 {
520 	struct vfio_device *device = filep->private_data;
521 
522 	vfio_device_group_close(device);
523 
524 	vfio_device_put_registration(device);
525 
526 	return 0;
527 }
528 
529 /*
530  * vfio_mig_get_next_state - Compute the next step in the FSM
531  * @cur_fsm - The current state the device is in
532  * @new_fsm - The target state to reach
533  * @next_fsm - Pointer to the next step to get to new_fsm
534  *
535  * Return 0 upon success, otherwise -errno
536  * Upon success the next step in the state progression between cur_fsm and
537  * new_fsm will be set in next_fsm.
538  *
539  * This breaks down requests for combination transitions into smaller steps and
540  * returns the next step to get to new_fsm. The function may need to be called
541  * multiple times before reaching new_fsm.
542  *
543  */
544 int vfio_mig_get_next_state(struct vfio_device *device,
545 			    enum vfio_device_mig_state cur_fsm,
546 			    enum vfio_device_mig_state new_fsm,
547 			    enum vfio_device_mig_state *next_fsm)
548 {
549 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
550 	/*
551 	 * The coding in this table requires the driver to implement the
552 	 * following FSM arcs:
553 	 *         RESUMING -> STOP
554 	 *         STOP -> RESUMING
555 	 *         STOP -> STOP_COPY
556 	 *         STOP_COPY -> STOP
557 	 *
558 	 * If P2P is supported then the driver must also implement these FSM
559 	 * arcs:
560 	 *         RUNNING -> RUNNING_P2P
561 	 *         RUNNING_P2P -> RUNNING
562 	 *         RUNNING_P2P -> STOP
563 	 *         STOP -> RUNNING_P2P
564 	 *
565 	 * If precopy is supported then the driver must support these additional
566 	 * FSM arcs:
567 	 *         RUNNING -> PRE_COPY
568 	 *         PRE_COPY -> RUNNING
569 	 *         PRE_COPY -> STOP_COPY
570 	 * However, if precopy and P2P are supported together then the driver
571 	 * must support these additional arcs beyond the P2P arcs above:
572 	 *         PRE_COPY -> RUNNING
573 	 *         PRE_COPY -> PRE_COPY_P2P
574 	 *         PRE_COPY_P2P -> PRE_COPY
575 	 *         PRE_COPY_P2P -> RUNNING_P2P
576 	 *         PRE_COPY_P2P -> STOP_COPY
577 	 *         RUNNING -> PRE_COPY
578 	 *         RUNNING_P2P -> PRE_COPY_P2P
579 	 *
580 	 * Without P2P and precopy the driver must implement:
581 	 *         RUNNING -> STOP
582 	 *         STOP -> RUNNING
583 	 *
584 	 * The coding will step through multiple states for some combination
585 	 * transitions; if all optional features are supported, this means the
586 	 * following ones:
587 	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
588 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
589 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
590 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
591 	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
592 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
593 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
594 	 *         RESUMING -> STOP -> RUNNING_P2P
595 	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
596 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
597 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
598 	 *         RESUMING -> STOP -> STOP_COPY
599 	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
600 	 *         RUNNING -> RUNNING_P2P -> STOP
601 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
602 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
603 	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
604 	 *         RUNNING_P2P -> STOP -> RESUMING
605 	 *         RUNNING_P2P -> STOP -> STOP_COPY
606 	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
607 	 *         STOP -> RUNNING_P2P -> RUNNING
608 	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
609 	 *         STOP_COPY -> STOP -> RESUMING
610 	 *         STOP_COPY -> STOP -> RUNNING_P2P
611 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
612 	 *
613 	 *  The following transitions are blocked:
614 	 *         STOP_COPY -> PRE_COPY
615 	 *         STOP_COPY -> PRE_COPY_P2P
616 	 */
617 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
618 		[VFIO_DEVICE_STATE_STOP] = {
619 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
620 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
621 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
622 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
623 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
624 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
625 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
626 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
627 		},
628 		[VFIO_DEVICE_STATE_RUNNING] = {
629 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
630 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
631 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
632 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
633 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
634 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
635 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
636 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
637 		},
638 		[VFIO_DEVICE_STATE_PRE_COPY] = {
639 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
640 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
641 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
642 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
643 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
644 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
645 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
646 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
647 		},
648 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
649 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
650 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
651 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
652 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
653 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
654 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
655 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
656 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
657 		},
658 		[VFIO_DEVICE_STATE_STOP_COPY] = {
659 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
660 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
661 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
662 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
663 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
664 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
665 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
666 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
667 		},
668 		[VFIO_DEVICE_STATE_RESUMING] = {
669 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
670 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
671 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
672 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
673 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
674 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
675 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
676 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
677 		},
678 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
679 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
680 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
681 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
682 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
683 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
684 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
685 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
686 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
687 		},
688 		[VFIO_DEVICE_STATE_ERROR] = {
689 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
690 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
691 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
692 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
693 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
694 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
695 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
696 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
697 		},
698 	};
699 
700 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
701 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
702 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
703 		[VFIO_DEVICE_STATE_PRE_COPY] =
704 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
705 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
706 						   VFIO_MIGRATION_P2P |
707 						   VFIO_MIGRATION_PRE_COPY,
708 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
709 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
710 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
711 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
712 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
713 	};
714 
715 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
716 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
717 			state_flags_table[cur_fsm]))
718 		return -EINVAL;
719 
720 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
721 	   (state_flags_table[new_fsm] & device->migration_flags) !=
722 			state_flags_table[new_fsm])
723 		return -EINVAL;
724 
725 	/*
726 	 * Arcs touching optional and unsupported states are skipped over. The
727 	 * driver will instead see an arc from the original state to the next
728 	 * logical state, as per the above comment.
729 	 */
730 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
731 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
732 			state_flags_table[*next_fsm])
733 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
734 
735 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
736 }
737 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
738 
739 /*
740  * Convert the drivers's struct file into a FD number and return it to userspace
741  */
742 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
743 				   struct vfio_device_feature_mig_state *mig)
744 {
745 	int ret;
746 	int fd;
747 
748 	fd = get_unused_fd_flags(O_CLOEXEC);
749 	if (fd < 0) {
750 		ret = fd;
751 		goto out_fput;
752 	}
753 
754 	mig->data_fd = fd;
755 	if (copy_to_user(arg, mig, sizeof(*mig))) {
756 		ret = -EFAULT;
757 		goto out_put_unused;
758 	}
759 	fd_install(fd, filp);
760 	return 0;
761 
762 out_put_unused:
763 	put_unused_fd(fd);
764 out_fput:
765 	fput(filp);
766 	return ret;
767 }
768 
769 static int
770 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
771 					   u32 flags, void __user *arg,
772 					   size_t argsz)
773 {
774 	size_t minsz =
775 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
776 	struct vfio_device_feature_mig_state mig;
777 	struct file *filp = NULL;
778 	int ret;
779 
780 	if (!device->mig_ops)
781 		return -ENOTTY;
782 
783 	ret = vfio_check_feature(flags, argsz,
784 				 VFIO_DEVICE_FEATURE_SET |
785 				 VFIO_DEVICE_FEATURE_GET,
786 				 sizeof(mig));
787 	if (ret != 1)
788 		return ret;
789 
790 	if (copy_from_user(&mig, arg, minsz))
791 		return -EFAULT;
792 
793 	if (flags & VFIO_DEVICE_FEATURE_GET) {
794 		enum vfio_device_mig_state curr_state;
795 
796 		ret = device->mig_ops->migration_get_state(device,
797 							   &curr_state);
798 		if (ret)
799 			return ret;
800 		mig.device_state = curr_state;
801 		goto out_copy;
802 	}
803 
804 	/* Handle the VFIO_DEVICE_FEATURE_SET */
805 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
806 	if (IS_ERR(filp) || !filp)
807 		goto out_copy;
808 
809 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
810 out_copy:
811 	mig.data_fd = -1;
812 	if (copy_to_user(arg, &mig, sizeof(mig)))
813 		return -EFAULT;
814 	if (IS_ERR(filp))
815 		return PTR_ERR(filp);
816 	return 0;
817 }
818 
819 static int
820 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
821 					      u32 flags, void __user *arg,
822 					      size_t argsz)
823 {
824 	struct vfio_device_feature_mig_data_size data_size = {};
825 	unsigned long stop_copy_length;
826 	int ret;
827 
828 	if (!device->mig_ops)
829 		return -ENOTTY;
830 
831 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
832 				 sizeof(data_size));
833 	if (ret != 1)
834 		return ret;
835 
836 	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
837 	if (ret)
838 		return ret;
839 
840 	data_size.stop_copy_length = stop_copy_length;
841 	if (copy_to_user(arg, &data_size, sizeof(data_size)))
842 		return -EFAULT;
843 
844 	return 0;
845 }
846 
847 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
848 					       u32 flags, void __user *arg,
849 					       size_t argsz)
850 {
851 	struct vfio_device_feature_migration mig = {
852 		.flags = device->migration_flags,
853 	};
854 	int ret;
855 
856 	if (!device->mig_ops)
857 		return -ENOTTY;
858 
859 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
860 				 sizeof(mig));
861 	if (ret != 1)
862 		return ret;
863 	if (copy_to_user(arg, &mig, sizeof(mig)))
864 		return -EFAULT;
865 	return 0;
866 }
867 
868 /* Ranges should fit into a single kernel page */
869 #define LOG_MAX_RANGES \
870 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
871 
872 static int
873 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
874 					u32 flags, void __user *arg,
875 					size_t argsz)
876 {
877 	size_t minsz =
878 		offsetofend(struct vfio_device_feature_dma_logging_control,
879 			    ranges);
880 	struct vfio_device_feature_dma_logging_range __user *ranges;
881 	struct vfio_device_feature_dma_logging_control control;
882 	struct vfio_device_feature_dma_logging_range range;
883 	struct rb_root_cached root = RB_ROOT_CACHED;
884 	struct interval_tree_node *nodes;
885 	u64 iova_end;
886 	u32 nnodes;
887 	int i, ret;
888 
889 	if (!device->log_ops)
890 		return -ENOTTY;
891 
892 	ret = vfio_check_feature(flags, argsz,
893 				 VFIO_DEVICE_FEATURE_SET,
894 				 sizeof(control));
895 	if (ret != 1)
896 		return ret;
897 
898 	if (copy_from_user(&control, arg, minsz))
899 		return -EFAULT;
900 
901 	nnodes = control.num_ranges;
902 	if (!nnodes)
903 		return -EINVAL;
904 
905 	if (nnodes > LOG_MAX_RANGES)
906 		return -E2BIG;
907 
908 	ranges = u64_to_user_ptr(control.ranges);
909 	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
910 			      GFP_KERNEL);
911 	if (!nodes)
912 		return -ENOMEM;
913 
914 	for (i = 0; i < nnodes; i++) {
915 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
916 			ret = -EFAULT;
917 			goto end;
918 		}
919 		if (!IS_ALIGNED(range.iova, control.page_size) ||
920 		    !IS_ALIGNED(range.length, control.page_size)) {
921 			ret = -EINVAL;
922 			goto end;
923 		}
924 
925 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
926 		    iova_end > ULONG_MAX) {
927 			ret = -EOVERFLOW;
928 			goto end;
929 		}
930 
931 		nodes[i].start = range.iova;
932 		nodes[i].last = range.iova + range.length - 1;
933 		if (interval_tree_iter_first(&root, nodes[i].start,
934 					     nodes[i].last)) {
935 			/* Range overlapping */
936 			ret = -EINVAL;
937 			goto end;
938 		}
939 		interval_tree_insert(nodes + i, &root);
940 	}
941 
942 	ret = device->log_ops->log_start(device, &root, nnodes,
943 					 &control.page_size);
944 	if (ret)
945 		goto end;
946 
947 	if (copy_to_user(arg, &control, sizeof(control))) {
948 		ret = -EFAULT;
949 		device->log_ops->log_stop(device);
950 	}
951 
952 end:
953 	kfree(nodes);
954 	return ret;
955 }
956 
957 static int
958 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
959 				       u32 flags, void __user *arg,
960 				       size_t argsz)
961 {
962 	int ret;
963 
964 	if (!device->log_ops)
965 		return -ENOTTY;
966 
967 	ret = vfio_check_feature(flags, argsz,
968 				 VFIO_DEVICE_FEATURE_SET, 0);
969 	if (ret != 1)
970 		return ret;
971 
972 	return device->log_ops->log_stop(device);
973 }
974 
975 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
976 					  unsigned long iova, size_t length,
977 					  void *opaque)
978 {
979 	struct vfio_device *device = opaque;
980 
981 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
982 }
983 
984 static int
985 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
986 					 u32 flags, void __user *arg,
987 					 size_t argsz)
988 {
989 	size_t minsz =
990 		offsetofend(struct vfio_device_feature_dma_logging_report,
991 			    bitmap);
992 	struct vfio_device_feature_dma_logging_report report;
993 	struct iova_bitmap *iter;
994 	u64 iova_end;
995 	int ret;
996 
997 	if (!device->log_ops)
998 		return -ENOTTY;
999 
1000 	ret = vfio_check_feature(flags, argsz,
1001 				 VFIO_DEVICE_FEATURE_GET,
1002 				 sizeof(report));
1003 	if (ret != 1)
1004 		return ret;
1005 
1006 	if (copy_from_user(&report, arg, minsz))
1007 		return -EFAULT;
1008 
1009 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1010 		return -EINVAL;
1011 
1012 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1013 	    iova_end > ULONG_MAX)
1014 		return -EOVERFLOW;
1015 
1016 	iter = iova_bitmap_alloc(report.iova, report.length,
1017 				 report.page_size,
1018 				 u64_to_user_ptr(report.bitmap));
1019 	if (IS_ERR(iter))
1020 		return PTR_ERR(iter);
1021 
1022 	ret = iova_bitmap_for_each(iter, device,
1023 				   vfio_device_log_read_and_clear);
1024 
1025 	iova_bitmap_free(iter);
1026 	return ret;
1027 }
1028 
1029 static int vfio_ioctl_device_feature(struct vfio_device *device,
1030 				     struct vfio_device_feature __user *arg)
1031 {
1032 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1033 	struct vfio_device_feature feature;
1034 
1035 	if (copy_from_user(&feature, arg, minsz))
1036 		return -EFAULT;
1037 
1038 	if (feature.argsz < minsz)
1039 		return -EINVAL;
1040 
1041 	/* Check unknown flags */
1042 	if (feature.flags &
1043 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1044 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1045 		return -EINVAL;
1046 
1047 	/* GET & SET are mutually exclusive except with PROBE */
1048 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1049 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1050 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1051 		return -EINVAL;
1052 
1053 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1054 	case VFIO_DEVICE_FEATURE_MIGRATION:
1055 		return vfio_ioctl_device_feature_migration(
1056 			device, feature.flags, arg->data,
1057 			feature.argsz - minsz);
1058 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1059 		return vfio_ioctl_device_feature_mig_device_state(
1060 			device, feature.flags, arg->data,
1061 			feature.argsz - minsz);
1062 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1063 		return vfio_ioctl_device_feature_logging_start(
1064 			device, feature.flags, arg->data,
1065 			feature.argsz - minsz);
1066 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1067 		return vfio_ioctl_device_feature_logging_stop(
1068 			device, feature.flags, arg->data,
1069 			feature.argsz - minsz);
1070 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1071 		return vfio_ioctl_device_feature_logging_report(
1072 			device, feature.flags, arg->data,
1073 			feature.argsz - minsz);
1074 	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1075 		return vfio_ioctl_device_feature_migration_data_size(
1076 			device, feature.flags, arg->data,
1077 			feature.argsz - minsz);
1078 	default:
1079 		if (unlikely(!device->ops->device_feature))
1080 			return -EINVAL;
1081 		return device->ops->device_feature(device, feature.flags,
1082 						   arg->data,
1083 						   feature.argsz - minsz);
1084 	}
1085 }
1086 
1087 static long vfio_device_fops_unl_ioctl(struct file *filep,
1088 				       unsigned int cmd, unsigned long arg)
1089 {
1090 	struct vfio_device *device = filep->private_data;
1091 	int ret;
1092 
1093 	ret = vfio_device_pm_runtime_get(device);
1094 	if (ret)
1095 		return ret;
1096 
1097 	switch (cmd) {
1098 	case VFIO_DEVICE_FEATURE:
1099 		ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1100 		break;
1101 
1102 	default:
1103 		if (unlikely(!device->ops->ioctl))
1104 			ret = -EINVAL;
1105 		else
1106 			ret = device->ops->ioctl(device, cmd, arg);
1107 		break;
1108 	}
1109 
1110 	vfio_device_pm_runtime_put(device);
1111 	return ret;
1112 }
1113 
1114 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1115 				     size_t count, loff_t *ppos)
1116 {
1117 	struct vfio_device *device = filep->private_data;
1118 
1119 	if (unlikely(!device->ops->read))
1120 		return -EINVAL;
1121 
1122 	return device->ops->read(device, buf, count, ppos);
1123 }
1124 
1125 static ssize_t vfio_device_fops_write(struct file *filep,
1126 				      const char __user *buf,
1127 				      size_t count, loff_t *ppos)
1128 {
1129 	struct vfio_device *device = filep->private_data;
1130 
1131 	if (unlikely(!device->ops->write))
1132 		return -EINVAL;
1133 
1134 	return device->ops->write(device, buf, count, ppos);
1135 }
1136 
1137 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1138 {
1139 	struct vfio_device *device = filep->private_data;
1140 
1141 	if (unlikely(!device->ops->mmap))
1142 		return -EINVAL;
1143 
1144 	return device->ops->mmap(device, vma);
1145 }
1146 
1147 const struct file_operations vfio_device_fops = {
1148 	.owner		= THIS_MODULE,
1149 	.release	= vfio_device_fops_release,
1150 	.read		= vfio_device_fops_read,
1151 	.write		= vfio_device_fops_write,
1152 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1153 	.compat_ioctl	= compat_ptr_ioctl,
1154 	.mmap		= vfio_device_fops_mmap,
1155 };
1156 
1157 /*
1158  * Sub-module support
1159  */
1160 /*
1161  * Helper for managing a buffer of info chain capabilities, allocate or
1162  * reallocate a buffer with additional @size, filling in @id and @version
1163  * of the capability.  A pointer to the new capability is returned.
1164  *
1165  * NB. The chain is based at the head of the buffer, so new entries are
1166  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1167  * next offsets prior to copying to the user buffer.
1168  */
1169 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1170 					       size_t size, u16 id, u16 version)
1171 {
1172 	void *buf;
1173 	struct vfio_info_cap_header *header, *tmp;
1174 
1175 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1176 	if (!buf) {
1177 		kfree(caps->buf);
1178 		caps->buf = NULL;
1179 		caps->size = 0;
1180 		return ERR_PTR(-ENOMEM);
1181 	}
1182 
1183 	caps->buf = buf;
1184 	header = buf + caps->size;
1185 
1186 	/* Eventually copied to user buffer, zero */
1187 	memset(header, 0, size);
1188 
1189 	header->id = id;
1190 	header->version = version;
1191 
1192 	/* Add to the end of the capability chain */
1193 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1194 		; /* nothing */
1195 
1196 	tmp->next = caps->size;
1197 	caps->size += size;
1198 
1199 	return header;
1200 }
1201 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1202 
1203 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1204 {
1205 	struct vfio_info_cap_header *tmp;
1206 	void *buf = (void *)caps->buf;
1207 
1208 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1209 		tmp->next += offset;
1210 }
1211 EXPORT_SYMBOL(vfio_info_cap_shift);
1212 
1213 int vfio_info_add_capability(struct vfio_info_cap *caps,
1214 			     struct vfio_info_cap_header *cap, size_t size)
1215 {
1216 	struct vfio_info_cap_header *header;
1217 
1218 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1219 	if (IS_ERR(header))
1220 		return PTR_ERR(header);
1221 
1222 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1223 
1224 	return 0;
1225 }
1226 EXPORT_SYMBOL(vfio_info_add_capability);
1227 
1228 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1229 				       int max_irq_type, size_t *data_size)
1230 {
1231 	unsigned long minsz;
1232 	size_t size;
1233 
1234 	minsz = offsetofend(struct vfio_irq_set, count);
1235 
1236 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1237 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1238 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1239 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1240 		return -EINVAL;
1241 
1242 	if (data_size)
1243 		*data_size = 0;
1244 
1245 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1246 		return -EINVAL;
1247 
1248 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1249 	case VFIO_IRQ_SET_DATA_NONE:
1250 		size = 0;
1251 		break;
1252 	case VFIO_IRQ_SET_DATA_BOOL:
1253 		size = sizeof(uint8_t);
1254 		break;
1255 	case VFIO_IRQ_SET_DATA_EVENTFD:
1256 		size = sizeof(int32_t);
1257 		break;
1258 	default:
1259 		return -EINVAL;
1260 	}
1261 
1262 	if (size) {
1263 		if (hdr->argsz - minsz < hdr->count * size)
1264 			return -EINVAL;
1265 
1266 		if (!data_size)
1267 			return -EINVAL;
1268 
1269 		*data_size = hdr->count * size;
1270 	}
1271 
1272 	return 0;
1273 }
1274 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1275 
1276 /*
1277  * Pin contiguous user pages and return their associated host pages for local
1278  * domain only.
1279  * @device [in]  : device
1280  * @iova [in]    : starting IOVA of user pages to be pinned.
1281  * @npage [in]   : count of pages to be pinned.  This count should not
1282  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1283  * @prot [in]    : protection flags
1284  * @pages[out]   : array of host pages
1285  * Return error or number of pages pinned.
1286  *
1287  * A driver may only call this function if the vfio_device was created
1288  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1289  */
1290 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1291 		   int npage, int prot, struct page **pages)
1292 {
1293 	/* group->container cannot change while a vfio device is open */
1294 	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1295 		return -EINVAL;
1296 	if (vfio_device_has_container(device))
1297 		return vfio_device_container_pin_pages(device, iova,
1298 						       npage, prot, pages);
1299 	if (device->iommufd_access) {
1300 		int ret;
1301 
1302 		if (iova > ULONG_MAX)
1303 			return -EINVAL;
1304 		/*
1305 		 * VFIO ignores the sub page offset, npages is from the start of
1306 		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1307 		 * the sub page offset by doing:
1308 		 *     pages[0] + (iova % PAGE_SIZE)
1309 		 */
1310 		ret = iommufd_access_pin_pages(
1311 			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1312 			npage * PAGE_SIZE, pages,
1313 			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1314 		if (ret)
1315 			return ret;
1316 		return npage;
1317 	}
1318 	return -EINVAL;
1319 }
1320 EXPORT_SYMBOL(vfio_pin_pages);
1321 
1322 /*
1323  * Unpin contiguous host pages for local domain only.
1324  * @device [in]  : device
1325  * @iova [in]    : starting address of user pages to be unpinned.
1326  * @npage [in]   : count of pages to be unpinned.  This count should not
1327  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1328  */
1329 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1330 {
1331 	if (WARN_ON(!vfio_assert_device_open(device)))
1332 		return;
1333 
1334 	if (vfio_device_has_container(device)) {
1335 		vfio_device_container_unpin_pages(device, iova, npage);
1336 		return;
1337 	}
1338 	if (device->iommufd_access) {
1339 		if (WARN_ON(iova > ULONG_MAX))
1340 			return;
1341 		iommufd_access_unpin_pages(device->iommufd_access,
1342 					   ALIGN_DOWN(iova, PAGE_SIZE),
1343 					   npage * PAGE_SIZE);
1344 		return;
1345 	}
1346 }
1347 EXPORT_SYMBOL(vfio_unpin_pages);
1348 
1349 /*
1350  * This interface allows the CPUs to perform some sort of virtual DMA on
1351  * behalf of the device.
1352  *
1353  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1354  * into/from a kernel buffer.
1355  *
1356  * As the read/write of user space memory is conducted via the CPUs and is
1357  * not a real device DMA, it is not necessary to pin the user space memory.
1358  *
1359  * @device [in]		: VFIO device
1360  * @iova [in]		: base IOVA of a user space buffer
1361  * @data [in]		: pointer to kernel buffer
1362  * @len [in]		: kernel buffer length
1363  * @write		: indicate read or write
1364  * Return error code on failure or 0 on success.
1365  */
1366 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1367 		size_t len, bool write)
1368 {
1369 	if (!data || len <= 0 || !vfio_assert_device_open(device))
1370 		return -EINVAL;
1371 
1372 	if (vfio_device_has_container(device))
1373 		return vfio_device_container_dma_rw(device, iova,
1374 						    data, len, write);
1375 
1376 	if (device->iommufd_access) {
1377 		unsigned int flags = 0;
1378 
1379 		if (iova > ULONG_MAX)
1380 			return -EINVAL;
1381 
1382 		/* VFIO historically tries to auto-detect a kthread */
1383 		if (!current->mm)
1384 			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1385 		if (write)
1386 			flags |= IOMMUFD_ACCESS_RW_WRITE;
1387 		return iommufd_access_rw(device->iommufd_access, iova, data,
1388 					 len, flags);
1389 	}
1390 	return -EINVAL;
1391 }
1392 EXPORT_SYMBOL(vfio_dma_rw);
1393 
1394 /*
1395  * Module/class support
1396  */
1397 static int __init vfio_init(void)
1398 {
1399 	int ret;
1400 
1401 	ida_init(&vfio.device_ida);
1402 
1403 	ret = vfio_group_init();
1404 	if (ret)
1405 		return ret;
1406 
1407 	ret = vfio_virqfd_init();
1408 	if (ret)
1409 		goto err_virqfd;
1410 
1411 	/* /sys/class/vfio-dev/vfioX */
1412 	vfio.device_class = class_create("vfio-dev");
1413 	if (IS_ERR(vfio.device_class)) {
1414 		ret = PTR_ERR(vfio.device_class);
1415 		goto err_dev_class;
1416 	}
1417 
1418 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1419 	return 0;
1420 
1421 err_dev_class:
1422 	vfio_virqfd_exit();
1423 err_virqfd:
1424 	vfio_group_cleanup();
1425 	return ret;
1426 }
1427 
1428 static void __exit vfio_cleanup(void)
1429 {
1430 	ida_destroy(&vfio.device_ida);
1431 	class_destroy(vfio.device_class);
1432 	vfio.device_class = NULL;
1433 	vfio_virqfd_exit();
1434 	vfio_group_cleanup();
1435 	xa_destroy(&vfio_device_set_xa);
1436 }
1437 
1438 module_init(vfio_init);
1439 module_exit(vfio_cleanup);
1440 
1441 MODULE_VERSION(DRIVER_VERSION);
1442 MODULE_LICENSE("GPL v2");
1443 MODULE_AUTHOR(DRIVER_AUTHOR);
1444 MODULE_DESCRIPTION(DRIVER_DESC);
1445 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1446