xref: /openbmc/linux/drivers/vfio/vfio_main.c (revision 9e6bfd42b14b45737cae8bc84c759f1874949b8b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #include <linux/list.h>
20 #include <linux/miscdevice.h>
21 #include <linux/module.h>
22 #include <linux/mutex.h>
23 #include <linux/pci.h>
24 #include <linux/rwsem.h>
25 #include <linux/sched.h>
26 #include <linux/slab.h>
27 #include <linux/stat.h>
28 #include <linux/string.h>
29 #include <linux/uaccess.h>
30 #include <linux/vfio.h>
31 #include <linux/wait.h>
32 #include <linux/sched/signal.h>
33 #include <linux/pm_runtime.h>
34 #include <linux/interval_tree.h>
35 #include <linux/iova_bitmap.h>
36 #include <linux/iommufd.h>
37 #include "vfio.h"
38 
39 #define DRIVER_VERSION	"0.3"
40 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
41 #define DRIVER_DESC	"VFIO - User Level meta-driver"
42 
43 static struct vfio {
44 	struct class			*device_class;
45 	struct ida			device_ida;
46 } vfio;
47 
48 #ifdef CONFIG_VFIO_NOIOMMU
49 bool vfio_noiommu __read_mostly;
50 module_param_named(enable_unsafe_noiommu_mode,
51 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
52 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
53 #endif
54 
55 static DEFINE_XARRAY(vfio_device_set_xa);
56 
57 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
58 {
59 	unsigned long idx = (unsigned long)set_id;
60 	struct vfio_device_set *new_dev_set;
61 	struct vfio_device_set *dev_set;
62 
63 	if (WARN_ON(!set_id))
64 		return -EINVAL;
65 
66 	/*
67 	 * Atomically acquire a singleton object in the xarray for this set_id
68 	 */
69 	xa_lock(&vfio_device_set_xa);
70 	dev_set = xa_load(&vfio_device_set_xa, idx);
71 	if (dev_set)
72 		goto found_get_ref;
73 	xa_unlock(&vfio_device_set_xa);
74 
75 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
76 	if (!new_dev_set)
77 		return -ENOMEM;
78 	mutex_init(&new_dev_set->lock);
79 	INIT_LIST_HEAD(&new_dev_set->device_list);
80 	new_dev_set->set_id = set_id;
81 
82 	xa_lock(&vfio_device_set_xa);
83 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
84 			       GFP_KERNEL);
85 	if (!dev_set) {
86 		dev_set = new_dev_set;
87 		goto found_get_ref;
88 	}
89 
90 	kfree(new_dev_set);
91 	if (xa_is_err(dev_set)) {
92 		xa_unlock(&vfio_device_set_xa);
93 		return xa_err(dev_set);
94 	}
95 
96 found_get_ref:
97 	dev_set->device_count++;
98 	xa_unlock(&vfio_device_set_xa);
99 	mutex_lock(&dev_set->lock);
100 	device->dev_set = dev_set;
101 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
102 	mutex_unlock(&dev_set->lock);
103 	return 0;
104 }
105 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
106 
107 static void vfio_release_device_set(struct vfio_device *device)
108 {
109 	struct vfio_device_set *dev_set = device->dev_set;
110 
111 	if (!dev_set)
112 		return;
113 
114 	mutex_lock(&dev_set->lock);
115 	list_del(&device->dev_set_list);
116 	mutex_unlock(&dev_set->lock);
117 
118 	xa_lock(&vfio_device_set_xa);
119 	if (!--dev_set->device_count) {
120 		__xa_erase(&vfio_device_set_xa,
121 			   (unsigned long)dev_set->set_id);
122 		mutex_destroy(&dev_set->lock);
123 		kfree(dev_set);
124 	}
125 	xa_unlock(&vfio_device_set_xa);
126 }
127 
128 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
129 {
130 	struct vfio_device *cur;
131 	unsigned int open_count = 0;
132 
133 	lockdep_assert_held(&dev_set->lock);
134 
135 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
136 		open_count += cur->open_count;
137 	return open_count;
138 }
139 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
140 
141 /*
142  * Device objects - create, release, get, put, search
143  */
144 /* Device reference always implies a group reference */
145 void vfio_device_put_registration(struct vfio_device *device)
146 {
147 	if (refcount_dec_and_test(&device->refcount))
148 		complete(&device->comp);
149 }
150 
151 bool vfio_device_try_get_registration(struct vfio_device *device)
152 {
153 	return refcount_inc_not_zero(&device->refcount);
154 }
155 
156 /*
157  * VFIO driver API
158  */
159 /* Release helper called by vfio_put_device() */
160 static void vfio_device_release(struct device *dev)
161 {
162 	struct vfio_device *device =
163 			container_of(dev, struct vfio_device, device);
164 
165 	vfio_release_device_set(device);
166 	ida_free(&vfio.device_ida, device->index);
167 
168 	if (device->ops->release)
169 		device->ops->release(device);
170 
171 	kvfree(device);
172 }
173 
174 static int vfio_init_device(struct vfio_device *device, struct device *dev,
175 			    const struct vfio_device_ops *ops);
176 
177 /*
178  * Allocate and initialize vfio_device so it can be registered to vfio
179  * core.
180  *
181  * Drivers should use the wrapper vfio_alloc_device() for allocation.
182  * @size is the size of the structure to be allocated, including any
183  * private data used by the driver.
184  *
185  * Driver may provide an @init callback to cover device private data.
186  *
187  * Use vfio_put_device() to release the structure after success return.
188  */
189 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
190 				       const struct vfio_device_ops *ops)
191 {
192 	struct vfio_device *device;
193 	int ret;
194 
195 	if (WARN_ON(size < sizeof(struct vfio_device)))
196 		return ERR_PTR(-EINVAL);
197 
198 	device = kvzalloc(size, GFP_KERNEL);
199 	if (!device)
200 		return ERR_PTR(-ENOMEM);
201 
202 	ret = vfio_init_device(device, dev, ops);
203 	if (ret)
204 		goto out_free;
205 	return device;
206 
207 out_free:
208 	kvfree(device);
209 	return ERR_PTR(ret);
210 }
211 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
212 
213 /*
214  * Initialize a vfio_device so it can be registered to vfio core.
215  */
216 static int vfio_init_device(struct vfio_device *device, struct device *dev,
217 			    const struct vfio_device_ops *ops)
218 {
219 	int ret;
220 
221 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
222 	if (ret < 0) {
223 		dev_dbg(dev, "Error to alloc index\n");
224 		return ret;
225 	}
226 
227 	device->index = ret;
228 	init_completion(&device->comp);
229 	device->dev = dev;
230 	device->ops = ops;
231 
232 	if (ops->init) {
233 		ret = ops->init(device);
234 		if (ret)
235 			goto out_uninit;
236 	}
237 
238 	device_initialize(&device->device);
239 	device->device.release = vfio_device_release;
240 	device->device.class = vfio.device_class;
241 	device->device.parent = device->dev;
242 	return 0;
243 
244 out_uninit:
245 	vfio_release_device_set(device);
246 	ida_free(&vfio.device_ida, device->index);
247 	return ret;
248 }
249 
250 static int __vfio_register_dev(struct vfio_device *device,
251 			       enum vfio_group_type type)
252 {
253 	int ret;
254 
255 	if (WARN_ON(device->ops->bind_iommufd &&
256 		    (!device->ops->unbind_iommufd ||
257 		     !device->ops->attach_ioas)))
258 		return -EINVAL;
259 
260 	/*
261 	 * If the driver doesn't specify a set then the device is added to a
262 	 * singleton set just for itself.
263 	 */
264 	if (!device->dev_set)
265 		vfio_assign_device_set(device, device);
266 
267 	ret = dev_set_name(&device->device, "vfio%d", device->index);
268 	if (ret)
269 		return ret;
270 
271 	ret = vfio_device_set_group(device, type);
272 	if (ret)
273 		return ret;
274 
275 	ret = device_add(&device->device);
276 	if (ret)
277 		goto err_out;
278 
279 	/* Refcounting can't start until the driver calls register */
280 	refcount_set(&device->refcount, 1);
281 
282 	vfio_device_group_register(device);
283 
284 	return 0;
285 err_out:
286 	vfio_device_remove_group(device);
287 	return ret;
288 }
289 
290 int vfio_register_group_dev(struct vfio_device *device)
291 {
292 	return __vfio_register_dev(device, VFIO_IOMMU);
293 }
294 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
295 
296 /*
297  * Register a virtual device without IOMMU backing.  The user of this
298  * device must not be able to directly trigger unmediated DMA.
299  */
300 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
301 {
302 	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
303 }
304 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
305 
306 /*
307  * Decrement the device reference count and wait for the device to be
308  * removed.  Open file descriptors for the device... */
309 void vfio_unregister_group_dev(struct vfio_device *device)
310 {
311 	unsigned int i = 0;
312 	bool interrupted = false;
313 	long rc;
314 
315 	vfio_device_put_registration(device);
316 	rc = try_wait_for_completion(&device->comp);
317 	while (rc <= 0) {
318 		if (device->ops->request)
319 			device->ops->request(device, i++);
320 
321 		if (interrupted) {
322 			rc = wait_for_completion_timeout(&device->comp,
323 							 HZ * 10);
324 		} else {
325 			rc = wait_for_completion_interruptible_timeout(
326 				&device->comp, HZ * 10);
327 			if (rc < 0) {
328 				interrupted = true;
329 				dev_warn(device->dev,
330 					 "Device is currently in use, task"
331 					 " \"%s\" (%d) "
332 					 "blocked until device is released",
333 					 current->comm, task_pid_nr(current));
334 			}
335 		}
336 	}
337 
338 	vfio_device_group_unregister(device);
339 
340 	/* Balances device_add in register path */
341 	device_del(&device->device);
342 
343 	/* Balances vfio_device_set_group in register path */
344 	vfio_device_remove_group(device);
345 }
346 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
347 
348 /* true if the vfio_device has open_device() called but not close_device() */
349 static bool vfio_assert_device_open(struct vfio_device *device)
350 {
351 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
352 }
353 
354 static int vfio_device_first_open(struct vfio_device *device,
355 				  struct iommufd_ctx *iommufd, struct kvm *kvm)
356 {
357 	int ret;
358 
359 	lockdep_assert_held(&device->dev_set->lock);
360 
361 	if (!try_module_get(device->dev->driver->owner))
362 		return -ENODEV;
363 
364 	if (iommufd)
365 		ret = vfio_iommufd_bind(device, iommufd);
366 	else
367 		ret = vfio_device_group_use_iommu(device);
368 	if (ret)
369 		goto err_module_put;
370 
371 	device->kvm = kvm;
372 	if (device->ops->open_device) {
373 		ret = device->ops->open_device(device);
374 		if (ret)
375 			goto err_unuse_iommu;
376 	}
377 	return 0;
378 
379 err_unuse_iommu:
380 	device->kvm = NULL;
381 	if (iommufd)
382 		vfio_iommufd_unbind(device);
383 	else
384 		vfio_device_group_unuse_iommu(device);
385 err_module_put:
386 	module_put(device->dev->driver->owner);
387 	return ret;
388 }
389 
390 static void vfio_device_last_close(struct vfio_device *device,
391 				   struct iommufd_ctx *iommufd)
392 {
393 	lockdep_assert_held(&device->dev_set->lock);
394 
395 	if (device->ops->close_device)
396 		device->ops->close_device(device);
397 	device->kvm = NULL;
398 	if (iommufd)
399 		vfio_iommufd_unbind(device);
400 	else
401 		vfio_device_group_unuse_iommu(device);
402 	module_put(device->dev->driver->owner);
403 }
404 
405 int vfio_device_open(struct vfio_device *device,
406 		     struct iommufd_ctx *iommufd, struct kvm *kvm)
407 {
408 	int ret = 0;
409 
410 	mutex_lock(&device->dev_set->lock);
411 	device->open_count++;
412 	if (device->open_count == 1) {
413 		ret = vfio_device_first_open(device, iommufd, kvm);
414 		if (ret)
415 			device->open_count--;
416 	}
417 	mutex_unlock(&device->dev_set->lock);
418 
419 	return ret;
420 }
421 
422 void vfio_device_close(struct vfio_device *device,
423 		       struct iommufd_ctx *iommufd)
424 {
425 	mutex_lock(&device->dev_set->lock);
426 	vfio_assert_device_open(device);
427 	if (device->open_count == 1)
428 		vfio_device_last_close(device, iommufd);
429 	device->open_count--;
430 	mutex_unlock(&device->dev_set->lock);
431 }
432 
433 /*
434  * Wrapper around pm_runtime_resume_and_get().
435  * Return error code on failure or 0 on success.
436  */
437 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
438 {
439 	struct device *dev = device->dev;
440 
441 	if (dev->driver && dev->driver->pm) {
442 		int ret;
443 
444 		ret = pm_runtime_resume_and_get(dev);
445 		if (ret) {
446 			dev_info_ratelimited(dev,
447 				"vfio: runtime resume failed %d\n", ret);
448 			return -EIO;
449 		}
450 	}
451 
452 	return 0;
453 }
454 
455 /*
456  * Wrapper around pm_runtime_put().
457  */
458 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
459 {
460 	struct device *dev = device->dev;
461 
462 	if (dev->driver && dev->driver->pm)
463 		pm_runtime_put(dev);
464 }
465 
466 /*
467  * VFIO Device fd
468  */
469 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
470 {
471 	struct vfio_device *device = filep->private_data;
472 
473 	vfio_device_group_close(device);
474 
475 	vfio_device_put_registration(device);
476 
477 	return 0;
478 }
479 
480 /*
481  * vfio_mig_get_next_state - Compute the next step in the FSM
482  * @cur_fsm - The current state the device is in
483  * @new_fsm - The target state to reach
484  * @next_fsm - Pointer to the next step to get to new_fsm
485  *
486  * Return 0 upon success, otherwise -errno
487  * Upon success the next step in the state progression between cur_fsm and
488  * new_fsm will be set in next_fsm.
489  *
490  * This breaks down requests for combination transitions into smaller steps and
491  * returns the next step to get to new_fsm. The function may need to be called
492  * multiple times before reaching new_fsm.
493  *
494  */
495 int vfio_mig_get_next_state(struct vfio_device *device,
496 			    enum vfio_device_mig_state cur_fsm,
497 			    enum vfio_device_mig_state new_fsm,
498 			    enum vfio_device_mig_state *next_fsm)
499 {
500 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
501 	/*
502 	 * The coding in this table requires the driver to implement the
503 	 * following FSM arcs:
504 	 *         RESUMING -> STOP
505 	 *         STOP -> RESUMING
506 	 *         STOP -> STOP_COPY
507 	 *         STOP_COPY -> STOP
508 	 *
509 	 * If P2P is supported then the driver must also implement these FSM
510 	 * arcs:
511 	 *         RUNNING -> RUNNING_P2P
512 	 *         RUNNING_P2P -> RUNNING
513 	 *         RUNNING_P2P -> STOP
514 	 *         STOP -> RUNNING_P2P
515 	 *
516 	 * If precopy is supported then the driver must support these additional
517 	 * FSM arcs:
518 	 *         RUNNING -> PRE_COPY
519 	 *         PRE_COPY -> RUNNING
520 	 *         PRE_COPY -> STOP_COPY
521 	 * However, if precopy and P2P are supported together then the driver
522 	 * must support these additional arcs beyond the P2P arcs above:
523 	 *         PRE_COPY -> RUNNING
524 	 *         PRE_COPY -> PRE_COPY_P2P
525 	 *         PRE_COPY_P2P -> PRE_COPY
526 	 *         PRE_COPY_P2P -> RUNNING_P2P
527 	 *         PRE_COPY_P2P -> STOP_COPY
528 	 *         RUNNING -> PRE_COPY
529 	 *         RUNNING_P2P -> PRE_COPY_P2P
530 	 *
531 	 * Without P2P and precopy the driver must implement:
532 	 *         RUNNING -> STOP
533 	 *         STOP -> RUNNING
534 	 *
535 	 * The coding will step through multiple states for some combination
536 	 * transitions; if all optional features are supported, this means the
537 	 * following ones:
538 	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
539 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
540 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
541 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
542 	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
543 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
544 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
545 	 *         RESUMING -> STOP -> RUNNING_P2P
546 	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
547 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
548 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
549 	 *         RESUMING -> STOP -> STOP_COPY
550 	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
551 	 *         RUNNING -> RUNNING_P2P -> STOP
552 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
553 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
554 	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
555 	 *         RUNNING_P2P -> STOP -> RESUMING
556 	 *         RUNNING_P2P -> STOP -> STOP_COPY
557 	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
558 	 *         STOP -> RUNNING_P2P -> RUNNING
559 	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
560 	 *         STOP_COPY -> STOP -> RESUMING
561 	 *         STOP_COPY -> STOP -> RUNNING_P2P
562 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
563 	 *
564 	 *  The following transitions are blocked:
565 	 *         STOP_COPY -> PRE_COPY
566 	 *         STOP_COPY -> PRE_COPY_P2P
567 	 */
568 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
569 		[VFIO_DEVICE_STATE_STOP] = {
570 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
571 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
572 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
573 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
574 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
575 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
576 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
577 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
578 		},
579 		[VFIO_DEVICE_STATE_RUNNING] = {
580 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
581 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
582 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
583 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
584 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
585 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
586 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
587 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
588 		},
589 		[VFIO_DEVICE_STATE_PRE_COPY] = {
590 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
591 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
592 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
593 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
594 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
595 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
596 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
597 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
598 		},
599 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
600 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
601 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
602 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
603 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
604 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
605 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
606 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
607 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
608 		},
609 		[VFIO_DEVICE_STATE_STOP_COPY] = {
610 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
611 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
612 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
613 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
614 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
615 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
616 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
617 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
618 		},
619 		[VFIO_DEVICE_STATE_RESUMING] = {
620 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
621 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
622 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
623 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
624 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
625 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
626 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
627 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
628 		},
629 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
630 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
631 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
632 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
633 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
634 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
635 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
636 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
637 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
638 		},
639 		[VFIO_DEVICE_STATE_ERROR] = {
640 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
641 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
642 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
643 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
644 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
645 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
646 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
647 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
648 		},
649 	};
650 
651 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
652 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
653 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
654 		[VFIO_DEVICE_STATE_PRE_COPY] =
655 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
656 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
657 						   VFIO_MIGRATION_P2P |
658 						   VFIO_MIGRATION_PRE_COPY,
659 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
660 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
661 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
662 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
663 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
664 	};
665 
666 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
667 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
668 			state_flags_table[cur_fsm]))
669 		return -EINVAL;
670 
671 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
672 	   (state_flags_table[new_fsm] & device->migration_flags) !=
673 			state_flags_table[new_fsm])
674 		return -EINVAL;
675 
676 	/*
677 	 * Arcs touching optional and unsupported states are skipped over. The
678 	 * driver will instead see an arc from the original state to the next
679 	 * logical state, as per the above comment.
680 	 */
681 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
682 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
683 			state_flags_table[*next_fsm])
684 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
685 
686 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
687 }
688 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
689 
690 /*
691  * Convert the drivers's struct file into a FD number and return it to userspace
692  */
693 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
694 				   struct vfio_device_feature_mig_state *mig)
695 {
696 	int ret;
697 	int fd;
698 
699 	fd = get_unused_fd_flags(O_CLOEXEC);
700 	if (fd < 0) {
701 		ret = fd;
702 		goto out_fput;
703 	}
704 
705 	mig->data_fd = fd;
706 	if (copy_to_user(arg, mig, sizeof(*mig))) {
707 		ret = -EFAULT;
708 		goto out_put_unused;
709 	}
710 	fd_install(fd, filp);
711 	return 0;
712 
713 out_put_unused:
714 	put_unused_fd(fd);
715 out_fput:
716 	fput(filp);
717 	return ret;
718 }
719 
720 static int
721 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
722 					   u32 flags, void __user *arg,
723 					   size_t argsz)
724 {
725 	size_t minsz =
726 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
727 	struct vfio_device_feature_mig_state mig;
728 	struct file *filp = NULL;
729 	int ret;
730 
731 	if (!device->mig_ops)
732 		return -ENOTTY;
733 
734 	ret = vfio_check_feature(flags, argsz,
735 				 VFIO_DEVICE_FEATURE_SET |
736 				 VFIO_DEVICE_FEATURE_GET,
737 				 sizeof(mig));
738 	if (ret != 1)
739 		return ret;
740 
741 	if (copy_from_user(&mig, arg, minsz))
742 		return -EFAULT;
743 
744 	if (flags & VFIO_DEVICE_FEATURE_GET) {
745 		enum vfio_device_mig_state curr_state;
746 
747 		ret = device->mig_ops->migration_get_state(device,
748 							   &curr_state);
749 		if (ret)
750 			return ret;
751 		mig.device_state = curr_state;
752 		goto out_copy;
753 	}
754 
755 	/* Handle the VFIO_DEVICE_FEATURE_SET */
756 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
757 	if (IS_ERR(filp) || !filp)
758 		goto out_copy;
759 
760 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
761 out_copy:
762 	mig.data_fd = -1;
763 	if (copy_to_user(arg, &mig, sizeof(mig)))
764 		return -EFAULT;
765 	if (IS_ERR(filp))
766 		return PTR_ERR(filp);
767 	return 0;
768 }
769 
770 static int
771 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
772 					      u32 flags, void __user *arg,
773 					      size_t argsz)
774 {
775 	struct vfio_device_feature_mig_data_size data_size = {};
776 	unsigned long stop_copy_length;
777 	int ret;
778 
779 	if (!device->mig_ops)
780 		return -ENOTTY;
781 
782 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
783 				 sizeof(data_size));
784 	if (ret != 1)
785 		return ret;
786 
787 	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
788 	if (ret)
789 		return ret;
790 
791 	data_size.stop_copy_length = stop_copy_length;
792 	if (copy_to_user(arg, &data_size, sizeof(data_size)))
793 		return -EFAULT;
794 
795 	return 0;
796 }
797 
798 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
799 					       u32 flags, void __user *arg,
800 					       size_t argsz)
801 {
802 	struct vfio_device_feature_migration mig = {
803 		.flags = device->migration_flags,
804 	};
805 	int ret;
806 
807 	if (!device->mig_ops)
808 		return -ENOTTY;
809 
810 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
811 				 sizeof(mig));
812 	if (ret != 1)
813 		return ret;
814 	if (copy_to_user(arg, &mig, sizeof(mig)))
815 		return -EFAULT;
816 	return 0;
817 }
818 
819 /* Ranges should fit into a single kernel page */
820 #define LOG_MAX_RANGES \
821 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
822 
823 static int
824 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
825 					u32 flags, void __user *arg,
826 					size_t argsz)
827 {
828 	size_t minsz =
829 		offsetofend(struct vfio_device_feature_dma_logging_control,
830 			    ranges);
831 	struct vfio_device_feature_dma_logging_range __user *ranges;
832 	struct vfio_device_feature_dma_logging_control control;
833 	struct vfio_device_feature_dma_logging_range range;
834 	struct rb_root_cached root = RB_ROOT_CACHED;
835 	struct interval_tree_node *nodes;
836 	u64 iova_end;
837 	u32 nnodes;
838 	int i, ret;
839 
840 	if (!device->log_ops)
841 		return -ENOTTY;
842 
843 	ret = vfio_check_feature(flags, argsz,
844 				 VFIO_DEVICE_FEATURE_SET,
845 				 sizeof(control));
846 	if (ret != 1)
847 		return ret;
848 
849 	if (copy_from_user(&control, arg, minsz))
850 		return -EFAULT;
851 
852 	nnodes = control.num_ranges;
853 	if (!nnodes)
854 		return -EINVAL;
855 
856 	if (nnodes > LOG_MAX_RANGES)
857 		return -E2BIG;
858 
859 	ranges = u64_to_user_ptr(control.ranges);
860 	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
861 			      GFP_KERNEL);
862 	if (!nodes)
863 		return -ENOMEM;
864 
865 	for (i = 0; i < nnodes; i++) {
866 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
867 			ret = -EFAULT;
868 			goto end;
869 		}
870 		if (!IS_ALIGNED(range.iova, control.page_size) ||
871 		    !IS_ALIGNED(range.length, control.page_size)) {
872 			ret = -EINVAL;
873 			goto end;
874 		}
875 
876 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
877 		    iova_end > ULONG_MAX) {
878 			ret = -EOVERFLOW;
879 			goto end;
880 		}
881 
882 		nodes[i].start = range.iova;
883 		nodes[i].last = range.iova + range.length - 1;
884 		if (interval_tree_iter_first(&root, nodes[i].start,
885 					     nodes[i].last)) {
886 			/* Range overlapping */
887 			ret = -EINVAL;
888 			goto end;
889 		}
890 		interval_tree_insert(nodes + i, &root);
891 	}
892 
893 	ret = device->log_ops->log_start(device, &root, nnodes,
894 					 &control.page_size);
895 	if (ret)
896 		goto end;
897 
898 	if (copy_to_user(arg, &control, sizeof(control))) {
899 		ret = -EFAULT;
900 		device->log_ops->log_stop(device);
901 	}
902 
903 end:
904 	kfree(nodes);
905 	return ret;
906 }
907 
908 static int
909 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
910 				       u32 flags, void __user *arg,
911 				       size_t argsz)
912 {
913 	int ret;
914 
915 	if (!device->log_ops)
916 		return -ENOTTY;
917 
918 	ret = vfio_check_feature(flags, argsz,
919 				 VFIO_DEVICE_FEATURE_SET, 0);
920 	if (ret != 1)
921 		return ret;
922 
923 	return device->log_ops->log_stop(device);
924 }
925 
926 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
927 					  unsigned long iova, size_t length,
928 					  void *opaque)
929 {
930 	struct vfio_device *device = opaque;
931 
932 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
933 }
934 
935 static int
936 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
937 					 u32 flags, void __user *arg,
938 					 size_t argsz)
939 {
940 	size_t minsz =
941 		offsetofend(struct vfio_device_feature_dma_logging_report,
942 			    bitmap);
943 	struct vfio_device_feature_dma_logging_report report;
944 	struct iova_bitmap *iter;
945 	u64 iova_end;
946 	int ret;
947 
948 	if (!device->log_ops)
949 		return -ENOTTY;
950 
951 	ret = vfio_check_feature(flags, argsz,
952 				 VFIO_DEVICE_FEATURE_GET,
953 				 sizeof(report));
954 	if (ret != 1)
955 		return ret;
956 
957 	if (copy_from_user(&report, arg, minsz))
958 		return -EFAULT;
959 
960 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
961 		return -EINVAL;
962 
963 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
964 	    iova_end > ULONG_MAX)
965 		return -EOVERFLOW;
966 
967 	iter = iova_bitmap_alloc(report.iova, report.length,
968 				 report.page_size,
969 				 u64_to_user_ptr(report.bitmap));
970 	if (IS_ERR(iter))
971 		return PTR_ERR(iter);
972 
973 	ret = iova_bitmap_for_each(iter, device,
974 				   vfio_device_log_read_and_clear);
975 
976 	iova_bitmap_free(iter);
977 	return ret;
978 }
979 
980 static int vfio_ioctl_device_feature(struct vfio_device *device,
981 				     struct vfio_device_feature __user *arg)
982 {
983 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
984 	struct vfio_device_feature feature;
985 
986 	if (copy_from_user(&feature, arg, minsz))
987 		return -EFAULT;
988 
989 	if (feature.argsz < minsz)
990 		return -EINVAL;
991 
992 	/* Check unknown flags */
993 	if (feature.flags &
994 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
995 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
996 		return -EINVAL;
997 
998 	/* GET & SET are mutually exclusive except with PROBE */
999 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1000 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1001 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1002 		return -EINVAL;
1003 
1004 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1005 	case VFIO_DEVICE_FEATURE_MIGRATION:
1006 		return vfio_ioctl_device_feature_migration(
1007 			device, feature.flags, arg->data,
1008 			feature.argsz - minsz);
1009 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1010 		return vfio_ioctl_device_feature_mig_device_state(
1011 			device, feature.flags, arg->data,
1012 			feature.argsz - minsz);
1013 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1014 		return vfio_ioctl_device_feature_logging_start(
1015 			device, feature.flags, arg->data,
1016 			feature.argsz - minsz);
1017 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1018 		return vfio_ioctl_device_feature_logging_stop(
1019 			device, feature.flags, arg->data,
1020 			feature.argsz - minsz);
1021 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1022 		return vfio_ioctl_device_feature_logging_report(
1023 			device, feature.flags, arg->data,
1024 			feature.argsz - minsz);
1025 	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1026 		return vfio_ioctl_device_feature_migration_data_size(
1027 			device, feature.flags, arg->data,
1028 			feature.argsz - minsz);
1029 	default:
1030 		if (unlikely(!device->ops->device_feature))
1031 			return -EINVAL;
1032 		return device->ops->device_feature(device, feature.flags,
1033 						   arg->data,
1034 						   feature.argsz - minsz);
1035 	}
1036 }
1037 
1038 static long vfio_device_fops_unl_ioctl(struct file *filep,
1039 				       unsigned int cmd, unsigned long arg)
1040 {
1041 	struct vfio_device *device = filep->private_data;
1042 	int ret;
1043 
1044 	ret = vfio_device_pm_runtime_get(device);
1045 	if (ret)
1046 		return ret;
1047 
1048 	switch (cmd) {
1049 	case VFIO_DEVICE_FEATURE:
1050 		ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1051 		break;
1052 
1053 	default:
1054 		if (unlikely(!device->ops->ioctl))
1055 			ret = -EINVAL;
1056 		else
1057 			ret = device->ops->ioctl(device, cmd, arg);
1058 		break;
1059 	}
1060 
1061 	vfio_device_pm_runtime_put(device);
1062 	return ret;
1063 }
1064 
1065 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1066 				     size_t count, loff_t *ppos)
1067 {
1068 	struct vfio_device *device = filep->private_data;
1069 
1070 	if (unlikely(!device->ops->read))
1071 		return -EINVAL;
1072 
1073 	return device->ops->read(device, buf, count, ppos);
1074 }
1075 
1076 static ssize_t vfio_device_fops_write(struct file *filep,
1077 				      const char __user *buf,
1078 				      size_t count, loff_t *ppos)
1079 {
1080 	struct vfio_device *device = filep->private_data;
1081 
1082 	if (unlikely(!device->ops->write))
1083 		return -EINVAL;
1084 
1085 	return device->ops->write(device, buf, count, ppos);
1086 }
1087 
1088 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1089 {
1090 	struct vfio_device *device = filep->private_data;
1091 
1092 	if (unlikely(!device->ops->mmap))
1093 		return -EINVAL;
1094 
1095 	return device->ops->mmap(device, vma);
1096 }
1097 
1098 const struct file_operations vfio_device_fops = {
1099 	.owner		= THIS_MODULE,
1100 	.release	= vfio_device_fops_release,
1101 	.read		= vfio_device_fops_read,
1102 	.write		= vfio_device_fops_write,
1103 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1104 	.compat_ioctl	= compat_ptr_ioctl,
1105 	.mmap		= vfio_device_fops_mmap,
1106 };
1107 
1108 /*
1109  * Sub-module support
1110  */
1111 /*
1112  * Helper for managing a buffer of info chain capabilities, allocate or
1113  * reallocate a buffer with additional @size, filling in @id and @version
1114  * of the capability.  A pointer to the new capability is returned.
1115  *
1116  * NB. The chain is based at the head of the buffer, so new entries are
1117  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1118  * next offsets prior to copying to the user buffer.
1119  */
1120 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1121 					       size_t size, u16 id, u16 version)
1122 {
1123 	void *buf;
1124 	struct vfio_info_cap_header *header, *tmp;
1125 
1126 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1127 	if (!buf) {
1128 		kfree(caps->buf);
1129 		caps->buf = NULL;
1130 		caps->size = 0;
1131 		return ERR_PTR(-ENOMEM);
1132 	}
1133 
1134 	caps->buf = buf;
1135 	header = buf + caps->size;
1136 
1137 	/* Eventually copied to user buffer, zero */
1138 	memset(header, 0, size);
1139 
1140 	header->id = id;
1141 	header->version = version;
1142 
1143 	/* Add to the end of the capability chain */
1144 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1145 		; /* nothing */
1146 
1147 	tmp->next = caps->size;
1148 	caps->size += size;
1149 
1150 	return header;
1151 }
1152 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1153 
1154 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1155 {
1156 	struct vfio_info_cap_header *tmp;
1157 	void *buf = (void *)caps->buf;
1158 
1159 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1160 		tmp->next += offset;
1161 }
1162 EXPORT_SYMBOL(vfio_info_cap_shift);
1163 
1164 int vfio_info_add_capability(struct vfio_info_cap *caps,
1165 			     struct vfio_info_cap_header *cap, size_t size)
1166 {
1167 	struct vfio_info_cap_header *header;
1168 
1169 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1170 	if (IS_ERR(header))
1171 		return PTR_ERR(header);
1172 
1173 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1174 
1175 	return 0;
1176 }
1177 EXPORT_SYMBOL(vfio_info_add_capability);
1178 
1179 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1180 				       int max_irq_type, size_t *data_size)
1181 {
1182 	unsigned long minsz;
1183 	size_t size;
1184 
1185 	minsz = offsetofend(struct vfio_irq_set, count);
1186 
1187 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1188 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1189 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1190 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1191 		return -EINVAL;
1192 
1193 	if (data_size)
1194 		*data_size = 0;
1195 
1196 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1197 		return -EINVAL;
1198 
1199 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1200 	case VFIO_IRQ_SET_DATA_NONE:
1201 		size = 0;
1202 		break;
1203 	case VFIO_IRQ_SET_DATA_BOOL:
1204 		size = sizeof(uint8_t);
1205 		break;
1206 	case VFIO_IRQ_SET_DATA_EVENTFD:
1207 		size = sizeof(int32_t);
1208 		break;
1209 	default:
1210 		return -EINVAL;
1211 	}
1212 
1213 	if (size) {
1214 		if (hdr->argsz - minsz < hdr->count * size)
1215 			return -EINVAL;
1216 
1217 		if (!data_size)
1218 			return -EINVAL;
1219 
1220 		*data_size = hdr->count * size;
1221 	}
1222 
1223 	return 0;
1224 }
1225 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1226 
1227 /*
1228  * Pin contiguous user pages and return their associated host pages for local
1229  * domain only.
1230  * @device [in]  : device
1231  * @iova [in]    : starting IOVA of user pages to be pinned.
1232  * @npage [in]   : count of pages to be pinned.  This count should not
1233  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1234  * @prot [in]    : protection flags
1235  * @pages[out]   : array of host pages
1236  * Return error or number of pages pinned.
1237  *
1238  * A driver may only call this function if the vfio_device was created
1239  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1240  */
1241 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1242 		   int npage, int prot, struct page **pages)
1243 {
1244 	/* group->container cannot change while a vfio device is open */
1245 	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1246 		return -EINVAL;
1247 	if (vfio_device_has_container(device))
1248 		return vfio_device_container_pin_pages(device, iova,
1249 						       npage, prot, pages);
1250 	if (device->iommufd_access) {
1251 		int ret;
1252 
1253 		if (iova > ULONG_MAX)
1254 			return -EINVAL;
1255 		/*
1256 		 * VFIO ignores the sub page offset, npages is from the start of
1257 		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1258 		 * the sub page offset by doing:
1259 		 *     pages[0] + (iova % PAGE_SIZE)
1260 		 */
1261 		ret = iommufd_access_pin_pages(
1262 			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1263 			npage * PAGE_SIZE, pages,
1264 			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1265 		if (ret)
1266 			return ret;
1267 		return npage;
1268 	}
1269 	return -EINVAL;
1270 }
1271 EXPORT_SYMBOL(vfio_pin_pages);
1272 
1273 /*
1274  * Unpin contiguous host pages for local domain only.
1275  * @device [in]  : device
1276  * @iova [in]    : starting address of user pages to be unpinned.
1277  * @npage [in]   : count of pages to be unpinned.  This count should not
1278  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1279  */
1280 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1281 {
1282 	if (WARN_ON(!vfio_assert_device_open(device)))
1283 		return;
1284 
1285 	if (vfio_device_has_container(device)) {
1286 		vfio_device_container_unpin_pages(device, iova, npage);
1287 		return;
1288 	}
1289 	if (device->iommufd_access) {
1290 		if (WARN_ON(iova > ULONG_MAX))
1291 			return;
1292 		iommufd_access_unpin_pages(device->iommufd_access,
1293 					   ALIGN_DOWN(iova, PAGE_SIZE),
1294 					   npage * PAGE_SIZE);
1295 		return;
1296 	}
1297 }
1298 EXPORT_SYMBOL(vfio_unpin_pages);
1299 
1300 /*
1301  * This interface allows the CPUs to perform some sort of virtual DMA on
1302  * behalf of the device.
1303  *
1304  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1305  * into/from a kernel buffer.
1306  *
1307  * As the read/write of user space memory is conducted via the CPUs and is
1308  * not a real device DMA, it is not necessary to pin the user space memory.
1309  *
1310  * @device [in]		: VFIO device
1311  * @iova [in]		: base IOVA of a user space buffer
1312  * @data [in]		: pointer to kernel buffer
1313  * @len [in]		: kernel buffer length
1314  * @write		: indicate read or write
1315  * Return error code on failure or 0 on success.
1316  */
1317 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1318 		size_t len, bool write)
1319 {
1320 	if (!data || len <= 0 || !vfio_assert_device_open(device))
1321 		return -EINVAL;
1322 
1323 	if (vfio_device_has_container(device))
1324 		return vfio_device_container_dma_rw(device, iova,
1325 						    data, len, write);
1326 
1327 	if (device->iommufd_access) {
1328 		unsigned int flags = 0;
1329 
1330 		if (iova > ULONG_MAX)
1331 			return -EINVAL;
1332 
1333 		/* VFIO historically tries to auto-detect a kthread */
1334 		if (!current->mm)
1335 			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1336 		if (write)
1337 			flags |= IOMMUFD_ACCESS_RW_WRITE;
1338 		return iommufd_access_rw(device->iommufd_access, iova, data,
1339 					 len, flags);
1340 	}
1341 	return -EINVAL;
1342 }
1343 EXPORT_SYMBOL(vfio_dma_rw);
1344 
1345 /*
1346  * Module/class support
1347  */
1348 static int __init vfio_init(void)
1349 {
1350 	int ret;
1351 
1352 	ida_init(&vfio.device_ida);
1353 
1354 	ret = vfio_group_init();
1355 	if (ret)
1356 		return ret;
1357 
1358 	ret = vfio_virqfd_init();
1359 	if (ret)
1360 		goto err_virqfd;
1361 
1362 	/* /sys/class/vfio-dev/vfioX */
1363 	vfio.device_class = class_create(THIS_MODULE, "vfio-dev");
1364 	if (IS_ERR(vfio.device_class)) {
1365 		ret = PTR_ERR(vfio.device_class);
1366 		goto err_dev_class;
1367 	}
1368 
1369 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1370 	return 0;
1371 
1372 err_dev_class:
1373 	vfio_virqfd_exit();
1374 err_virqfd:
1375 	vfio_group_cleanup();
1376 	return ret;
1377 }
1378 
1379 static void __exit vfio_cleanup(void)
1380 {
1381 	ida_destroy(&vfio.device_ida);
1382 	class_destroy(vfio.device_class);
1383 	vfio.device_class = NULL;
1384 	vfio_virqfd_exit();
1385 	vfio_group_cleanup();
1386 	xa_destroy(&vfio_device_set_xa);
1387 }
1388 
1389 module_init(vfio_init);
1390 module_exit(vfio_cleanup);
1391 
1392 MODULE_VERSION(DRIVER_VERSION);
1393 MODULE_LICENSE("GPL v2");
1394 MODULE_AUTHOR(DRIVER_AUTHOR);
1395 MODULE_DESCRIPTION(DRIVER_DESC);
1396 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1397