xref: /openbmc/linux/drivers/vfio/vfio_main.c (revision b290a05f)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #ifdef CONFIG_HAVE_KVM
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mutex.h>
26 #include <linux/pci.h>
27 #include <linux/rwsem.h>
28 #include <linux/sched.h>
29 #include <linux/slab.h>
30 #include <linux/stat.h>
31 #include <linux/string.h>
32 #include <linux/uaccess.h>
33 #include <linux/vfio.h>
34 #include <linux/wait.h>
35 #include <linux/sched/signal.h>
36 #include <linux/pm_runtime.h>
37 #include <linux/interval_tree.h>
38 #include <linux/iova_bitmap.h>
39 #include <linux/iommufd.h>
40 #include "vfio.h"
41 
42 #define DRIVER_VERSION	"0.3"
43 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC	"VFIO - User Level meta-driver"
45 
46 static struct vfio {
47 	struct class			*device_class;
48 	struct ida			device_ida;
49 } vfio;
50 
51 #ifdef CONFIG_VFIO_NOIOMMU
52 bool vfio_noiommu __read_mostly;
53 module_param_named(enable_unsafe_noiommu_mode,
54 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
56 #endif
57 
58 static DEFINE_XARRAY(vfio_device_set_xa);
59 
60 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
61 {
62 	unsigned long idx = (unsigned long)set_id;
63 	struct vfio_device_set *new_dev_set;
64 	struct vfio_device_set *dev_set;
65 
66 	if (WARN_ON(!set_id))
67 		return -EINVAL;
68 
69 	/*
70 	 * Atomically acquire a singleton object in the xarray for this set_id
71 	 */
72 	xa_lock(&vfio_device_set_xa);
73 	dev_set = xa_load(&vfio_device_set_xa, idx);
74 	if (dev_set)
75 		goto found_get_ref;
76 	xa_unlock(&vfio_device_set_xa);
77 
78 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
79 	if (!new_dev_set)
80 		return -ENOMEM;
81 	mutex_init(&new_dev_set->lock);
82 	INIT_LIST_HEAD(&new_dev_set->device_list);
83 	new_dev_set->set_id = set_id;
84 
85 	xa_lock(&vfio_device_set_xa);
86 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
87 			       GFP_KERNEL);
88 	if (!dev_set) {
89 		dev_set = new_dev_set;
90 		goto found_get_ref;
91 	}
92 
93 	kfree(new_dev_set);
94 	if (xa_is_err(dev_set)) {
95 		xa_unlock(&vfio_device_set_xa);
96 		return xa_err(dev_set);
97 	}
98 
99 found_get_ref:
100 	dev_set->device_count++;
101 	xa_unlock(&vfio_device_set_xa);
102 	mutex_lock(&dev_set->lock);
103 	device->dev_set = dev_set;
104 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
105 	mutex_unlock(&dev_set->lock);
106 	return 0;
107 }
108 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
109 
110 static void vfio_release_device_set(struct vfio_device *device)
111 {
112 	struct vfio_device_set *dev_set = device->dev_set;
113 
114 	if (!dev_set)
115 		return;
116 
117 	mutex_lock(&dev_set->lock);
118 	list_del(&device->dev_set_list);
119 	mutex_unlock(&dev_set->lock);
120 
121 	xa_lock(&vfio_device_set_xa);
122 	if (!--dev_set->device_count) {
123 		__xa_erase(&vfio_device_set_xa,
124 			   (unsigned long)dev_set->set_id);
125 		mutex_destroy(&dev_set->lock);
126 		kfree(dev_set);
127 	}
128 	xa_unlock(&vfio_device_set_xa);
129 }
130 
131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
132 {
133 	struct vfio_device *cur;
134 	unsigned int open_count = 0;
135 
136 	lockdep_assert_held(&dev_set->lock);
137 
138 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139 		open_count += cur->open_count;
140 	return open_count;
141 }
142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
143 
144 struct vfio_device *
145 vfio_find_device_in_devset(struct vfio_device_set *dev_set,
146 			   struct device *dev)
147 {
148 	struct vfio_device *cur;
149 
150 	lockdep_assert_held(&dev_set->lock);
151 
152 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
153 		if (cur->dev == dev)
154 			return cur;
155 	return NULL;
156 }
157 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
158 
159 /*
160  * Device objects - create, release, get, put, search
161  */
162 /* Device reference always implies a group reference */
163 void vfio_device_put_registration(struct vfio_device *device)
164 {
165 	if (refcount_dec_and_test(&device->refcount))
166 		complete(&device->comp);
167 }
168 
169 bool vfio_device_try_get_registration(struct vfio_device *device)
170 {
171 	return refcount_inc_not_zero(&device->refcount);
172 }
173 
174 /*
175  * VFIO driver API
176  */
177 /* Release helper called by vfio_put_device() */
178 static void vfio_device_release(struct device *dev)
179 {
180 	struct vfio_device *device =
181 			container_of(dev, struct vfio_device, device);
182 
183 	vfio_release_device_set(device);
184 	ida_free(&vfio.device_ida, device->index);
185 
186 	if (device->ops->release)
187 		device->ops->release(device);
188 
189 	kvfree(device);
190 }
191 
192 static int vfio_init_device(struct vfio_device *device, struct device *dev,
193 			    const struct vfio_device_ops *ops);
194 
195 /*
196  * Allocate and initialize vfio_device so it can be registered to vfio
197  * core.
198  *
199  * Drivers should use the wrapper vfio_alloc_device() for allocation.
200  * @size is the size of the structure to be allocated, including any
201  * private data used by the driver.
202  *
203  * Driver may provide an @init callback to cover device private data.
204  *
205  * Use vfio_put_device() to release the structure after success return.
206  */
207 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
208 				       const struct vfio_device_ops *ops)
209 {
210 	struct vfio_device *device;
211 	int ret;
212 
213 	if (WARN_ON(size < sizeof(struct vfio_device)))
214 		return ERR_PTR(-EINVAL);
215 
216 	device = kvzalloc(size, GFP_KERNEL);
217 	if (!device)
218 		return ERR_PTR(-ENOMEM);
219 
220 	ret = vfio_init_device(device, dev, ops);
221 	if (ret)
222 		goto out_free;
223 	return device;
224 
225 out_free:
226 	kvfree(device);
227 	return ERR_PTR(ret);
228 }
229 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
230 
231 /*
232  * Initialize a vfio_device so it can be registered to vfio core.
233  */
234 static int vfio_init_device(struct vfio_device *device, struct device *dev,
235 			    const struct vfio_device_ops *ops)
236 {
237 	int ret;
238 
239 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
240 	if (ret < 0) {
241 		dev_dbg(dev, "Error to alloc index\n");
242 		return ret;
243 	}
244 
245 	device->index = ret;
246 	init_completion(&device->comp);
247 	device->dev = dev;
248 	device->ops = ops;
249 
250 	if (ops->init) {
251 		ret = ops->init(device);
252 		if (ret)
253 			goto out_uninit;
254 	}
255 
256 	device_initialize(&device->device);
257 	device->device.release = vfio_device_release;
258 	device->device.class = vfio.device_class;
259 	device->device.parent = device->dev;
260 	return 0;
261 
262 out_uninit:
263 	vfio_release_device_set(device);
264 	ida_free(&vfio.device_ida, device->index);
265 	return ret;
266 }
267 
268 static int __vfio_register_dev(struct vfio_device *device,
269 			       enum vfio_group_type type)
270 {
271 	int ret;
272 
273 	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
274 		    (!device->ops->bind_iommufd ||
275 		     !device->ops->unbind_iommufd ||
276 		     !device->ops->attach_ioas ||
277 		     !device->ops->detach_ioas)))
278 		return -EINVAL;
279 
280 	/*
281 	 * If the driver doesn't specify a set then the device is added to a
282 	 * singleton set just for itself.
283 	 */
284 	if (!device->dev_set)
285 		vfio_assign_device_set(device, device);
286 
287 	ret = dev_set_name(&device->device, "vfio%d", device->index);
288 	if (ret)
289 		return ret;
290 
291 	ret = vfio_device_set_group(device, type);
292 	if (ret)
293 		return ret;
294 
295 	ret = vfio_device_add(device);
296 	if (ret)
297 		goto err_out;
298 
299 	/* Refcounting can't start until the driver calls register */
300 	refcount_set(&device->refcount, 1);
301 
302 	vfio_device_group_register(device);
303 
304 	return 0;
305 err_out:
306 	vfio_device_remove_group(device);
307 	return ret;
308 }
309 
310 int vfio_register_group_dev(struct vfio_device *device)
311 {
312 	return __vfio_register_dev(device, VFIO_IOMMU);
313 }
314 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
315 
316 /*
317  * Register a virtual device without IOMMU backing.  The user of this
318  * device must not be able to directly trigger unmediated DMA.
319  */
320 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
321 {
322 	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
323 }
324 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
325 
326 /*
327  * Decrement the device reference count and wait for the device to be
328  * removed.  Open file descriptors for the device... */
329 void vfio_unregister_group_dev(struct vfio_device *device)
330 {
331 	unsigned int i = 0;
332 	bool interrupted = false;
333 	long rc;
334 
335 	/*
336 	 * Prevent new device opened by userspace via the
337 	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
338 	 */
339 	vfio_device_group_unregister(device);
340 
341 	/*
342 	 * Balances vfio_device_add() in register path, also prevents
343 	 * new device opened by userspace in the cdev path.
344 	 */
345 	vfio_device_del(device);
346 
347 	vfio_device_put_registration(device);
348 	rc = try_wait_for_completion(&device->comp);
349 	while (rc <= 0) {
350 		if (device->ops->request)
351 			device->ops->request(device, i++);
352 
353 		if (interrupted) {
354 			rc = wait_for_completion_timeout(&device->comp,
355 							 HZ * 10);
356 		} else {
357 			rc = wait_for_completion_interruptible_timeout(
358 				&device->comp, HZ * 10);
359 			if (rc < 0) {
360 				interrupted = true;
361 				dev_warn(device->dev,
362 					 "Device is currently in use, task"
363 					 " \"%s\" (%d) "
364 					 "blocked until device is released",
365 					 current->comm, task_pid_nr(current));
366 			}
367 		}
368 	}
369 
370 	/* Balances vfio_device_set_group in register path */
371 	vfio_device_remove_group(device);
372 }
373 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
374 
375 #ifdef CONFIG_HAVE_KVM
376 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
377 {
378 	void (*pfn)(struct kvm *kvm);
379 	bool (*fn)(struct kvm *kvm);
380 	bool ret;
381 
382 	lockdep_assert_held(&device->dev_set->lock);
383 
384 	if (!kvm)
385 		return;
386 
387 	pfn = symbol_get(kvm_put_kvm);
388 	if (WARN_ON(!pfn))
389 		return;
390 
391 	fn = symbol_get(kvm_get_kvm_safe);
392 	if (WARN_ON(!fn)) {
393 		symbol_put(kvm_put_kvm);
394 		return;
395 	}
396 
397 	ret = fn(kvm);
398 	symbol_put(kvm_get_kvm_safe);
399 	if (!ret) {
400 		symbol_put(kvm_put_kvm);
401 		return;
402 	}
403 
404 	device->put_kvm = pfn;
405 	device->kvm = kvm;
406 }
407 
408 void vfio_device_put_kvm(struct vfio_device *device)
409 {
410 	lockdep_assert_held(&device->dev_set->lock);
411 
412 	if (!device->kvm)
413 		return;
414 
415 	if (WARN_ON(!device->put_kvm))
416 		goto clear;
417 
418 	device->put_kvm(device->kvm);
419 	device->put_kvm = NULL;
420 	symbol_put(kvm_put_kvm);
421 
422 clear:
423 	device->kvm = NULL;
424 }
425 #endif
426 
427 /* true if the vfio_device has open_device() called but not close_device() */
428 static bool vfio_assert_device_open(struct vfio_device *device)
429 {
430 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
431 }
432 
433 struct vfio_device_file *
434 vfio_allocate_device_file(struct vfio_device *device)
435 {
436 	struct vfio_device_file *df;
437 
438 	df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
439 	if (!df)
440 		return ERR_PTR(-ENOMEM);
441 
442 	df->device = device;
443 	spin_lock_init(&df->kvm_ref_lock);
444 
445 	return df;
446 }
447 
448 static int vfio_df_device_first_open(struct vfio_device_file *df)
449 {
450 	struct vfio_device *device = df->device;
451 	struct iommufd_ctx *iommufd = df->iommufd;
452 	int ret;
453 
454 	lockdep_assert_held(&device->dev_set->lock);
455 
456 	if (!try_module_get(device->dev->driver->owner))
457 		return -ENODEV;
458 
459 	if (iommufd)
460 		ret = vfio_df_iommufd_bind(df);
461 	else
462 		ret = vfio_device_group_use_iommu(device);
463 	if (ret)
464 		goto err_module_put;
465 
466 	if (device->ops->open_device) {
467 		ret = device->ops->open_device(device);
468 		if (ret)
469 			goto err_unuse_iommu;
470 	}
471 	return 0;
472 
473 err_unuse_iommu:
474 	if (iommufd)
475 		vfio_df_iommufd_unbind(df);
476 	else
477 		vfio_device_group_unuse_iommu(device);
478 err_module_put:
479 	module_put(device->dev->driver->owner);
480 	return ret;
481 }
482 
483 static void vfio_df_device_last_close(struct vfio_device_file *df)
484 {
485 	struct vfio_device *device = df->device;
486 	struct iommufd_ctx *iommufd = df->iommufd;
487 
488 	lockdep_assert_held(&device->dev_set->lock);
489 
490 	if (device->ops->close_device)
491 		device->ops->close_device(device);
492 	if (iommufd)
493 		vfio_df_iommufd_unbind(df);
494 	else
495 		vfio_device_group_unuse_iommu(device);
496 	module_put(device->dev->driver->owner);
497 }
498 
499 int vfio_df_open(struct vfio_device_file *df)
500 {
501 	struct vfio_device *device = df->device;
502 	int ret = 0;
503 
504 	lockdep_assert_held(&device->dev_set->lock);
505 
506 	/*
507 	 * Only the group path allows the device to be opened multiple
508 	 * times.  The device cdev path doesn't have a secure way for it.
509 	 */
510 	if (device->open_count != 0 && !df->group)
511 		return -EINVAL;
512 
513 	device->open_count++;
514 	if (device->open_count == 1) {
515 		ret = vfio_df_device_first_open(df);
516 		if (ret)
517 			device->open_count--;
518 	}
519 
520 	return ret;
521 }
522 
523 void vfio_df_close(struct vfio_device_file *df)
524 {
525 	struct vfio_device *device = df->device;
526 
527 	lockdep_assert_held(&device->dev_set->lock);
528 
529 	vfio_assert_device_open(device);
530 	if (device->open_count == 1)
531 		vfio_df_device_last_close(df);
532 	device->open_count--;
533 }
534 
535 /*
536  * Wrapper around pm_runtime_resume_and_get().
537  * Return error code on failure or 0 on success.
538  */
539 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
540 {
541 	struct device *dev = device->dev;
542 
543 	if (dev->driver && dev->driver->pm) {
544 		int ret;
545 
546 		ret = pm_runtime_resume_and_get(dev);
547 		if (ret) {
548 			dev_info_ratelimited(dev,
549 				"vfio: runtime resume failed %d\n", ret);
550 			return -EIO;
551 		}
552 	}
553 
554 	return 0;
555 }
556 
557 /*
558  * Wrapper around pm_runtime_put().
559  */
560 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
561 {
562 	struct device *dev = device->dev;
563 
564 	if (dev->driver && dev->driver->pm)
565 		pm_runtime_put(dev);
566 }
567 
568 /*
569  * VFIO Device fd
570  */
571 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
572 {
573 	struct vfio_device_file *df = filep->private_data;
574 	struct vfio_device *device = df->device;
575 
576 	if (df->group)
577 		vfio_df_group_close(df);
578 	else
579 		vfio_df_unbind_iommufd(df);
580 
581 	vfio_device_put_registration(device);
582 
583 	kfree(df);
584 
585 	return 0;
586 }
587 
588 /*
589  * vfio_mig_get_next_state - Compute the next step in the FSM
590  * @cur_fsm - The current state the device is in
591  * @new_fsm - The target state to reach
592  * @next_fsm - Pointer to the next step to get to new_fsm
593  *
594  * Return 0 upon success, otherwise -errno
595  * Upon success the next step in the state progression between cur_fsm and
596  * new_fsm will be set in next_fsm.
597  *
598  * This breaks down requests for combination transitions into smaller steps and
599  * returns the next step to get to new_fsm. The function may need to be called
600  * multiple times before reaching new_fsm.
601  *
602  */
603 int vfio_mig_get_next_state(struct vfio_device *device,
604 			    enum vfio_device_mig_state cur_fsm,
605 			    enum vfio_device_mig_state new_fsm,
606 			    enum vfio_device_mig_state *next_fsm)
607 {
608 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
609 	/*
610 	 * The coding in this table requires the driver to implement the
611 	 * following FSM arcs:
612 	 *         RESUMING -> STOP
613 	 *         STOP -> RESUMING
614 	 *         STOP -> STOP_COPY
615 	 *         STOP_COPY -> STOP
616 	 *
617 	 * If P2P is supported then the driver must also implement these FSM
618 	 * arcs:
619 	 *         RUNNING -> RUNNING_P2P
620 	 *         RUNNING_P2P -> RUNNING
621 	 *         RUNNING_P2P -> STOP
622 	 *         STOP -> RUNNING_P2P
623 	 *
624 	 * If precopy is supported then the driver must support these additional
625 	 * FSM arcs:
626 	 *         RUNNING -> PRE_COPY
627 	 *         PRE_COPY -> RUNNING
628 	 *         PRE_COPY -> STOP_COPY
629 	 * However, if precopy and P2P are supported together then the driver
630 	 * must support these additional arcs beyond the P2P arcs above:
631 	 *         PRE_COPY -> RUNNING
632 	 *         PRE_COPY -> PRE_COPY_P2P
633 	 *         PRE_COPY_P2P -> PRE_COPY
634 	 *         PRE_COPY_P2P -> RUNNING_P2P
635 	 *         PRE_COPY_P2P -> STOP_COPY
636 	 *         RUNNING -> PRE_COPY
637 	 *         RUNNING_P2P -> PRE_COPY_P2P
638 	 *
639 	 * Without P2P and precopy the driver must implement:
640 	 *         RUNNING -> STOP
641 	 *         STOP -> RUNNING
642 	 *
643 	 * The coding will step through multiple states for some combination
644 	 * transitions; if all optional features are supported, this means the
645 	 * following ones:
646 	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
647 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
648 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
649 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
650 	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
651 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
652 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
653 	 *         RESUMING -> STOP -> RUNNING_P2P
654 	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
655 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
656 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
657 	 *         RESUMING -> STOP -> STOP_COPY
658 	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
659 	 *         RUNNING -> RUNNING_P2P -> STOP
660 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
661 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
662 	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
663 	 *         RUNNING_P2P -> STOP -> RESUMING
664 	 *         RUNNING_P2P -> STOP -> STOP_COPY
665 	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
666 	 *         STOP -> RUNNING_P2P -> RUNNING
667 	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
668 	 *         STOP_COPY -> STOP -> RESUMING
669 	 *         STOP_COPY -> STOP -> RUNNING_P2P
670 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
671 	 *
672 	 *  The following transitions are blocked:
673 	 *         STOP_COPY -> PRE_COPY
674 	 *         STOP_COPY -> PRE_COPY_P2P
675 	 */
676 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
677 		[VFIO_DEVICE_STATE_STOP] = {
678 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
679 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
680 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
681 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
682 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
683 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
684 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
685 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
686 		},
687 		[VFIO_DEVICE_STATE_RUNNING] = {
688 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
689 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
690 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
691 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
692 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
693 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
694 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
695 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
696 		},
697 		[VFIO_DEVICE_STATE_PRE_COPY] = {
698 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
699 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
700 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
701 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
702 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
703 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
704 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
705 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
706 		},
707 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
708 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
709 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
710 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
711 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
712 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
713 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
714 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
715 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
716 		},
717 		[VFIO_DEVICE_STATE_STOP_COPY] = {
718 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
719 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
720 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
721 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
722 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
723 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
724 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
725 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
726 		},
727 		[VFIO_DEVICE_STATE_RESUMING] = {
728 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
729 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
730 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
731 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
732 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
733 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
734 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
735 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
736 		},
737 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
738 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
739 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
740 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
741 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
742 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
743 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
744 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
745 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
746 		},
747 		[VFIO_DEVICE_STATE_ERROR] = {
748 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
749 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
750 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
751 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
752 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
753 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
754 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
755 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
756 		},
757 	};
758 
759 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
760 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
761 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
762 		[VFIO_DEVICE_STATE_PRE_COPY] =
763 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
764 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
765 						   VFIO_MIGRATION_P2P |
766 						   VFIO_MIGRATION_PRE_COPY,
767 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
768 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
769 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
770 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
771 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
772 	};
773 
774 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
775 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
776 			state_flags_table[cur_fsm]))
777 		return -EINVAL;
778 
779 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
780 	   (state_flags_table[new_fsm] & device->migration_flags) !=
781 			state_flags_table[new_fsm])
782 		return -EINVAL;
783 
784 	/*
785 	 * Arcs touching optional and unsupported states are skipped over. The
786 	 * driver will instead see an arc from the original state to the next
787 	 * logical state, as per the above comment.
788 	 */
789 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
790 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
791 			state_flags_table[*next_fsm])
792 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
793 
794 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
795 }
796 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
797 
798 /*
799  * Convert the drivers's struct file into a FD number and return it to userspace
800  */
801 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
802 				   struct vfio_device_feature_mig_state *mig)
803 {
804 	int ret;
805 	int fd;
806 
807 	fd = get_unused_fd_flags(O_CLOEXEC);
808 	if (fd < 0) {
809 		ret = fd;
810 		goto out_fput;
811 	}
812 
813 	mig->data_fd = fd;
814 	if (copy_to_user(arg, mig, sizeof(*mig))) {
815 		ret = -EFAULT;
816 		goto out_put_unused;
817 	}
818 	fd_install(fd, filp);
819 	return 0;
820 
821 out_put_unused:
822 	put_unused_fd(fd);
823 out_fput:
824 	fput(filp);
825 	return ret;
826 }
827 
828 static int
829 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
830 					   u32 flags, void __user *arg,
831 					   size_t argsz)
832 {
833 	size_t minsz =
834 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
835 	struct vfio_device_feature_mig_state mig;
836 	struct file *filp = NULL;
837 	int ret;
838 
839 	if (!device->mig_ops)
840 		return -ENOTTY;
841 
842 	ret = vfio_check_feature(flags, argsz,
843 				 VFIO_DEVICE_FEATURE_SET |
844 				 VFIO_DEVICE_FEATURE_GET,
845 				 sizeof(mig));
846 	if (ret != 1)
847 		return ret;
848 
849 	if (copy_from_user(&mig, arg, minsz))
850 		return -EFAULT;
851 
852 	if (flags & VFIO_DEVICE_FEATURE_GET) {
853 		enum vfio_device_mig_state curr_state;
854 
855 		ret = device->mig_ops->migration_get_state(device,
856 							   &curr_state);
857 		if (ret)
858 			return ret;
859 		mig.device_state = curr_state;
860 		goto out_copy;
861 	}
862 
863 	/* Handle the VFIO_DEVICE_FEATURE_SET */
864 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
865 	if (IS_ERR(filp) || !filp)
866 		goto out_copy;
867 
868 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
869 out_copy:
870 	mig.data_fd = -1;
871 	if (copy_to_user(arg, &mig, sizeof(mig)))
872 		return -EFAULT;
873 	if (IS_ERR(filp))
874 		return PTR_ERR(filp);
875 	return 0;
876 }
877 
878 static int
879 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
880 					      u32 flags, void __user *arg,
881 					      size_t argsz)
882 {
883 	struct vfio_device_feature_mig_data_size data_size = {};
884 	unsigned long stop_copy_length;
885 	int ret;
886 
887 	if (!device->mig_ops)
888 		return -ENOTTY;
889 
890 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
891 				 sizeof(data_size));
892 	if (ret != 1)
893 		return ret;
894 
895 	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
896 	if (ret)
897 		return ret;
898 
899 	data_size.stop_copy_length = stop_copy_length;
900 	if (copy_to_user(arg, &data_size, sizeof(data_size)))
901 		return -EFAULT;
902 
903 	return 0;
904 }
905 
906 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
907 					       u32 flags, void __user *arg,
908 					       size_t argsz)
909 {
910 	struct vfio_device_feature_migration mig = {
911 		.flags = device->migration_flags,
912 	};
913 	int ret;
914 
915 	if (!device->mig_ops)
916 		return -ENOTTY;
917 
918 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
919 				 sizeof(mig));
920 	if (ret != 1)
921 		return ret;
922 	if (copy_to_user(arg, &mig, sizeof(mig)))
923 		return -EFAULT;
924 	return 0;
925 }
926 
927 /* Ranges should fit into a single kernel page */
928 #define LOG_MAX_RANGES \
929 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
930 
931 static int
932 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
933 					u32 flags, void __user *arg,
934 					size_t argsz)
935 {
936 	size_t minsz =
937 		offsetofend(struct vfio_device_feature_dma_logging_control,
938 			    ranges);
939 	struct vfio_device_feature_dma_logging_range __user *ranges;
940 	struct vfio_device_feature_dma_logging_control control;
941 	struct vfio_device_feature_dma_logging_range range;
942 	struct rb_root_cached root = RB_ROOT_CACHED;
943 	struct interval_tree_node *nodes;
944 	u64 iova_end;
945 	u32 nnodes;
946 	int i, ret;
947 
948 	if (!device->log_ops)
949 		return -ENOTTY;
950 
951 	ret = vfio_check_feature(flags, argsz,
952 				 VFIO_DEVICE_FEATURE_SET,
953 				 sizeof(control));
954 	if (ret != 1)
955 		return ret;
956 
957 	if (copy_from_user(&control, arg, minsz))
958 		return -EFAULT;
959 
960 	nnodes = control.num_ranges;
961 	if (!nnodes)
962 		return -EINVAL;
963 
964 	if (nnodes > LOG_MAX_RANGES)
965 		return -E2BIG;
966 
967 	ranges = u64_to_user_ptr(control.ranges);
968 	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
969 			      GFP_KERNEL);
970 	if (!nodes)
971 		return -ENOMEM;
972 
973 	for (i = 0; i < nnodes; i++) {
974 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
975 			ret = -EFAULT;
976 			goto end;
977 		}
978 		if (!IS_ALIGNED(range.iova, control.page_size) ||
979 		    !IS_ALIGNED(range.length, control.page_size)) {
980 			ret = -EINVAL;
981 			goto end;
982 		}
983 
984 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
985 		    iova_end > ULONG_MAX) {
986 			ret = -EOVERFLOW;
987 			goto end;
988 		}
989 
990 		nodes[i].start = range.iova;
991 		nodes[i].last = range.iova + range.length - 1;
992 		if (interval_tree_iter_first(&root, nodes[i].start,
993 					     nodes[i].last)) {
994 			/* Range overlapping */
995 			ret = -EINVAL;
996 			goto end;
997 		}
998 		interval_tree_insert(nodes + i, &root);
999 	}
1000 
1001 	ret = device->log_ops->log_start(device, &root, nnodes,
1002 					 &control.page_size);
1003 	if (ret)
1004 		goto end;
1005 
1006 	if (copy_to_user(arg, &control, sizeof(control))) {
1007 		ret = -EFAULT;
1008 		device->log_ops->log_stop(device);
1009 	}
1010 
1011 end:
1012 	kfree(nodes);
1013 	return ret;
1014 }
1015 
1016 static int
1017 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1018 				       u32 flags, void __user *arg,
1019 				       size_t argsz)
1020 {
1021 	int ret;
1022 
1023 	if (!device->log_ops)
1024 		return -ENOTTY;
1025 
1026 	ret = vfio_check_feature(flags, argsz,
1027 				 VFIO_DEVICE_FEATURE_SET, 0);
1028 	if (ret != 1)
1029 		return ret;
1030 
1031 	return device->log_ops->log_stop(device);
1032 }
1033 
1034 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1035 					  unsigned long iova, size_t length,
1036 					  void *opaque)
1037 {
1038 	struct vfio_device *device = opaque;
1039 
1040 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1041 }
1042 
1043 static int
1044 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1045 					 u32 flags, void __user *arg,
1046 					 size_t argsz)
1047 {
1048 	size_t minsz =
1049 		offsetofend(struct vfio_device_feature_dma_logging_report,
1050 			    bitmap);
1051 	struct vfio_device_feature_dma_logging_report report;
1052 	struct iova_bitmap *iter;
1053 	u64 iova_end;
1054 	int ret;
1055 
1056 	if (!device->log_ops)
1057 		return -ENOTTY;
1058 
1059 	ret = vfio_check_feature(flags, argsz,
1060 				 VFIO_DEVICE_FEATURE_GET,
1061 				 sizeof(report));
1062 	if (ret != 1)
1063 		return ret;
1064 
1065 	if (copy_from_user(&report, arg, minsz))
1066 		return -EFAULT;
1067 
1068 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1069 		return -EINVAL;
1070 
1071 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1072 	    iova_end > ULONG_MAX)
1073 		return -EOVERFLOW;
1074 
1075 	iter = iova_bitmap_alloc(report.iova, report.length,
1076 				 report.page_size,
1077 				 u64_to_user_ptr(report.bitmap));
1078 	if (IS_ERR(iter))
1079 		return PTR_ERR(iter);
1080 
1081 	ret = iova_bitmap_for_each(iter, device,
1082 				   vfio_device_log_read_and_clear);
1083 
1084 	iova_bitmap_free(iter);
1085 	return ret;
1086 }
1087 
1088 static int vfio_ioctl_device_feature(struct vfio_device *device,
1089 				     struct vfio_device_feature __user *arg)
1090 {
1091 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1092 	struct vfio_device_feature feature;
1093 
1094 	if (copy_from_user(&feature, arg, minsz))
1095 		return -EFAULT;
1096 
1097 	if (feature.argsz < minsz)
1098 		return -EINVAL;
1099 
1100 	/* Check unknown flags */
1101 	if (feature.flags &
1102 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1103 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1104 		return -EINVAL;
1105 
1106 	/* GET & SET are mutually exclusive except with PROBE */
1107 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1108 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1109 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1110 		return -EINVAL;
1111 
1112 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1113 	case VFIO_DEVICE_FEATURE_MIGRATION:
1114 		return vfio_ioctl_device_feature_migration(
1115 			device, feature.flags, arg->data,
1116 			feature.argsz - minsz);
1117 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1118 		return vfio_ioctl_device_feature_mig_device_state(
1119 			device, feature.flags, arg->data,
1120 			feature.argsz - minsz);
1121 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1122 		return vfio_ioctl_device_feature_logging_start(
1123 			device, feature.flags, arg->data,
1124 			feature.argsz - minsz);
1125 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1126 		return vfio_ioctl_device_feature_logging_stop(
1127 			device, feature.flags, arg->data,
1128 			feature.argsz - minsz);
1129 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1130 		return vfio_ioctl_device_feature_logging_report(
1131 			device, feature.flags, arg->data,
1132 			feature.argsz - minsz);
1133 	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1134 		return vfio_ioctl_device_feature_migration_data_size(
1135 			device, feature.flags, arg->data,
1136 			feature.argsz - minsz);
1137 	default:
1138 		if (unlikely(!device->ops->device_feature))
1139 			return -EINVAL;
1140 		return device->ops->device_feature(device, feature.flags,
1141 						   arg->data,
1142 						   feature.argsz - minsz);
1143 	}
1144 }
1145 
1146 static long vfio_device_fops_unl_ioctl(struct file *filep,
1147 				       unsigned int cmd, unsigned long arg)
1148 {
1149 	struct vfio_device_file *df = filep->private_data;
1150 	struct vfio_device *device = df->device;
1151 	void __user *uptr = (void __user *)arg;
1152 	int ret;
1153 
1154 	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1155 		return vfio_df_ioctl_bind_iommufd(df, uptr);
1156 
1157 	/* Paired with smp_store_release() following vfio_df_open() */
1158 	if (!smp_load_acquire(&df->access_granted))
1159 		return -EINVAL;
1160 
1161 	ret = vfio_device_pm_runtime_get(device);
1162 	if (ret)
1163 		return ret;
1164 
1165 	/* cdev only ioctls */
1166 	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1167 		switch (cmd) {
1168 		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1169 			ret = vfio_df_ioctl_attach_pt(df, uptr);
1170 			goto out;
1171 
1172 		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1173 			ret = vfio_df_ioctl_detach_pt(df, uptr);
1174 			goto out;
1175 		}
1176 	}
1177 
1178 	switch (cmd) {
1179 	case VFIO_DEVICE_FEATURE:
1180 		ret = vfio_ioctl_device_feature(device, uptr);
1181 		break;
1182 
1183 	default:
1184 		if (unlikely(!device->ops->ioctl))
1185 			ret = -EINVAL;
1186 		else
1187 			ret = device->ops->ioctl(device, cmd, arg);
1188 		break;
1189 	}
1190 out:
1191 	vfio_device_pm_runtime_put(device);
1192 	return ret;
1193 }
1194 
1195 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1196 				     size_t count, loff_t *ppos)
1197 {
1198 	struct vfio_device_file *df = filep->private_data;
1199 	struct vfio_device *device = df->device;
1200 
1201 	/* Paired with smp_store_release() following vfio_df_open() */
1202 	if (!smp_load_acquire(&df->access_granted))
1203 		return -EINVAL;
1204 
1205 	if (unlikely(!device->ops->read))
1206 		return -EINVAL;
1207 
1208 	return device->ops->read(device, buf, count, ppos);
1209 }
1210 
1211 static ssize_t vfio_device_fops_write(struct file *filep,
1212 				      const char __user *buf,
1213 				      size_t count, loff_t *ppos)
1214 {
1215 	struct vfio_device_file *df = filep->private_data;
1216 	struct vfio_device *device = df->device;
1217 
1218 	/* Paired with smp_store_release() following vfio_df_open() */
1219 	if (!smp_load_acquire(&df->access_granted))
1220 		return -EINVAL;
1221 
1222 	if (unlikely(!device->ops->write))
1223 		return -EINVAL;
1224 
1225 	return device->ops->write(device, buf, count, ppos);
1226 }
1227 
1228 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1229 {
1230 	struct vfio_device_file *df = filep->private_data;
1231 	struct vfio_device *device = df->device;
1232 
1233 	/* Paired with smp_store_release() following vfio_df_open() */
1234 	if (!smp_load_acquire(&df->access_granted))
1235 		return -EINVAL;
1236 
1237 	if (unlikely(!device->ops->mmap))
1238 		return -EINVAL;
1239 
1240 	return device->ops->mmap(device, vma);
1241 }
1242 
1243 const struct file_operations vfio_device_fops = {
1244 	.owner		= THIS_MODULE,
1245 	.open		= vfio_device_fops_cdev_open,
1246 	.release	= vfio_device_fops_release,
1247 	.read		= vfio_device_fops_read,
1248 	.write		= vfio_device_fops_write,
1249 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1250 	.compat_ioctl	= compat_ptr_ioctl,
1251 	.mmap		= vfio_device_fops_mmap,
1252 };
1253 
1254 static struct vfio_device *vfio_device_from_file(struct file *file)
1255 {
1256 	struct vfio_device_file *df = file->private_data;
1257 
1258 	if (file->f_op != &vfio_device_fops)
1259 		return NULL;
1260 	return df->device;
1261 }
1262 
1263 /**
1264  * vfio_file_is_valid - True if the file is valid vfio file
1265  * @file: VFIO group file or VFIO device file
1266  */
1267 bool vfio_file_is_valid(struct file *file)
1268 {
1269 	return vfio_group_from_file(file) ||
1270 	       vfio_device_from_file(file);
1271 }
1272 EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1273 
1274 /**
1275  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1276  *        is always CPU cache coherent
1277  * @file: VFIO group file or VFIO device file
1278  *
1279  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1280  * bit in DMA transactions. A return of false indicates that the user has
1281  * rights to access additional instructions such as wbinvd on x86.
1282  */
1283 bool vfio_file_enforced_coherent(struct file *file)
1284 {
1285 	struct vfio_device *device;
1286 	struct vfio_group *group;
1287 
1288 	group = vfio_group_from_file(file);
1289 	if (group)
1290 		return vfio_group_enforced_coherent(group);
1291 
1292 	device = vfio_device_from_file(file);
1293 	if (device)
1294 		return device_iommu_capable(device->dev,
1295 					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1296 
1297 	return true;
1298 }
1299 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1300 
1301 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1302 {
1303 	struct vfio_device_file *df = file->private_data;
1304 
1305 	/*
1306 	 * The kvm is first recorded in the vfio_device_file, and will
1307 	 * be propagated to vfio_device::kvm when the file is bound to
1308 	 * iommufd successfully in the vfio device cdev path.
1309 	 */
1310 	spin_lock(&df->kvm_ref_lock);
1311 	df->kvm = kvm;
1312 	spin_unlock(&df->kvm_ref_lock);
1313 }
1314 
1315 /**
1316  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1317  * @file: VFIO group file or VFIO device file
1318  * @kvm: KVM to link
1319  *
1320  * When a VFIO device is first opened the KVM will be available in
1321  * device->kvm if one was associated with the file.
1322  */
1323 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1324 {
1325 	struct vfio_group *group;
1326 
1327 	group = vfio_group_from_file(file);
1328 	if (group)
1329 		vfio_group_set_kvm(group, kvm);
1330 
1331 	if (vfio_device_from_file(file))
1332 		vfio_device_file_set_kvm(file, kvm);
1333 }
1334 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1335 
1336 /*
1337  * Sub-module support
1338  */
1339 /*
1340  * Helper for managing a buffer of info chain capabilities, allocate or
1341  * reallocate a buffer with additional @size, filling in @id and @version
1342  * of the capability.  A pointer to the new capability is returned.
1343  *
1344  * NB. The chain is based at the head of the buffer, so new entries are
1345  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1346  * next offsets prior to copying to the user buffer.
1347  */
1348 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1349 					       size_t size, u16 id, u16 version)
1350 {
1351 	void *buf;
1352 	struct vfio_info_cap_header *header, *tmp;
1353 
1354 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1355 	if (!buf) {
1356 		kfree(caps->buf);
1357 		caps->buf = NULL;
1358 		caps->size = 0;
1359 		return ERR_PTR(-ENOMEM);
1360 	}
1361 
1362 	caps->buf = buf;
1363 	header = buf + caps->size;
1364 
1365 	/* Eventually copied to user buffer, zero */
1366 	memset(header, 0, size);
1367 
1368 	header->id = id;
1369 	header->version = version;
1370 
1371 	/* Add to the end of the capability chain */
1372 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1373 		; /* nothing */
1374 
1375 	tmp->next = caps->size;
1376 	caps->size += size;
1377 
1378 	return header;
1379 }
1380 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1381 
1382 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1383 {
1384 	struct vfio_info_cap_header *tmp;
1385 	void *buf = (void *)caps->buf;
1386 
1387 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1388 		tmp->next += offset;
1389 }
1390 EXPORT_SYMBOL(vfio_info_cap_shift);
1391 
1392 int vfio_info_add_capability(struct vfio_info_cap *caps,
1393 			     struct vfio_info_cap_header *cap, size_t size)
1394 {
1395 	struct vfio_info_cap_header *header;
1396 
1397 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1398 	if (IS_ERR(header))
1399 		return PTR_ERR(header);
1400 
1401 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1402 
1403 	return 0;
1404 }
1405 EXPORT_SYMBOL(vfio_info_add_capability);
1406 
1407 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1408 				       int max_irq_type, size_t *data_size)
1409 {
1410 	unsigned long minsz;
1411 	size_t size;
1412 
1413 	minsz = offsetofend(struct vfio_irq_set, count);
1414 
1415 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1416 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1417 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1418 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1419 		return -EINVAL;
1420 
1421 	if (data_size)
1422 		*data_size = 0;
1423 
1424 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1425 		return -EINVAL;
1426 
1427 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1428 	case VFIO_IRQ_SET_DATA_NONE:
1429 		size = 0;
1430 		break;
1431 	case VFIO_IRQ_SET_DATA_BOOL:
1432 		size = sizeof(uint8_t);
1433 		break;
1434 	case VFIO_IRQ_SET_DATA_EVENTFD:
1435 		size = sizeof(int32_t);
1436 		break;
1437 	default:
1438 		return -EINVAL;
1439 	}
1440 
1441 	if (size) {
1442 		if (hdr->argsz - minsz < hdr->count * size)
1443 			return -EINVAL;
1444 
1445 		if (!data_size)
1446 			return -EINVAL;
1447 
1448 		*data_size = hdr->count * size;
1449 	}
1450 
1451 	return 0;
1452 }
1453 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1454 
1455 /*
1456  * Pin contiguous user pages and return their associated host pages for local
1457  * domain only.
1458  * @device [in]  : device
1459  * @iova [in]    : starting IOVA of user pages to be pinned.
1460  * @npage [in]   : count of pages to be pinned.  This count should not
1461  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1462  * @prot [in]    : protection flags
1463  * @pages[out]   : array of host pages
1464  * Return error or number of pages pinned.
1465  *
1466  * A driver may only call this function if the vfio_device was created
1467  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1468  */
1469 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1470 		   int npage, int prot, struct page **pages)
1471 {
1472 	/* group->container cannot change while a vfio device is open */
1473 	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1474 		return -EINVAL;
1475 	if (vfio_device_has_container(device))
1476 		return vfio_device_container_pin_pages(device, iova,
1477 						       npage, prot, pages);
1478 	if (device->iommufd_access) {
1479 		int ret;
1480 
1481 		if (iova > ULONG_MAX)
1482 			return -EINVAL;
1483 		/*
1484 		 * VFIO ignores the sub page offset, npages is from the start of
1485 		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1486 		 * the sub page offset by doing:
1487 		 *     pages[0] + (iova % PAGE_SIZE)
1488 		 */
1489 		ret = iommufd_access_pin_pages(
1490 			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1491 			npage * PAGE_SIZE, pages,
1492 			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1493 		if (ret)
1494 			return ret;
1495 		return npage;
1496 	}
1497 	return -EINVAL;
1498 }
1499 EXPORT_SYMBOL(vfio_pin_pages);
1500 
1501 /*
1502  * Unpin contiguous host pages for local domain only.
1503  * @device [in]  : device
1504  * @iova [in]    : starting address of user pages to be unpinned.
1505  * @npage [in]   : count of pages to be unpinned.  This count should not
1506  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1507  */
1508 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1509 {
1510 	if (WARN_ON(!vfio_assert_device_open(device)))
1511 		return;
1512 
1513 	if (vfio_device_has_container(device)) {
1514 		vfio_device_container_unpin_pages(device, iova, npage);
1515 		return;
1516 	}
1517 	if (device->iommufd_access) {
1518 		if (WARN_ON(iova > ULONG_MAX))
1519 			return;
1520 		iommufd_access_unpin_pages(device->iommufd_access,
1521 					   ALIGN_DOWN(iova, PAGE_SIZE),
1522 					   npage * PAGE_SIZE);
1523 		return;
1524 	}
1525 }
1526 EXPORT_SYMBOL(vfio_unpin_pages);
1527 
1528 /*
1529  * This interface allows the CPUs to perform some sort of virtual DMA on
1530  * behalf of the device.
1531  *
1532  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1533  * into/from a kernel buffer.
1534  *
1535  * As the read/write of user space memory is conducted via the CPUs and is
1536  * not a real device DMA, it is not necessary to pin the user space memory.
1537  *
1538  * @device [in]		: VFIO device
1539  * @iova [in]		: base IOVA of a user space buffer
1540  * @data [in]		: pointer to kernel buffer
1541  * @len [in]		: kernel buffer length
1542  * @write		: indicate read or write
1543  * Return error code on failure or 0 on success.
1544  */
1545 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1546 		size_t len, bool write)
1547 {
1548 	if (!data || len <= 0 || !vfio_assert_device_open(device))
1549 		return -EINVAL;
1550 
1551 	if (vfio_device_has_container(device))
1552 		return vfio_device_container_dma_rw(device, iova,
1553 						    data, len, write);
1554 
1555 	if (device->iommufd_access) {
1556 		unsigned int flags = 0;
1557 
1558 		if (iova > ULONG_MAX)
1559 			return -EINVAL;
1560 
1561 		/* VFIO historically tries to auto-detect a kthread */
1562 		if (!current->mm)
1563 			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1564 		if (write)
1565 			flags |= IOMMUFD_ACCESS_RW_WRITE;
1566 		return iommufd_access_rw(device->iommufd_access, iova, data,
1567 					 len, flags);
1568 	}
1569 	return -EINVAL;
1570 }
1571 EXPORT_SYMBOL(vfio_dma_rw);
1572 
1573 /*
1574  * Module/class support
1575  */
1576 static int __init vfio_init(void)
1577 {
1578 	int ret;
1579 
1580 	ida_init(&vfio.device_ida);
1581 
1582 	ret = vfio_group_init();
1583 	if (ret)
1584 		return ret;
1585 
1586 	ret = vfio_virqfd_init();
1587 	if (ret)
1588 		goto err_virqfd;
1589 
1590 	/* /sys/class/vfio-dev/vfioX */
1591 	vfio.device_class = class_create("vfio-dev");
1592 	if (IS_ERR(vfio.device_class)) {
1593 		ret = PTR_ERR(vfio.device_class);
1594 		goto err_dev_class;
1595 	}
1596 
1597 	ret = vfio_cdev_init(vfio.device_class);
1598 	if (ret)
1599 		goto err_alloc_dev_chrdev;
1600 
1601 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1602 	return 0;
1603 
1604 err_alloc_dev_chrdev:
1605 	class_destroy(vfio.device_class);
1606 	vfio.device_class = NULL;
1607 err_dev_class:
1608 	vfio_virqfd_exit();
1609 err_virqfd:
1610 	vfio_group_cleanup();
1611 	return ret;
1612 }
1613 
1614 static void __exit vfio_cleanup(void)
1615 {
1616 	ida_destroy(&vfio.device_ida);
1617 	vfio_cdev_cleanup();
1618 	class_destroy(vfio.device_class);
1619 	vfio.device_class = NULL;
1620 	vfio_virqfd_exit();
1621 	vfio_group_cleanup();
1622 	xa_destroy(&vfio_device_set_xa);
1623 }
1624 
1625 module_init(vfio_init);
1626 module_exit(vfio_cleanup);
1627 
1628 MODULE_VERSION(DRIVER_VERSION);
1629 MODULE_LICENSE("GPL v2");
1630 MODULE_AUTHOR(DRIVER_AUTHOR);
1631 MODULE_DESCRIPTION(DRIVER_DESC);
1632 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1633