xref: /openbmc/linux/drivers/vfio/container.c (revision 90741096769bd75152a5fe397343e5893c7d905a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
4  *
5  * VFIO container (/dev/vfio/vfio)
6  */
7 #include <linux/file.h>
8 #include <linux/slab.h>
9 #include <linux/fs.h>
10 #include <linux/capability.h>
11 #include <linux/iommu.h>
12 #include <linux/miscdevice.h>
13 #include <linux/vfio.h>
14 #include <uapi/linux/vfio.h>
15 
16 #include "vfio.h"
17 
18 struct vfio_container {
19 	struct kref			kref;
20 	struct list_head		group_list;
21 	struct rw_semaphore		group_lock;
22 	struct vfio_iommu_driver	*iommu_driver;
23 	void				*iommu_data;
24 	bool				noiommu;
25 };
26 
27 static struct vfio {
28 	struct list_head		iommu_drivers_list;
29 	struct mutex			iommu_drivers_lock;
30 } vfio;
31 
32 #ifdef CONFIG_VFIO_NOIOMMU
33 bool vfio_noiommu __read_mostly;
34 module_param_named(enable_unsafe_noiommu_mode,
35 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
36 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
37 #endif
38 
39 static void *vfio_noiommu_open(unsigned long arg)
40 {
41 	if (arg != VFIO_NOIOMMU_IOMMU)
42 		return ERR_PTR(-EINVAL);
43 	if (!capable(CAP_SYS_RAWIO))
44 		return ERR_PTR(-EPERM);
45 
46 	return NULL;
47 }
48 
49 static void vfio_noiommu_release(void *iommu_data)
50 {
51 }
52 
53 static long vfio_noiommu_ioctl(void *iommu_data,
54 			       unsigned int cmd, unsigned long arg)
55 {
56 	if (cmd == VFIO_CHECK_EXTENSION)
57 		return vfio_noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
58 
59 	return -ENOTTY;
60 }
61 
62 static int vfio_noiommu_attach_group(void *iommu_data,
63 		struct iommu_group *iommu_group, enum vfio_group_type type)
64 {
65 	return 0;
66 }
67 
68 static void vfio_noiommu_detach_group(void *iommu_data,
69 				      struct iommu_group *iommu_group)
70 {
71 }
72 
73 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
74 	.name = "vfio-noiommu",
75 	.owner = THIS_MODULE,
76 	.open = vfio_noiommu_open,
77 	.release = vfio_noiommu_release,
78 	.ioctl = vfio_noiommu_ioctl,
79 	.attach_group = vfio_noiommu_attach_group,
80 	.detach_group = vfio_noiommu_detach_group,
81 };
82 
83 /*
84  * Only noiommu containers can use vfio-noiommu and noiommu containers can only
85  * use vfio-noiommu.
86  */
87 static bool vfio_iommu_driver_allowed(struct vfio_container *container,
88 				      const struct vfio_iommu_driver *driver)
89 {
90 	if (!IS_ENABLED(CONFIG_VFIO_NOIOMMU))
91 		return true;
92 	return container->noiommu == (driver->ops == &vfio_noiommu_ops);
93 }
94 
95 /*
96  * IOMMU driver registration
97  */
98 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
99 {
100 	struct vfio_iommu_driver *driver, *tmp;
101 
102 	if (WARN_ON(!ops->register_device != !ops->unregister_device))
103 		return -EINVAL;
104 
105 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
106 	if (!driver)
107 		return -ENOMEM;
108 
109 	driver->ops = ops;
110 
111 	mutex_lock(&vfio.iommu_drivers_lock);
112 
113 	/* Check for duplicates */
114 	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
115 		if (tmp->ops == ops) {
116 			mutex_unlock(&vfio.iommu_drivers_lock);
117 			kfree(driver);
118 			return -EINVAL;
119 		}
120 	}
121 
122 	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
123 
124 	mutex_unlock(&vfio.iommu_drivers_lock);
125 
126 	return 0;
127 }
128 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
129 
130 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
131 {
132 	struct vfio_iommu_driver *driver;
133 
134 	mutex_lock(&vfio.iommu_drivers_lock);
135 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
136 		if (driver->ops == ops) {
137 			list_del(&driver->vfio_next);
138 			mutex_unlock(&vfio.iommu_drivers_lock);
139 			kfree(driver);
140 			return;
141 		}
142 	}
143 	mutex_unlock(&vfio.iommu_drivers_lock);
144 }
145 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
146 
147 /*
148  * Container objects - containers are created when /dev/vfio/vfio is
149  * opened, but their lifecycle extends until the last user is done, so
150  * it's freed via kref.  Must support container/group/device being
151  * closed in any order.
152  */
153 static void vfio_container_release(struct kref *kref)
154 {
155 	struct vfio_container *container;
156 	container = container_of(kref, struct vfio_container, kref);
157 
158 	kfree(container);
159 }
160 
161 static void vfio_container_get(struct vfio_container *container)
162 {
163 	kref_get(&container->kref);
164 }
165 
166 static void vfio_container_put(struct vfio_container *container)
167 {
168 	kref_put(&container->kref, vfio_container_release);
169 }
170 
171 void vfio_device_container_register(struct vfio_device *device)
172 {
173 	struct vfio_iommu_driver *iommu_driver =
174 		device->group->container->iommu_driver;
175 
176 	if (iommu_driver && iommu_driver->ops->register_device)
177 		iommu_driver->ops->register_device(
178 			device->group->container->iommu_data, device);
179 }
180 
181 void vfio_device_container_unregister(struct vfio_device *device)
182 {
183 	struct vfio_iommu_driver *iommu_driver =
184 		device->group->container->iommu_driver;
185 
186 	if (iommu_driver && iommu_driver->ops->unregister_device)
187 		iommu_driver->ops->unregister_device(
188 			device->group->container->iommu_data, device);
189 }
190 
191 static long
192 vfio_container_ioctl_check_extension(struct vfio_container *container,
193 				     unsigned long arg)
194 {
195 	struct vfio_iommu_driver *driver;
196 	long ret = 0;
197 
198 	down_read(&container->group_lock);
199 
200 	driver = container->iommu_driver;
201 
202 	switch (arg) {
203 		/* No base extensions yet */
204 	default:
205 		/*
206 		 * If no driver is set, poll all registered drivers for
207 		 * extensions and return the first positive result.  If
208 		 * a driver is already set, further queries will be passed
209 		 * only to that driver.
210 		 */
211 		if (!driver) {
212 			mutex_lock(&vfio.iommu_drivers_lock);
213 			list_for_each_entry(driver, &vfio.iommu_drivers_list,
214 					    vfio_next) {
215 
216 				if (!list_empty(&container->group_list) &&
217 				    !vfio_iommu_driver_allowed(container,
218 							       driver))
219 					continue;
220 				if (!try_module_get(driver->ops->owner))
221 					continue;
222 
223 				ret = driver->ops->ioctl(NULL,
224 							 VFIO_CHECK_EXTENSION,
225 							 arg);
226 				module_put(driver->ops->owner);
227 				if (ret > 0)
228 					break;
229 			}
230 			mutex_unlock(&vfio.iommu_drivers_lock);
231 		} else
232 			ret = driver->ops->ioctl(container->iommu_data,
233 						 VFIO_CHECK_EXTENSION, arg);
234 	}
235 
236 	up_read(&container->group_lock);
237 
238 	return ret;
239 }
240 
241 /* hold write lock on container->group_lock */
242 static int __vfio_container_attach_groups(struct vfio_container *container,
243 					  struct vfio_iommu_driver *driver,
244 					  void *data)
245 {
246 	struct vfio_group *group;
247 	int ret = -ENODEV;
248 
249 	list_for_each_entry(group, &container->group_list, container_next) {
250 		ret = driver->ops->attach_group(data, group->iommu_group,
251 						group->type);
252 		if (ret)
253 			goto unwind;
254 	}
255 
256 	return ret;
257 
258 unwind:
259 	list_for_each_entry_continue_reverse(group, &container->group_list,
260 					     container_next) {
261 		driver->ops->detach_group(data, group->iommu_group);
262 	}
263 
264 	return ret;
265 }
266 
267 static long vfio_ioctl_set_iommu(struct vfio_container *container,
268 				 unsigned long arg)
269 {
270 	struct vfio_iommu_driver *driver;
271 	long ret = -ENODEV;
272 
273 	down_write(&container->group_lock);
274 
275 	/*
276 	 * The container is designed to be an unprivileged interface while
277 	 * the group can be assigned to specific users.  Therefore, only by
278 	 * adding a group to a container does the user get the privilege of
279 	 * enabling the iommu, which may allocate finite resources.  There
280 	 * is no unset_iommu, but by removing all the groups from a container,
281 	 * the container is deprivileged and returns to an unset state.
282 	 */
283 	if (list_empty(&container->group_list) || container->iommu_driver) {
284 		up_write(&container->group_lock);
285 		return -EINVAL;
286 	}
287 
288 	mutex_lock(&vfio.iommu_drivers_lock);
289 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
290 		void *data;
291 
292 		if (!vfio_iommu_driver_allowed(container, driver))
293 			continue;
294 		if (!try_module_get(driver->ops->owner))
295 			continue;
296 
297 		/*
298 		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
299 		 * so test which iommu driver reported support for this
300 		 * extension and call open on them.  We also pass them the
301 		 * magic, allowing a single driver to support multiple
302 		 * interfaces if they'd like.
303 		 */
304 		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
305 			module_put(driver->ops->owner);
306 			continue;
307 		}
308 
309 		data = driver->ops->open(arg);
310 		if (IS_ERR(data)) {
311 			ret = PTR_ERR(data);
312 			module_put(driver->ops->owner);
313 			continue;
314 		}
315 
316 		ret = __vfio_container_attach_groups(container, driver, data);
317 		if (ret) {
318 			driver->ops->release(data);
319 			module_put(driver->ops->owner);
320 			continue;
321 		}
322 
323 		container->iommu_driver = driver;
324 		container->iommu_data = data;
325 		break;
326 	}
327 
328 	mutex_unlock(&vfio.iommu_drivers_lock);
329 	up_write(&container->group_lock);
330 
331 	return ret;
332 }
333 
334 static long vfio_fops_unl_ioctl(struct file *filep,
335 				unsigned int cmd, unsigned long arg)
336 {
337 	struct vfio_container *container = filep->private_data;
338 	struct vfio_iommu_driver *driver;
339 	void *data;
340 	long ret = -EINVAL;
341 
342 	if (!container)
343 		return ret;
344 
345 	switch (cmd) {
346 	case VFIO_GET_API_VERSION:
347 		ret = VFIO_API_VERSION;
348 		break;
349 	case VFIO_CHECK_EXTENSION:
350 		ret = vfio_container_ioctl_check_extension(container, arg);
351 		break;
352 	case VFIO_SET_IOMMU:
353 		ret = vfio_ioctl_set_iommu(container, arg);
354 		break;
355 	default:
356 		driver = container->iommu_driver;
357 		data = container->iommu_data;
358 
359 		if (driver) /* passthrough all unrecognized ioctls */
360 			ret = driver->ops->ioctl(data, cmd, arg);
361 	}
362 
363 	return ret;
364 }
365 
366 static int vfio_fops_open(struct inode *inode, struct file *filep)
367 {
368 	struct vfio_container *container;
369 
370 	container = kzalloc(sizeof(*container), GFP_KERNEL);
371 	if (!container)
372 		return -ENOMEM;
373 
374 	INIT_LIST_HEAD(&container->group_list);
375 	init_rwsem(&container->group_lock);
376 	kref_init(&container->kref);
377 
378 	filep->private_data = container;
379 
380 	return 0;
381 }
382 
383 static int vfio_fops_release(struct inode *inode, struct file *filep)
384 {
385 	struct vfio_container *container = filep->private_data;
386 	struct vfio_iommu_driver *driver = container->iommu_driver;
387 
388 	if (driver && driver->ops->notify)
389 		driver->ops->notify(container->iommu_data,
390 				    VFIO_IOMMU_CONTAINER_CLOSE);
391 
392 	filep->private_data = NULL;
393 
394 	vfio_container_put(container);
395 
396 	return 0;
397 }
398 
399 static const struct file_operations vfio_fops = {
400 	.owner		= THIS_MODULE,
401 	.open		= vfio_fops_open,
402 	.release	= vfio_fops_release,
403 	.unlocked_ioctl	= vfio_fops_unl_ioctl,
404 	.compat_ioctl	= compat_ptr_ioctl,
405 };
406 
407 struct vfio_container *vfio_container_from_file(struct file *file)
408 {
409 	struct vfio_container *container;
410 
411 	/* Sanity check, is this really our fd? */
412 	if (file->f_op != &vfio_fops)
413 		return NULL;
414 
415 	container = file->private_data;
416 	WARN_ON(!container); /* fget ensures we don't race vfio_release */
417 	return container;
418 }
419 
420 static struct miscdevice vfio_dev = {
421 	.minor = VFIO_MINOR,
422 	.name = "vfio",
423 	.fops = &vfio_fops,
424 	.nodename = "vfio/vfio",
425 	.mode = S_IRUGO | S_IWUGO,
426 };
427 
428 int vfio_container_attach_group(struct vfio_container *container,
429 				struct vfio_group *group)
430 {
431 	struct vfio_iommu_driver *driver;
432 	int ret = 0;
433 
434 	lockdep_assert_held(&group->group_lock);
435 
436 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
437 		return -EPERM;
438 
439 	down_write(&container->group_lock);
440 
441 	/* Real groups and fake groups cannot mix */
442 	if (!list_empty(&container->group_list) &&
443 	    container->noiommu != (group->type == VFIO_NO_IOMMU)) {
444 		ret = -EPERM;
445 		goto out_unlock_container;
446 	}
447 
448 	if (group->type == VFIO_IOMMU) {
449 		ret = iommu_group_claim_dma_owner(group->iommu_group, group);
450 		if (ret)
451 			goto out_unlock_container;
452 	}
453 
454 	driver = container->iommu_driver;
455 	if (driver) {
456 		ret = driver->ops->attach_group(container->iommu_data,
457 						group->iommu_group,
458 						group->type);
459 		if (ret) {
460 			if (group->type == VFIO_IOMMU)
461 				iommu_group_release_dma_owner(
462 					group->iommu_group);
463 			goto out_unlock_container;
464 		}
465 	}
466 
467 	group->container = container;
468 	group->container_users = 1;
469 	container->noiommu = (group->type == VFIO_NO_IOMMU);
470 	list_add(&group->container_next, &container->group_list);
471 
472 	/* Get a reference on the container and mark a user within the group */
473 	vfio_container_get(container);
474 
475 out_unlock_container:
476 	up_write(&container->group_lock);
477 	return ret;
478 }
479 
480 void vfio_group_detach_container(struct vfio_group *group)
481 {
482 	struct vfio_container *container = group->container;
483 	struct vfio_iommu_driver *driver;
484 
485 	lockdep_assert_held(&group->group_lock);
486 	WARN_ON(group->container_users != 1);
487 
488 	down_write(&container->group_lock);
489 
490 	driver = container->iommu_driver;
491 	if (driver)
492 		driver->ops->detach_group(container->iommu_data,
493 					  group->iommu_group);
494 
495 	if (group->type == VFIO_IOMMU)
496 		iommu_group_release_dma_owner(group->iommu_group);
497 
498 	group->container = NULL;
499 	group->container_users = 0;
500 	list_del(&group->container_next);
501 
502 	/* Detaching the last group deprivileges a container, remove iommu */
503 	if (driver && list_empty(&container->group_list)) {
504 		driver->ops->release(container->iommu_data);
505 		module_put(driver->ops->owner);
506 		container->iommu_driver = NULL;
507 		container->iommu_data = NULL;
508 	}
509 
510 	up_write(&container->group_lock);
511 
512 	vfio_container_put(container);
513 }
514 
515 int vfio_group_use_container(struct vfio_group *group)
516 {
517 	lockdep_assert_held(&group->group_lock);
518 
519 	/*
520 	 * The container fd has been assigned with VFIO_GROUP_SET_CONTAINER but
521 	 * VFIO_SET_IOMMU hasn't been done yet.
522 	 */
523 	if (!group->container->iommu_driver)
524 		return -EINVAL;
525 
526 	if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
527 		return -EPERM;
528 
529 	get_file(group->opened_file);
530 	group->container_users++;
531 	return 0;
532 }
533 
534 void vfio_group_unuse_container(struct vfio_group *group)
535 {
536 	lockdep_assert_held(&group->group_lock);
537 
538 	WARN_ON(group->container_users <= 1);
539 	group->container_users--;
540 	fput(group->opened_file);
541 }
542 
543 int vfio_device_container_pin_pages(struct vfio_device *device,
544 				    dma_addr_t iova, int npage,
545 				    int prot, struct page **pages)
546 {
547 	struct vfio_container *container = device->group->container;
548 	struct iommu_group *iommu_group = device->group->iommu_group;
549 	struct vfio_iommu_driver *driver = container->iommu_driver;
550 
551 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
552 		return -E2BIG;
553 
554 	if (unlikely(!driver || !driver->ops->pin_pages))
555 		return -ENOTTY;
556 	return driver->ops->pin_pages(container->iommu_data, iommu_group, iova,
557 				      npage, prot, pages);
558 }
559 
560 void vfio_device_container_unpin_pages(struct vfio_device *device,
561 				       dma_addr_t iova, int npage)
562 {
563 	struct vfio_container *container = device->group->container;
564 
565 	if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES))
566 		return;
567 
568 	container->iommu_driver->ops->unpin_pages(container->iommu_data, iova,
569 						  npage);
570 }
571 
572 int vfio_device_container_dma_rw(struct vfio_device *device,
573 				 dma_addr_t iova, void *data,
574 				 size_t len, bool write)
575 {
576 	struct vfio_container *container = device->group->container;
577 	struct vfio_iommu_driver *driver = container->iommu_driver;
578 
579 	if (unlikely(!driver || !driver->ops->dma_rw))
580 		return -ENOTTY;
581 	return driver->ops->dma_rw(container->iommu_data, iova, data, len,
582 				   write);
583 }
584 
585 int __init vfio_container_init(void)
586 {
587 	int ret;
588 
589 	mutex_init(&vfio.iommu_drivers_lock);
590 	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
591 
592 	ret = misc_register(&vfio_dev);
593 	if (ret) {
594 		pr_err("vfio: misc device register failed\n");
595 		return ret;
596 	}
597 
598 	if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) {
599 		ret = vfio_register_iommu_driver(&vfio_noiommu_ops);
600 		if (ret)
601 			goto err_misc;
602 	}
603 	return 0;
604 
605 err_misc:
606 	misc_deregister(&vfio_dev);
607 	return ret;
608 }
609 
610 void vfio_container_cleanup(void)
611 {
612 	if (IS_ENABLED(CONFIG_VFIO_NOIOMMU))
613 		vfio_unregister_iommu_driver(&vfio_noiommu_ops);
614 	misc_deregister(&vfio_dev);
615 	mutex_destroy(&vfio.iommu_drivers_lock);
616 }
617 
618 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
619 MODULE_ALIAS("devname:vfio/vfio");
620