xref: /openbmc/linux/drivers/iommu/iommufd/device.c (revision 6129b59fcdf374b5d82e1f4518884da13de38b1a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
3  */
4 #include <linux/iommufd.h>
5 #include <linux/slab.h>
6 #include <linux/iommu.h>
7 #include "../iommu-priv.h"
8 
9 #include "io_pagetable.h"
10 #include "iommufd_private.h"
11 
12 static bool allow_unsafe_interrupts;
13 module_param(allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
14 MODULE_PARM_DESC(
15 	allow_unsafe_interrupts,
16 	"Allow IOMMUFD to bind to devices even if the platform cannot isolate "
17 	"the MSI interrupt window. Enabling this is a security weakness.");
18 
19 static void iommufd_group_release(struct kref *kref)
20 {
21 	struct iommufd_group *igroup =
22 		container_of(kref, struct iommufd_group, ref);
23 
24 	WARN_ON(igroup->hwpt || !list_empty(&igroup->device_list));
25 
26 	xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup,
27 		   NULL, GFP_KERNEL);
28 	iommu_group_put(igroup->group);
29 	mutex_destroy(&igroup->lock);
30 	kfree(igroup);
31 }
32 
33 static void iommufd_put_group(struct iommufd_group *group)
34 {
35 	kref_put(&group->ref, iommufd_group_release);
36 }
37 
38 static bool iommufd_group_try_get(struct iommufd_group *igroup,
39 				  struct iommu_group *group)
40 {
41 	if (!igroup)
42 		return false;
43 	/*
44 	 * group ID's cannot be re-used until the group is put back which does
45 	 * not happen if we could get an igroup pointer under the xa_lock.
46 	 */
47 	if (WARN_ON(igroup->group != group))
48 		return false;
49 	return kref_get_unless_zero(&igroup->ref);
50 }
51 
52 /*
53  * iommufd needs to store some more data for each iommu_group, we keep a
54  * parallel xarray indexed by iommu_group id to hold this instead of putting it
55  * in the core structure. To keep things simple the iommufd_group memory is
56  * unique within the iommufd_ctx. This makes it easy to check there are no
57  * memory leaks.
58  */
59 static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx,
60 					       struct device *dev)
61 {
62 	struct iommufd_group *new_igroup;
63 	struct iommufd_group *cur_igroup;
64 	struct iommufd_group *igroup;
65 	struct iommu_group *group;
66 	unsigned int id;
67 
68 	group = iommu_group_get(dev);
69 	if (!group)
70 		return ERR_PTR(-ENODEV);
71 
72 	id = iommu_group_id(group);
73 
74 	xa_lock(&ictx->groups);
75 	igroup = xa_load(&ictx->groups, id);
76 	if (iommufd_group_try_get(igroup, group)) {
77 		xa_unlock(&ictx->groups);
78 		iommu_group_put(group);
79 		return igroup;
80 	}
81 	xa_unlock(&ictx->groups);
82 
83 	new_igroup = kzalloc(sizeof(*new_igroup), GFP_KERNEL);
84 	if (!new_igroup) {
85 		iommu_group_put(group);
86 		return ERR_PTR(-ENOMEM);
87 	}
88 
89 	kref_init(&new_igroup->ref);
90 	mutex_init(&new_igroup->lock);
91 	INIT_LIST_HEAD(&new_igroup->device_list);
92 	new_igroup->sw_msi_start = PHYS_ADDR_MAX;
93 	/* group reference moves into new_igroup */
94 	new_igroup->group = group;
95 
96 	/*
97 	 * The ictx is not additionally refcounted here becase all objects using
98 	 * an igroup must put it before their destroy completes.
99 	 */
100 	new_igroup->ictx = ictx;
101 
102 	/*
103 	 * We dropped the lock so igroup is invalid. NULL is a safe and likely
104 	 * value to assume for the xa_cmpxchg algorithm.
105 	 */
106 	cur_igroup = NULL;
107 	xa_lock(&ictx->groups);
108 	while (true) {
109 		igroup = __xa_cmpxchg(&ictx->groups, id, cur_igroup, new_igroup,
110 				      GFP_KERNEL);
111 		if (xa_is_err(igroup)) {
112 			xa_unlock(&ictx->groups);
113 			iommufd_put_group(new_igroup);
114 			return ERR_PTR(xa_err(igroup));
115 		}
116 
117 		/* new_group was successfully installed */
118 		if (cur_igroup == igroup) {
119 			xa_unlock(&ictx->groups);
120 			return new_igroup;
121 		}
122 
123 		/* Check again if the current group is any good */
124 		if (iommufd_group_try_get(igroup, group)) {
125 			xa_unlock(&ictx->groups);
126 			iommufd_put_group(new_igroup);
127 			return igroup;
128 		}
129 		cur_igroup = igroup;
130 	}
131 }
132 
133 void iommufd_device_destroy(struct iommufd_object *obj)
134 {
135 	struct iommufd_device *idev =
136 		container_of(obj, struct iommufd_device, obj);
137 
138 	iommu_device_release_dma_owner(idev->dev);
139 	iommufd_put_group(idev->igroup);
140 	if (!iommufd_selftest_is_mock_dev(idev->dev))
141 		iommufd_ctx_put(idev->ictx);
142 }
143 
144 /**
145  * iommufd_device_bind - Bind a physical device to an iommu fd
146  * @ictx: iommufd file descriptor
147  * @dev: Pointer to a physical device struct
148  * @id: Output ID number to return to userspace for this device
149  *
150  * A successful bind establishes an ownership over the device and returns
151  * struct iommufd_device pointer, otherwise returns error pointer.
152  *
153  * A driver using this API must set driver_managed_dma and must not touch
154  * the device until this routine succeeds and establishes ownership.
155  *
156  * Binding a PCI device places the entire RID under iommufd control.
157  *
158  * The caller must undo this with iommufd_device_unbind()
159  */
160 struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
161 					   struct device *dev, u32 *id)
162 {
163 	struct iommufd_device *idev;
164 	struct iommufd_group *igroup;
165 	int rc;
166 
167 	/*
168 	 * iommufd always sets IOMMU_CACHE because we offer no way for userspace
169 	 * to restore cache coherency.
170 	 */
171 	if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY))
172 		return ERR_PTR(-EINVAL);
173 
174 	igroup = iommufd_get_group(ictx, dev);
175 	if (IS_ERR(igroup))
176 		return ERR_CAST(igroup);
177 
178 	/*
179 	 * For historical compat with VFIO the insecure interrupt path is
180 	 * allowed if the module parameter is set. Secure/Isolated means that a
181 	 * MemWr operation from the device (eg a simple DMA) cannot trigger an
182 	 * interrupt outside this iommufd context.
183 	 */
184 	if (!iommufd_selftest_is_mock_dev(dev) &&
185 	    !iommu_group_has_isolated_msi(igroup->group)) {
186 		if (!allow_unsafe_interrupts) {
187 			rc = -EPERM;
188 			goto out_group_put;
189 		}
190 
191 		dev_warn(
192 			dev,
193 			"MSI interrupts are not secure, they cannot be isolated by the platform. "
194 			"Check that platform features like interrupt remapping are enabled. "
195 			"Use the \"allow_unsafe_interrupts\" module parameter to override\n");
196 	}
197 
198 	rc = iommu_device_claim_dma_owner(dev, ictx);
199 	if (rc)
200 		goto out_group_put;
201 
202 	idev = iommufd_object_alloc(ictx, idev, IOMMUFD_OBJ_DEVICE);
203 	if (IS_ERR(idev)) {
204 		rc = PTR_ERR(idev);
205 		goto out_release_owner;
206 	}
207 	idev->ictx = ictx;
208 	if (!iommufd_selftest_is_mock_dev(dev))
209 		iommufd_ctx_get(ictx);
210 	idev->dev = dev;
211 	idev->enforce_cache_coherency =
212 		device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
213 	/* The calling driver is a user until iommufd_device_unbind() */
214 	refcount_inc(&idev->obj.users);
215 	/* igroup refcount moves into iommufd_device */
216 	idev->igroup = igroup;
217 
218 	/*
219 	 * If the caller fails after this success it must call
220 	 * iommufd_unbind_device() which is safe since we hold this refcount.
221 	 * This also means the device is a leaf in the graph and no other object
222 	 * can take a reference on it.
223 	 */
224 	iommufd_object_finalize(ictx, &idev->obj);
225 	*id = idev->obj.id;
226 	return idev;
227 
228 out_release_owner:
229 	iommu_device_release_dma_owner(dev);
230 out_group_put:
231 	iommufd_put_group(igroup);
232 	return ERR_PTR(rc);
233 }
234 EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD);
235 
236 /**
237  * iommufd_ctx_has_group - True if any device within the group is bound
238  *                         to the ictx
239  * @ictx: iommufd file descriptor
240  * @group: Pointer to a physical iommu_group struct
241  *
242  * True if any device within the group has been bound to this ictx, ex. via
243  * iommufd_device_bind(), therefore implying ictx ownership of the group.
244  */
245 bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group)
246 {
247 	struct iommufd_object *obj;
248 	unsigned long index;
249 
250 	if (!ictx || !group)
251 		return false;
252 
253 	xa_lock(&ictx->objects);
254 	xa_for_each(&ictx->objects, index, obj) {
255 		if (obj->type == IOMMUFD_OBJ_DEVICE &&
256 		    container_of(obj, struct iommufd_device, obj)
257 				    ->igroup->group == group) {
258 			xa_unlock(&ictx->objects);
259 			return true;
260 		}
261 	}
262 	xa_unlock(&ictx->objects);
263 	return false;
264 }
265 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD);
266 
267 /**
268  * iommufd_device_unbind - Undo iommufd_device_bind()
269  * @idev: Device returned by iommufd_device_bind()
270  *
271  * Release the device from iommufd control. The DMA ownership will return back
272  * to unowned with DMA controlled by the DMA API. This invalidates the
273  * iommufd_device pointer, other APIs that consume it must not be called
274  * concurrently.
275  */
276 void iommufd_device_unbind(struct iommufd_device *idev)
277 {
278 	bool was_destroyed;
279 
280 	was_destroyed = iommufd_object_destroy_user(idev->ictx, &idev->obj);
281 	WARN_ON(!was_destroyed);
282 }
283 EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD);
284 
285 struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev)
286 {
287 	return idev->ictx;
288 }
289 EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx, IOMMUFD);
290 
291 u32 iommufd_device_to_id(struct iommufd_device *idev)
292 {
293 	return idev->obj.id;
294 }
295 EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD);
296 
297 static int iommufd_group_setup_msi(struct iommufd_group *igroup,
298 				   struct iommufd_hw_pagetable *hwpt)
299 {
300 	phys_addr_t sw_msi_start = igroup->sw_msi_start;
301 	int rc;
302 
303 	/*
304 	 * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to
305 	 * call iommu_get_msi_cookie() on its behalf. This is necessary to setup
306 	 * the MSI window so iommu_dma_prepare_msi() can install pages into our
307 	 * domain after request_irq(). If it is not done interrupts will not
308 	 * work on this domain.
309 	 *
310 	 * FIXME: This is conceptually broken for iommufd since we want to allow
311 	 * userspace to change the domains, eg switch from an identity IOAS to a
312 	 * DMA IOAS. There is currently no way to create a MSI window that
313 	 * matches what the IRQ layer actually expects in a newly created
314 	 * domain.
315 	 */
316 	if (sw_msi_start != PHYS_ADDR_MAX && !hwpt->msi_cookie) {
317 		rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start);
318 		if (rc)
319 			return rc;
320 
321 		/*
322 		 * iommu_get_msi_cookie() can only be called once per domain,
323 		 * it returns -EBUSY on later calls.
324 		 */
325 		hwpt->msi_cookie = true;
326 	}
327 	return 0;
328 }
329 
330 int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
331 				struct iommufd_device *idev)
332 {
333 	int rc;
334 
335 	mutex_lock(&idev->igroup->lock);
336 
337 	if (idev->igroup->hwpt != NULL && idev->igroup->hwpt != hwpt) {
338 		rc = -EINVAL;
339 		goto err_unlock;
340 	}
341 
342 	/* Try to upgrade the domain we have */
343 	if (idev->enforce_cache_coherency) {
344 		rc = iommufd_hw_pagetable_enforce_cc(hwpt);
345 		if (rc)
346 			goto err_unlock;
347 	}
348 
349 	rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev,
350 						 &idev->igroup->sw_msi_start);
351 	if (rc)
352 		goto err_unlock;
353 
354 	/*
355 	 * Only attach to the group once for the first device that is in the
356 	 * group. All the other devices will follow this attachment. The user
357 	 * should attach every device individually to the hwpt as the per-device
358 	 * reserved regions are only updated during individual device
359 	 * attachment.
360 	 */
361 	if (list_empty(&idev->igroup->device_list)) {
362 		rc = iommufd_group_setup_msi(idev->igroup, hwpt);
363 		if (rc)
364 			goto err_unresv;
365 
366 		rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
367 		if (rc)
368 			goto err_unresv;
369 		idev->igroup->hwpt = hwpt;
370 	}
371 	refcount_inc(&hwpt->obj.users);
372 	list_add_tail(&idev->group_item, &idev->igroup->device_list);
373 	mutex_unlock(&idev->igroup->lock);
374 	return 0;
375 err_unresv:
376 	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
377 err_unlock:
378 	mutex_unlock(&idev->igroup->lock);
379 	return rc;
380 }
381 
382 struct iommufd_hw_pagetable *
383 iommufd_hw_pagetable_detach(struct iommufd_device *idev)
384 {
385 	struct iommufd_hw_pagetable *hwpt = idev->igroup->hwpt;
386 
387 	mutex_lock(&idev->igroup->lock);
388 	list_del(&idev->group_item);
389 	if (list_empty(&idev->igroup->device_list)) {
390 		iommu_detach_group(hwpt->domain, idev->igroup->group);
391 		idev->igroup->hwpt = NULL;
392 	}
393 	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
394 	mutex_unlock(&idev->igroup->lock);
395 
396 	/* Caller must destroy hwpt */
397 	return hwpt;
398 }
399 
400 static struct iommufd_hw_pagetable *
401 iommufd_device_do_attach(struct iommufd_device *idev,
402 			 struct iommufd_hw_pagetable *hwpt)
403 {
404 	int rc;
405 
406 	rc = iommufd_hw_pagetable_attach(hwpt, idev);
407 	if (rc)
408 		return ERR_PTR(rc);
409 	return NULL;
410 }
411 
412 static struct iommufd_hw_pagetable *
413 iommufd_device_do_replace(struct iommufd_device *idev,
414 			  struct iommufd_hw_pagetable *hwpt)
415 {
416 	struct iommufd_group *igroup = idev->igroup;
417 	struct iommufd_hw_pagetable *old_hwpt;
418 	unsigned int num_devices = 0;
419 	struct iommufd_device *cur;
420 	int rc;
421 
422 	mutex_lock(&idev->igroup->lock);
423 
424 	if (igroup->hwpt == NULL) {
425 		rc = -EINVAL;
426 		goto err_unlock;
427 	}
428 
429 	if (hwpt == igroup->hwpt) {
430 		mutex_unlock(&idev->igroup->lock);
431 		return NULL;
432 	}
433 
434 	/* Try to upgrade the domain we have */
435 	list_for_each_entry(cur, &igroup->device_list, group_item) {
436 		num_devices++;
437 		if (cur->enforce_cache_coherency) {
438 			rc = iommufd_hw_pagetable_enforce_cc(hwpt);
439 			if (rc)
440 				goto err_unlock;
441 		}
442 	}
443 
444 	old_hwpt = igroup->hwpt;
445 	if (hwpt->ioas != old_hwpt->ioas) {
446 		list_for_each_entry(cur, &igroup->device_list, group_item) {
447 			rc = iopt_table_enforce_dev_resv_regions(
448 				&hwpt->ioas->iopt, cur->dev, NULL);
449 			if (rc)
450 				goto err_unresv;
451 		}
452 	}
453 
454 	rc = iommufd_group_setup_msi(idev->igroup, hwpt);
455 	if (rc)
456 		goto err_unresv;
457 
458 	rc = iommu_group_replace_domain(igroup->group, hwpt->domain);
459 	if (rc)
460 		goto err_unresv;
461 
462 	if (hwpt->ioas != old_hwpt->ioas) {
463 		list_for_each_entry(cur, &igroup->device_list, group_item)
464 			iopt_remove_reserved_iova(&old_hwpt->ioas->iopt,
465 						  cur->dev);
466 	}
467 
468 	igroup->hwpt = hwpt;
469 
470 	/*
471 	 * Move the refcounts held by the device_list to the new hwpt. Retain a
472 	 * refcount for this thread as the caller will free it.
473 	 */
474 	refcount_add(num_devices, &hwpt->obj.users);
475 	if (num_devices > 1)
476 		WARN_ON(refcount_sub_and_test(num_devices - 1,
477 					      &old_hwpt->obj.users));
478 	mutex_unlock(&idev->igroup->lock);
479 
480 	/* Caller must destroy old_hwpt */
481 	return old_hwpt;
482 err_unresv:
483 	list_for_each_entry(cur, &igroup->device_list, group_item)
484 		iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev);
485 err_unlock:
486 	mutex_unlock(&idev->igroup->lock);
487 	return ERR_PTR(rc);
488 }
489 
490 typedef struct iommufd_hw_pagetable *(*attach_fn)(
491 	struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt);
492 
493 /*
494  * When automatically managing the domains we search for a compatible domain in
495  * the iopt and if one is found use it, otherwise create a new domain.
496  * Automatic domain selection will never pick a manually created domain.
497  */
498 static struct iommufd_hw_pagetable *
499 iommufd_device_auto_get_domain(struct iommufd_device *idev,
500 			       struct iommufd_ioas *ioas, u32 *pt_id,
501 			       attach_fn do_attach)
502 {
503 	/*
504 	 * iommufd_hw_pagetable_attach() is called by
505 	 * iommufd_hw_pagetable_alloc() in immediate attachment mode, same as
506 	 * iommufd_device_do_attach(). So if we are in this mode then we prefer
507 	 * to use the immediate_attach path as it supports drivers that can't
508 	 * directly allocate a domain.
509 	 */
510 	bool immediate_attach = do_attach == iommufd_device_do_attach;
511 	struct iommufd_hw_pagetable *destroy_hwpt;
512 	struct iommufd_hw_pagetable *hwpt;
513 
514 	/*
515 	 * There is no differentiation when domains are allocated, so any domain
516 	 * that is willing to attach to the device is interchangeable with any
517 	 * other.
518 	 */
519 	mutex_lock(&ioas->mutex);
520 	list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
521 		if (!hwpt->auto_domain)
522 			continue;
523 
524 		if (!iommufd_lock_obj(&hwpt->obj))
525 			continue;
526 		destroy_hwpt = (*do_attach)(idev, hwpt);
527 		if (IS_ERR(destroy_hwpt)) {
528 			iommufd_put_object(&hwpt->obj);
529 			/*
530 			 * -EINVAL means the domain is incompatible with the
531 			 * device. Other error codes should propagate to
532 			 * userspace as failure. Success means the domain is
533 			 * attached.
534 			 */
535 			if (PTR_ERR(destroy_hwpt) == -EINVAL)
536 				continue;
537 			goto out_unlock;
538 		}
539 		*pt_id = hwpt->obj.id;
540 		iommufd_put_object(&hwpt->obj);
541 		goto out_unlock;
542 	}
543 
544 	hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev,
545 					  immediate_attach);
546 	if (IS_ERR(hwpt)) {
547 		destroy_hwpt = ERR_CAST(hwpt);
548 		goto out_unlock;
549 	}
550 
551 	if (!immediate_attach) {
552 		destroy_hwpt = (*do_attach)(idev, hwpt);
553 		if (IS_ERR(destroy_hwpt))
554 			goto out_abort;
555 	} else {
556 		destroy_hwpt = NULL;
557 	}
558 
559 	hwpt->auto_domain = true;
560 	*pt_id = hwpt->obj.id;
561 
562 	iommufd_object_finalize(idev->ictx, &hwpt->obj);
563 	mutex_unlock(&ioas->mutex);
564 	return destroy_hwpt;
565 
566 out_abort:
567 	iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj);
568 out_unlock:
569 	mutex_unlock(&ioas->mutex);
570 	return destroy_hwpt;
571 }
572 
573 static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id,
574 				    attach_fn do_attach)
575 {
576 	struct iommufd_hw_pagetable *destroy_hwpt;
577 	struct iommufd_object *pt_obj;
578 
579 	pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY);
580 	if (IS_ERR(pt_obj))
581 		return PTR_ERR(pt_obj);
582 
583 	switch (pt_obj->type) {
584 	case IOMMUFD_OBJ_HW_PAGETABLE: {
585 		struct iommufd_hw_pagetable *hwpt =
586 			container_of(pt_obj, struct iommufd_hw_pagetable, obj);
587 
588 		destroy_hwpt = (*do_attach)(idev, hwpt);
589 		if (IS_ERR(destroy_hwpt))
590 			goto out_put_pt_obj;
591 		break;
592 	}
593 	case IOMMUFD_OBJ_IOAS: {
594 		struct iommufd_ioas *ioas =
595 			container_of(pt_obj, struct iommufd_ioas, obj);
596 
597 		destroy_hwpt = iommufd_device_auto_get_domain(idev, ioas, pt_id,
598 							      do_attach);
599 		if (IS_ERR(destroy_hwpt))
600 			goto out_put_pt_obj;
601 		break;
602 	}
603 	default:
604 		destroy_hwpt = ERR_PTR(-EINVAL);
605 		goto out_put_pt_obj;
606 	}
607 	iommufd_put_object(pt_obj);
608 
609 	/* This destruction has to be after we unlock everything */
610 	if (destroy_hwpt)
611 		iommufd_hw_pagetable_put(idev->ictx, destroy_hwpt);
612 	return 0;
613 
614 out_put_pt_obj:
615 	iommufd_put_object(pt_obj);
616 	return PTR_ERR(destroy_hwpt);
617 }
618 
619 /**
620  * iommufd_device_attach - Connect a device to an iommu_domain
621  * @idev: device to attach
622  * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
623  *         Output the IOMMUFD_OBJ_HW_PAGETABLE ID
624  *
625  * This connects the device to an iommu_domain, either automatically or manually
626  * selected. Once this completes the device could do DMA.
627  *
628  * The caller should return the resulting pt_id back to userspace.
629  * This function is undone by calling iommufd_device_detach().
630  */
631 int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id)
632 {
633 	int rc;
634 
635 	rc = iommufd_device_change_pt(idev, pt_id, &iommufd_device_do_attach);
636 	if (rc)
637 		return rc;
638 
639 	/*
640 	 * Pairs with iommufd_device_detach() - catches caller bugs attempting
641 	 * to destroy a device with an attachment.
642 	 */
643 	refcount_inc(&idev->obj.users);
644 	return 0;
645 }
646 EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
647 
648 /**
649  * iommufd_device_replace - Change the device's iommu_domain
650  * @idev: device to change
651  * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
652  *         Output the IOMMUFD_OBJ_HW_PAGETABLE ID
653  *
654  * This is the same as::
655  *
656  *   iommufd_device_detach();
657  *   iommufd_device_attach();
658  *
659  * If it fails then no change is made to the attachment. The iommu driver may
660  * implement this so there is no disruption in translation. This can only be
661  * called if iommufd_device_attach() has already succeeded.
662  */
663 int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id)
664 {
665 	return iommufd_device_change_pt(idev, pt_id,
666 					&iommufd_device_do_replace);
667 }
668 EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, IOMMUFD);
669 
670 /**
671  * iommufd_device_detach - Disconnect a device to an iommu_domain
672  * @idev: device to detach
673  *
674  * Undo iommufd_device_attach(). This disconnects the idev from the previously
675  * attached pt_id. The device returns back to a blocked DMA translation.
676  */
677 void iommufd_device_detach(struct iommufd_device *idev)
678 {
679 	struct iommufd_hw_pagetable *hwpt;
680 
681 	hwpt = iommufd_hw_pagetable_detach(idev);
682 	iommufd_hw_pagetable_put(idev->ictx, hwpt);
683 	refcount_dec(&idev->obj.users);
684 }
685 EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD);
686 
687 /*
688  * On success, it will refcount_inc() at a valid new_ioas and refcount_dec() at
689  * a valid cur_ioas (access->ioas). A caller passing in a valid new_ioas should
690  * call iommufd_put_object() if it does an iommufd_get_object() for a new_ioas.
691  */
692 static int iommufd_access_change_ioas(struct iommufd_access *access,
693 				      struct iommufd_ioas *new_ioas)
694 {
695 	u32 iopt_access_list_id = access->iopt_access_list_id;
696 	struct iommufd_ioas *cur_ioas = access->ioas;
697 	int rc;
698 
699 	lockdep_assert_held(&access->ioas_lock);
700 
701 	/* We are racing with a concurrent detach, bail */
702 	if (cur_ioas != access->ioas_unpin)
703 		return -EBUSY;
704 
705 	if (cur_ioas == new_ioas)
706 		return 0;
707 
708 	/*
709 	 * Set ioas to NULL to block any further iommufd_access_pin_pages().
710 	 * iommufd_access_unpin_pages() can continue using access->ioas_unpin.
711 	 */
712 	access->ioas = NULL;
713 
714 	if (new_ioas) {
715 		rc = iopt_add_access(&new_ioas->iopt, access);
716 		if (rc) {
717 			access->ioas = cur_ioas;
718 			return rc;
719 		}
720 		refcount_inc(&new_ioas->obj.users);
721 	}
722 
723 	if (cur_ioas) {
724 		if (access->ops->unmap) {
725 			mutex_unlock(&access->ioas_lock);
726 			access->ops->unmap(access->data, 0, ULONG_MAX);
727 			mutex_lock(&access->ioas_lock);
728 		}
729 		iopt_remove_access(&cur_ioas->iopt, access, iopt_access_list_id);
730 		refcount_dec(&cur_ioas->obj.users);
731 	}
732 
733 	access->ioas = new_ioas;
734 	access->ioas_unpin = new_ioas;
735 
736 	return 0;
737 }
738 
739 static int iommufd_access_change_ioas_id(struct iommufd_access *access, u32 id)
740 {
741 	struct iommufd_ioas *ioas = iommufd_get_ioas(access->ictx, id);
742 	int rc;
743 
744 	if (IS_ERR(ioas))
745 		return PTR_ERR(ioas);
746 	rc = iommufd_access_change_ioas(access, ioas);
747 	iommufd_put_object(&ioas->obj);
748 	return rc;
749 }
750 
751 void iommufd_access_destroy_object(struct iommufd_object *obj)
752 {
753 	struct iommufd_access *access =
754 		container_of(obj, struct iommufd_access, obj);
755 
756 	mutex_lock(&access->ioas_lock);
757 	if (access->ioas)
758 		WARN_ON(iommufd_access_change_ioas(access, NULL));
759 	mutex_unlock(&access->ioas_lock);
760 	iommufd_ctx_put(access->ictx);
761 }
762 
763 /**
764  * iommufd_access_create - Create an iommufd_access
765  * @ictx: iommufd file descriptor
766  * @ops: Driver's ops to associate with the access
767  * @data: Opaque data to pass into ops functions
768  * @id: Output ID number to return to userspace for this access
769  *
770  * An iommufd_access allows a driver to read/write to the IOAS without using
771  * DMA. The underlying CPU memory can be accessed using the
772  * iommufd_access_pin_pages() or iommufd_access_rw() functions.
773  *
774  * The provided ops are required to use iommufd_access_pin_pages().
775  */
776 struct iommufd_access *
777 iommufd_access_create(struct iommufd_ctx *ictx,
778 		      const struct iommufd_access_ops *ops, void *data, u32 *id)
779 {
780 	struct iommufd_access *access;
781 
782 	/*
783 	 * There is no uAPI for the access object, but to keep things symmetric
784 	 * use the object infrastructure anyhow.
785 	 */
786 	access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS);
787 	if (IS_ERR(access))
788 		return access;
789 
790 	access->data = data;
791 	access->ops = ops;
792 
793 	if (ops->needs_pin_pages)
794 		access->iova_alignment = PAGE_SIZE;
795 	else
796 		access->iova_alignment = 1;
797 
798 	/* The calling driver is a user until iommufd_access_destroy() */
799 	refcount_inc(&access->obj.users);
800 	access->ictx = ictx;
801 	iommufd_ctx_get(ictx);
802 	iommufd_object_finalize(ictx, &access->obj);
803 	*id = access->obj.id;
804 	mutex_init(&access->ioas_lock);
805 	return access;
806 }
807 EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD);
808 
809 /**
810  * iommufd_access_destroy - Destroy an iommufd_access
811  * @access: The access to destroy
812  *
813  * The caller must stop using the access before destroying it.
814  */
815 void iommufd_access_destroy(struct iommufd_access *access)
816 {
817 	bool was_destroyed;
818 
819 	was_destroyed = iommufd_object_destroy_user(access->ictx, &access->obj);
820 	WARN_ON(!was_destroyed);
821 }
822 EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD);
823 
824 void iommufd_access_detach(struct iommufd_access *access)
825 {
826 	mutex_lock(&access->ioas_lock);
827 	if (WARN_ON(!access->ioas)) {
828 		mutex_unlock(&access->ioas_lock);
829 		return;
830 	}
831 	WARN_ON(iommufd_access_change_ioas(access, NULL));
832 	mutex_unlock(&access->ioas_lock);
833 }
834 EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, IOMMUFD);
835 
836 int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id)
837 {
838 	int rc;
839 
840 	mutex_lock(&access->ioas_lock);
841 	if (WARN_ON(access->ioas)) {
842 		mutex_unlock(&access->ioas_lock);
843 		return -EINVAL;
844 	}
845 
846 	rc = iommufd_access_change_ioas_id(access, ioas_id);
847 	mutex_unlock(&access->ioas_lock);
848 	return rc;
849 }
850 EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, IOMMUFD);
851 
852 /**
853  * iommufd_access_notify_unmap - Notify users of an iopt to stop using it
854  * @iopt: iopt to work on
855  * @iova: Starting iova in the iopt
856  * @length: Number of bytes
857  *
858  * After this function returns there should be no users attached to the pages
859  * linked to this iopt that intersect with iova,length. Anyone that has attached
860  * a user through iopt_access_pages() needs to detach it through
861  * iommufd_access_unpin_pages() before this function returns.
862  *
863  * iommufd_access_destroy() will wait for any outstanding unmap callback to
864  * complete. Once iommufd_access_destroy() no unmap ops are running or will
865  * run in the future. Due to this a driver must not create locking that prevents
866  * unmap to complete while iommufd_access_destroy() is running.
867  */
868 void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
869 				 unsigned long length)
870 {
871 	struct iommufd_ioas *ioas =
872 		container_of(iopt, struct iommufd_ioas, iopt);
873 	struct iommufd_access *access;
874 	unsigned long index;
875 
876 	xa_lock(&ioas->iopt.access_list);
877 	xa_for_each(&ioas->iopt.access_list, index, access) {
878 		if (!iommufd_lock_obj(&access->obj))
879 			continue;
880 		xa_unlock(&ioas->iopt.access_list);
881 
882 		access->ops->unmap(access->data, iova, length);
883 
884 		iommufd_put_object(&access->obj);
885 		xa_lock(&ioas->iopt.access_list);
886 	}
887 	xa_unlock(&ioas->iopt.access_list);
888 }
889 
890 /**
891  * iommufd_access_unpin_pages() - Undo iommufd_access_pin_pages
892  * @access: IOAS access to act on
893  * @iova: Starting IOVA
894  * @length: Number of bytes to access
895  *
896  * Return the struct page's. The caller must stop accessing them before calling
897  * this. The iova/length must exactly match the one provided to access_pages.
898  */
899 void iommufd_access_unpin_pages(struct iommufd_access *access,
900 				unsigned long iova, unsigned long length)
901 {
902 	struct iopt_area_contig_iter iter;
903 	struct io_pagetable *iopt;
904 	unsigned long last_iova;
905 	struct iopt_area *area;
906 
907 	if (WARN_ON(!length) ||
908 	    WARN_ON(check_add_overflow(iova, length - 1, &last_iova)))
909 		return;
910 
911 	mutex_lock(&access->ioas_lock);
912 	/*
913 	 * The driver must be doing something wrong if it calls this before an
914 	 * iommufd_access_attach() or after an iommufd_access_detach().
915 	 */
916 	if (WARN_ON(!access->ioas_unpin)) {
917 		mutex_unlock(&access->ioas_lock);
918 		return;
919 	}
920 	iopt = &access->ioas_unpin->iopt;
921 
922 	down_read(&iopt->iova_rwsem);
923 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova)
924 		iopt_area_remove_access(
925 			area, iopt_area_iova_to_index(area, iter.cur_iova),
926 			iopt_area_iova_to_index(
927 				area,
928 				min(last_iova, iopt_area_last_iova(area))));
929 	WARN_ON(!iopt_area_contig_done(&iter));
930 	up_read(&iopt->iova_rwsem);
931 	mutex_unlock(&access->ioas_lock);
932 }
933 EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, IOMMUFD);
934 
935 static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter)
936 {
937 	if (iopt_area_start_byte(iter->area, iter->cur_iova) % PAGE_SIZE)
938 		return false;
939 
940 	if (!iopt_area_contig_done(iter) &&
941 	    (iopt_area_start_byte(iter->area, iopt_area_last_iova(iter->area)) %
942 	     PAGE_SIZE) != (PAGE_SIZE - 1))
943 		return false;
944 	return true;
945 }
946 
947 static bool check_area_prot(struct iopt_area *area, unsigned int flags)
948 {
949 	if (flags & IOMMUFD_ACCESS_RW_WRITE)
950 		return area->iommu_prot & IOMMU_WRITE;
951 	return area->iommu_prot & IOMMU_READ;
952 }
953 
954 /**
955  * iommufd_access_pin_pages() - Return a list of pages under the iova
956  * @access: IOAS access to act on
957  * @iova: Starting IOVA
958  * @length: Number of bytes to access
959  * @out_pages: Output page list
960  * @flags: IOPMMUFD_ACCESS_RW_* flags
961  *
962  * Reads @length bytes starting at iova and returns the struct page * pointers.
963  * These can be kmap'd by the caller for CPU access.
964  *
965  * The caller must perform iommufd_access_unpin_pages() when done to balance
966  * this.
967  *
968  * This API always requires a page aligned iova. This happens naturally if the
969  * ioas alignment is >= PAGE_SIZE and the iova is PAGE_SIZE aligned. However
970  * smaller alignments have corner cases where this API can fail on otherwise
971  * aligned iova.
972  */
973 int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
974 			     unsigned long length, struct page **out_pages,
975 			     unsigned int flags)
976 {
977 	struct iopt_area_contig_iter iter;
978 	struct io_pagetable *iopt;
979 	unsigned long last_iova;
980 	struct iopt_area *area;
981 	int rc;
982 
983 	/* Driver's ops don't support pin_pages */
984 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
985 	    WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap))
986 		return -EINVAL;
987 
988 	if (!length)
989 		return -EINVAL;
990 	if (check_add_overflow(iova, length - 1, &last_iova))
991 		return -EOVERFLOW;
992 
993 	mutex_lock(&access->ioas_lock);
994 	if (!access->ioas) {
995 		mutex_unlock(&access->ioas_lock);
996 		return -ENOENT;
997 	}
998 	iopt = &access->ioas->iopt;
999 
1000 	down_read(&iopt->iova_rwsem);
1001 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
1002 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
1003 		unsigned long last_index = iopt_area_iova_to_index(area, last);
1004 		unsigned long index =
1005 			iopt_area_iova_to_index(area, iter.cur_iova);
1006 
1007 		if (area->prevent_access ||
1008 		    !iopt_area_contig_is_aligned(&iter)) {
1009 			rc = -EINVAL;
1010 			goto err_remove;
1011 		}
1012 
1013 		if (!check_area_prot(area, flags)) {
1014 			rc = -EPERM;
1015 			goto err_remove;
1016 		}
1017 
1018 		rc = iopt_area_add_access(area, index, last_index, out_pages,
1019 					  flags);
1020 		if (rc)
1021 			goto err_remove;
1022 		out_pages += last_index - index + 1;
1023 	}
1024 	if (!iopt_area_contig_done(&iter)) {
1025 		rc = -ENOENT;
1026 		goto err_remove;
1027 	}
1028 
1029 	up_read(&iopt->iova_rwsem);
1030 	mutex_unlock(&access->ioas_lock);
1031 	return 0;
1032 
1033 err_remove:
1034 	if (iova < iter.cur_iova) {
1035 		last_iova = iter.cur_iova - 1;
1036 		iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova)
1037 			iopt_area_remove_access(
1038 				area,
1039 				iopt_area_iova_to_index(area, iter.cur_iova),
1040 				iopt_area_iova_to_index(
1041 					area, min(last_iova,
1042 						  iopt_area_last_iova(area))));
1043 	}
1044 	up_read(&iopt->iova_rwsem);
1045 	mutex_unlock(&access->ioas_lock);
1046 	return rc;
1047 }
1048 EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, IOMMUFD);
1049 
1050 /**
1051  * iommufd_access_rw - Read or write data under the iova
1052  * @access: IOAS access to act on
1053  * @iova: Starting IOVA
1054  * @data: Kernel buffer to copy to/from
1055  * @length: Number of bytes to access
1056  * @flags: IOMMUFD_ACCESS_RW_* flags
1057  *
1058  * Copy kernel to/from data into the range given by IOVA/length. If flags
1059  * indicates IOMMUFD_ACCESS_RW_KTHREAD then a large copy can be optimized
1060  * by changing it into copy_to/from_user().
1061  */
1062 int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
1063 		      void *data, size_t length, unsigned int flags)
1064 {
1065 	struct iopt_area_contig_iter iter;
1066 	struct io_pagetable *iopt;
1067 	struct iopt_area *area;
1068 	unsigned long last_iova;
1069 	int rc;
1070 
1071 	if (!length)
1072 		return -EINVAL;
1073 	if (check_add_overflow(iova, length - 1, &last_iova))
1074 		return -EOVERFLOW;
1075 
1076 	mutex_lock(&access->ioas_lock);
1077 	if (!access->ioas) {
1078 		mutex_unlock(&access->ioas_lock);
1079 		return -ENOENT;
1080 	}
1081 	iopt = &access->ioas->iopt;
1082 
1083 	down_read(&iopt->iova_rwsem);
1084 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
1085 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
1086 		unsigned long bytes = (last - iter.cur_iova) + 1;
1087 
1088 		if (area->prevent_access) {
1089 			rc = -EINVAL;
1090 			goto err_out;
1091 		}
1092 
1093 		if (!check_area_prot(area, flags)) {
1094 			rc = -EPERM;
1095 			goto err_out;
1096 		}
1097 
1098 		rc = iopt_pages_rw_access(
1099 			area->pages, iopt_area_start_byte(area, iter.cur_iova),
1100 			data, bytes, flags);
1101 		if (rc)
1102 			goto err_out;
1103 		data += bytes;
1104 	}
1105 	if (!iopt_area_contig_done(&iter))
1106 		rc = -ENOENT;
1107 err_out:
1108 	up_read(&iopt->iova_rwsem);
1109 	mutex_unlock(&access->ioas_lock);
1110 	return rc;
1111 }
1112 EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD);
1113