xref: /openbmc/linux/drivers/iommu/iommufd/io_pagetable.c (revision e65e175b07bef5974045cc42238de99057669ca7)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/iommufd.h>
12 #include <linux/lockdep.h>
13 #include <linux/iommu.h>
14 #include <linux/sched/mm.h>
15 #include <linux/err.h>
16 #include <linux/slab.h>
17 #include <linux/errno.h>
18 
19 #include "io_pagetable.h"
20 #include "double_span.h"
21 
22 struct iopt_pages_list {
23 	struct iopt_pages *pages;
24 	struct iopt_area *area;
25 	struct list_head next;
26 	unsigned long start_byte;
27 	unsigned long length;
28 };
29 
30 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
31 					struct io_pagetable *iopt,
32 					unsigned long iova,
33 					unsigned long last_iova)
34 {
35 	lockdep_assert_held(&iopt->iova_rwsem);
36 
37 	iter->cur_iova = iova;
38 	iter->last_iova = last_iova;
39 	iter->area = iopt_area_iter_first(iopt, iova, iova);
40 	if (!iter->area)
41 		return NULL;
42 	if (!iter->area->pages) {
43 		iter->area = NULL;
44 		return NULL;
45 	}
46 	return iter->area;
47 }
48 
49 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
50 {
51 	unsigned long last_iova;
52 
53 	if (!iter->area)
54 		return NULL;
55 	last_iova = iopt_area_last_iova(iter->area);
56 	if (iter->last_iova <= last_iova)
57 		return NULL;
58 
59 	iter->cur_iova = last_iova + 1;
60 	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
61 					 iter->last_iova);
62 	if (!iter->area)
63 		return NULL;
64 	if (iter->cur_iova != iopt_area_iova(iter->area) ||
65 	    !iter->area->pages) {
66 		iter->area = NULL;
67 		return NULL;
68 	}
69 	return iter->area;
70 }
71 
72 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
73 				    unsigned long length,
74 				    unsigned long iova_alignment,
75 				    unsigned long page_offset)
76 {
77 	if (span->is_used || span->last_hole - span->start_hole < length - 1)
78 		return false;
79 
80 	span->start_hole = ALIGN(span->start_hole, iova_alignment) |
81 			   page_offset;
82 	if (span->start_hole > span->last_hole ||
83 	    span->last_hole - span->start_hole < length - 1)
84 		return false;
85 	return true;
86 }
87 
88 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
89 				    unsigned long length,
90 				    unsigned long iova_alignment,
91 				    unsigned long page_offset)
92 {
93 	if (span->is_hole || span->last_used - span->start_used < length - 1)
94 		return false;
95 
96 	span->start_used = ALIGN(span->start_used, iova_alignment) |
97 			   page_offset;
98 	if (span->start_used > span->last_used ||
99 	    span->last_used - span->start_used < length - 1)
100 		return false;
101 	return true;
102 }
103 
104 /*
105  * Automatically find a block of IOVA that is not being used and not reserved.
106  * Does not return a 0 IOVA even if it is valid.
107  */
108 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
109 			   unsigned long uptr, unsigned long length)
110 {
111 	unsigned long page_offset = uptr % PAGE_SIZE;
112 	struct interval_tree_double_span_iter used_span;
113 	struct interval_tree_span_iter allowed_span;
114 	unsigned long iova_alignment;
115 
116 	lockdep_assert_held(&iopt->iova_rwsem);
117 
118 	/* Protect roundup_pow-of_two() from overflow */
119 	if (length == 0 || length >= ULONG_MAX / 2)
120 		return -EOVERFLOW;
121 
122 	/*
123 	 * Keep alignment present in the uptr when building the IOVA, this
124 	 * increases the chance we can map a THP.
125 	 */
126 	if (!uptr)
127 		iova_alignment = roundup_pow_of_two(length);
128 	else
129 		iova_alignment = min_t(unsigned long,
130 				       roundup_pow_of_two(length),
131 				       1UL << __ffs64(uptr));
132 
133 	if (iova_alignment < iopt->iova_alignment)
134 		return -EINVAL;
135 
136 	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
137 				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
138 		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
139 			allowed_span.start_used = PAGE_SIZE;
140 			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
141 			allowed_span.is_hole = false;
142 		}
143 
144 		if (!__alloc_iova_check_used(&allowed_span, length,
145 					     iova_alignment, page_offset))
146 			continue;
147 
148 		interval_tree_for_each_double_span(
149 			&used_span, &iopt->reserved_itree, &iopt->area_itree,
150 			allowed_span.start_used, allowed_span.last_used) {
151 			if (!__alloc_iova_check_hole(&used_span, length,
152 						     iova_alignment,
153 						     page_offset))
154 				continue;
155 
156 			*iova = used_span.start_hole;
157 			return 0;
158 		}
159 	}
160 	return -ENOSPC;
161 }
162 
163 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
164 			   unsigned long length)
165 {
166 	unsigned long last;
167 
168 	lockdep_assert_held(&iopt->iova_rwsem);
169 
170 	if ((iova & (iopt->iova_alignment - 1)))
171 		return -EINVAL;
172 
173 	if (check_add_overflow(iova, length - 1, &last))
174 		return -EOVERFLOW;
175 
176 	/* No reserved IOVA intersects the range */
177 	if (iopt_reserved_iter_first(iopt, iova, last))
178 		return -EINVAL;
179 
180 	/* Check that there is not already a mapping in the range */
181 	if (iopt_area_iter_first(iopt, iova, last))
182 		return -EEXIST;
183 	return 0;
184 }
185 
186 /*
187  * The area takes a slice of the pages from start_bytes to start_byte + length
188  */
189 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
190 			    struct iopt_pages *pages, unsigned long iova,
191 			    unsigned long start_byte, unsigned long length,
192 			    int iommu_prot)
193 {
194 	lockdep_assert_held_write(&iopt->iova_rwsem);
195 
196 	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
197 		return -EPERM;
198 
199 	area->iommu_prot = iommu_prot;
200 	area->page_offset = start_byte % PAGE_SIZE;
201 	if (area->page_offset & (iopt->iova_alignment - 1))
202 		return -EINVAL;
203 
204 	area->node.start = iova;
205 	if (check_add_overflow(iova, length - 1, &area->node.last))
206 		return -EOVERFLOW;
207 
208 	area->pages_node.start = start_byte / PAGE_SIZE;
209 	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
210 		return -EOVERFLOW;
211 	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
212 	if (WARN_ON(area->pages_node.last >= pages->npages))
213 		return -EOVERFLOW;
214 
215 	/*
216 	 * The area is inserted with a NULL pages indicating it is not fully
217 	 * initialized yet.
218 	 */
219 	area->iopt = iopt;
220 	interval_tree_insert(&area->node, &iopt->area_itree);
221 	return 0;
222 }
223 
224 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
225 				 struct list_head *pages_list,
226 				 unsigned long length, unsigned long *dst_iova,
227 				 int iommu_prot, unsigned int flags)
228 {
229 	struct iopt_pages_list *elm;
230 	unsigned long iova;
231 	int rc = 0;
232 
233 	list_for_each_entry(elm, pages_list, next) {
234 		elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT);
235 		if (!elm->area)
236 			return -ENOMEM;
237 	}
238 
239 	down_write(&iopt->iova_rwsem);
240 	if ((length & (iopt->iova_alignment - 1)) || !length) {
241 		rc = -EINVAL;
242 		goto out_unlock;
243 	}
244 
245 	if (flags & IOPT_ALLOC_IOVA) {
246 		/* Use the first entry to guess the ideal IOVA alignment */
247 		elm = list_first_entry(pages_list, struct iopt_pages_list,
248 				       next);
249 		rc = iopt_alloc_iova(
250 			iopt, dst_iova,
251 			(uintptr_t)elm->pages->uptr + elm->start_byte, length);
252 		if (rc)
253 			goto out_unlock;
254 		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
255 		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
256 			rc = -EINVAL;
257 			goto out_unlock;
258 		}
259 	} else {
260 		rc = iopt_check_iova(iopt, *dst_iova, length);
261 		if (rc)
262 			goto out_unlock;
263 	}
264 
265 	/*
266 	 * Areas are created with a NULL pages so that the IOVA space is
267 	 * reserved and we can unlock the iova_rwsem.
268 	 */
269 	iova = *dst_iova;
270 	list_for_each_entry(elm, pages_list, next) {
271 		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
272 				      elm->start_byte, elm->length, iommu_prot);
273 		if (rc)
274 			goto out_unlock;
275 		iova += elm->length;
276 	}
277 
278 out_unlock:
279 	up_write(&iopt->iova_rwsem);
280 	return rc;
281 }
282 
283 static void iopt_abort_area(struct iopt_area *area)
284 {
285 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
286 		WARN_ON(area->pages);
287 	if (area->iopt) {
288 		down_write(&area->iopt->iova_rwsem);
289 		interval_tree_remove(&area->node, &area->iopt->area_itree);
290 		up_write(&area->iopt->iova_rwsem);
291 	}
292 	kfree(area);
293 }
294 
295 void iopt_free_pages_list(struct list_head *pages_list)
296 {
297 	struct iopt_pages_list *elm;
298 
299 	while ((elm = list_first_entry_or_null(pages_list,
300 					       struct iopt_pages_list, next))) {
301 		if (elm->area)
302 			iopt_abort_area(elm->area);
303 		if (elm->pages)
304 			iopt_put_pages(elm->pages);
305 		list_del(&elm->next);
306 		kfree(elm);
307 	}
308 }
309 
310 static int iopt_fill_domains_pages(struct list_head *pages_list)
311 {
312 	struct iopt_pages_list *undo_elm;
313 	struct iopt_pages_list *elm;
314 	int rc;
315 
316 	list_for_each_entry(elm, pages_list, next) {
317 		rc = iopt_area_fill_domains(elm->area, elm->pages);
318 		if (rc)
319 			goto err_undo;
320 	}
321 	return 0;
322 
323 err_undo:
324 	list_for_each_entry(undo_elm, pages_list, next) {
325 		if (undo_elm == elm)
326 			break;
327 		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
328 	}
329 	return rc;
330 }
331 
332 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
333 		   unsigned long length, unsigned long *dst_iova,
334 		   int iommu_prot, unsigned int flags)
335 {
336 	struct iopt_pages_list *elm;
337 	int rc;
338 
339 	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
340 				   iommu_prot, flags);
341 	if (rc)
342 		return rc;
343 
344 	down_read(&iopt->domains_rwsem);
345 	rc = iopt_fill_domains_pages(pages_list);
346 	if (rc)
347 		goto out_unlock_domains;
348 
349 	down_write(&iopt->iova_rwsem);
350 	list_for_each_entry(elm, pages_list, next) {
351 		/*
352 		 * area->pages must be set inside the domains_rwsem to ensure
353 		 * any newly added domains will get filled. Moves the reference
354 		 * in from the list.
355 		 */
356 		elm->area->pages = elm->pages;
357 		elm->pages = NULL;
358 		elm->area = NULL;
359 	}
360 	up_write(&iopt->iova_rwsem);
361 out_unlock_domains:
362 	up_read(&iopt->domains_rwsem);
363 	return rc;
364 }
365 
366 /**
367  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
368  * @ictx: iommufd_ctx the iopt is part of
369  * @iopt: io_pagetable to act on
370  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
371  *        the chosen iova on output. Otherwise is the iova to map to on input
372  * @uptr: User VA to map
373  * @length: Number of bytes to map
374  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
375  * @flags: IOPT_ALLOC_IOVA or zero
376  *
377  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
378  * page tables this will pin the pages and load them into the domain at iova.
379  * For non-domain page tables this will only setup a lazy reference and the
380  * caller must use iopt_access_pages() to touch them.
381  *
382  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
383  * destroyed.
384  */
385 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
386 			unsigned long *iova, void __user *uptr,
387 			unsigned long length, int iommu_prot,
388 			unsigned int flags)
389 {
390 	struct iopt_pages_list elm = {};
391 	LIST_HEAD(pages_list);
392 	int rc;
393 
394 	elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
395 	if (IS_ERR(elm.pages))
396 		return PTR_ERR(elm.pages);
397 	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
398 	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
399 		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
400 	elm.start_byte = uptr - elm.pages->uptr;
401 	elm.length = length;
402 	list_add(&elm.next, &pages_list);
403 
404 	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
405 	if (rc) {
406 		if (elm.area)
407 			iopt_abort_area(elm.area);
408 		if (elm.pages)
409 			iopt_put_pages(elm.pages);
410 		return rc;
411 	}
412 	return 0;
413 }
414 
415 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
416 		   unsigned long length, struct list_head *pages_list)
417 {
418 	struct iopt_area_contig_iter iter;
419 	unsigned long last_iova;
420 	struct iopt_area *area;
421 	int rc;
422 
423 	if (!length)
424 		return -EINVAL;
425 	if (check_add_overflow(iova, length - 1, &last_iova))
426 		return -EOVERFLOW;
427 
428 	down_read(&iopt->iova_rwsem);
429 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
430 		struct iopt_pages_list *elm;
431 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
432 
433 		elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
434 		if (!elm) {
435 			rc = -ENOMEM;
436 			goto err_free;
437 		}
438 		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
439 		elm->pages = area->pages;
440 		elm->length = (last - iter.cur_iova) + 1;
441 		kref_get(&elm->pages->kref);
442 		list_add_tail(&elm->next, pages_list);
443 	}
444 	if (!iopt_area_contig_done(&iter)) {
445 		rc = -ENOENT;
446 		goto err_free;
447 	}
448 	up_read(&iopt->iova_rwsem);
449 	return 0;
450 err_free:
451 	up_read(&iopt->iova_rwsem);
452 	iopt_free_pages_list(pages_list);
453 	return rc;
454 }
455 
456 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
457 				 unsigned long last, unsigned long *unmapped)
458 {
459 	struct iopt_area *area;
460 	unsigned long unmapped_bytes = 0;
461 	int rc = -ENOENT;
462 
463 	/*
464 	 * The domains_rwsem must be held in read mode any time any area->pages
465 	 * is NULL. This prevents domain attach/detatch from running
466 	 * concurrently with cleaning up the area.
467 	 */
468 again:
469 	down_read(&iopt->domains_rwsem);
470 	down_write(&iopt->iova_rwsem);
471 	while ((area = iopt_area_iter_first(iopt, start, last))) {
472 		unsigned long area_last = iopt_area_last_iova(area);
473 		unsigned long area_first = iopt_area_iova(area);
474 		struct iopt_pages *pages;
475 
476 		/* Userspace should not race map/unmap's of the same area */
477 		if (!area->pages) {
478 			rc = -EBUSY;
479 			goto out_unlock_iova;
480 		}
481 
482 		if (area_first < start || area_last > last) {
483 			rc = -ENOENT;
484 			goto out_unlock_iova;
485 		}
486 
487 		/*
488 		 * num_accesses writers must hold the iova_rwsem too, so we can
489 		 * safely read it under the write side of the iovam_rwsem
490 		 * without the pages->mutex.
491 		 */
492 		if (area->num_accesses) {
493 			start = area_first;
494 			area->prevent_access = true;
495 			up_write(&iopt->iova_rwsem);
496 			up_read(&iopt->domains_rwsem);
497 			iommufd_access_notify_unmap(iopt, area_first,
498 						    iopt_area_length(area));
499 			if (WARN_ON(READ_ONCE(area->num_accesses)))
500 				return -EDEADLOCK;
501 			goto again;
502 		}
503 
504 		pages = area->pages;
505 		area->pages = NULL;
506 		up_write(&iopt->iova_rwsem);
507 
508 		iopt_area_unfill_domains(area, pages);
509 		iopt_abort_area(area);
510 		iopt_put_pages(pages);
511 
512 		unmapped_bytes += area_last - area_first + 1;
513 
514 		down_write(&iopt->iova_rwsem);
515 	}
516 	if (unmapped_bytes)
517 		rc = 0;
518 
519 out_unlock_iova:
520 	up_write(&iopt->iova_rwsem);
521 	up_read(&iopt->domains_rwsem);
522 	if (unmapped)
523 		*unmapped = unmapped_bytes;
524 	return rc;
525 }
526 
527 /**
528  * iopt_unmap_iova() - Remove a range of iova
529  * @iopt: io_pagetable to act on
530  * @iova: Starting iova to unmap
531  * @length: Number of bytes to unmap
532  * @unmapped: Return number of bytes unmapped
533  *
534  * The requested range must be a superset of existing ranges.
535  * Splitting/truncating IOVA mappings is not allowed.
536  */
537 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
538 		    unsigned long length, unsigned long *unmapped)
539 {
540 	unsigned long iova_last;
541 
542 	if (!length)
543 		return -EINVAL;
544 
545 	if (check_add_overflow(iova, length - 1, &iova_last))
546 		return -EOVERFLOW;
547 
548 	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
549 }
550 
551 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
552 {
553 	int rc;
554 
555 	rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
556 	/* If the IOVAs are empty then unmap all succeeds */
557 	if (rc == -ENOENT)
558 		return 0;
559 	return rc;
560 }
561 
562 /* The caller must always free all the nodes in the allowed_iova rb_root. */
563 int iopt_set_allow_iova(struct io_pagetable *iopt,
564 			struct rb_root_cached *allowed_iova)
565 {
566 	struct iopt_allowed *allowed;
567 
568 	down_write(&iopt->iova_rwsem);
569 	swap(*allowed_iova, iopt->allowed_itree);
570 
571 	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
572 	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
573 		if (iopt_reserved_iter_first(iopt, allowed->node.start,
574 					     allowed->node.last)) {
575 			swap(*allowed_iova, iopt->allowed_itree);
576 			up_write(&iopt->iova_rwsem);
577 			return -EADDRINUSE;
578 		}
579 	}
580 	up_write(&iopt->iova_rwsem);
581 	return 0;
582 }
583 
584 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
585 		      unsigned long last, void *owner)
586 {
587 	struct iopt_reserved *reserved;
588 
589 	lockdep_assert_held_write(&iopt->iova_rwsem);
590 
591 	if (iopt_area_iter_first(iopt, start, last) ||
592 	    iopt_allowed_iter_first(iopt, start, last))
593 		return -EADDRINUSE;
594 
595 	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
596 	if (!reserved)
597 		return -ENOMEM;
598 	reserved->node.start = start;
599 	reserved->node.last = last;
600 	reserved->owner = owner;
601 	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
602 	return 0;
603 }
604 
605 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
606 {
607 	struct iopt_reserved *reserved, *next;
608 
609 	lockdep_assert_held_write(&iopt->iova_rwsem);
610 
611 	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
612 	     reserved = next) {
613 		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
614 
615 		if (reserved->owner == owner) {
616 			interval_tree_remove(&reserved->node,
617 					     &iopt->reserved_itree);
618 			kfree(reserved);
619 		}
620 	}
621 }
622 
623 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
624 {
625 	down_write(&iopt->iova_rwsem);
626 	__iopt_remove_reserved_iova(iopt, owner);
627 	up_write(&iopt->iova_rwsem);
628 }
629 
630 void iopt_init_table(struct io_pagetable *iopt)
631 {
632 	init_rwsem(&iopt->iova_rwsem);
633 	init_rwsem(&iopt->domains_rwsem);
634 	iopt->area_itree = RB_ROOT_CACHED;
635 	iopt->allowed_itree = RB_ROOT_CACHED;
636 	iopt->reserved_itree = RB_ROOT_CACHED;
637 	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
638 	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
639 
640 	/*
641 	 * iopt's start as SW tables that can use the entire size_t IOVA space
642 	 * due to the use of size_t in the APIs. They have no alignment
643 	 * restriction.
644 	 */
645 	iopt->iova_alignment = 1;
646 }
647 
648 void iopt_destroy_table(struct io_pagetable *iopt)
649 {
650 	struct interval_tree_node *node;
651 
652 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
653 		iopt_remove_reserved_iova(iopt, NULL);
654 
655 	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
656 						ULONG_MAX))) {
657 		interval_tree_remove(node, &iopt->allowed_itree);
658 		kfree(container_of(node, struct iopt_allowed, node));
659 	}
660 
661 	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
662 	WARN_ON(!xa_empty(&iopt->domains));
663 	WARN_ON(!xa_empty(&iopt->access_list));
664 	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
665 }
666 
667 /**
668  * iopt_unfill_domain() - Unfill a domain with PFNs
669  * @iopt: io_pagetable to act on
670  * @domain: domain to unfill
671  *
672  * This is used when removing a domain from the iopt. Every area in the iopt
673  * will be unmapped from the domain. The domain must already be removed from the
674  * domains xarray.
675  */
676 static void iopt_unfill_domain(struct io_pagetable *iopt,
677 			       struct iommu_domain *domain)
678 {
679 	struct iopt_area *area;
680 
681 	lockdep_assert_held(&iopt->iova_rwsem);
682 	lockdep_assert_held_write(&iopt->domains_rwsem);
683 
684 	/*
685 	 * Some other domain is holding all the pfns still, rapidly unmap this
686 	 * domain.
687 	 */
688 	if (iopt->next_domain_id != 0) {
689 		/* Pick an arbitrary remaining domain to act as storage */
690 		struct iommu_domain *storage_domain =
691 			xa_load(&iopt->domains, 0);
692 
693 		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
694 		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
695 			struct iopt_pages *pages = area->pages;
696 
697 			if (!pages)
698 				continue;
699 
700 			mutex_lock(&pages->mutex);
701 			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
702 				WARN_ON(!area->storage_domain);
703 			if (area->storage_domain == domain)
704 				area->storage_domain = storage_domain;
705 			mutex_unlock(&pages->mutex);
706 
707 			iopt_area_unmap_domain(area, domain);
708 		}
709 		return;
710 	}
711 
712 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
713 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
714 		struct iopt_pages *pages = area->pages;
715 
716 		if (!pages)
717 			continue;
718 
719 		mutex_lock(&pages->mutex);
720 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
721 		WARN_ON(area->storage_domain != domain);
722 		area->storage_domain = NULL;
723 		iopt_area_unfill_domain(area, pages, domain);
724 		mutex_unlock(&pages->mutex);
725 	}
726 }
727 
728 /**
729  * iopt_fill_domain() - Fill a domain with PFNs
730  * @iopt: io_pagetable to act on
731  * @domain: domain to fill
732  *
733  * Fill the domain with PFNs from every area in the iopt. On failure the domain
734  * is left unchanged.
735  */
736 static int iopt_fill_domain(struct io_pagetable *iopt,
737 			    struct iommu_domain *domain)
738 {
739 	struct iopt_area *end_area;
740 	struct iopt_area *area;
741 	int rc;
742 
743 	lockdep_assert_held(&iopt->iova_rwsem);
744 	lockdep_assert_held_write(&iopt->domains_rwsem);
745 
746 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
747 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
748 		struct iopt_pages *pages = area->pages;
749 
750 		if (!pages)
751 			continue;
752 
753 		mutex_lock(&pages->mutex);
754 		rc = iopt_area_fill_domain(area, domain);
755 		if (rc) {
756 			mutex_unlock(&pages->mutex);
757 			goto out_unfill;
758 		}
759 		if (!area->storage_domain) {
760 			WARN_ON(iopt->next_domain_id != 0);
761 			area->storage_domain = domain;
762 			interval_tree_insert(&area->pages_node,
763 					     &pages->domains_itree);
764 		}
765 		mutex_unlock(&pages->mutex);
766 	}
767 	return 0;
768 
769 out_unfill:
770 	end_area = area;
771 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
772 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
773 		struct iopt_pages *pages = area->pages;
774 
775 		if (area == end_area)
776 			break;
777 		if (!pages)
778 			continue;
779 		mutex_lock(&pages->mutex);
780 		if (iopt->next_domain_id == 0) {
781 			interval_tree_remove(&area->pages_node,
782 					     &pages->domains_itree);
783 			area->storage_domain = NULL;
784 		}
785 		iopt_area_unfill_domain(area, pages, domain);
786 		mutex_unlock(&pages->mutex);
787 	}
788 	return rc;
789 }
790 
791 /* All existing area's conform to an increased page size */
792 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
793 				     unsigned long new_iova_alignment)
794 {
795 	unsigned long align_mask = new_iova_alignment - 1;
796 	struct iopt_area *area;
797 
798 	lockdep_assert_held(&iopt->iova_rwsem);
799 	lockdep_assert_held(&iopt->domains_rwsem);
800 
801 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
802 	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
803 		if ((iopt_area_iova(area) & align_mask) ||
804 		    (iopt_area_length(area) & align_mask) ||
805 		    (area->page_offset & align_mask))
806 			return -EADDRINUSE;
807 
808 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
809 		struct iommufd_access *access;
810 		unsigned long index;
811 
812 		xa_for_each(&iopt->access_list, index, access)
813 			if (WARN_ON(access->iova_alignment >
814 				    new_iova_alignment))
815 				return -EADDRINUSE;
816 	}
817 	return 0;
818 }
819 
820 int iopt_table_add_domain(struct io_pagetable *iopt,
821 			  struct iommu_domain *domain)
822 {
823 	const struct iommu_domain_geometry *geometry = &domain->geometry;
824 	struct iommu_domain *iter_domain;
825 	unsigned int new_iova_alignment;
826 	unsigned long index;
827 	int rc;
828 
829 	down_write(&iopt->domains_rwsem);
830 	down_write(&iopt->iova_rwsem);
831 
832 	xa_for_each(&iopt->domains, index, iter_domain) {
833 		if (WARN_ON(iter_domain == domain)) {
834 			rc = -EEXIST;
835 			goto out_unlock;
836 		}
837 	}
838 
839 	/*
840 	 * The io page size drives the iova_alignment. Internally the iopt_pages
841 	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
842 	 * objects into the iommu_domain.
843 	 *
844 	 * A iommu_domain must always be able to accept PAGE_SIZE to be
845 	 * compatible as we can't guarantee higher contiguity.
846 	 */
847 	new_iova_alignment = max_t(unsigned long,
848 				   1UL << __ffs(domain->pgsize_bitmap),
849 				   iopt->iova_alignment);
850 	if (new_iova_alignment > PAGE_SIZE) {
851 		rc = -EINVAL;
852 		goto out_unlock;
853 	}
854 	if (new_iova_alignment != iopt->iova_alignment) {
855 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
856 		if (rc)
857 			goto out_unlock;
858 	}
859 
860 	/* No area exists that is outside the allowed domain aperture */
861 	if (geometry->aperture_start != 0) {
862 		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
863 				       domain);
864 		if (rc)
865 			goto out_reserved;
866 	}
867 	if (geometry->aperture_end != ULONG_MAX) {
868 		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
869 				       ULONG_MAX, domain);
870 		if (rc)
871 			goto out_reserved;
872 	}
873 
874 	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
875 	if (rc)
876 		goto out_reserved;
877 
878 	rc = iopt_fill_domain(iopt, domain);
879 	if (rc)
880 		goto out_release;
881 
882 	iopt->iova_alignment = new_iova_alignment;
883 	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
884 	iopt->next_domain_id++;
885 	up_write(&iopt->iova_rwsem);
886 	up_write(&iopt->domains_rwsem);
887 	return 0;
888 out_release:
889 	xa_release(&iopt->domains, iopt->next_domain_id);
890 out_reserved:
891 	__iopt_remove_reserved_iova(iopt, domain);
892 out_unlock:
893 	up_write(&iopt->iova_rwsem);
894 	up_write(&iopt->domains_rwsem);
895 	return rc;
896 }
897 
898 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
899 {
900 	unsigned long new_iova_alignment;
901 	struct iommufd_access *access;
902 	struct iommu_domain *domain;
903 	unsigned long index;
904 
905 	lockdep_assert_held_write(&iopt->iova_rwsem);
906 	lockdep_assert_held(&iopt->domains_rwsem);
907 
908 	/* See batch_iommu_map_small() */
909 	if (iopt->disable_large_pages)
910 		new_iova_alignment = PAGE_SIZE;
911 	else
912 		new_iova_alignment = 1;
913 
914 	xa_for_each(&iopt->domains, index, domain)
915 		new_iova_alignment = max_t(unsigned long,
916 					   1UL << __ffs(domain->pgsize_bitmap),
917 					   new_iova_alignment);
918 	xa_for_each(&iopt->access_list, index, access)
919 		new_iova_alignment = max_t(unsigned long,
920 					   access->iova_alignment,
921 					   new_iova_alignment);
922 
923 	if (new_iova_alignment > iopt->iova_alignment) {
924 		int rc;
925 
926 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
927 		if (rc)
928 			return rc;
929 	}
930 	iopt->iova_alignment = new_iova_alignment;
931 	return 0;
932 }
933 
934 void iopt_table_remove_domain(struct io_pagetable *iopt,
935 			      struct iommu_domain *domain)
936 {
937 	struct iommu_domain *iter_domain = NULL;
938 	unsigned long index;
939 
940 	down_write(&iopt->domains_rwsem);
941 	down_write(&iopt->iova_rwsem);
942 
943 	xa_for_each(&iopt->domains, index, iter_domain)
944 		if (iter_domain == domain)
945 			break;
946 	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
947 		goto out_unlock;
948 
949 	/*
950 	 * Compress the xarray to keep it linear by swapping the entry to erase
951 	 * with the tail entry and shrinking the tail.
952 	 */
953 	iopt->next_domain_id--;
954 	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
955 	if (index != iopt->next_domain_id)
956 		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
957 
958 	iopt_unfill_domain(iopt, domain);
959 	__iopt_remove_reserved_iova(iopt, domain);
960 
961 	WARN_ON(iopt_calculate_iova_alignment(iopt));
962 out_unlock:
963 	up_write(&iopt->iova_rwsem);
964 	up_write(&iopt->domains_rwsem);
965 }
966 
967 /**
968  * iopt_area_split - Split an area into two parts at iova
969  * @area: The area to split
970  * @iova: Becomes the last of a new area
971  *
972  * This splits an area into two. It is part of the VFIO compatibility to allow
973  * poking a hole in the mapping. The two areas continue to point at the same
974  * iopt_pages, just with different starting bytes.
975  */
976 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
977 {
978 	unsigned long alignment = area->iopt->iova_alignment;
979 	unsigned long last_iova = iopt_area_last_iova(area);
980 	unsigned long start_iova = iopt_area_iova(area);
981 	unsigned long new_start = iova + 1;
982 	struct io_pagetable *iopt = area->iopt;
983 	struct iopt_pages *pages = area->pages;
984 	struct iopt_area *lhs;
985 	struct iopt_area *rhs;
986 	int rc;
987 
988 	lockdep_assert_held_write(&iopt->iova_rwsem);
989 
990 	if (iova == start_iova || iova == last_iova)
991 		return 0;
992 
993 	if (!pages || area->prevent_access)
994 		return -EBUSY;
995 
996 	if (new_start & (alignment - 1) ||
997 	    iopt_area_start_byte(area, new_start) & (alignment - 1))
998 		return -EINVAL;
999 
1000 	lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
1001 	if (!lhs)
1002 		return -ENOMEM;
1003 
1004 	rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
1005 	if (!rhs) {
1006 		rc = -ENOMEM;
1007 		goto err_free_lhs;
1008 	}
1009 
1010 	mutex_lock(&pages->mutex);
1011 	/*
1012 	 * Splitting is not permitted if an access exists, we don't track enough
1013 	 * information to split existing accesses.
1014 	 */
1015 	if (area->num_accesses) {
1016 		rc = -EINVAL;
1017 		goto err_unlock;
1018 	}
1019 
1020 	/*
1021 	 * Splitting is not permitted if a domain could have been mapped with
1022 	 * huge pages.
1023 	 */
1024 	if (area->storage_domain && !iopt->disable_large_pages) {
1025 		rc = -EINVAL;
1026 		goto err_unlock;
1027 	}
1028 
1029 	interval_tree_remove(&area->node, &iopt->area_itree);
1030 	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1031 			      iopt_area_start_byte(area, start_iova),
1032 			      (new_start - 1) - start_iova + 1,
1033 			      area->iommu_prot);
1034 	if (WARN_ON(rc))
1035 		goto err_insert;
1036 
1037 	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1038 			      iopt_area_start_byte(area, new_start),
1039 			      last_iova - new_start + 1, area->iommu_prot);
1040 	if (WARN_ON(rc))
1041 		goto err_remove_lhs;
1042 
1043 	lhs->storage_domain = area->storage_domain;
1044 	lhs->pages = area->pages;
1045 	rhs->storage_domain = area->storage_domain;
1046 	rhs->pages = area->pages;
1047 	kref_get(&rhs->pages->kref);
1048 	kfree(area);
1049 	mutex_unlock(&pages->mutex);
1050 
1051 	/*
1052 	 * No change to domains or accesses because the pages hasn't been
1053 	 * changed
1054 	 */
1055 	return 0;
1056 
1057 err_remove_lhs:
1058 	interval_tree_remove(&lhs->node, &iopt->area_itree);
1059 err_insert:
1060 	interval_tree_insert(&area->node, &iopt->area_itree);
1061 err_unlock:
1062 	mutex_unlock(&pages->mutex);
1063 	kfree(rhs);
1064 err_free_lhs:
1065 	kfree(lhs);
1066 	return rc;
1067 }
1068 
1069 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1070 		  size_t num_iovas)
1071 {
1072 	int rc = 0;
1073 	int i;
1074 
1075 	down_write(&iopt->iova_rwsem);
1076 	for (i = 0; i < num_iovas; i++) {
1077 		struct iopt_area *area;
1078 
1079 		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1080 		if (!area)
1081 			continue;
1082 		rc = iopt_area_split(area, iovas[i]);
1083 		if (rc)
1084 			break;
1085 	}
1086 	up_write(&iopt->iova_rwsem);
1087 	return rc;
1088 }
1089 
1090 void iopt_enable_large_pages(struct io_pagetable *iopt)
1091 {
1092 	int rc;
1093 
1094 	down_write(&iopt->domains_rwsem);
1095 	down_write(&iopt->iova_rwsem);
1096 	WRITE_ONCE(iopt->disable_large_pages, false);
1097 	rc = iopt_calculate_iova_alignment(iopt);
1098 	WARN_ON(rc);
1099 	up_write(&iopt->iova_rwsem);
1100 	up_write(&iopt->domains_rwsem);
1101 }
1102 
1103 int iopt_disable_large_pages(struct io_pagetable *iopt)
1104 {
1105 	int rc = 0;
1106 
1107 	down_write(&iopt->domains_rwsem);
1108 	down_write(&iopt->iova_rwsem);
1109 	if (iopt->disable_large_pages)
1110 		goto out_unlock;
1111 
1112 	/* Won't do it if domains already have pages mapped in them */
1113 	if (!xa_empty(&iopt->domains) &&
1114 	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1115 		rc = -EINVAL;
1116 		goto out_unlock;
1117 	}
1118 
1119 	WRITE_ONCE(iopt->disable_large_pages, true);
1120 	rc = iopt_calculate_iova_alignment(iopt);
1121 	if (rc)
1122 		WRITE_ONCE(iopt->disable_large_pages, false);
1123 out_unlock:
1124 	up_write(&iopt->iova_rwsem);
1125 	up_write(&iopt->domains_rwsem);
1126 	return rc;
1127 }
1128 
1129 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1130 {
1131 	int rc;
1132 
1133 	down_write(&iopt->domains_rwsem);
1134 	down_write(&iopt->iova_rwsem);
1135 	rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access,
1136 		      xa_limit_16b, GFP_KERNEL_ACCOUNT);
1137 	if (rc)
1138 		goto out_unlock;
1139 
1140 	rc = iopt_calculate_iova_alignment(iopt);
1141 	if (rc) {
1142 		xa_erase(&iopt->access_list, access->iopt_access_list_id);
1143 		goto out_unlock;
1144 	}
1145 
1146 out_unlock:
1147 	up_write(&iopt->iova_rwsem);
1148 	up_write(&iopt->domains_rwsem);
1149 	return rc;
1150 }
1151 
1152 void iopt_remove_access(struct io_pagetable *iopt,
1153 			struct iommufd_access *access)
1154 {
1155 	down_write(&iopt->domains_rwsem);
1156 	down_write(&iopt->iova_rwsem);
1157 	WARN_ON(xa_erase(&iopt->access_list, access->iopt_access_list_id) !=
1158 		access);
1159 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1160 	up_write(&iopt->iova_rwsem);
1161 	up_write(&iopt->domains_rwsem);
1162 }
1163 
1164 /* Narrow the valid_iova_itree to include reserved ranges from a group. */
1165 int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt,
1166 					  struct device *device,
1167 					  struct iommu_group *group,
1168 					  phys_addr_t *sw_msi_start)
1169 {
1170 	struct iommu_resv_region *resv;
1171 	struct iommu_resv_region *tmp;
1172 	LIST_HEAD(group_resv_regions);
1173 	unsigned int num_hw_msi = 0;
1174 	unsigned int num_sw_msi = 0;
1175 	int rc;
1176 
1177 	down_write(&iopt->iova_rwsem);
1178 	rc = iommu_get_group_resv_regions(group, &group_resv_regions);
1179 	if (rc)
1180 		goto out_unlock;
1181 
1182 	list_for_each_entry(resv, &group_resv_regions, list) {
1183 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1184 			continue;
1185 
1186 		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1187 			num_hw_msi++;
1188 		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1189 			*sw_msi_start = resv->start;
1190 			num_sw_msi++;
1191 		}
1192 
1193 		rc = iopt_reserve_iova(iopt, resv->start,
1194 				       resv->length - 1 + resv->start, device);
1195 		if (rc)
1196 			goto out_reserved;
1197 	}
1198 
1199 	/* Drivers must offer sane combinations of regions */
1200 	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1201 		rc = -EINVAL;
1202 		goto out_reserved;
1203 	}
1204 
1205 	rc = 0;
1206 	goto out_free_resv;
1207 
1208 out_reserved:
1209 	__iopt_remove_reserved_iova(iopt, device);
1210 out_free_resv:
1211 	list_for_each_entry_safe(resv, tmp, &group_resv_regions, list)
1212 		kfree(resv);
1213 out_unlock:
1214 	up_write(&iopt->iova_rwsem);
1215 	return rc;
1216 }
1217