xref: /openbmc/linux/drivers/iommu/iommufd/io_pagetable.c (revision 8ef9ea1503d0a129cc6f5cf48fb63633efa5d766)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/iommufd.h>
12 #include <linux/lockdep.h>
13 #include <linux/iommu.h>
14 #include <linux/sched/mm.h>
15 #include <linux/err.h>
16 #include <linux/slab.h>
17 #include <linux/errno.h>
18 
19 #include "io_pagetable.h"
20 #include "double_span.h"
21 
22 struct iopt_pages_list {
23 	struct iopt_pages *pages;
24 	struct iopt_area *area;
25 	struct list_head next;
26 	unsigned long start_byte;
27 	unsigned long length;
28 };
29 
30 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
31 					struct io_pagetable *iopt,
32 					unsigned long iova,
33 					unsigned long last_iova)
34 {
35 	lockdep_assert_held(&iopt->iova_rwsem);
36 
37 	iter->cur_iova = iova;
38 	iter->last_iova = last_iova;
39 	iter->area = iopt_area_iter_first(iopt, iova, iova);
40 	if (!iter->area)
41 		return NULL;
42 	if (!iter->area->pages) {
43 		iter->area = NULL;
44 		return NULL;
45 	}
46 	return iter->area;
47 }
48 
49 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
50 {
51 	unsigned long last_iova;
52 
53 	if (!iter->area)
54 		return NULL;
55 	last_iova = iopt_area_last_iova(iter->area);
56 	if (iter->last_iova <= last_iova)
57 		return NULL;
58 
59 	iter->cur_iova = last_iova + 1;
60 	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
61 					 iter->last_iova);
62 	if (!iter->area)
63 		return NULL;
64 	if (iter->cur_iova != iopt_area_iova(iter->area) ||
65 	    !iter->area->pages) {
66 		iter->area = NULL;
67 		return NULL;
68 	}
69 	return iter->area;
70 }
71 
72 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
73 				    unsigned long length,
74 				    unsigned long iova_alignment,
75 				    unsigned long page_offset)
76 {
77 	if (span->is_used || span->last_hole - span->start_hole < length - 1)
78 		return false;
79 
80 	span->start_hole = ALIGN(span->start_hole, iova_alignment) |
81 			   page_offset;
82 	if (span->start_hole > span->last_hole ||
83 	    span->last_hole - span->start_hole < length - 1)
84 		return false;
85 	return true;
86 }
87 
88 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
89 				    unsigned long length,
90 				    unsigned long iova_alignment,
91 				    unsigned long page_offset)
92 {
93 	if (span->is_hole || span->last_used - span->start_used < length - 1)
94 		return false;
95 
96 	span->start_used = ALIGN(span->start_used, iova_alignment) |
97 			   page_offset;
98 	if (span->start_used > span->last_used ||
99 	    span->last_used - span->start_used < length - 1)
100 		return false;
101 	return true;
102 }
103 
104 /*
105  * Automatically find a block of IOVA that is not being used and not reserved.
106  * Does not return a 0 IOVA even if it is valid.
107  */
108 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
109 			   unsigned long uptr, unsigned long length)
110 {
111 	unsigned long page_offset = uptr % PAGE_SIZE;
112 	struct interval_tree_double_span_iter used_span;
113 	struct interval_tree_span_iter allowed_span;
114 	unsigned long max_alignment = PAGE_SIZE;
115 	unsigned long iova_alignment;
116 
117 	lockdep_assert_held(&iopt->iova_rwsem);
118 
119 	/* Protect roundup_pow-of_two() from overflow */
120 	if (length == 0 || length >= ULONG_MAX / 2)
121 		return -EOVERFLOW;
122 
123 	/*
124 	 * Keep alignment present in the uptr when building the IOVA, this
125 	 * increases the chance we can map a THP.
126 	 */
127 	if (!uptr)
128 		iova_alignment = roundup_pow_of_two(length);
129 	else
130 		iova_alignment = min_t(unsigned long,
131 				       roundup_pow_of_two(length),
132 				       1UL << __ffs64(uptr));
133 
134 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
135 	max_alignment = HPAGE_SIZE;
136 #endif
137 	/* Protect against ALIGN() overflow */
138 	if (iova_alignment >= max_alignment)
139 		iova_alignment = max_alignment;
140 
141 	if (iova_alignment < iopt->iova_alignment)
142 		return -EINVAL;
143 
144 	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
145 				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
146 		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
147 			allowed_span.start_used = PAGE_SIZE;
148 			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
149 			allowed_span.is_hole = false;
150 		}
151 
152 		if (!__alloc_iova_check_used(&allowed_span, length,
153 					     iova_alignment, page_offset))
154 			continue;
155 
156 		interval_tree_for_each_double_span(
157 			&used_span, &iopt->reserved_itree, &iopt->area_itree,
158 			allowed_span.start_used, allowed_span.last_used) {
159 			if (!__alloc_iova_check_hole(&used_span, length,
160 						     iova_alignment,
161 						     page_offset))
162 				continue;
163 
164 			*iova = used_span.start_hole;
165 			return 0;
166 		}
167 	}
168 	return -ENOSPC;
169 }
170 
171 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
172 			   unsigned long length)
173 {
174 	unsigned long last;
175 
176 	lockdep_assert_held(&iopt->iova_rwsem);
177 
178 	if ((iova & (iopt->iova_alignment - 1)))
179 		return -EINVAL;
180 
181 	if (check_add_overflow(iova, length - 1, &last))
182 		return -EOVERFLOW;
183 
184 	/* No reserved IOVA intersects the range */
185 	if (iopt_reserved_iter_first(iopt, iova, last))
186 		return -EINVAL;
187 
188 	/* Check that there is not already a mapping in the range */
189 	if (iopt_area_iter_first(iopt, iova, last))
190 		return -EEXIST;
191 	return 0;
192 }
193 
194 /*
195  * The area takes a slice of the pages from start_bytes to start_byte + length
196  */
197 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
198 			    struct iopt_pages *pages, unsigned long iova,
199 			    unsigned long start_byte, unsigned long length,
200 			    int iommu_prot)
201 {
202 	lockdep_assert_held_write(&iopt->iova_rwsem);
203 
204 	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
205 		return -EPERM;
206 
207 	area->iommu_prot = iommu_prot;
208 	area->page_offset = start_byte % PAGE_SIZE;
209 	if (area->page_offset & (iopt->iova_alignment - 1))
210 		return -EINVAL;
211 
212 	area->node.start = iova;
213 	if (check_add_overflow(iova, length - 1, &area->node.last))
214 		return -EOVERFLOW;
215 
216 	area->pages_node.start = start_byte / PAGE_SIZE;
217 	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
218 		return -EOVERFLOW;
219 	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
220 	if (WARN_ON(area->pages_node.last >= pages->npages))
221 		return -EOVERFLOW;
222 
223 	/*
224 	 * The area is inserted with a NULL pages indicating it is not fully
225 	 * initialized yet.
226 	 */
227 	area->iopt = iopt;
228 	interval_tree_insert(&area->node, &iopt->area_itree);
229 	return 0;
230 }
231 
232 static struct iopt_area *iopt_area_alloc(void)
233 {
234 	struct iopt_area *area;
235 
236 	area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
237 	if (!area)
238 		return NULL;
239 	RB_CLEAR_NODE(&area->node.rb);
240 	RB_CLEAR_NODE(&area->pages_node.rb);
241 	return area;
242 }
243 
244 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
245 				 struct list_head *pages_list,
246 				 unsigned long length, unsigned long *dst_iova,
247 				 int iommu_prot, unsigned int flags)
248 {
249 	struct iopt_pages_list *elm;
250 	unsigned long iova;
251 	int rc = 0;
252 
253 	list_for_each_entry(elm, pages_list, next) {
254 		elm->area = iopt_area_alloc();
255 		if (!elm->area)
256 			return -ENOMEM;
257 	}
258 
259 	down_write(&iopt->iova_rwsem);
260 	if ((length & (iopt->iova_alignment - 1)) || !length) {
261 		rc = -EINVAL;
262 		goto out_unlock;
263 	}
264 
265 	if (flags & IOPT_ALLOC_IOVA) {
266 		/* Use the first entry to guess the ideal IOVA alignment */
267 		elm = list_first_entry(pages_list, struct iopt_pages_list,
268 				       next);
269 		rc = iopt_alloc_iova(
270 			iopt, dst_iova,
271 			(uintptr_t)elm->pages->uptr + elm->start_byte, length);
272 		if (rc)
273 			goto out_unlock;
274 		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
275 		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
276 			rc = -EINVAL;
277 			goto out_unlock;
278 		}
279 	} else {
280 		rc = iopt_check_iova(iopt, *dst_iova, length);
281 		if (rc)
282 			goto out_unlock;
283 	}
284 
285 	/*
286 	 * Areas are created with a NULL pages so that the IOVA space is
287 	 * reserved and we can unlock the iova_rwsem.
288 	 */
289 	iova = *dst_iova;
290 	list_for_each_entry(elm, pages_list, next) {
291 		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
292 				      elm->start_byte, elm->length, iommu_prot);
293 		if (rc)
294 			goto out_unlock;
295 		iova += elm->length;
296 	}
297 
298 out_unlock:
299 	up_write(&iopt->iova_rwsem);
300 	return rc;
301 }
302 
303 static void iopt_abort_area(struct iopt_area *area)
304 {
305 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
306 		WARN_ON(area->pages);
307 	if (area->iopt) {
308 		down_write(&area->iopt->iova_rwsem);
309 		interval_tree_remove(&area->node, &area->iopt->area_itree);
310 		up_write(&area->iopt->iova_rwsem);
311 	}
312 	kfree(area);
313 }
314 
315 void iopt_free_pages_list(struct list_head *pages_list)
316 {
317 	struct iopt_pages_list *elm;
318 
319 	while ((elm = list_first_entry_or_null(pages_list,
320 					       struct iopt_pages_list, next))) {
321 		if (elm->area)
322 			iopt_abort_area(elm->area);
323 		if (elm->pages)
324 			iopt_put_pages(elm->pages);
325 		list_del(&elm->next);
326 		kfree(elm);
327 	}
328 }
329 
330 static int iopt_fill_domains_pages(struct list_head *pages_list)
331 {
332 	struct iopt_pages_list *undo_elm;
333 	struct iopt_pages_list *elm;
334 	int rc;
335 
336 	list_for_each_entry(elm, pages_list, next) {
337 		rc = iopt_area_fill_domains(elm->area, elm->pages);
338 		if (rc)
339 			goto err_undo;
340 	}
341 	return 0;
342 
343 err_undo:
344 	list_for_each_entry(undo_elm, pages_list, next) {
345 		if (undo_elm == elm)
346 			break;
347 		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
348 	}
349 	return rc;
350 }
351 
352 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
353 		   unsigned long length, unsigned long *dst_iova,
354 		   int iommu_prot, unsigned int flags)
355 {
356 	struct iopt_pages_list *elm;
357 	int rc;
358 
359 	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
360 				   iommu_prot, flags);
361 	if (rc)
362 		return rc;
363 
364 	down_read(&iopt->domains_rwsem);
365 	rc = iopt_fill_domains_pages(pages_list);
366 	if (rc)
367 		goto out_unlock_domains;
368 
369 	down_write(&iopt->iova_rwsem);
370 	list_for_each_entry(elm, pages_list, next) {
371 		/*
372 		 * area->pages must be set inside the domains_rwsem to ensure
373 		 * any newly added domains will get filled. Moves the reference
374 		 * in from the list.
375 		 */
376 		elm->area->pages = elm->pages;
377 		elm->pages = NULL;
378 		elm->area = NULL;
379 	}
380 	up_write(&iopt->iova_rwsem);
381 out_unlock_domains:
382 	up_read(&iopt->domains_rwsem);
383 	return rc;
384 }
385 
386 /**
387  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
388  * @ictx: iommufd_ctx the iopt is part of
389  * @iopt: io_pagetable to act on
390  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
391  *        the chosen iova on output. Otherwise is the iova to map to on input
392  * @uptr: User VA to map
393  * @length: Number of bytes to map
394  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
395  * @flags: IOPT_ALLOC_IOVA or zero
396  *
397  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
398  * page tables this will pin the pages and load them into the domain at iova.
399  * For non-domain page tables this will only setup a lazy reference and the
400  * caller must use iopt_access_pages() to touch them.
401  *
402  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
403  * destroyed.
404  */
405 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
406 			unsigned long *iova, void __user *uptr,
407 			unsigned long length, int iommu_prot,
408 			unsigned int flags)
409 {
410 	struct iopt_pages_list elm = {};
411 	LIST_HEAD(pages_list);
412 	int rc;
413 
414 	elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
415 	if (IS_ERR(elm.pages))
416 		return PTR_ERR(elm.pages);
417 	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
418 	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
419 		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
420 	elm.start_byte = uptr - elm.pages->uptr;
421 	elm.length = length;
422 	list_add(&elm.next, &pages_list);
423 
424 	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
425 	if (rc) {
426 		if (elm.area)
427 			iopt_abort_area(elm.area);
428 		if (elm.pages)
429 			iopt_put_pages(elm.pages);
430 		return rc;
431 	}
432 	return 0;
433 }
434 
435 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
436 		   unsigned long length, struct list_head *pages_list)
437 {
438 	struct iopt_area_contig_iter iter;
439 	unsigned long last_iova;
440 	struct iopt_area *area;
441 	int rc;
442 
443 	if (!length)
444 		return -EINVAL;
445 	if (check_add_overflow(iova, length - 1, &last_iova))
446 		return -EOVERFLOW;
447 
448 	down_read(&iopt->iova_rwsem);
449 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
450 		struct iopt_pages_list *elm;
451 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
452 
453 		elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
454 		if (!elm) {
455 			rc = -ENOMEM;
456 			goto err_free;
457 		}
458 		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
459 		elm->pages = area->pages;
460 		elm->length = (last - iter.cur_iova) + 1;
461 		kref_get(&elm->pages->kref);
462 		list_add_tail(&elm->next, pages_list);
463 	}
464 	if (!iopt_area_contig_done(&iter)) {
465 		rc = -ENOENT;
466 		goto err_free;
467 	}
468 	up_read(&iopt->iova_rwsem);
469 	return 0;
470 err_free:
471 	up_read(&iopt->iova_rwsem);
472 	iopt_free_pages_list(pages_list);
473 	return rc;
474 }
475 
476 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
477 				 unsigned long last, unsigned long *unmapped)
478 {
479 	struct iopt_area *area;
480 	unsigned long unmapped_bytes = 0;
481 	unsigned int tries = 0;
482 	int rc = -ENOENT;
483 
484 	/*
485 	 * The domains_rwsem must be held in read mode any time any area->pages
486 	 * is NULL. This prevents domain attach/detatch from running
487 	 * concurrently with cleaning up the area.
488 	 */
489 again:
490 	down_read(&iopt->domains_rwsem);
491 	down_write(&iopt->iova_rwsem);
492 	while ((area = iopt_area_iter_first(iopt, start, last))) {
493 		unsigned long area_last = iopt_area_last_iova(area);
494 		unsigned long area_first = iopt_area_iova(area);
495 		struct iopt_pages *pages;
496 
497 		/* Userspace should not race map/unmap's of the same area */
498 		if (!area->pages) {
499 			rc = -EBUSY;
500 			goto out_unlock_iova;
501 		}
502 
503 		if (area_first < start || area_last > last) {
504 			rc = -ENOENT;
505 			goto out_unlock_iova;
506 		}
507 
508 		if (area_first != start)
509 			tries = 0;
510 
511 		/*
512 		 * num_accesses writers must hold the iova_rwsem too, so we can
513 		 * safely read it under the write side of the iovam_rwsem
514 		 * without the pages->mutex.
515 		 */
516 		if (area->num_accesses) {
517 			size_t length = iopt_area_length(area);
518 
519 			start = area_first;
520 			area->prevent_access = true;
521 			up_write(&iopt->iova_rwsem);
522 			up_read(&iopt->domains_rwsem);
523 
524 			iommufd_access_notify_unmap(iopt, area_first, length);
525 			/* Something is not responding to unmap requests. */
526 			tries++;
527 			if (WARN_ON(tries > 100))
528 				return -EDEADLOCK;
529 			goto again;
530 		}
531 
532 		pages = area->pages;
533 		area->pages = NULL;
534 		up_write(&iopt->iova_rwsem);
535 
536 		iopt_area_unfill_domains(area, pages);
537 		iopt_abort_area(area);
538 		iopt_put_pages(pages);
539 
540 		unmapped_bytes += area_last - area_first + 1;
541 
542 		down_write(&iopt->iova_rwsem);
543 	}
544 	if (unmapped_bytes)
545 		rc = 0;
546 
547 out_unlock_iova:
548 	up_write(&iopt->iova_rwsem);
549 	up_read(&iopt->domains_rwsem);
550 	if (unmapped)
551 		*unmapped = unmapped_bytes;
552 	return rc;
553 }
554 
555 /**
556  * iopt_unmap_iova() - Remove a range of iova
557  * @iopt: io_pagetable to act on
558  * @iova: Starting iova to unmap
559  * @length: Number of bytes to unmap
560  * @unmapped: Return number of bytes unmapped
561  *
562  * The requested range must be a superset of existing ranges.
563  * Splitting/truncating IOVA mappings is not allowed.
564  */
565 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
566 		    unsigned long length, unsigned long *unmapped)
567 {
568 	unsigned long iova_last;
569 
570 	if (!length)
571 		return -EINVAL;
572 
573 	if (check_add_overflow(iova, length - 1, &iova_last))
574 		return -EOVERFLOW;
575 
576 	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
577 }
578 
579 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
580 {
581 	int rc;
582 
583 	rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
584 	/* If the IOVAs are empty then unmap all succeeds */
585 	if (rc == -ENOENT)
586 		return 0;
587 	return rc;
588 }
589 
590 /* The caller must always free all the nodes in the allowed_iova rb_root. */
591 int iopt_set_allow_iova(struct io_pagetable *iopt,
592 			struct rb_root_cached *allowed_iova)
593 {
594 	struct iopt_allowed *allowed;
595 
596 	down_write(&iopt->iova_rwsem);
597 	swap(*allowed_iova, iopt->allowed_itree);
598 
599 	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
600 	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
601 		if (iopt_reserved_iter_first(iopt, allowed->node.start,
602 					     allowed->node.last)) {
603 			swap(*allowed_iova, iopt->allowed_itree);
604 			up_write(&iopt->iova_rwsem);
605 			return -EADDRINUSE;
606 		}
607 	}
608 	up_write(&iopt->iova_rwsem);
609 	return 0;
610 }
611 
612 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
613 		      unsigned long last, void *owner)
614 {
615 	struct iopt_reserved *reserved;
616 
617 	lockdep_assert_held_write(&iopt->iova_rwsem);
618 
619 	if (iopt_area_iter_first(iopt, start, last) ||
620 	    iopt_allowed_iter_first(iopt, start, last))
621 		return -EADDRINUSE;
622 
623 	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
624 	if (!reserved)
625 		return -ENOMEM;
626 	reserved->node.start = start;
627 	reserved->node.last = last;
628 	reserved->owner = owner;
629 	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
630 	return 0;
631 }
632 
633 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
634 {
635 	struct iopt_reserved *reserved, *next;
636 
637 	lockdep_assert_held_write(&iopt->iova_rwsem);
638 
639 	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
640 	     reserved = next) {
641 		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
642 
643 		if (reserved->owner == owner) {
644 			interval_tree_remove(&reserved->node,
645 					     &iopt->reserved_itree);
646 			kfree(reserved);
647 		}
648 	}
649 }
650 
651 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
652 {
653 	down_write(&iopt->iova_rwsem);
654 	__iopt_remove_reserved_iova(iopt, owner);
655 	up_write(&iopt->iova_rwsem);
656 }
657 
658 void iopt_init_table(struct io_pagetable *iopt)
659 {
660 	init_rwsem(&iopt->iova_rwsem);
661 	init_rwsem(&iopt->domains_rwsem);
662 	iopt->area_itree = RB_ROOT_CACHED;
663 	iopt->allowed_itree = RB_ROOT_CACHED;
664 	iopt->reserved_itree = RB_ROOT_CACHED;
665 	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
666 	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
667 
668 	/*
669 	 * iopt's start as SW tables that can use the entire size_t IOVA space
670 	 * due to the use of size_t in the APIs. They have no alignment
671 	 * restriction.
672 	 */
673 	iopt->iova_alignment = 1;
674 }
675 
676 void iopt_destroy_table(struct io_pagetable *iopt)
677 {
678 	struct interval_tree_node *node;
679 
680 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
681 		iopt_remove_reserved_iova(iopt, NULL);
682 
683 	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
684 						ULONG_MAX))) {
685 		interval_tree_remove(node, &iopt->allowed_itree);
686 		kfree(container_of(node, struct iopt_allowed, node));
687 	}
688 
689 	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
690 	WARN_ON(!xa_empty(&iopt->domains));
691 	WARN_ON(!xa_empty(&iopt->access_list));
692 	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
693 }
694 
695 /**
696  * iopt_unfill_domain() - Unfill a domain with PFNs
697  * @iopt: io_pagetable to act on
698  * @domain: domain to unfill
699  *
700  * This is used when removing a domain from the iopt. Every area in the iopt
701  * will be unmapped from the domain. The domain must already be removed from the
702  * domains xarray.
703  */
704 static void iopt_unfill_domain(struct io_pagetable *iopt,
705 			       struct iommu_domain *domain)
706 {
707 	struct iopt_area *area;
708 
709 	lockdep_assert_held(&iopt->iova_rwsem);
710 	lockdep_assert_held_write(&iopt->domains_rwsem);
711 
712 	/*
713 	 * Some other domain is holding all the pfns still, rapidly unmap this
714 	 * domain.
715 	 */
716 	if (iopt->next_domain_id != 0) {
717 		/* Pick an arbitrary remaining domain to act as storage */
718 		struct iommu_domain *storage_domain =
719 			xa_load(&iopt->domains, 0);
720 
721 		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
722 		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
723 			struct iopt_pages *pages = area->pages;
724 
725 			if (!pages)
726 				continue;
727 
728 			mutex_lock(&pages->mutex);
729 			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
730 				WARN_ON(!area->storage_domain);
731 			if (area->storage_domain == domain)
732 				area->storage_domain = storage_domain;
733 			mutex_unlock(&pages->mutex);
734 
735 			iopt_area_unmap_domain(area, domain);
736 		}
737 		return;
738 	}
739 
740 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
741 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
742 		struct iopt_pages *pages = area->pages;
743 
744 		if (!pages)
745 			continue;
746 
747 		mutex_lock(&pages->mutex);
748 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
749 		WARN_ON(area->storage_domain != domain);
750 		area->storage_domain = NULL;
751 		iopt_area_unfill_domain(area, pages, domain);
752 		mutex_unlock(&pages->mutex);
753 	}
754 }
755 
756 /**
757  * iopt_fill_domain() - Fill a domain with PFNs
758  * @iopt: io_pagetable to act on
759  * @domain: domain to fill
760  *
761  * Fill the domain with PFNs from every area in the iopt. On failure the domain
762  * is left unchanged.
763  */
764 static int iopt_fill_domain(struct io_pagetable *iopt,
765 			    struct iommu_domain *domain)
766 {
767 	struct iopt_area *end_area;
768 	struct iopt_area *area;
769 	int rc;
770 
771 	lockdep_assert_held(&iopt->iova_rwsem);
772 	lockdep_assert_held_write(&iopt->domains_rwsem);
773 
774 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
775 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
776 		struct iopt_pages *pages = area->pages;
777 
778 		if (!pages)
779 			continue;
780 
781 		mutex_lock(&pages->mutex);
782 		rc = iopt_area_fill_domain(area, domain);
783 		if (rc) {
784 			mutex_unlock(&pages->mutex);
785 			goto out_unfill;
786 		}
787 		if (!area->storage_domain) {
788 			WARN_ON(iopt->next_domain_id != 0);
789 			area->storage_domain = domain;
790 			interval_tree_insert(&area->pages_node,
791 					     &pages->domains_itree);
792 		}
793 		mutex_unlock(&pages->mutex);
794 	}
795 	return 0;
796 
797 out_unfill:
798 	end_area = area;
799 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
800 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
801 		struct iopt_pages *pages = area->pages;
802 
803 		if (area == end_area)
804 			break;
805 		if (!pages)
806 			continue;
807 		mutex_lock(&pages->mutex);
808 		if (iopt->next_domain_id == 0) {
809 			interval_tree_remove(&area->pages_node,
810 					     &pages->domains_itree);
811 			area->storage_domain = NULL;
812 		}
813 		iopt_area_unfill_domain(area, pages, domain);
814 		mutex_unlock(&pages->mutex);
815 	}
816 	return rc;
817 }
818 
819 /* All existing area's conform to an increased page size */
820 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
821 				     unsigned long new_iova_alignment)
822 {
823 	unsigned long align_mask = new_iova_alignment - 1;
824 	struct iopt_area *area;
825 
826 	lockdep_assert_held(&iopt->iova_rwsem);
827 	lockdep_assert_held(&iopt->domains_rwsem);
828 
829 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
830 	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
831 		if ((iopt_area_iova(area) & align_mask) ||
832 		    (iopt_area_length(area) & align_mask) ||
833 		    (area->page_offset & align_mask))
834 			return -EADDRINUSE;
835 
836 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
837 		struct iommufd_access *access;
838 		unsigned long index;
839 
840 		xa_for_each(&iopt->access_list, index, access)
841 			if (WARN_ON(access->iova_alignment >
842 				    new_iova_alignment))
843 				return -EADDRINUSE;
844 	}
845 	return 0;
846 }
847 
848 int iopt_table_add_domain(struct io_pagetable *iopt,
849 			  struct iommu_domain *domain)
850 {
851 	const struct iommu_domain_geometry *geometry = &domain->geometry;
852 	struct iommu_domain *iter_domain;
853 	unsigned int new_iova_alignment;
854 	unsigned long index;
855 	int rc;
856 
857 	down_write(&iopt->domains_rwsem);
858 	down_write(&iopt->iova_rwsem);
859 
860 	xa_for_each(&iopt->domains, index, iter_domain) {
861 		if (WARN_ON(iter_domain == domain)) {
862 			rc = -EEXIST;
863 			goto out_unlock;
864 		}
865 	}
866 
867 	/*
868 	 * The io page size drives the iova_alignment. Internally the iopt_pages
869 	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
870 	 * objects into the iommu_domain.
871 	 *
872 	 * A iommu_domain must always be able to accept PAGE_SIZE to be
873 	 * compatible as we can't guarantee higher contiguity.
874 	 */
875 	new_iova_alignment = max_t(unsigned long,
876 				   1UL << __ffs(domain->pgsize_bitmap),
877 				   iopt->iova_alignment);
878 	if (new_iova_alignment > PAGE_SIZE) {
879 		rc = -EINVAL;
880 		goto out_unlock;
881 	}
882 	if (new_iova_alignment != iopt->iova_alignment) {
883 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
884 		if (rc)
885 			goto out_unlock;
886 	}
887 
888 	/* No area exists that is outside the allowed domain aperture */
889 	if (geometry->aperture_start != 0) {
890 		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
891 				       domain);
892 		if (rc)
893 			goto out_reserved;
894 	}
895 	if (geometry->aperture_end != ULONG_MAX) {
896 		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
897 				       ULONG_MAX, domain);
898 		if (rc)
899 			goto out_reserved;
900 	}
901 
902 	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
903 	if (rc)
904 		goto out_reserved;
905 
906 	rc = iopt_fill_domain(iopt, domain);
907 	if (rc)
908 		goto out_release;
909 
910 	iopt->iova_alignment = new_iova_alignment;
911 	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
912 	iopt->next_domain_id++;
913 	up_write(&iopt->iova_rwsem);
914 	up_write(&iopt->domains_rwsem);
915 	return 0;
916 out_release:
917 	xa_release(&iopt->domains, iopt->next_domain_id);
918 out_reserved:
919 	__iopt_remove_reserved_iova(iopt, domain);
920 out_unlock:
921 	up_write(&iopt->iova_rwsem);
922 	up_write(&iopt->domains_rwsem);
923 	return rc;
924 }
925 
926 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
927 {
928 	unsigned long new_iova_alignment;
929 	struct iommufd_access *access;
930 	struct iommu_domain *domain;
931 	unsigned long index;
932 
933 	lockdep_assert_held_write(&iopt->iova_rwsem);
934 	lockdep_assert_held(&iopt->domains_rwsem);
935 
936 	/* See batch_iommu_map_small() */
937 	if (iopt->disable_large_pages)
938 		new_iova_alignment = PAGE_SIZE;
939 	else
940 		new_iova_alignment = 1;
941 
942 	xa_for_each(&iopt->domains, index, domain)
943 		new_iova_alignment = max_t(unsigned long,
944 					   1UL << __ffs(domain->pgsize_bitmap),
945 					   new_iova_alignment);
946 	xa_for_each(&iopt->access_list, index, access)
947 		new_iova_alignment = max_t(unsigned long,
948 					   access->iova_alignment,
949 					   new_iova_alignment);
950 
951 	if (new_iova_alignment > iopt->iova_alignment) {
952 		int rc;
953 
954 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
955 		if (rc)
956 			return rc;
957 	}
958 	iopt->iova_alignment = new_iova_alignment;
959 	return 0;
960 }
961 
962 void iopt_table_remove_domain(struct io_pagetable *iopt,
963 			      struct iommu_domain *domain)
964 {
965 	struct iommu_domain *iter_domain = NULL;
966 	unsigned long index;
967 
968 	down_write(&iopt->domains_rwsem);
969 	down_write(&iopt->iova_rwsem);
970 
971 	xa_for_each(&iopt->domains, index, iter_domain)
972 		if (iter_domain == domain)
973 			break;
974 	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
975 		goto out_unlock;
976 
977 	/*
978 	 * Compress the xarray to keep it linear by swapping the entry to erase
979 	 * with the tail entry and shrinking the tail.
980 	 */
981 	iopt->next_domain_id--;
982 	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
983 	if (index != iopt->next_domain_id)
984 		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
985 
986 	iopt_unfill_domain(iopt, domain);
987 	__iopt_remove_reserved_iova(iopt, domain);
988 
989 	WARN_ON(iopt_calculate_iova_alignment(iopt));
990 out_unlock:
991 	up_write(&iopt->iova_rwsem);
992 	up_write(&iopt->domains_rwsem);
993 }
994 
995 /**
996  * iopt_area_split - Split an area into two parts at iova
997  * @area: The area to split
998  * @iova: Becomes the last of a new area
999  *
1000  * This splits an area into two. It is part of the VFIO compatibility to allow
1001  * poking a hole in the mapping. The two areas continue to point at the same
1002  * iopt_pages, just with different starting bytes.
1003  */
1004 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1005 {
1006 	unsigned long alignment = area->iopt->iova_alignment;
1007 	unsigned long last_iova = iopt_area_last_iova(area);
1008 	unsigned long start_iova = iopt_area_iova(area);
1009 	unsigned long new_start = iova + 1;
1010 	struct io_pagetable *iopt = area->iopt;
1011 	struct iopt_pages *pages = area->pages;
1012 	struct iopt_area *lhs;
1013 	struct iopt_area *rhs;
1014 	int rc;
1015 
1016 	lockdep_assert_held_write(&iopt->iova_rwsem);
1017 
1018 	if (iova == start_iova || iova == last_iova)
1019 		return 0;
1020 
1021 	if (!pages || area->prevent_access)
1022 		return -EBUSY;
1023 
1024 	if (new_start & (alignment - 1) ||
1025 	    iopt_area_start_byte(area, new_start) & (alignment - 1))
1026 		return -EINVAL;
1027 
1028 	lhs = iopt_area_alloc();
1029 	if (!lhs)
1030 		return -ENOMEM;
1031 
1032 	rhs = iopt_area_alloc();
1033 	if (!rhs) {
1034 		rc = -ENOMEM;
1035 		goto err_free_lhs;
1036 	}
1037 
1038 	mutex_lock(&pages->mutex);
1039 	/*
1040 	 * Splitting is not permitted if an access exists, we don't track enough
1041 	 * information to split existing accesses.
1042 	 */
1043 	if (area->num_accesses) {
1044 		rc = -EINVAL;
1045 		goto err_unlock;
1046 	}
1047 
1048 	/*
1049 	 * Splitting is not permitted if a domain could have been mapped with
1050 	 * huge pages.
1051 	 */
1052 	if (area->storage_domain && !iopt->disable_large_pages) {
1053 		rc = -EINVAL;
1054 		goto err_unlock;
1055 	}
1056 
1057 	interval_tree_remove(&area->node, &iopt->area_itree);
1058 	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1059 			      iopt_area_start_byte(area, start_iova),
1060 			      (new_start - 1) - start_iova + 1,
1061 			      area->iommu_prot);
1062 	if (WARN_ON(rc))
1063 		goto err_insert;
1064 
1065 	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1066 			      iopt_area_start_byte(area, new_start),
1067 			      last_iova - new_start + 1, area->iommu_prot);
1068 	if (WARN_ON(rc))
1069 		goto err_remove_lhs;
1070 
1071 	/*
1072 	 * If the original area has filled a domain, domains_itree has to be
1073 	 * updated.
1074 	 */
1075 	if (area->storage_domain) {
1076 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1077 		interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1078 		interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1079 	}
1080 
1081 	lhs->storage_domain = area->storage_domain;
1082 	lhs->pages = area->pages;
1083 	rhs->storage_domain = area->storage_domain;
1084 	rhs->pages = area->pages;
1085 	kref_get(&rhs->pages->kref);
1086 	kfree(area);
1087 	mutex_unlock(&pages->mutex);
1088 
1089 	/*
1090 	 * No change to domains or accesses because the pages hasn't been
1091 	 * changed
1092 	 */
1093 	return 0;
1094 
1095 err_remove_lhs:
1096 	interval_tree_remove(&lhs->node, &iopt->area_itree);
1097 err_insert:
1098 	interval_tree_insert(&area->node, &iopt->area_itree);
1099 err_unlock:
1100 	mutex_unlock(&pages->mutex);
1101 	kfree(rhs);
1102 err_free_lhs:
1103 	kfree(lhs);
1104 	return rc;
1105 }
1106 
1107 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1108 		  size_t num_iovas)
1109 {
1110 	int rc = 0;
1111 	int i;
1112 
1113 	down_write(&iopt->iova_rwsem);
1114 	for (i = 0; i < num_iovas; i++) {
1115 		struct iopt_area *area;
1116 
1117 		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1118 		if (!area)
1119 			continue;
1120 		rc = iopt_area_split(area, iovas[i]);
1121 		if (rc)
1122 			break;
1123 	}
1124 	up_write(&iopt->iova_rwsem);
1125 	return rc;
1126 }
1127 
1128 void iopt_enable_large_pages(struct io_pagetable *iopt)
1129 {
1130 	int rc;
1131 
1132 	down_write(&iopt->domains_rwsem);
1133 	down_write(&iopt->iova_rwsem);
1134 	WRITE_ONCE(iopt->disable_large_pages, false);
1135 	rc = iopt_calculate_iova_alignment(iopt);
1136 	WARN_ON(rc);
1137 	up_write(&iopt->iova_rwsem);
1138 	up_write(&iopt->domains_rwsem);
1139 }
1140 
1141 int iopt_disable_large_pages(struct io_pagetable *iopt)
1142 {
1143 	int rc = 0;
1144 
1145 	down_write(&iopt->domains_rwsem);
1146 	down_write(&iopt->iova_rwsem);
1147 	if (iopt->disable_large_pages)
1148 		goto out_unlock;
1149 
1150 	/* Won't do it if domains already have pages mapped in them */
1151 	if (!xa_empty(&iopt->domains) &&
1152 	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1153 		rc = -EINVAL;
1154 		goto out_unlock;
1155 	}
1156 
1157 	WRITE_ONCE(iopt->disable_large_pages, true);
1158 	rc = iopt_calculate_iova_alignment(iopt);
1159 	if (rc)
1160 		WRITE_ONCE(iopt->disable_large_pages, false);
1161 out_unlock:
1162 	up_write(&iopt->iova_rwsem);
1163 	up_write(&iopt->domains_rwsem);
1164 	return rc;
1165 }
1166 
1167 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1168 {
1169 	u32 new_id;
1170 	int rc;
1171 
1172 	down_write(&iopt->domains_rwsem);
1173 	down_write(&iopt->iova_rwsem);
1174 	rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1175 		      GFP_KERNEL_ACCOUNT);
1176 
1177 	if (rc)
1178 		goto out_unlock;
1179 
1180 	rc = iopt_calculate_iova_alignment(iopt);
1181 	if (rc) {
1182 		xa_erase(&iopt->access_list, new_id);
1183 		goto out_unlock;
1184 	}
1185 	access->iopt_access_list_id = new_id;
1186 
1187 out_unlock:
1188 	up_write(&iopt->iova_rwsem);
1189 	up_write(&iopt->domains_rwsem);
1190 	return rc;
1191 }
1192 
1193 void iopt_remove_access(struct io_pagetable *iopt,
1194 			struct iommufd_access *access,
1195 			u32 iopt_access_list_id)
1196 {
1197 	down_write(&iopt->domains_rwsem);
1198 	down_write(&iopt->iova_rwsem);
1199 	WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1200 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1201 	up_write(&iopt->iova_rwsem);
1202 	up_write(&iopt->domains_rwsem);
1203 }
1204 
1205 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
1206 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1207 					struct device *dev,
1208 					phys_addr_t *sw_msi_start)
1209 {
1210 	struct iommu_resv_region *resv;
1211 	LIST_HEAD(resv_regions);
1212 	unsigned int num_hw_msi = 0;
1213 	unsigned int num_sw_msi = 0;
1214 	int rc;
1215 
1216 	if (iommufd_should_fail())
1217 		return -EINVAL;
1218 
1219 	down_write(&iopt->iova_rwsem);
1220 	/* FIXME: drivers allocate memory but there is no failure propogated */
1221 	iommu_get_resv_regions(dev, &resv_regions);
1222 
1223 	list_for_each_entry(resv, &resv_regions, list) {
1224 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1225 			continue;
1226 
1227 		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1228 			num_hw_msi++;
1229 		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1230 			*sw_msi_start = resv->start;
1231 			num_sw_msi++;
1232 		}
1233 
1234 		rc = iopt_reserve_iova(iopt, resv->start,
1235 				       resv->length - 1 + resv->start, dev);
1236 		if (rc)
1237 			goto out_reserved;
1238 	}
1239 
1240 	/* Drivers must offer sane combinations of regions */
1241 	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1242 		rc = -EINVAL;
1243 		goto out_reserved;
1244 	}
1245 
1246 	rc = 0;
1247 	goto out_free_resv;
1248 
1249 out_reserved:
1250 	__iopt_remove_reserved_iova(iopt, dev);
1251 out_free_resv:
1252 	iommu_put_resv_regions(dev, &resv_regions);
1253 	up_write(&iopt->iova_rwsem);
1254 	return rc;
1255 }
1256