xref: /openbmc/linux/drivers/iommu/iommufd/io_pagetable.c (revision f5fb5ac7cee29cea9156e734fd652a66417d32fc)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/iommufd.h>
12 #include <linux/lockdep.h>
13 #include <linux/iommu.h>
14 #include <linux/sched/mm.h>
15 #include <linux/err.h>
16 #include <linux/slab.h>
17 #include <linux/errno.h>
18 
19 #include "io_pagetable.h"
20 #include "double_span.h"
21 
22 struct iopt_pages_list {
23 	struct iopt_pages *pages;
24 	struct iopt_area *area;
25 	struct list_head next;
26 	unsigned long start_byte;
27 	unsigned long length;
28 };
29 
30 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
31 					struct io_pagetable *iopt,
32 					unsigned long iova,
33 					unsigned long last_iova)
34 {
35 	lockdep_assert_held(&iopt->iova_rwsem);
36 
37 	iter->cur_iova = iova;
38 	iter->last_iova = last_iova;
39 	iter->area = iopt_area_iter_first(iopt, iova, iova);
40 	if (!iter->area)
41 		return NULL;
42 	if (!iter->area->pages) {
43 		iter->area = NULL;
44 		return NULL;
45 	}
46 	return iter->area;
47 }
48 
49 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
50 {
51 	unsigned long last_iova;
52 
53 	if (!iter->area)
54 		return NULL;
55 	last_iova = iopt_area_last_iova(iter->area);
56 	if (iter->last_iova <= last_iova)
57 		return NULL;
58 
59 	iter->cur_iova = last_iova + 1;
60 	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
61 					 iter->last_iova);
62 	if (!iter->area)
63 		return NULL;
64 	if (iter->cur_iova != iopt_area_iova(iter->area) ||
65 	    !iter->area->pages) {
66 		iter->area = NULL;
67 		return NULL;
68 	}
69 	return iter->area;
70 }
71 
72 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
73 				    unsigned long length,
74 				    unsigned long iova_alignment,
75 				    unsigned long page_offset)
76 {
77 	if (span->is_used || span->last_hole - span->start_hole < length - 1)
78 		return false;
79 
80 	span->start_hole = ALIGN(span->start_hole, iova_alignment) |
81 			   page_offset;
82 	if (span->start_hole > span->last_hole ||
83 	    span->last_hole - span->start_hole < length - 1)
84 		return false;
85 	return true;
86 }
87 
88 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
89 				    unsigned long length,
90 				    unsigned long iova_alignment,
91 				    unsigned long page_offset)
92 {
93 	if (span->is_hole || span->last_used - span->start_used < length - 1)
94 		return false;
95 
96 	span->start_used = ALIGN(span->start_used, iova_alignment) |
97 			   page_offset;
98 	if (span->start_used > span->last_used ||
99 	    span->last_used - span->start_used < length - 1)
100 		return false;
101 	return true;
102 }
103 
104 /*
105  * Automatically find a block of IOVA that is not being used and not reserved.
106  * Does not return a 0 IOVA even if it is valid.
107  */
108 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
109 			   unsigned long uptr, unsigned long length)
110 {
111 	unsigned long page_offset = uptr % PAGE_SIZE;
112 	struct interval_tree_double_span_iter used_span;
113 	struct interval_tree_span_iter allowed_span;
114 	unsigned long iova_alignment;
115 
116 	lockdep_assert_held(&iopt->iova_rwsem);
117 
118 	/* Protect roundup_pow-of_two() from overflow */
119 	if (length == 0 || length >= ULONG_MAX / 2)
120 		return -EOVERFLOW;
121 
122 	/*
123 	 * Keep alignment present in the uptr when building the IOVA, this
124 	 * increases the chance we can map a THP.
125 	 */
126 	if (!uptr)
127 		iova_alignment = roundup_pow_of_two(length);
128 	else
129 		iova_alignment = min_t(unsigned long,
130 				       roundup_pow_of_two(length),
131 				       1UL << __ffs64(uptr));
132 
133 	if (iova_alignment < iopt->iova_alignment)
134 		return -EINVAL;
135 
136 	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
137 				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
138 		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
139 			allowed_span.start_used = PAGE_SIZE;
140 			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
141 			allowed_span.is_hole = false;
142 		}
143 
144 		if (!__alloc_iova_check_used(&allowed_span, length,
145 					     iova_alignment, page_offset))
146 			continue;
147 
148 		interval_tree_for_each_double_span(
149 			&used_span, &iopt->reserved_itree, &iopt->area_itree,
150 			allowed_span.start_used, allowed_span.last_used) {
151 			if (!__alloc_iova_check_hole(&used_span, length,
152 						     iova_alignment,
153 						     page_offset))
154 				continue;
155 
156 			*iova = used_span.start_hole;
157 			return 0;
158 		}
159 	}
160 	return -ENOSPC;
161 }
162 
163 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
164 			   unsigned long length)
165 {
166 	unsigned long last;
167 
168 	lockdep_assert_held(&iopt->iova_rwsem);
169 
170 	if ((iova & (iopt->iova_alignment - 1)))
171 		return -EINVAL;
172 
173 	if (check_add_overflow(iova, length - 1, &last))
174 		return -EOVERFLOW;
175 
176 	/* No reserved IOVA intersects the range */
177 	if (iopt_reserved_iter_first(iopt, iova, last))
178 		return -EINVAL;
179 
180 	/* Check that there is not already a mapping in the range */
181 	if (iopt_area_iter_first(iopt, iova, last))
182 		return -EEXIST;
183 	return 0;
184 }
185 
186 /*
187  * The area takes a slice of the pages from start_bytes to start_byte + length
188  */
189 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
190 			    struct iopt_pages *pages, unsigned long iova,
191 			    unsigned long start_byte, unsigned long length,
192 			    int iommu_prot)
193 {
194 	lockdep_assert_held_write(&iopt->iova_rwsem);
195 
196 	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
197 		return -EPERM;
198 
199 	area->iommu_prot = iommu_prot;
200 	area->page_offset = start_byte % PAGE_SIZE;
201 	if (area->page_offset & (iopt->iova_alignment - 1))
202 		return -EINVAL;
203 
204 	area->node.start = iova;
205 	if (check_add_overflow(iova, length - 1, &area->node.last))
206 		return -EOVERFLOW;
207 
208 	area->pages_node.start = start_byte / PAGE_SIZE;
209 	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
210 		return -EOVERFLOW;
211 	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
212 	if (WARN_ON(area->pages_node.last >= pages->npages))
213 		return -EOVERFLOW;
214 
215 	/*
216 	 * The area is inserted with a NULL pages indicating it is not fully
217 	 * initialized yet.
218 	 */
219 	area->iopt = iopt;
220 	interval_tree_insert(&area->node, &iopt->area_itree);
221 	return 0;
222 }
223 
224 static struct iopt_area *iopt_area_alloc(void)
225 {
226 	struct iopt_area *area;
227 
228 	area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
229 	if (!area)
230 		return NULL;
231 	RB_CLEAR_NODE(&area->node.rb);
232 	RB_CLEAR_NODE(&area->pages_node.rb);
233 	return area;
234 }
235 
236 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
237 				 struct list_head *pages_list,
238 				 unsigned long length, unsigned long *dst_iova,
239 				 int iommu_prot, unsigned int flags)
240 {
241 	struct iopt_pages_list *elm;
242 	unsigned long iova;
243 	int rc = 0;
244 
245 	list_for_each_entry(elm, pages_list, next) {
246 		elm->area = iopt_area_alloc();
247 		if (!elm->area)
248 			return -ENOMEM;
249 	}
250 
251 	down_write(&iopt->iova_rwsem);
252 	if ((length & (iopt->iova_alignment - 1)) || !length) {
253 		rc = -EINVAL;
254 		goto out_unlock;
255 	}
256 
257 	if (flags & IOPT_ALLOC_IOVA) {
258 		/* Use the first entry to guess the ideal IOVA alignment */
259 		elm = list_first_entry(pages_list, struct iopt_pages_list,
260 				       next);
261 		rc = iopt_alloc_iova(
262 			iopt, dst_iova,
263 			(uintptr_t)elm->pages->uptr + elm->start_byte, length);
264 		if (rc)
265 			goto out_unlock;
266 		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
267 		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
268 			rc = -EINVAL;
269 			goto out_unlock;
270 		}
271 	} else {
272 		rc = iopt_check_iova(iopt, *dst_iova, length);
273 		if (rc)
274 			goto out_unlock;
275 	}
276 
277 	/*
278 	 * Areas are created with a NULL pages so that the IOVA space is
279 	 * reserved and we can unlock the iova_rwsem.
280 	 */
281 	iova = *dst_iova;
282 	list_for_each_entry(elm, pages_list, next) {
283 		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
284 				      elm->start_byte, elm->length, iommu_prot);
285 		if (rc)
286 			goto out_unlock;
287 		iova += elm->length;
288 	}
289 
290 out_unlock:
291 	up_write(&iopt->iova_rwsem);
292 	return rc;
293 }
294 
295 static void iopt_abort_area(struct iopt_area *area)
296 {
297 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
298 		WARN_ON(area->pages);
299 	if (area->iopt) {
300 		down_write(&area->iopt->iova_rwsem);
301 		interval_tree_remove(&area->node, &area->iopt->area_itree);
302 		up_write(&area->iopt->iova_rwsem);
303 	}
304 	kfree(area);
305 }
306 
307 void iopt_free_pages_list(struct list_head *pages_list)
308 {
309 	struct iopt_pages_list *elm;
310 
311 	while ((elm = list_first_entry_or_null(pages_list,
312 					       struct iopt_pages_list, next))) {
313 		if (elm->area)
314 			iopt_abort_area(elm->area);
315 		if (elm->pages)
316 			iopt_put_pages(elm->pages);
317 		list_del(&elm->next);
318 		kfree(elm);
319 	}
320 }
321 
322 static int iopt_fill_domains_pages(struct list_head *pages_list)
323 {
324 	struct iopt_pages_list *undo_elm;
325 	struct iopt_pages_list *elm;
326 	int rc;
327 
328 	list_for_each_entry(elm, pages_list, next) {
329 		rc = iopt_area_fill_domains(elm->area, elm->pages);
330 		if (rc)
331 			goto err_undo;
332 	}
333 	return 0;
334 
335 err_undo:
336 	list_for_each_entry(undo_elm, pages_list, next) {
337 		if (undo_elm == elm)
338 			break;
339 		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
340 	}
341 	return rc;
342 }
343 
344 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
345 		   unsigned long length, unsigned long *dst_iova,
346 		   int iommu_prot, unsigned int flags)
347 {
348 	struct iopt_pages_list *elm;
349 	int rc;
350 
351 	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
352 				   iommu_prot, flags);
353 	if (rc)
354 		return rc;
355 
356 	down_read(&iopt->domains_rwsem);
357 	rc = iopt_fill_domains_pages(pages_list);
358 	if (rc)
359 		goto out_unlock_domains;
360 
361 	down_write(&iopt->iova_rwsem);
362 	list_for_each_entry(elm, pages_list, next) {
363 		/*
364 		 * area->pages must be set inside the domains_rwsem to ensure
365 		 * any newly added domains will get filled. Moves the reference
366 		 * in from the list.
367 		 */
368 		elm->area->pages = elm->pages;
369 		elm->pages = NULL;
370 		elm->area = NULL;
371 	}
372 	up_write(&iopt->iova_rwsem);
373 out_unlock_domains:
374 	up_read(&iopt->domains_rwsem);
375 	return rc;
376 }
377 
378 /**
379  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
380  * @ictx: iommufd_ctx the iopt is part of
381  * @iopt: io_pagetable to act on
382  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
383  *        the chosen iova on output. Otherwise is the iova to map to on input
384  * @uptr: User VA to map
385  * @length: Number of bytes to map
386  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
387  * @flags: IOPT_ALLOC_IOVA or zero
388  *
389  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
390  * page tables this will pin the pages and load them into the domain at iova.
391  * For non-domain page tables this will only setup a lazy reference and the
392  * caller must use iopt_access_pages() to touch them.
393  *
394  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
395  * destroyed.
396  */
397 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
398 			unsigned long *iova, void __user *uptr,
399 			unsigned long length, int iommu_prot,
400 			unsigned int flags)
401 {
402 	struct iopt_pages_list elm = {};
403 	LIST_HEAD(pages_list);
404 	int rc;
405 
406 	elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
407 	if (IS_ERR(elm.pages))
408 		return PTR_ERR(elm.pages);
409 	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
410 	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
411 		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
412 	elm.start_byte = uptr - elm.pages->uptr;
413 	elm.length = length;
414 	list_add(&elm.next, &pages_list);
415 
416 	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
417 	if (rc) {
418 		if (elm.area)
419 			iopt_abort_area(elm.area);
420 		if (elm.pages)
421 			iopt_put_pages(elm.pages);
422 		return rc;
423 	}
424 	return 0;
425 }
426 
427 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
428 		   unsigned long length, struct list_head *pages_list)
429 {
430 	struct iopt_area_contig_iter iter;
431 	unsigned long last_iova;
432 	struct iopt_area *area;
433 	int rc;
434 
435 	if (!length)
436 		return -EINVAL;
437 	if (check_add_overflow(iova, length - 1, &last_iova))
438 		return -EOVERFLOW;
439 
440 	down_read(&iopt->iova_rwsem);
441 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
442 		struct iopt_pages_list *elm;
443 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
444 
445 		elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
446 		if (!elm) {
447 			rc = -ENOMEM;
448 			goto err_free;
449 		}
450 		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
451 		elm->pages = area->pages;
452 		elm->length = (last - iter.cur_iova) + 1;
453 		kref_get(&elm->pages->kref);
454 		list_add_tail(&elm->next, pages_list);
455 	}
456 	if (!iopt_area_contig_done(&iter)) {
457 		rc = -ENOENT;
458 		goto err_free;
459 	}
460 	up_read(&iopt->iova_rwsem);
461 	return 0;
462 err_free:
463 	up_read(&iopt->iova_rwsem);
464 	iopt_free_pages_list(pages_list);
465 	return rc;
466 }
467 
468 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
469 				 unsigned long last, unsigned long *unmapped)
470 {
471 	struct iopt_area *area;
472 	unsigned long unmapped_bytes = 0;
473 	unsigned int tries = 0;
474 	int rc = -ENOENT;
475 
476 	/*
477 	 * The domains_rwsem must be held in read mode any time any area->pages
478 	 * is NULL. This prevents domain attach/detatch from running
479 	 * concurrently with cleaning up the area.
480 	 */
481 again:
482 	down_read(&iopt->domains_rwsem);
483 	down_write(&iopt->iova_rwsem);
484 	while ((area = iopt_area_iter_first(iopt, start, last))) {
485 		unsigned long area_last = iopt_area_last_iova(area);
486 		unsigned long area_first = iopt_area_iova(area);
487 		struct iopt_pages *pages;
488 
489 		/* Userspace should not race map/unmap's of the same area */
490 		if (!area->pages) {
491 			rc = -EBUSY;
492 			goto out_unlock_iova;
493 		}
494 
495 		if (area_first < start || area_last > last) {
496 			rc = -ENOENT;
497 			goto out_unlock_iova;
498 		}
499 
500 		if (area_first != start)
501 			tries = 0;
502 
503 		/*
504 		 * num_accesses writers must hold the iova_rwsem too, so we can
505 		 * safely read it under the write side of the iovam_rwsem
506 		 * without the pages->mutex.
507 		 */
508 		if (area->num_accesses) {
509 			size_t length = iopt_area_length(area);
510 
511 			start = area_first;
512 			area->prevent_access = true;
513 			up_write(&iopt->iova_rwsem);
514 			up_read(&iopt->domains_rwsem);
515 
516 			iommufd_access_notify_unmap(iopt, area_first, length);
517 			/* Something is not responding to unmap requests. */
518 			tries++;
519 			if (WARN_ON(tries > 100))
520 				return -EDEADLOCK;
521 			goto again;
522 		}
523 
524 		pages = area->pages;
525 		area->pages = NULL;
526 		up_write(&iopt->iova_rwsem);
527 
528 		iopt_area_unfill_domains(area, pages);
529 		iopt_abort_area(area);
530 		iopt_put_pages(pages);
531 
532 		unmapped_bytes += area_last - area_first + 1;
533 
534 		down_write(&iopt->iova_rwsem);
535 	}
536 	if (unmapped_bytes)
537 		rc = 0;
538 
539 out_unlock_iova:
540 	up_write(&iopt->iova_rwsem);
541 	up_read(&iopt->domains_rwsem);
542 	if (unmapped)
543 		*unmapped = unmapped_bytes;
544 	return rc;
545 }
546 
547 /**
548  * iopt_unmap_iova() - Remove a range of iova
549  * @iopt: io_pagetable to act on
550  * @iova: Starting iova to unmap
551  * @length: Number of bytes to unmap
552  * @unmapped: Return number of bytes unmapped
553  *
554  * The requested range must be a superset of existing ranges.
555  * Splitting/truncating IOVA mappings is not allowed.
556  */
557 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
558 		    unsigned long length, unsigned long *unmapped)
559 {
560 	unsigned long iova_last;
561 
562 	if (!length)
563 		return -EINVAL;
564 
565 	if (check_add_overflow(iova, length - 1, &iova_last))
566 		return -EOVERFLOW;
567 
568 	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
569 }
570 
571 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
572 {
573 	int rc;
574 
575 	rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
576 	/* If the IOVAs are empty then unmap all succeeds */
577 	if (rc == -ENOENT)
578 		return 0;
579 	return rc;
580 }
581 
582 /* The caller must always free all the nodes in the allowed_iova rb_root. */
583 int iopt_set_allow_iova(struct io_pagetable *iopt,
584 			struct rb_root_cached *allowed_iova)
585 {
586 	struct iopt_allowed *allowed;
587 
588 	down_write(&iopt->iova_rwsem);
589 	swap(*allowed_iova, iopt->allowed_itree);
590 
591 	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
592 	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
593 		if (iopt_reserved_iter_first(iopt, allowed->node.start,
594 					     allowed->node.last)) {
595 			swap(*allowed_iova, iopt->allowed_itree);
596 			up_write(&iopt->iova_rwsem);
597 			return -EADDRINUSE;
598 		}
599 	}
600 	up_write(&iopt->iova_rwsem);
601 	return 0;
602 }
603 
604 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
605 		      unsigned long last, void *owner)
606 {
607 	struct iopt_reserved *reserved;
608 
609 	lockdep_assert_held_write(&iopt->iova_rwsem);
610 
611 	if (iopt_area_iter_first(iopt, start, last) ||
612 	    iopt_allowed_iter_first(iopt, start, last))
613 		return -EADDRINUSE;
614 
615 	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
616 	if (!reserved)
617 		return -ENOMEM;
618 	reserved->node.start = start;
619 	reserved->node.last = last;
620 	reserved->owner = owner;
621 	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
622 	return 0;
623 }
624 
625 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
626 {
627 	struct iopt_reserved *reserved, *next;
628 
629 	lockdep_assert_held_write(&iopt->iova_rwsem);
630 
631 	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
632 	     reserved = next) {
633 		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
634 
635 		if (reserved->owner == owner) {
636 			interval_tree_remove(&reserved->node,
637 					     &iopt->reserved_itree);
638 			kfree(reserved);
639 		}
640 	}
641 }
642 
643 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
644 {
645 	down_write(&iopt->iova_rwsem);
646 	__iopt_remove_reserved_iova(iopt, owner);
647 	up_write(&iopt->iova_rwsem);
648 }
649 
650 void iopt_init_table(struct io_pagetable *iopt)
651 {
652 	init_rwsem(&iopt->iova_rwsem);
653 	init_rwsem(&iopt->domains_rwsem);
654 	iopt->area_itree = RB_ROOT_CACHED;
655 	iopt->allowed_itree = RB_ROOT_CACHED;
656 	iopt->reserved_itree = RB_ROOT_CACHED;
657 	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
658 	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
659 
660 	/*
661 	 * iopt's start as SW tables that can use the entire size_t IOVA space
662 	 * due to the use of size_t in the APIs. They have no alignment
663 	 * restriction.
664 	 */
665 	iopt->iova_alignment = 1;
666 }
667 
668 void iopt_destroy_table(struct io_pagetable *iopt)
669 {
670 	struct interval_tree_node *node;
671 
672 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
673 		iopt_remove_reserved_iova(iopt, NULL);
674 
675 	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
676 						ULONG_MAX))) {
677 		interval_tree_remove(node, &iopt->allowed_itree);
678 		kfree(container_of(node, struct iopt_allowed, node));
679 	}
680 
681 	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
682 	WARN_ON(!xa_empty(&iopt->domains));
683 	WARN_ON(!xa_empty(&iopt->access_list));
684 	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
685 }
686 
687 /**
688  * iopt_unfill_domain() - Unfill a domain with PFNs
689  * @iopt: io_pagetable to act on
690  * @domain: domain to unfill
691  *
692  * This is used when removing a domain from the iopt. Every area in the iopt
693  * will be unmapped from the domain. The domain must already be removed from the
694  * domains xarray.
695  */
696 static void iopt_unfill_domain(struct io_pagetable *iopt,
697 			       struct iommu_domain *domain)
698 {
699 	struct iopt_area *area;
700 
701 	lockdep_assert_held(&iopt->iova_rwsem);
702 	lockdep_assert_held_write(&iopt->domains_rwsem);
703 
704 	/*
705 	 * Some other domain is holding all the pfns still, rapidly unmap this
706 	 * domain.
707 	 */
708 	if (iopt->next_domain_id != 0) {
709 		/* Pick an arbitrary remaining domain to act as storage */
710 		struct iommu_domain *storage_domain =
711 			xa_load(&iopt->domains, 0);
712 
713 		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
714 		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
715 			struct iopt_pages *pages = area->pages;
716 
717 			if (!pages)
718 				continue;
719 
720 			mutex_lock(&pages->mutex);
721 			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
722 				WARN_ON(!area->storage_domain);
723 			if (area->storage_domain == domain)
724 				area->storage_domain = storage_domain;
725 			mutex_unlock(&pages->mutex);
726 
727 			iopt_area_unmap_domain(area, domain);
728 		}
729 		return;
730 	}
731 
732 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
733 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
734 		struct iopt_pages *pages = area->pages;
735 
736 		if (!pages)
737 			continue;
738 
739 		mutex_lock(&pages->mutex);
740 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
741 		WARN_ON(area->storage_domain != domain);
742 		area->storage_domain = NULL;
743 		iopt_area_unfill_domain(area, pages, domain);
744 		mutex_unlock(&pages->mutex);
745 	}
746 }
747 
748 /**
749  * iopt_fill_domain() - Fill a domain with PFNs
750  * @iopt: io_pagetable to act on
751  * @domain: domain to fill
752  *
753  * Fill the domain with PFNs from every area in the iopt. On failure the domain
754  * is left unchanged.
755  */
756 static int iopt_fill_domain(struct io_pagetable *iopt,
757 			    struct iommu_domain *domain)
758 {
759 	struct iopt_area *end_area;
760 	struct iopt_area *area;
761 	int rc;
762 
763 	lockdep_assert_held(&iopt->iova_rwsem);
764 	lockdep_assert_held_write(&iopt->domains_rwsem);
765 
766 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
767 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
768 		struct iopt_pages *pages = area->pages;
769 
770 		if (!pages)
771 			continue;
772 
773 		mutex_lock(&pages->mutex);
774 		rc = iopt_area_fill_domain(area, domain);
775 		if (rc) {
776 			mutex_unlock(&pages->mutex);
777 			goto out_unfill;
778 		}
779 		if (!area->storage_domain) {
780 			WARN_ON(iopt->next_domain_id != 0);
781 			area->storage_domain = domain;
782 			interval_tree_insert(&area->pages_node,
783 					     &pages->domains_itree);
784 		}
785 		mutex_unlock(&pages->mutex);
786 	}
787 	return 0;
788 
789 out_unfill:
790 	end_area = area;
791 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
792 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
793 		struct iopt_pages *pages = area->pages;
794 
795 		if (area == end_area)
796 			break;
797 		if (!pages)
798 			continue;
799 		mutex_lock(&pages->mutex);
800 		if (iopt->next_domain_id == 0) {
801 			interval_tree_remove(&area->pages_node,
802 					     &pages->domains_itree);
803 			area->storage_domain = NULL;
804 		}
805 		iopt_area_unfill_domain(area, pages, domain);
806 		mutex_unlock(&pages->mutex);
807 	}
808 	return rc;
809 }
810 
811 /* All existing area's conform to an increased page size */
812 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
813 				     unsigned long new_iova_alignment)
814 {
815 	unsigned long align_mask = new_iova_alignment - 1;
816 	struct iopt_area *area;
817 
818 	lockdep_assert_held(&iopt->iova_rwsem);
819 	lockdep_assert_held(&iopt->domains_rwsem);
820 
821 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
822 	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
823 		if ((iopt_area_iova(area) & align_mask) ||
824 		    (iopt_area_length(area) & align_mask) ||
825 		    (area->page_offset & align_mask))
826 			return -EADDRINUSE;
827 
828 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
829 		struct iommufd_access *access;
830 		unsigned long index;
831 
832 		xa_for_each(&iopt->access_list, index, access)
833 			if (WARN_ON(access->iova_alignment >
834 				    new_iova_alignment))
835 				return -EADDRINUSE;
836 	}
837 	return 0;
838 }
839 
840 int iopt_table_add_domain(struct io_pagetable *iopt,
841 			  struct iommu_domain *domain)
842 {
843 	const struct iommu_domain_geometry *geometry = &domain->geometry;
844 	struct iommu_domain *iter_domain;
845 	unsigned int new_iova_alignment;
846 	unsigned long index;
847 	int rc;
848 
849 	down_write(&iopt->domains_rwsem);
850 	down_write(&iopt->iova_rwsem);
851 
852 	xa_for_each(&iopt->domains, index, iter_domain) {
853 		if (WARN_ON(iter_domain == domain)) {
854 			rc = -EEXIST;
855 			goto out_unlock;
856 		}
857 	}
858 
859 	/*
860 	 * The io page size drives the iova_alignment. Internally the iopt_pages
861 	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
862 	 * objects into the iommu_domain.
863 	 *
864 	 * A iommu_domain must always be able to accept PAGE_SIZE to be
865 	 * compatible as we can't guarantee higher contiguity.
866 	 */
867 	new_iova_alignment = max_t(unsigned long,
868 				   1UL << __ffs(domain->pgsize_bitmap),
869 				   iopt->iova_alignment);
870 	if (new_iova_alignment > PAGE_SIZE) {
871 		rc = -EINVAL;
872 		goto out_unlock;
873 	}
874 	if (new_iova_alignment != iopt->iova_alignment) {
875 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
876 		if (rc)
877 			goto out_unlock;
878 	}
879 
880 	/* No area exists that is outside the allowed domain aperture */
881 	if (geometry->aperture_start != 0) {
882 		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
883 				       domain);
884 		if (rc)
885 			goto out_reserved;
886 	}
887 	if (geometry->aperture_end != ULONG_MAX) {
888 		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
889 				       ULONG_MAX, domain);
890 		if (rc)
891 			goto out_reserved;
892 	}
893 
894 	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
895 	if (rc)
896 		goto out_reserved;
897 
898 	rc = iopt_fill_domain(iopt, domain);
899 	if (rc)
900 		goto out_release;
901 
902 	iopt->iova_alignment = new_iova_alignment;
903 	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
904 	iopt->next_domain_id++;
905 	up_write(&iopt->iova_rwsem);
906 	up_write(&iopt->domains_rwsem);
907 	return 0;
908 out_release:
909 	xa_release(&iopt->domains, iopt->next_domain_id);
910 out_reserved:
911 	__iopt_remove_reserved_iova(iopt, domain);
912 out_unlock:
913 	up_write(&iopt->iova_rwsem);
914 	up_write(&iopt->domains_rwsem);
915 	return rc;
916 }
917 
918 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
919 {
920 	unsigned long new_iova_alignment;
921 	struct iommufd_access *access;
922 	struct iommu_domain *domain;
923 	unsigned long index;
924 
925 	lockdep_assert_held_write(&iopt->iova_rwsem);
926 	lockdep_assert_held(&iopt->domains_rwsem);
927 
928 	/* See batch_iommu_map_small() */
929 	if (iopt->disable_large_pages)
930 		new_iova_alignment = PAGE_SIZE;
931 	else
932 		new_iova_alignment = 1;
933 
934 	xa_for_each(&iopt->domains, index, domain)
935 		new_iova_alignment = max_t(unsigned long,
936 					   1UL << __ffs(domain->pgsize_bitmap),
937 					   new_iova_alignment);
938 	xa_for_each(&iopt->access_list, index, access)
939 		new_iova_alignment = max_t(unsigned long,
940 					   access->iova_alignment,
941 					   new_iova_alignment);
942 
943 	if (new_iova_alignment > iopt->iova_alignment) {
944 		int rc;
945 
946 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
947 		if (rc)
948 			return rc;
949 	}
950 	iopt->iova_alignment = new_iova_alignment;
951 	return 0;
952 }
953 
954 void iopt_table_remove_domain(struct io_pagetable *iopt,
955 			      struct iommu_domain *domain)
956 {
957 	struct iommu_domain *iter_domain = NULL;
958 	unsigned long index;
959 
960 	down_write(&iopt->domains_rwsem);
961 	down_write(&iopt->iova_rwsem);
962 
963 	xa_for_each(&iopt->domains, index, iter_domain)
964 		if (iter_domain == domain)
965 			break;
966 	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
967 		goto out_unlock;
968 
969 	/*
970 	 * Compress the xarray to keep it linear by swapping the entry to erase
971 	 * with the tail entry and shrinking the tail.
972 	 */
973 	iopt->next_domain_id--;
974 	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
975 	if (index != iopt->next_domain_id)
976 		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
977 
978 	iopt_unfill_domain(iopt, domain);
979 	__iopt_remove_reserved_iova(iopt, domain);
980 
981 	WARN_ON(iopt_calculate_iova_alignment(iopt));
982 out_unlock:
983 	up_write(&iopt->iova_rwsem);
984 	up_write(&iopt->domains_rwsem);
985 }
986 
987 /**
988  * iopt_area_split - Split an area into two parts at iova
989  * @area: The area to split
990  * @iova: Becomes the last of a new area
991  *
992  * This splits an area into two. It is part of the VFIO compatibility to allow
993  * poking a hole in the mapping. The two areas continue to point at the same
994  * iopt_pages, just with different starting bytes.
995  */
996 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
997 {
998 	unsigned long alignment = area->iopt->iova_alignment;
999 	unsigned long last_iova = iopt_area_last_iova(area);
1000 	unsigned long start_iova = iopt_area_iova(area);
1001 	unsigned long new_start = iova + 1;
1002 	struct io_pagetable *iopt = area->iopt;
1003 	struct iopt_pages *pages = area->pages;
1004 	struct iopt_area *lhs;
1005 	struct iopt_area *rhs;
1006 	int rc;
1007 
1008 	lockdep_assert_held_write(&iopt->iova_rwsem);
1009 
1010 	if (iova == start_iova || iova == last_iova)
1011 		return 0;
1012 
1013 	if (!pages || area->prevent_access)
1014 		return -EBUSY;
1015 
1016 	if (new_start & (alignment - 1) ||
1017 	    iopt_area_start_byte(area, new_start) & (alignment - 1))
1018 		return -EINVAL;
1019 
1020 	lhs = iopt_area_alloc();
1021 	if (!lhs)
1022 		return -ENOMEM;
1023 
1024 	rhs = iopt_area_alloc();
1025 	if (!rhs) {
1026 		rc = -ENOMEM;
1027 		goto err_free_lhs;
1028 	}
1029 
1030 	mutex_lock(&pages->mutex);
1031 	/*
1032 	 * Splitting is not permitted if an access exists, we don't track enough
1033 	 * information to split existing accesses.
1034 	 */
1035 	if (area->num_accesses) {
1036 		rc = -EINVAL;
1037 		goto err_unlock;
1038 	}
1039 
1040 	/*
1041 	 * Splitting is not permitted if a domain could have been mapped with
1042 	 * huge pages.
1043 	 */
1044 	if (area->storage_domain && !iopt->disable_large_pages) {
1045 		rc = -EINVAL;
1046 		goto err_unlock;
1047 	}
1048 
1049 	interval_tree_remove(&area->node, &iopt->area_itree);
1050 	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1051 			      iopt_area_start_byte(area, start_iova),
1052 			      (new_start - 1) - start_iova + 1,
1053 			      area->iommu_prot);
1054 	if (WARN_ON(rc))
1055 		goto err_insert;
1056 
1057 	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1058 			      iopt_area_start_byte(area, new_start),
1059 			      last_iova - new_start + 1, area->iommu_prot);
1060 	if (WARN_ON(rc))
1061 		goto err_remove_lhs;
1062 
1063 	/*
1064 	 * If the original area has filled a domain, domains_itree has to be
1065 	 * updated.
1066 	 */
1067 	if (area->storage_domain) {
1068 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1069 		interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1070 		interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1071 	}
1072 
1073 	lhs->storage_domain = area->storage_domain;
1074 	lhs->pages = area->pages;
1075 	rhs->storage_domain = area->storage_domain;
1076 	rhs->pages = area->pages;
1077 	kref_get(&rhs->pages->kref);
1078 	kfree(area);
1079 	mutex_unlock(&pages->mutex);
1080 
1081 	/*
1082 	 * No change to domains or accesses because the pages hasn't been
1083 	 * changed
1084 	 */
1085 	return 0;
1086 
1087 err_remove_lhs:
1088 	interval_tree_remove(&lhs->node, &iopt->area_itree);
1089 err_insert:
1090 	interval_tree_insert(&area->node, &iopt->area_itree);
1091 err_unlock:
1092 	mutex_unlock(&pages->mutex);
1093 	kfree(rhs);
1094 err_free_lhs:
1095 	kfree(lhs);
1096 	return rc;
1097 }
1098 
1099 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1100 		  size_t num_iovas)
1101 {
1102 	int rc = 0;
1103 	int i;
1104 
1105 	down_write(&iopt->iova_rwsem);
1106 	for (i = 0; i < num_iovas; i++) {
1107 		struct iopt_area *area;
1108 
1109 		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1110 		if (!area)
1111 			continue;
1112 		rc = iopt_area_split(area, iovas[i]);
1113 		if (rc)
1114 			break;
1115 	}
1116 	up_write(&iopt->iova_rwsem);
1117 	return rc;
1118 }
1119 
1120 void iopt_enable_large_pages(struct io_pagetable *iopt)
1121 {
1122 	int rc;
1123 
1124 	down_write(&iopt->domains_rwsem);
1125 	down_write(&iopt->iova_rwsem);
1126 	WRITE_ONCE(iopt->disable_large_pages, false);
1127 	rc = iopt_calculate_iova_alignment(iopt);
1128 	WARN_ON(rc);
1129 	up_write(&iopt->iova_rwsem);
1130 	up_write(&iopt->domains_rwsem);
1131 }
1132 
1133 int iopt_disable_large_pages(struct io_pagetable *iopt)
1134 {
1135 	int rc = 0;
1136 
1137 	down_write(&iopt->domains_rwsem);
1138 	down_write(&iopt->iova_rwsem);
1139 	if (iopt->disable_large_pages)
1140 		goto out_unlock;
1141 
1142 	/* Won't do it if domains already have pages mapped in them */
1143 	if (!xa_empty(&iopt->domains) &&
1144 	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1145 		rc = -EINVAL;
1146 		goto out_unlock;
1147 	}
1148 
1149 	WRITE_ONCE(iopt->disable_large_pages, true);
1150 	rc = iopt_calculate_iova_alignment(iopt);
1151 	if (rc)
1152 		WRITE_ONCE(iopt->disable_large_pages, false);
1153 out_unlock:
1154 	up_write(&iopt->iova_rwsem);
1155 	up_write(&iopt->domains_rwsem);
1156 	return rc;
1157 }
1158 
1159 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1160 {
1161 	int rc;
1162 
1163 	down_write(&iopt->domains_rwsem);
1164 	down_write(&iopt->iova_rwsem);
1165 	rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access,
1166 		      xa_limit_16b, GFP_KERNEL_ACCOUNT);
1167 	if (rc)
1168 		goto out_unlock;
1169 
1170 	rc = iopt_calculate_iova_alignment(iopt);
1171 	if (rc) {
1172 		xa_erase(&iopt->access_list, access->iopt_access_list_id);
1173 		goto out_unlock;
1174 	}
1175 
1176 out_unlock:
1177 	up_write(&iopt->iova_rwsem);
1178 	up_write(&iopt->domains_rwsem);
1179 	return rc;
1180 }
1181 
1182 void iopt_remove_access(struct io_pagetable *iopt,
1183 			struct iommufd_access *access,
1184 			u32 iopt_access_list_id)
1185 {
1186 	down_write(&iopt->domains_rwsem);
1187 	down_write(&iopt->iova_rwsem);
1188 	WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1189 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1190 	up_write(&iopt->iova_rwsem);
1191 	up_write(&iopt->domains_rwsem);
1192 }
1193 
1194 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
1195 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1196 					struct device *dev,
1197 					phys_addr_t *sw_msi_start)
1198 {
1199 	struct iommu_resv_region *resv;
1200 	LIST_HEAD(resv_regions);
1201 	unsigned int num_hw_msi = 0;
1202 	unsigned int num_sw_msi = 0;
1203 	int rc;
1204 
1205 	if (iommufd_should_fail())
1206 		return -EINVAL;
1207 
1208 	down_write(&iopt->iova_rwsem);
1209 	/* FIXME: drivers allocate memory but there is no failure propogated */
1210 	iommu_get_resv_regions(dev, &resv_regions);
1211 
1212 	list_for_each_entry(resv, &resv_regions, list) {
1213 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1214 			continue;
1215 
1216 		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1217 			num_hw_msi++;
1218 		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1219 			*sw_msi_start = resv->start;
1220 			num_sw_msi++;
1221 		}
1222 
1223 		rc = iopt_reserve_iova(iopt, resv->start,
1224 				       resv->length - 1 + resv->start, dev);
1225 		if (rc)
1226 			goto out_reserved;
1227 	}
1228 
1229 	/* Drivers must offer sane combinations of regions */
1230 	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1231 		rc = -EINVAL;
1232 		goto out_reserved;
1233 	}
1234 
1235 	rc = 0;
1236 	goto out_free_resv;
1237 
1238 out_reserved:
1239 	__iopt_remove_reserved_iova(iopt, dev);
1240 out_free_resv:
1241 	iommu_put_resv_regions(dev, &resv_regions);
1242 	up_write(&iopt->iova_rwsem);
1243 	return rc;
1244 }
1245