1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3 *
4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6 * list for access by an in-kernel user.
7 *
8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9 * between the domains and xarray.
10 */
11 #include <linux/iommufd.h>
12 #include <linux/lockdep.h>
13 #include <linux/iommu.h>
14 #include <linux/sched/mm.h>
15 #include <linux/err.h>
16 #include <linux/slab.h>
17 #include <linux/errno.h>
18
19 #include "io_pagetable.h"
20 #include "double_span.h"
21
22 struct iopt_pages_list {
23 struct iopt_pages *pages;
24 struct iopt_area *area;
25 struct list_head next;
26 unsigned long start_byte;
27 unsigned long length;
28 };
29
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)30 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
31 struct io_pagetable *iopt,
32 unsigned long iova,
33 unsigned long last_iova)
34 {
35 lockdep_assert_held(&iopt->iova_rwsem);
36
37 iter->cur_iova = iova;
38 iter->last_iova = last_iova;
39 iter->area = iopt_area_iter_first(iopt, iova, iova);
40 if (!iter->area)
41 return NULL;
42 if (!iter->area->pages) {
43 iter->area = NULL;
44 return NULL;
45 }
46 return iter->area;
47 }
48
iopt_area_contig_next(struct iopt_area_contig_iter * iter)49 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
50 {
51 unsigned long last_iova;
52
53 if (!iter->area)
54 return NULL;
55 last_iova = iopt_area_last_iova(iter->area);
56 if (iter->last_iova <= last_iova)
57 return NULL;
58
59 iter->cur_iova = last_iova + 1;
60 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
61 iter->last_iova);
62 if (!iter->area)
63 return NULL;
64 if (iter->cur_iova != iopt_area_iova(iter->area) ||
65 !iter->area->pages) {
66 iter->area = NULL;
67 return NULL;
68 }
69 return iter->area;
70 }
71
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)72 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
73 unsigned long length,
74 unsigned long iova_alignment,
75 unsigned long page_offset)
76 {
77 if (span->is_used || span->last_hole - span->start_hole < length - 1)
78 return false;
79
80 span->start_hole = ALIGN(span->start_hole, iova_alignment) |
81 page_offset;
82 if (span->start_hole > span->last_hole ||
83 span->last_hole - span->start_hole < length - 1)
84 return false;
85 return true;
86 }
87
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)88 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
89 unsigned long length,
90 unsigned long iova_alignment,
91 unsigned long page_offset)
92 {
93 if (span->is_hole || span->last_used - span->start_used < length - 1)
94 return false;
95
96 span->start_used = ALIGN(span->start_used, iova_alignment) |
97 page_offset;
98 if (span->start_used > span->last_used ||
99 span->last_used - span->start_used < length - 1)
100 return false;
101 return true;
102 }
103
104 /*
105 * Automatically find a block of IOVA that is not being used and not reserved.
106 * Does not return a 0 IOVA even if it is valid.
107 */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long uptr,unsigned long length)108 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
109 unsigned long uptr, unsigned long length)
110 {
111 unsigned long page_offset = uptr % PAGE_SIZE;
112 struct interval_tree_double_span_iter used_span;
113 struct interval_tree_span_iter allowed_span;
114 unsigned long max_alignment = PAGE_SIZE;
115 unsigned long iova_alignment;
116
117 lockdep_assert_held(&iopt->iova_rwsem);
118
119 /* Protect roundup_pow-of_two() from overflow */
120 if (length == 0 || length >= ULONG_MAX / 2)
121 return -EOVERFLOW;
122
123 /*
124 * Keep alignment present in the uptr when building the IOVA, this
125 * increases the chance we can map a THP.
126 */
127 if (!uptr)
128 iova_alignment = roundup_pow_of_two(length);
129 else
130 iova_alignment = min_t(unsigned long,
131 roundup_pow_of_two(length),
132 1UL << __ffs64(uptr));
133
134 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
135 max_alignment = HPAGE_SIZE;
136 #endif
137 /* Protect against ALIGN() overflow */
138 if (iova_alignment >= max_alignment)
139 iova_alignment = max_alignment;
140
141 if (iova_alignment < iopt->iova_alignment)
142 return -EINVAL;
143
144 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
145 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
146 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
147 allowed_span.start_used = PAGE_SIZE;
148 allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
149 allowed_span.is_hole = false;
150 }
151
152 if (!__alloc_iova_check_used(&allowed_span, length,
153 iova_alignment, page_offset))
154 continue;
155
156 interval_tree_for_each_double_span(
157 &used_span, &iopt->reserved_itree, &iopt->area_itree,
158 allowed_span.start_used, allowed_span.last_used) {
159 if (!__alloc_iova_check_hole(&used_span, length,
160 iova_alignment,
161 page_offset))
162 continue;
163
164 *iova = used_span.start_hole;
165 return 0;
166 }
167 }
168 return -ENOSPC;
169 }
170
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)171 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
172 unsigned long length)
173 {
174 unsigned long last;
175
176 lockdep_assert_held(&iopt->iova_rwsem);
177
178 if ((iova & (iopt->iova_alignment - 1)))
179 return -EINVAL;
180
181 if (check_add_overflow(iova, length - 1, &last))
182 return -EOVERFLOW;
183
184 /* No reserved IOVA intersects the range */
185 if (iopt_reserved_iter_first(iopt, iova, last))
186 return -EINVAL;
187
188 /* Check that there is not already a mapping in the range */
189 if (iopt_area_iter_first(iopt, iova, last))
190 return -EEXIST;
191 return 0;
192 }
193
194 /*
195 * The area takes a slice of the pages from start_bytes to start_byte + length
196 */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)197 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
198 struct iopt_pages *pages, unsigned long iova,
199 unsigned long start_byte, unsigned long length,
200 int iommu_prot)
201 {
202 lockdep_assert_held_write(&iopt->iova_rwsem);
203
204 if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
205 return -EPERM;
206
207 area->iommu_prot = iommu_prot;
208 area->page_offset = start_byte % PAGE_SIZE;
209 if (area->page_offset & (iopt->iova_alignment - 1))
210 return -EINVAL;
211
212 area->node.start = iova;
213 if (check_add_overflow(iova, length - 1, &area->node.last))
214 return -EOVERFLOW;
215
216 area->pages_node.start = start_byte / PAGE_SIZE;
217 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
218 return -EOVERFLOW;
219 area->pages_node.last = area->pages_node.last / PAGE_SIZE;
220 if (WARN_ON(area->pages_node.last >= pages->npages))
221 return -EOVERFLOW;
222
223 /*
224 * The area is inserted with a NULL pages indicating it is not fully
225 * initialized yet.
226 */
227 area->iopt = iopt;
228 interval_tree_insert(&area->node, &iopt->area_itree);
229 return 0;
230 }
231
iopt_area_alloc(void)232 static struct iopt_area *iopt_area_alloc(void)
233 {
234 struct iopt_area *area;
235
236 area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
237 if (!area)
238 return NULL;
239 RB_CLEAR_NODE(&area->node.rb);
240 RB_CLEAR_NODE(&area->pages_node.rb);
241 return area;
242 }
243
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)244 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
245 struct list_head *pages_list,
246 unsigned long length, unsigned long *dst_iova,
247 int iommu_prot, unsigned int flags)
248 {
249 struct iopt_pages_list *elm;
250 unsigned long iova;
251 int rc = 0;
252
253 list_for_each_entry(elm, pages_list, next) {
254 elm->area = iopt_area_alloc();
255 if (!elm->area)
256 return -ENOMEM;
257 }
258
259 down_write(&iopt->iova_rwsem);
260 if ((length & (iopt->iova_alignment - 1)) || !length) {
261 rc = -EINVAL;
262 goto out_unlock;
263 }
264
265 if (flags & IOPT_ALLOC_IOVA) {
266 /* Use the first entry to guess the ideal IOVA alignment */
267 elm = list_first_entry(pages_list, struct iopt_pages_list,
268 next);
269 rc = iopt_alloc_iova(
270 iopt, dst_iova,
271 (uintptr_t)elm->pages->uptr + elm->start_byte, length);
272 if (rc)
273 goto out_unlock;
274 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
275 WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
276 rc = -EINVAL;
277 goto out_unlock;
278 }
279 } else {
280 rc = iopt_check_iova(iopt, *dst_iova, length);
281 if (rc)
282 goto out_unlock;
283 }
284
285 /*
286 * Areas are created with a NULL pages so that the IOVA space is
287 * reserved and we can unlock the iova_rwsem.
288 */
289 iova = *dst_iova;
290 list_for_each_entry(elm, pages_list, next) {
291 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
292 elm->start_byte, elm->length, iommu_prot);
293 if (rc)
294 goto out_unlock;
295 iova += elm->length;
296 }
297
298 out_unlock:
299 up_write(&iopt->iova_rwsem);
300 return rc;
301 }
302
iopt_abort_area(struct iopt_area * area)303 static void iopt_abort_area(struct iopt_area *area)
304 {
305 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
306 WARN_ON(area->pages);
307 if (area->iopt) {
308 down_write(&area->iopt->iova_rwsem);
309 interval_tree_remove(&area->node, &area->iopt->area_itree);
310 up_write(&area->iopt->iova_rwsem);
311 }
312 kfree(area);
313 }
314
iopt_free_pages_list(struct list_head * pages_list)315 void iopt_free_pages_list(struct list_head *pages_list)
316 {
317 struct iopt_pages_list *elm;
318
319 while ((elm = list_first_entry_or_null(pages_list,
320 struct iopt_pages_list, next))) {
321 if (elm->area)
322 iopt_abort_area(elm->area);
323 if (elm->pages)
324 iopt_put_pages(elm->pages);
325 list_del(&elm->next);
326 kfree(elm);
327 }
328 }
329
iopt_fill_domains_pages(struct list_head * pages_list)330 static int iopt_fill_domains_pages(struct list_head *pages_list)
331 {
332 struct iopt_pages_list *undo_elm;
333 struct iopt_pages_list *elm;
334 int rc;
335
336 list_for_each_entry(elm, pages_list, next) {
337 rc = iopt_area_fill_domains(elm->area, elm->pages);
338 if (rc)
339 goto err_undo;
340 }
341 return 0;
342
343 err_undo:
344 list_for_each_entry(undo_elm, pages_list, next) {
345 if (undo_elm == elm)
346 break;
347 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
348 }
349 return rc;
350 }
351
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)352 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
353 unsigned long length, unsigned long *dst_iova,
354 int iommu_prot, unsigned int flags)
355 {
356 struct iopt_pages_list *elm;
357 int rc;
358
359 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
360 iommu_prot, flags);
361 if (rc)
362 return rc;
363
364 down_read(&iopt->domains_rwsem);
365 rc = iopt_fill_domains_pages(pages_list);
366 if (rc)
367 goto out_unlock_domains;
368
369 down_write(&iopt->iova_rwsem);
370 list_for_each_entry(elm, pages_list, next) {
371 /*
372 * area->pages must be set inside the domains_rwsem to ensure
373 * any newly added domains will get filled. Moves the reference
374 * in from the list.
375 */
376 elm->area->pages = elm->pages;
377 elm->pages = NULL;
378 elm->area = NULL;
379 }
380 up_write(&iopt->iova_rwsem);
381 out_unlock_domains:
382 up_read(&iopt->domains_rwsem);
383 return rc;
384 }
385
386 /**
387 * iopt_map_user_pages() - Map a user VA to an iova in the io page table
388 * @ictx: iommufd_ctx the iopt is part of
389 * @iopt: io_pagetable to act on
390 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
391 * the chosen iova on output. Otherwise is the iova to map to on input
392 * @uptr: User VA to map
393 * @length: Number of bytes to map
394 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
395 * @flags: IOPT_ALLOC_IOVA or zero
396 *
397 * iova, uptr, and length must be aligned to iova_alignment. For domain backed
398 * page tables this will pin the pages and load them into the domain at iova.
399 * For non-domain page tables this will only setup a lazy reference and the
400 * caller must use iopt_access_pages() to touch them.
401 *
402 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
403 * destroyed.
404 */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)405 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
406 unsigned long *iova, void __user *uptr,
407 unsigned long length, int iommu_prot,
408 unsigned int flags)
409 {
410 struct iopt_pages_list elm = {};
411 LIST_HEAD(pages_list);
412 int rc;
413
414 elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
415 if (IS_ERR(elm.pages))
416 return PTR_ERR(elm.pages);
417 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
418 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
419 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
420 elm.start_byte = uptr - elm.pages->uptr;
421 elm.length = length;
422 list_add(&elm.next, &pages_list);
423
424 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
425 if (rc) {
426 if (elm.area)
427 iopt_abort_area(elm.area);
428 if (elm.pages)
429 iopt_put_pages(elm.pages);
430 return rc;
431 }
432 return 0;
433 }
434
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)435 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
436 unsigned long length, struct list_head *pages_list)
437 {
438 struct iopt_area_contig_iter iter;
439 unsigned long last_iova;
440 struct iopt_area *area;
441 int rc;
442
443 if (!length)
444 return -EINVAL;
445 if (check_add_overflow(iova, length - 1, &last_iova))
446 return -EOVERFLOW;
447
448 down_read(&iopt->iova_rwsem);
449 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
450 struct iopt_pages_list *elm;
451 unsigned long last = min(last_iova, iopt_area_last_iova(area));
452
453 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
454 if (!elm) {
455 rc = -ENOMEM;
456 goto err_free;
457 }
458 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
459 elm->pages = area->pages;
460 elm->length = (last - iter.cur_iova) + 1;
461 kref_get(&elm->pages->kref);
462 list_add_tail(&elm->next, pages_list);
463 }
464 if (!iopt_area_contig_done(&iter)) {
465 rc = -ENOENT;
466 goto err_free;
467 }
468 up_read(&iopt->iova_rwsem);
469 return 0;
470 err_free:
471 up_read(&iopt->iova_rwsem);
472 iopt_free_pages_list(pages_list);
473 return rc;
474 }
475
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)476 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
477 unsigned long last, unsigned long *unmapped)
478 {
479 struct iopt_area *area;
480 unsigned long unmapped_bytes = 0;
481 unsigned int tries = 0;
482 int rc = -ENOENT;
483
484 /*
485 * The domains_rwsem must be held in read mode any time any area->pages
486 * is NULL. This prevents domain attach/detatch from running
487 * concurrently with cleaning up the area.
488 */
489 again:
490 down_read(&iopt->domains_rwsem);
491 down_write(&iopt->iova_rwsem);
492 while ((area = iopt_area_iter_first(iopt, start, last))) {
493 unsigned long area_last = iopt_area_last_iova(area);
494 unsigned long area_first = iopt_area_iova(area);
495 struct iopt_pages *pages;
496
497 /* Userspace should not race map/unmap's of the same area */
498 if (!area->pages) {
499 rc = -EBUSY;
500 goto out_unlock_iova;
501 }
502
503 if (area_first < start || area_last > last) {
504 rc = -ENOENT;
505 goto out_unlock_iova;
506 }
507
508 if (area_first != start)
509 tries = 0;
510
511 /*
512 * num_accesses writers must hold the iova_rwsem too, so we can
513 * safely read it under the write side of the iovam_rwsem
514 * without the pages->mutex.
515 */
516 if (area->num_accesses) {
517 size_t length = iopt_area_length(area);
518
519 start = area_first;
520 area->prevent_access = true;
521 up_write(&iopt->iova_rwsem);
522 up_read(&iopt->domains_rwsem);
523
524 iommufd_access_notify_unmap(iopt, area_first, length);
525 /* Something is not responding to unmap requests. */
526 tries++;
527 if (WARN_ON(tries > 100))
528 return -EDEADLOCK;
529 goto again;
530 }
531
532 pages = area->pages;
533 area->pages = NULL;
534 up_write(&iopt->iova_rwsem);
535
536 iopt_area_unfill_domains(area, pages);
537 iopt_abort_area(area);
538 iopt_put_pages(pages);
539
540 unmapped_bytes += area_last - area_first + 1;
541
542 down_write(&iopt->iova_rwsem);
543 }
544 if (unmapped_bytes)
545 rc = 0;
546
547 out_unlock_iova:
548 up_write(&iopt->iova_rwsem);
549 up_read(&iopt->domains_rwsem);
550 if (unmapped)
551 *unmapped = unmapped_bytes;
552 return rc;
553 }
554
555 /**
556 * iopt_unmap_iova() - Remove a range of iova
557 * @iopt: io_pagetable to act on
558 * @iova: Starting iova to unmap
559 * @length: Number of bytes to unmap
560 * @unmapped: Return number of bytes unmapped
561 *
562 * The requested range must be a superset of existing ranges.
563 * Splitting/truncating IOVA mappings is not allowed.
564 */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)565 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
566 unsigned long length, unsigned long *unmapped)
567 {
568 unsigned long iova_last;
569
570 if (!length)
571 return -EINVAL;
572
573 if (check_add_overflow(iova, length - 1, &iova_last))
574 return -EOVERFLOW;
575
576 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
577 }
578
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)579 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
580 {
581 int rc;
582
583 rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
584 /* If the IOVAs are empty then unmap all succeeds */
585 if (rc == -ENOENT)
586 return 0;
587 return rc;
588 }
589
590 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)591 int iopt_set_allow_iova(struct io_pagetable *iopt,
592 struct rb_root_cached *allowed_iova)
593 {
594 struct iopt_allowed *allowed;
595
596 down_write(&iopt->iova_rwsem);
597 swap(*allowed_iova, iopt->allowed_itree);
598
599 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
600 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
601 if (iopt_reserved_iter_first(iopt, allowed->node.start,
602 allowed->node.last)) {
603 swap(*allowed_iova, iopt->allowed_itree);
604 up_write(&iopt->iova_rwsem);
605 return -EADDRINUSE;
606 }
607 }
608 up_write(&iopt->iova_rwsem);
609 return 0;
610 }
611
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)612 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
613 unsigned long last, void *owner)
614 {
615 struct iopt_reserved *reserved;
616
617 lockdep_assert_held_write(&iopt->iova_rwsem);
618
619 if (iopt_area_iter_first(iopt, start, last) ||
620 iopt_allowed_iter_first(iopt, start, last))
621 return -EADDRINUSE;
622
623 reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
624 if (!reserved)
625 return -ENOMEM;
626 reserved->node.start = start;
627 reserved->node.last = last;
628 reserved->owner = owner;
629 interval_tree_insert(&reserved->node, &iopt->reserved_itree);
630 return 0;
631 }
632
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)633 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
634 {
635 struct iopt_reserved *reserved, *next;
636
637 lockdep_assert_held_write(&iopt->iova_rwsem);
638
639 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
640 reserved = next) {
641 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
642
643 if (reserved->owner == owner) {
644 interval_tree_remove(&reserved->node,
645 &iopt->reserved_itree);
646 kfree(reserved);
647 }
648 }
649 }
650
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)651 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
652 {
653 down_write(&iopt->iova_rwsem);
654 __iopt_remove_reserved_iova(iopt, owner);
655 up_write(&iopt->iova_rwsem);
656 }
657
iopt_init_table(struct io_pagetable * iopt)658 void iopt_init_table(struct io_pagetable *iopt)
659 {
660 init_rwsem(&iopt->iova_rwsem);
661 init_rwsem(&iopt->domains_rwsem);
662 iopt->area_itree = RB_ROOT_CACHED;
663 iopt->allowed_itree = RB_ROOT_CACHED;
664 iopt->reserved_itree = RB_ROOT_CACHED;
665 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
666 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
667
668 /*
669 * iopt's start as SW tables that can use the entire size_t IOVA space
670 * due to the use of size_t in the APIs. They have no alignment
671 * restriction.
672 */
673 iopt->iova_alignment = 1;
674 }
675
iopt_destroy_table(struct io_pagetable * iopt)676 void iopt_destroy_table(struct io_pagetable *iopt)
677 {
678 struct interval_tree_node *node;
679
680 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
681 iopt_remove_reserved_iova(iopt, NULL);
682
683 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
684 ULONG_MAX))) {
685 interval_tree_remove(node, &iopt->allowed_itree);
686 kfree(container_of(node, struct iopt_allowed, node));
687 }
688
689 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
690 WARN_ON(!xa_empty(&iopt->domains));
691 WARN_ON(!xa_empty(&iopt->access_list));
692 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
693 }
694
695 /**
696 * iopt_unfill_domain() - Unfill a domain with PFNs
697 * @iopt: io_pagetable to act on
698 * @domain: domain to unfill
699 *
700 * This is used when removing a domain from the iopt. Every area in the iopt
701 * will be unmapped from the domain. The domain must already be removed from the
702 * domains xarray.
703 */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)704 static void iopt_unfill_domain(struct io_pagetable *iopt,
705 struct iommu_domain *domain)
706 {
707 struct iopt_area *area;
708
709 lockdep_assert_held(&iopt->iova_rwsem);
710 lockdep_assert_held_write(&iopt->domains_rwsem);
711
712 /*
713 * Some other domain is holding all the pfns still, rapidly unmap this
714 * domain.
715 */
716 if (iopt->next_domain_id != 0) {
717 /* Pick an arbitrary remaining domain to act as storage */
718 struct iommu_domain *storage_domain =
719 xa_load(&iopt->domains, 0);
720
721 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
722 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
723 struct iopt_pages *pages = area->pages;
724
725 if (!pages)
726 continue;
727
728 mutex_lock(&pages->mutex);
729 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
730 WARN_ON(!area->storage_domain);
731 if (area->storage_domain == domain)
732 area->storage_domain = storage_domain;
733 mutex_unlock(&pages->mutex);
734
735 iopt_area_unmap_domain(area, domain);
736 }
737 return;
738 }
739
740 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
741 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
742 struct iopt_pages *pages = area->pages;
743
744 if (!pages)
745 continue;
746
747 mutex_lock(&pages->mutex);
748 interval_tree_remove(&area->pages_node, &pages->domains_itree);
749 WARN_ON(area->storage_domain != domain);
750 area->storage_domain = NULL;
751 iopt_area_unfill_domain(area, pages, domain);
752 mutex_unlock(&pages->mutex);
753 }
754 }
755
756 /**
757 * iopt_fill_domain() - Fill a domain with PFNs
758 * @iopt: io_pagetable to act on
759 * @domain: domain to fill
760 *
761 * Fill the domain with PFNs from every area in the iopt. On failure the domain
762 * is left unchanged.
763 */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)764 static int iopt_fill_domain(struct io_pagetable *iopt,
765 struct iommu_domain *domain)
766 {
767 struct iopt_area *end_area;
768 struct iopt_area *area;
769 int rc;
770
771 lockdep_assert_held(&iopt->iova_rwsem);
772 lockdep_assert_held_write(&iopt->domains_rwsem);
773
774 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
775 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
776 struct iopt_pages *pages = area->pages;
777
778 if (!pages)
779 continue;
780
781 mutex_lock(&pages->mutex);
782 rc = iopt_area_fill_domain(area, domain);
783 if (rc) {
784 mutex_unlock(&pages->mutex);
785 goto out_unfill;
786 }
787 if (!area->storage_domain) {
788 WARN_ON(iopt->next_domain_id != 0);
789 area->storage_domain = domain;
790 interval_tree_insert(&area->pages_node,
791 &pages->domains_itree);
792 }
793 mutex_unlock(&pages->mutex);
794 }
795 return 0;
796
797 out_unfill:
798 end_area = area;
799 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
800 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
801 struct iopt_pages *pages = area->pages;
802
803 if (area == end_area)
804 break;
805 if (!pages)
806 continue;
807 mutex_lock(&pages->mutex);
808 if (iopt->next_domain_id == 0) {
809 interval_tree_remove(&area->pages_node,
810 &pages->domains_itree);
811 area->storage_domain = NULL;
812 }
813 iopt_area_unfill_domain(area, pages, domain);
814 mutex_unlock(&pages->mutex);
815 }
816 return rc;
817 }
818
819 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)820 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
821 unsigned long new_iova_alignment)
822 {
823 unsigned long align_mask = new_iova_alignment - 1;
824 struct iopt_area *area;
825
826 lockdep_assert_held(&iopt->iova_rwsem);
827 lockdep_assert_held(&iopt->domains_rwsem);
828
829 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
830 area = iopt_area_iter_next(area, 0, ULONG_MAX))
831 if ((iopt_area_iova(area) & align_mask) ||
832 (iopt_area_length(area) & align_mask) ||
833 (area->page_offset & align_mask))
834 return -EADDRINUSE;
835
836 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
837 struct iommufd_access *access;
838 unsigned long index;
839
840 xa_for_each(&iopt->access_list, index, access)
841 if (WARN_ON(access->iova_alignment >
842 new_iova_alignment))
843 return -EADDRINUSE;
844 }
845 return 0;
846 }
847
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)848 int iopt_table_add_domain(struct io_pagetable *iopt,
849 struct iommu_domain *domain)
850 {
851 const struct iommu_domain_geometry *geometry = &domain->geometry;
852 struct iommu_domain *iter_domain;
853 unsigned int new_iova_alignment;
854 unsigned long index;
855 int rc;
856
857 down_write(&iopt->domains_rwsem);
858 down_write(&iopt->iova_rwsem);
859
860 xa_for_each(&iopt->domains, index, iter_domain) {
861 if (WARN_ON(iter_domain == domain)) {
862 rc = -EEXIST;
863 goto out_unlock;
864 }
865 }
866
867 /*
868 * The io page size drives the iova_alignment. Internally the iopt_pages
869 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
870 * objects into the iommu_domain.
871 *
872 * A iommu_domain must always be able to accept PAGE_SIZE to be
873 * compatible as we can't guarantee higher contiguity.
874 */
875 new_iova_alignment = max_t(unsigned long,
876 1UL << __ffs(domain->pgsize_bitmap),
877 iopt->iova_alignment);
878 if (new_iova_alignment > PAGE_SIZE) {
879 rc = -EINVAL;
880 goto out_unlock;
881 }
882 if (new_iova_alignment != iopt->iova_alignment) {
883 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
884 if (rc)
885 goto out_unlock;
886 }
887
888 /* No area exists that is outside the allowed domain aperture */
889 if (geometry->aperture_start != 0) {
890 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
891 domain);
892 if (rc)
893 goto out_reserved;
894 }
895 if (geometry->aperture_end != ULONG_MAX) {
896 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
897 ULONG_MAX, domain);
898 if (rc)
899 goto out_reserved;
900 }
901
902 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
903 if (rc)
904 goto out_reserved;
905
906 rc = iopt_fill_domain(iopt, domain);
907 if (rc)
908 goto out_release;
909
910 iopt->iova_alignment = new_iova_alignment;
911 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
912 iopt->next_domain_id++;
913 up_write(&iopt->iova_rwsem);
914 up_write(&iopt->domains_rwsem);
915 return 0;
916 out_release:
917 xa_release(&iopt->domains, iopt->next_domain_id);
918 out_reserved:
919 __iopt_remove_reserved_iova(iopt, domain);
920 out_unlock:
921 up_write(&iopt->iova_rwsem);
922 up_write(&iopt->domains_rwsem);
923 return rc;
924 }
925
iopt_calculate_iova_alignment(struct io_pagetable * iopt)926 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
927 {
928 unsigned long new_iova_alignment;
929 struct iommufd_access *access;
930 struct iommu_domain *domain;
931 unsigned long index;
932
933 lockdep_assert_held_write(&iopt->iova_rwsem);
934 lockdep_assert_held(&iopt->domains_rwsem);
935
936 /* See batch_iommu_map_small() */
937 if (iopt->disable_large_pages)
938 new_iova_alignment = PAGE_SIZE;
939 else
940 new_iova_alignment = 1;
941
942 xa_for_each(&iopt->domains, index, domain)
943 new_iova_alignment = max_t(unsigned long,
944 1UL << __ffs(domain->pgsize_bitmap),
945 new_iova_alignment);
946 xa_for_each(&iopt->access_list, index, access)
947 new_iova_alignment = max_t(unsigned long,
948 access->iova_alignment,
949 new_iova_alignment);
950
951 if (new_iova_alignment > iopt->iova_alignment) {
952 int rc;
953
954 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
955 if (rc)
956 return rc;
957 }
958 iopt->iova_alignment = new_iova_alignment;
959 return 0;
960 }
961
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)962 void iopt_table_remove_domain(struct io_pagetable *iopt,
963 struct iommu_domain *domain)
964 {
965 struct iommu_domain *iter_domain = NULL;
966 unsigned long index;
967
968 down_write(&iopt->domains_rwsem);
969 down_write(&iopt->iova_rwsem);
970
971 xa_for_each(&iopt->domains, index, iter_domain)
972 if (iter_domain == domain)
973 break;
974 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
975 goto out_unlock;
976
977 /*
978 * Compress the xarray to keep it linear by swapping the entry to erase
979 * with the tail entry and shrinking the tail.
980 */
981 iopt->next_domain_id--;
982 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
983 if (index != iopt->next_domain_id)
984 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
985
986 iopt_unfill_domain(iopt, domain);
987 __iopt_remove_reserved_iova(iopt, domain);
988
989 WARN_ON(iopt_calculate_iova_alignment(iopt));
990 out_unlock:
991 up_write(&iopt->iova_rwsem);
992 up_write(&iopt->domains_rwsem);
993 }
994
995 /**
996 * iopt_area_split - Split an area into two parts at iova
997 * @area: The area to split
998 * @iova: Becomes the last of a new area
999 *
1000 * This splits an area into two. It is part of the VFIO compatibility to allow
1001 * poking a hole in the mapping. The two areas continue to point at the same
1002 * iopt_pages, just with different starting bytes.
1003 */
iopt_area_split(struct iopt_area * area,unsigned long iova)1004 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1005 {
1006 unsigned long alignment = area->iopt->iova_alignment;
1007 unsigned long last_iova = iopt_area_last_iova(area);
1008 unsigned long start_iova = iopt_area_iova(area);
1009 unsigned long new_start = iova + 1;
1010 struct io_pagetable *iopt = area->iopt;
1011 struct iopt_pages *pages = area->pages;
1012 struct iopt_area *lhs;
1013 struct iopt_area *rhs;
1014 int rc;
1015
1016 lockdep_assert_held_write(&iopt->iova_rwsem);
1017
1018 if (iova == start_iova || iova == last_iova)
1019 return 0;
1020
1021 if (!pages || area->prevent_access)
1022 return -EBUSY;
1023
1024 if (new_start & (alignment - 1) ||
1025 iopt_area_start_byte(area, new_start) & (alignment - 1))
1026 return -EINVAL;
1027
1028 lhs = iopt_area_alloc();
1029 if (!lhs)
1030 return -ENOMEM;
1031
1032 rhs = iopt_area_alloc();
1033 if (!rhs) {
1034 rc = -ENOMEM;
1035 goto err_free_lhs;
1036 }
1037
1038 mutex_lock(&pages->mutex);
1039 /*
1040 * Splitting is not permitted if an access exists, we don't track enough
1041 * information to split existing accesses.
1042 */
1043 if (area->num_accesses) {
1044 rc = -EINVAL;
1045 goto err_unlock;
1046 }
1047
1048 /*
1049 * Splitting is not permitted if a domain could have been mapped with
1050 * huge pages.
1051 */
1052 if (area->storage_domain && !iopt->disable_large_pages) {
1053 rc = -EINVAL;
1054 goto err_unlock;
1055 }
1056
1057 interval_tree_remove(&area->node, &iopt->area_itree);
1058 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1059 iopt_area_start_byte(area, start_iova),
1060 (new_start - 1) - start_iova + 1,
1061 area->iommu_prot);
1062 if (WARN_ON(rc))
1063 goto err_insert;
1064
1065 rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1066 iopt_area_start_byte(area, new_start),
1067 last_iova - new_start + 1, area->iommu_prot);
1068 if (WARN_ON(rc))
1069 goto err_remove_lhs;
1070
1071 /*
1072 * If the original area has filled a domain, domains_itree has to be
1073 * updated.
1074 */
1075 if (area->storage_domain) {
1076 interval_tree_remove(&area->pages_node, &pages->domains_itree);
1077 interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1078 interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1079 }
1080
1081 lhs->storage_domain = area->storage_domain;
1082 lhs->pages = area->pages;
1083 rhs->storage_domain = area->storage_domain;
1084 rhs->pages = area->pages;
1085 kref_get(&rhs->pages->kref);
1086 kfree(area);
1087 mutex_unlock(&pages->mutex);
1088
1089 /*
1090 * No change to domains or accesses because the pages hasn't been
1091 * changed
1092 */
1093 return 0;
1094
1095 err_remove_lhs:
1096 interval_tree_remove(&lhs->node, &iopt->area_itree);
1097 err_insert:
1098 interval_tree_insert(&area->node, &iopt->area_itree);
1099 err_unlock:
1100 mutex_unlock(&pages->mutex);
1101 kfree(rhs);
1102 err_free_lhs:
1103 kfree(lhs);
1104 return rc;
1105 }
1106
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1107 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1108 size_t num_iovas)
1109 {
1110 int rc = 0;
1111 int i;
1112
1113 down_write(&iopt->iova_rwsem);
1114 for (i = 0; i < num_iovas; i++) {
1115 struct iopt_area *area;
1116
1117 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1118 if (!area)
1119 continue;
1120 rc = iopt_area_split(area, iovas[i]);
1121 if (rc)
1122 break;
1123 }
1124 up_write(&iopt->iova_rwsem);
1125 return rc;
1126 }
1127
iopt_enable_large_pages(struct io_pagetable * iopt)1128 void iopt_enable_large_pages(struct io_pagetable *iopt)
1129 {
1130 int rc;
1131
1132 down_write(&iopt->domains_rwsem);
1133 down_write(&iopt->iova_rwsem);
1134 WRITE_ONCE(iopt->disable_large_pages, false);
1135 rc = iopt_calculate_iova_alignment(iopt);
1136 WARN_ON(rc);
1137 up_write(&iopt->iova_rwsem);
1138 up_write(&iopt->domains_rwsem);
1139 }
1140
iopt_disable_large_pages(struct io_pagetable * iopt)1141 int iopt_disable_large_pages(struct io_pagetable *iopt)
1142 {
1143 int rc = 0;
1144
1145 down_write(&iopt->domains_rwsem);
1146 down_write(&iopt->iova_rwsem);
1147 if (iopt->disable_large_pages)
1148 goto out_unlock;
1149
1150 /* Won't do it if domains already have pages mapped in them */
1151 if (!xa_empty(&iopt->domains) &&
1152 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1153 rc = -EINVAL;
1154 goto out_unlock;
1155 }
1156
1157 WRITE_ONCE(iopt->disable_large_pages, true);
1158 rc = iopt_calculate_iova_alignment(iopt);
1159 if (rc)
1160 WRITE_ONCE(iopt->disable_large_pages, false);
1161 out_unlock:
1162 up_write(&iopt->iova_rwsem);
1163 up_write(&iopt->domains_rwsem);
1164 return rc;
1165 }
1166
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1167 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1168 {
1169 u32 new_id;
1170 int rc;
1171
1172 down_write(&iopt->domains_rwsem);
1173 down_write(&iopt->iova_rwsem);
1174 rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1175 GFP_KERNEL_ACCOUNT);
1176
1177 if (rc)
1178 goto out_unlock;
1179
1180 rc = iopt_calculate_iova_alignment(iopt);
1181 if (rc) {
1182 xa_erase(&iopt->access_list, new_id);
1183 goto out_unlock;
1184 }
1185 access->iopt_access_list_id = new_id;
1186
1187 out_unlock:
1188 up_write(&iopt->iova_rwsem);
1189 up_write(&iopt->domains_rwsem);
1190 return rc;
1191 }
1192
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1193 void iopt_remove_access(struct io_pagetable *iopt,
1194 struct iommufd_access *access,
1195 u32 iopt_access_list_id)
1196 {
1197 down_write(&iopt->domains_rwsem);
1198 down_write(&iopt->iova_rwsem);
1199 WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1200 WARN_ON(iopt_calculate_iova_alignment(iopt));
1201 up_write(&iopt->iova_rwsem);
1202 up_write(&iopt->domains_rwsem);
1203 }
1204
1205 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1206 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1207 struct device *dev,
1208 phys_addr_t *sw_msi_start)
1209 {
1210 struct iommu_resv_region *resv;
1211 LIST_HEAD(resv_regions);
1212 unsigned int num_hw_msi = 0;
1213 unsigned int num_sw_msi = 0;
1214 int rc;
1215
1216 if (iommufd_should_fail())
1217 return -EINVAL;
1218
1219 down_write(&iopt->iova_rwsem);
1220 /* FIXME: drivers allocate memory but there is no failure propogated */
1221 iommu_get_resv_regions(dev, &resv_regions);
1222
1223 list_for_each_entry(resv, &resv_regions, list) {
1224 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1225 continue;
1226
1227 if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1228 num_hw_msi++;
1229 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1230 *sw_msi_start = resv->start;
1231 num_sw_msi++;
1232 }
1233
1234 rc = iopt_reserve_iova(iopt, resv->start,
1235 resv->length - 1 + resv->start, dev);
1236 if (rc)
1237 goto out_reserved;
1238 }
1239
1240 /* Drivers must offer sane combinations of regions */
1241 if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1242 rc = -EINVAL;
1243 goto out_reserved;
1244 }
1245
1246 rc = 0;
1247 goto out_free_resv;
1248
1249 out_reserved:
1250 __iopt_remove_reserved_iova(iopt, dev);
1251 out_free_resv:
1252 iommu_put_resv_regions(dev, &resv_regions);
1253 up_write(&iopt->iova_rwsem);
1254 return rc;
1255 }
1256