1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO: IOMMU DMA mapping support for TCE on POWER
4  *
5  * Copyright (C) 2013 IBM Corp.  All rights reserved.
6  *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
7  *
8  * Derived from original vfio_iommu_type1.c:
9  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
10  *     Author: Alex Williamson <alex.williamson@redhat.com>
11  */
12 
13 #include <linux/module.h>
14 #include <linux/pci.h>
15 #include <linux/slab.h>
16 #include <linux/uaccess.h>
17 #include <linux/err.h>
18 #include <linux/vfio.h>
19 #include <linux/vmalloc.h>
20 #include <linux/sched/mm.h>
21 #include <linux/sched/signal.h>
22 
23 #include <asm/iommu.h>
24 #include <asm/tce.h>
25 #include <asm/mmu_context.h>
26 
27 #define DRIVER_VERSION  "0.1"
28 #define DRIVER_AUTHOR   "aik@ozlabs.ru"
29 #define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
30 
31 static void tce_iommu_detach_group(void *iommu_data,
32 		struct iommu_group *iommu_group);
33 
34 static long try_increment_locked_vm(struct mm_struct *mm, long npages)
35 {
36 	long ret = 0, locked, lock_limit;
37 
38 	if (WARN_ON_ONCE(!mm))
39 		return -EPERM;
40 
41 	if (!npages)
42 		return 0;
43 
44 	down_write(&mm->mmap_sem);
45 	locked = mm->locked_vm + npages;
46 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
47 	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
48 		ret = -ENOMEM;
49 	else
50 		mm->locked_vm += npages;
51 
52 	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
53 			npages << PAGE_SHIFT,
54 			mm->locked_vm << PAGE_SHIFT,
55 			rlimit(RLIMIT_MEMLOCK),
56 			ret ? " - exceeded" : "");
57 
58 	up_write(&mm->mmap_sem);
59 
60 	return ret;
61 }
62 
63 static void decrement_locked_vm(struct mm_struct *mm, long npages)
64 {
65 	if (!mm || !npages)
66 		return;
67 
68 	down_write(&mm->mmap_sem);
69 	if (WARN_ON_ONCE(npages > mm->locked_vm))
70 		npages = mm->locked_vm;
71 	mm->locked_vm -= npages;
72 	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
73 			npages << PAGE_SHIFT,
74 			mm->locked_vm << PAGE_SHIFT,
75 			rlimit(RLIMIT_MEMLOCK));
76 	up_write(&mm->mmap_sem);
77 }
78 
79 /*
80  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
81  *
82  * This code handles mapping and unmapping of user data buffers
83  * into DMA'ble space using the IOMMU
84  */
85 
86 struct tce_iommu_group {
87 	struct list_head next;
88 	struct iommu_group *grp;
89 };
90 
91 /*
92  * A container needs to remember which preregistered region  it has
93  * referenced to do proper cleanup at the userspace process exit.
94  */
95 struct tce_iommu_prereg {
96 	struct list_head next;
97 	struct mm_iommu_table_group_mem_t *mem;
98 };
99 
100 /*
101  * The container descriptor supports only a single group per container.
102  * Required by the API as the container is not supplied with the IOMMU group
103  * at the moment of initialization.
104  */
105 struct tce_container {
106 	struct mutex lock;
107 	bool enabled;
108 	bool v2;
109 	bool def_window_pending;
110 	unsigned long locked_pages;
111 	struct mm_struct *mm;
112 	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
113 	struct list_head group_list;
114 	struct list_head prereg_list;
115 };
116 
117 static long tce_iommu_mm_set(struct tce_container *container)
118 {
119 	if (container->mm) {
120 		if (container->mm == current->mm)
121 			return 0;
122 		return -EPERM;
123 	}
124 	BUG_ON(!current->mm);
125 	container->mm = current->mm;
126 	atomic_inc(&container->mm->mm_count);
127 
128 	return 0;
129 }
130 
131 static long tce_iommu_prereg_free(struct tce_container *container,
132 		struct tce_iommu_prereg *tcemem)
133 {
134 	long ret;
135 
136 	ret = mm_iommu_put(container->mm, tcemem->mem);
137 	if (ret)
138 		return ret;
139 
140 	list_del(&tcemem->next);
141 	kfree(tcemem);
142 
143 	return 0;
144 }
145 
146 static long tce_iommu_unregister_pages(struct tce_container *container,
147 		__u64 vaddr, __u64 size)
148 {
149 	struct mm_iommu_table_group_mem_t *mem;
150 	struct tce_iommu_prereg *tcemem;
151 	bool found = false;
152 	long ret;
153 
154 	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
155 		return -EINVAL;
156 
157 	mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
158 	if (!mem)
159 		return -ENOENT;
160 
161 	list_for_each_entry(tcemem, &container->prereg_list, next) {
162 		if (tcemem->mem == mem) {
163 			found = true;
164 			break;
165 		}
166 	}
167 
168 	if (!found)
169 		ret = -ENOENT;
170 	else
171 		ret = tce_iommu_prereg_free(container, tcemem);
172 
173 	mm_iommu_put(container->mm, mem);
174 
175 	return ret;
176 }
177 
178 static long tce_iommu_register_pages(struct tce_container *container,
179 		__u64 vaddr, __u64 size)
180 {
181 	long ret = 0;
182 	struct mm_iommu_table_group_mem_t *mem = NULL;
183 	struct tce_iommu_prereg *tcemem;
184 	unsigned long entries = size >> PAGE_SHIFT;
185 
186 	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
187 			((vaddr + size) < vaddr))
188 		return -EINVAL;
189 
190 	mem = mm_iommu_get(container->mm, vaddr, entries);
191 	if (mem) {
192 		list_for_each_entry(tcemem, &container->prereg_list, next) {
193 			if (tcemem->mem == mem) {
194 				ret = -EBUSY;
195 				goto put_exit;
196 			}
197 		}
198 	} else {
199 		ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
200 		if (ret)
201 			return ret;
202 	}
203 
204 	tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
205 	if (!tcemem) {
206 		ret = -ENOMEM;
207 		goto put_exit;
208 	}
209 
210 	tcemem->mem = mem;
211 	list_add(&tcemem->next, &container->prereg_list);
212 
213 	container->enabled = true;
214 
215 	return 0;
216 
217 put_exit:
218 	mm_iommu_put(container->mm, mem);
219 	return ret;
220 }
221 
222 static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
223 		unsigned int page_shift)
224 {
225 	struct page *page;
226 	unsigned long size = 0;
227 
228 	if (mm_iommu_is_devmem(mm, hpa, page_shift, &size))
229 		return size == (1UL << page_shift);
230 
231 	page = pfn_to_page(hpa >> PAGE_SHIFT);
232 	/*
233 	 * Check that the TCE table granularity is not bigger than the size of
234 	 * a page we just found. Otherwise the hardware can get access to
235 	 * a bigger memory chunk that it should.
236 	 */
237 	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
238 }
239 
240 static inline bool tce_groups_attached(struct tce_container *container)
241 {
242 	return !list_empty(&container->group_list);
243 }
244 
245 static long tce_iommu_find_table(struct tce_container *container,
246 		phys_addr_t ioba, struct iommu_table **ptbl)
247 {
248 	long i;
249 
250 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
251 		struct iommu_table *tbl = container->tables[i];
252 
253 		if (tbl) {
254 			unsigned long entry = ioba >> tbl->it_page_shift;
255 			unsigned long start = tbl->it_offset;
256 			unsigned long end = start + tbl->it_size;
257 
258 			if ((start <= entry) && (entry < end)) {
259 				*ptbl = tbl;
260 				return i;
261 			}
262 		}
263 	}
264 
265 	return -1;
266 }
267 
268 static int tce_iommu_find_free_table(struct tce_container *container)
269 {
270 	int i;
271 
272 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
273 		if (!container->tables[i])
274 			return i;
275 	}
276 
277 	return -ENOSPC;
278 }
279 
280 static int tce_iommu_enable(struct tce_container *container)
281 {
282 	int ret = 0;
283 	unsigned long locked;
284 	struct iommu_table_group *table_group;
285 	struct tce_iommu_group *tcegrp;
286 
287 	if (container->enabled)
288 		return -EBUSY;
289 
290 	/*
291 	 * When userspace pages are mapped into the IOMMU, they are effectively
292 	 * locked memory, so, theoretically, we need to update the accounting
293 	 * of locked pages on each map and unmap.  For powerpc, the map unmap
294 	 * paths can be very hot, though, and the accounting would kill
295 	 * performance, especially since it would be difficult to impossible
296 	 * to handle the accounting in real mode only.
297 	 *
298 	 * To address that, rather than precisely accounting every page, we
299 	 * instead account for a worst case on locked memory when the iommu is
300 	 * enabled and disabled.  The worst case upper bound on locked memory
301 	 * is the size of the whole iommu window, which is usually relatively
302 	 * small (compared to total memory sizes) on POWER hardware.
303 	 *
304 	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
305 	 * that would effectively kill the guest at random points, much better
306 	 * enforcing the limit based on the max that the guest can map.
307 	 *
308 	 * Unfortunately at the moment it counts whole tables, no matter how
309 	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
310 	 * each with 2GB DMA window, 8GB will be counted here. The reason for
311 	 * this is that we cannot tell here the amount of RAM used by the guest
312 	 * as this information is only available from KVM and VFIO is
313 	 * KVM agnostic.
314 	 *
315 	 * So we do not allow enabling a container without a group attached
316 	 * as there is no way to know how much we should increment
317 	 * the locked_vm counter.
318 	 */
319 	if (!tce_groups_attached(container))
320 		return -ENODEV;
321 
322 	tcegrp = list_first_entry(&container->group_list,
323 			struct tce_iommu_group, next);
324 	table_group = iommu_group_get_iommudata(tcegrp->grp);
325 	if (!table_group)
326 		return -ENODEV;
327 
328 	if (!table_group->tce32_size)
329 		return -EPERM;
330 
331 	ret = tce_iommu_mm_set(container);
332 	if (ret)
333 		return ret;
334 
335 	locked = table_group->tce32_size >> PAGE_SHIFT;
336 	ret = try_increment_locked_vm(container->mm, locked);
337 	if (ret)
338 		return ret;
339 
340 	container->locked_pages = locked;
341 
342 	container->enabled = true;
343 
344 	return ret;
345 }
346 
347 static void tce_iommu_disable(struct tce_container *container)
348 {
349 	if (!container->enabled)
350 		return;
351 
352 	container->enabled = false;
353 
354 	BUG_ON(!container->mm);
355 	decrement_locked_vm(container->mm, container->locked_pages);
356 }
357 
358 static void *tce_iommu_open(unsigned long arg)
359 {
360 	struct tce_container *container;
361 
362 	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
363 		pr_err("tce_vfio: Wrong IOMMU type\n");
364 		return ERR_PTR(-EINVAL);
365 	}
366 
367 	container = kzalloc(sizeof(*container), GFP_KERNEL);
368 	if (!container)
369 		return ERR_PTR(-ENOMEM);
370 
371 	mutex_init(&container->lock);
372 	INIT_LIST_HEAD_RCU(&container->group_list);
373 	INIT_LIST_HEAD_RCU(&container->prereg_list);
374 
375 	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
376 
377 	return container;
378 }
379 
380 static int tce_iommu_clear(struct tce_container *container,
381 		struct iommu_table *tbl,
382 		unsigned long entry, unsigned long pages);
383 static void tce_iommu_free_table(struct tce_container *container,
384 		struct iommu_table *tbl);
385 
386 static void tce_iommu_release(void *iommu_data)
387 {
388 	struct tce_container *container = iommu_data;
389 	struct tce_iommu_group *tcegrp;
390 	struct tce_iommu_prereg *tcemem, *tmtmp;
391 	long i;
392 
393 	while (tce_groups_attached(container)) {
394 		tcegrp = list_first_entry(&container->group_list,
395 				struct tce_iommu_group, next);
396 		tce_iommu_detach_group(iommu_data, tcegrp->grp);
397 	}
398 
399 	/*
400 	 * If VFIO created a table, it was not disposed
401 	 * by tce_iommu_detach_group() so do it now.
402 	 */
403 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
404 		struct iommu_table *tbl = container->tables[i];
405 
406 		if (!tbl)
407 			continue;
408 
409 		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
410 		tce_iommu_free_table(container, tbl);
411 	}
412 
413 	list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
414 		WARN_ON(tce_iommu_prereg_free(container, tcemem));
415 
416 	tce_iommu_disable(container);
417 	if (container->mm)
418 		mmdrop(container->mm);
419 	mutex_destroy(&container->lock);
420 
421 	kfree(container);
422 }
423 
424 static void tce_iommu_unuse_page(struct tce_container *container,
425 		unsigned long hpa)
426 {
427 	struct page *page;
428 
429 	page = pfn_to_page(hpa >> PAGE_SHIFT);
430 	put_page(page);
431 }
432 
433 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
434 		unsigned long tce, unsigned long shift,
435 		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
436 {
437 	long ret = 0;
438 	struct mm_iommu_table_group_mem_t *mem;
439 
440 	mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
441 	if (!mem)
442 		return -EINVAL;
443 
444 	ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
445 	if (ret)
446 		return -EINVAL;
447 
448 	*pmem = mem;
449 
450 	return 0;
451 }
452 
453 static void tce_iommu_unuse_page_v2(struct tce_container *container,
454 		struct iommu_table *tbl, unsigned long entry)
455 {
456 	struct mm_iommu_table_group_mem_t *mem = NULL;
457 	int ret;
458 	unsigned long hpa = 0;
459 	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
460 
461 	if (!pua)
462 		return;
463 
464 	ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
465 			tbl->it_page_shift, &hpa, &mem);
466 	if (ret)
467 		pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
468 				__func__, be64_to_cpu(*pua), entry, ret);
469 	if (mem)
470 		mm_iommu_mapped_dec(mem);
471 
472 	*pua = cpu_to_be64(0);
473 }
474 
475 static int tce_iommu_clear(struct tce_container *container,
476 		struct iommu_table *tbl,
477 		unsigned long entry, unsigned long pages)
478 {
479 	unsigned long oldhpa;
480 	long ret;
481 	enum dma_data_direction direction;
482 	unsigned long lastentry = entry + pages;
483 
484 	for ( ; entry < lastentry; ++entry) {
485 		if (tbl->it_indirect_levels && tbl->it_userspace) {
486 			/*
487 			 * For multilevel tables, we can take a shortcut here
488 			 * and skip some TCEs as we know that the userspace
489 			 * addresses cache is a mirror of the real TCE table
490 			 * and if it is missing some indirect levels, then
491 			 * the hardware table does not have them allocated
492 			 * either and therefore does not require updating.
493 			 */
494 			__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
495 					entry);
496 			if (!pua) {
497 				/* align to level_size which is power of two */
498 				entry |= tbl->it_level_size - 1;
499 				continue;
500 			}
501 		}
502 
503 		cond_resched();
504 
505 		direction = DMA_NONE;
506 		oldhpa = 0;
507 		ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa,
508 				&direction);
509 		if (ret)
510 			continue;
511 
512 		if (direction == DMA_NONE)
513 			continue;
514 
515 		if (container->v2) {
516 			tce_iommu_unuse_page_v2(container, tbl, entry);
517 			continue;
518 		}
519 
520 		tce_iommu_unuse_page(container, oldhpa);
521 	}
522 
523 	return 0;
524 }
525 
526 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
527 {
528 	struct page *page = NULL;
529 	enum dma_data_direction direction = iommu_tce_direction(tce);
530 
531 	if (get_user_pages_fast(tce & PAGE_MASK, 1,
532 			direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
533 			&page) != 1)
534 		return -EFAULT;
535 
536 	*hpa = __pa((unsigned long) page_address(page));
537 
538 	return 0;
539 }
540 
541 static long tce_iommu_build(struct tce_container *container,
542 		struct iommu_table *tbl,
543 		unsigned long entry, unsigned long tce, unsigned long pages,
544 		enum dma_data_direction direction)
545 {
546 	long i, ret = 0;
547 	unsigned long hpa;
548 	enum dma_data_direction dirtmp;
549 
550 	for (i = 0; i < pages; ++i) {
551 		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
552 
553 		ret = tce_iommu_use_page(tce, &hpa);
554 		if (ret)
555 			break;
556 
557 		if (!tce_page_is_contained(container->mm, hpa,
558 				tbl->it_page_shift)) {
559 			ret = -EPERM;
560 			break;
561 		}
562 
563 		hpa |= offset;
564 		dirtmp = direction;
565 		ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
566 				&dirtmp);
567 		if (ret) {
568 			tce_iommu_unuse_page(container, hpa);
569 			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
570 					__func__, entry << tbl->it_page_shift,
571 					tce, ret);
572 			break;
573 		}
574 
575 		if (dirtmp != DMA_NONE)
576 			tce_iommu_unuse_page(container, hpa);
577 
578 		tce += IOMMU_PAGE_SIZE(tbl);
579 	}
580 
581 	if (ret)
582 		tce_iommu_clear(container, tbl, entry, i);
583 
584 	return ret;
585 }
586 
587 static long tce_iommu_build_v2(struct tce_container *container,
588 		struct iommu_table *tbl,
589 		unsigned long entry, unsigned long tce, unsigned long pages,
590 		enum dma_data_direction direction)
591 {
592 	long i, ret = 0;
593 	unsigned long hpa;
594 	enum dma_data_direction dirtmp;
595 
596 	for (i = 0; i < pages; ++i) {
597 		struct mm_iommu_table_group_mem_t *mem = NULL;
598 		__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
599 
600 		ret = tce_iommu_prereg_ua_to_hpa(container,
601 				tce, tbl->it_page_shift, &hpa, &mem);
602 		if (ret)
603 			break;
604 
605 		if (!tce_page_is_contained(container->mm, hpa,
606 				tbl->it_page_shift)) {
607 			ret = -EPERM;
608 			break;
609 		}
610 
611 		/* Preserve offset within IOMMU page */
612 		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
613 		dirtmp = direction;
614 
615 		/* The registered region is being unregistered */
616 		if (mm_iommu_mapped_inc(mem))
617 			break;
618 
619 		ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
620 				&dirtmp);
621 		if (ret) {
622 			/* dirtmp cannot be DMA_NONE here */
623 			tce_iommu_unuse_page_v2(container, tbl, entry + i);
624 			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
625 					__func__, entry << tbl->it_page_shift,
626 					tce, ret);
627 			break;
628 		}
629 
630 		if (dirtmp != DMA_NONE)
631 			tce_iommu_unuse_page_v2(container, tbl, entry + i);
632 
633 		*pua = cpu_to_be64(tce);
634 
635 		tce += IOMMU_PAGE_SIZE(tbl);
636 	}
637 
638 	if (ret)
639 		tce_iommu_clear(container, tbl, entry, i);
640 
641 	return ret;
642 }
643 
644 static long tce_iommu_create_table(struct tce_container *container,
645 			struct iommu_table_group *table_group,
646 			int num,
647 			__u32 page_shift,
648 			__u64 window_size,
649 			__u32 levels,
650 			struct iommu_table **ptbl)
651 {
652 	long ret, table_size;
653 
654 	table_size = table_group->ops->get_table_size(page_shift, window_size,
655 			levels);
656 	if (!table_size)
657 		return -EINVAL;
658 
659 	ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
660 	if (ret)
661 		return ret;
662 
663 	ret = table_group->ops->create_table(table_group, num,
664 			page_shift, window_size, levels, ptbl);
665 
666 	WARN_ON(!ret && !(*ptbl)->it_ops->free);
667 	WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
668 
669 	return ret;
670 }
671 
672 static void tce_iommu_free_table(struct tce_container *container,
673 		struct iommu_table *tbl)
674 {
675 	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
676 
677 	iommu_tce_table_put(tbl);
678 	decrement_locked_vm(container->mm, pages);
679 }
680 
681 static long tce_iommu_create_window(struct tce_container *container,
682 		__u32 page_shift, __u64 window_size, __u32 levels,
683 		__u64 *start_addr)
684 {
685 	struct tce_iommu_group *tcegrp;
686 	struct iommu_table_group *table_group;
687 	struct iommu_table *tbl = NULL;
688 	long ret, num;
689 
690 	num = tce_iommu_find_free_table(container);
691 	if (num < 0)
692 		return num;
693 
694 	/* Get the first group for ops::create_table */
695 	tcegrp = list_first_entry(&container->group_list,
696 			struct tce_iommu_group, next);
697 	table_group = iommu_group_get_iommudata(tcegrp->grp);
698 	if (!table_group)
699 		return -EFAULT;
700 
701 	if (!(table_group->pgsizes & (1ULL << page_shift)))
702 		return -EINVAL;
703 
704 	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
705 			!table_group->ops->get_table_size ||
706 			!table_group->ops->create_table)
707 		return -EPERM;
708 
709 	/* Create TCE table */
710 	ret = tce_iommu_create_table(container, table_group, num,
711 			page_shift, window_size, levels, &tbl);
712 	if (ret)
713 		return ret;
714 
715 	BUG_ON(!tbl->it_ops->free);
716 
717 	/*
718 	 * Program the table to every group.
719 	 * Groups have been tested for compatibility at the attach time.
720 	 */
721 	list_for_each_entry(tcegrp, &container->group_list, next) {
722 		table_group = iommu_group_get_iommudata(tcegrp->grp);
723 
724 		ret = table_group->ops->set_window(table_group, num, tbl);
725 		if (ret)
726 			goto unset_exit;
727 	}
728 
729 	container->tables[num] = tbl;
730 
731 	/* Return start address assigned by platform in create_table() */
732 	*start_addr = tbl->it_offset << tbl->it_page_shift;
733 
734 	return 0;
735 
736 unset_exit:
737 	list_for_each_entry(tcegrp, &container->group_list, next) {
738 		table_group = iommu_group_get_iommudata(tcegrp->grp);
739 		table_group->ops->unset_window(table_group, num);
740 	}
741 	tce_iommu_free_table(container, tbl);
742 
743 	return ret;
744 }
745 
746 static long tce_iommu_remove_window(struct tce_container *container,
747 		__u64 start_addr)
748 {
749 	struct iommu_table_group *table_group = NULL;
750 	struct iommu_table *tbl;
751 	struct tce_iommu_group *tcegrp;
752 	int num;
753 
754 	num = tce_iommu_find_table(container, start_addr, &tbl);
755 	if (num < 0)
756 		return -EINVAL;
757 
758 	BUG_ON(!tbl->it_size);
759 
760 	/* Detach groups from IOMMUs */
761 	list_for_each_entry(tcegrp, &container->group_list, next) {
762 		table_group = iommu_group_get_iommudata(tcegrp->grp);
763 
764 		/*
765 		 * SPAPR TCE IOMMU exposes the default DMA window to
766 		 * the guest via dma32_window_start/size of
767 		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
768 		 * the userspace to remove this window, some do not so
769 		 * here we check for the platform capability.
770 		 */
771 		if (!table_group->ops || !table_group->ops->unset_window)
772 			return -EPERM;
773 
774 		table_group->ops->unset_window(table_group, num);
775 	}
776 
777 	/* Free table */
778 	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
779 	tce_iommu_free_table(container, tbl);
780 	container->tables[num] = NULL;
781 
782 	return 0;
783 }
784 
785 static long tce_iommu_create_default_window(struct tce_container *container)
786 {
787 	long ret;
788 	__u64 start_addr = 0;
789 	struct tce_iommu_group *tcegrp;
790 	struct iommu_table_group *table_group;
791 
792 	if (!container->def_window_pending)
793 		return 0;
794 
795 	if (!tce_groups_attached(container))
796 		return -ENODEV;
797 
798 	tcegrp = list_first_entry(&container->group_list,
799 			struct tce_iommu_group, next);
800 	table_group = iommu_group_get_iommudata(tcegrp->grp);
801 	if (!table_group)
802 		return -ENODEV;
803 
804 	ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
805 			table_group->tce32_size, 1, &start_addr);
806 	WARN_ON_ONCE(!ret && start_addr);
807 
808 	if (!ret)
809 		container->def_window_pending = false;
810 
811 	return ret;
812 }
813 
814 static long tce_iommu_ioctl(void *iommu_data,
815 				 unsigned int cmd, unsigned long arg)
816 {
817 	struct tce_container *container = iommu_data;
818 	unsigned long minsz, ddwsz;
819 	long ret;
820 
821 	switch (cmd) {
822 	case VFIO_CHECK_EXTENSION:
823 		switch (arg) {
824 		case VFIO_SPAPR_TCE_IOMMU:
825 		case VFIO_SPAPR_TCE_v2_IOMMU:
826 			ret = 1;
827 			break;
828 		default:
829 			ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
830 			break;
831 		}
832 
833 		return (ret < 0) ? 0 : ret;
834 	}
835 
836 	/*
837 	 * Sanity check to prevent one userspace from manipulating
838 	 * another userspace mm.
839 	 */
840 	BUG_ON(!container);
841 	if (container->mm && container->mm != current->mm)
842 		return -EPERM;
843 
844 	switch (cmd) {
845 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
846 		struct vfio_iommu_spapr_tce_info info;
847 		struct tce_iommu_group *tcegrp;
848 		struct iommu_table_group *table_group;
849 
850 		if (!tce_groups_attached(container))
851 			return -ENXIO;
852 
853 		tcegrp = list_first_entry(&container->group_list,
854 				struct tce_iommu_group, next);
855 		table_group = iommu_group_get_iommudata(tcegrp->grp);
856 
857 		if (!table_group)
858 			return -ENXIO;
859 
860 		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
861 				dma32_window_size);
862 
863 		if (copy_from_user(&info, (void __user *)arg, minsz))
864 			return -EFAULT;
865 
866 		if (info.argsz < minsz)
867 			return -EINVAL;
868 
869 		info.dma32_window_start = table_group->tce32_start;
870 		info.dma32_window_size = table_group->tce32_size;
871 		info.flags = 0;
872 		memset(&info.ddw, 0, sizeof(info.ddw));
873 
874 		if (table_group->max_dynamic_windows_supported &&
875 				container->v2) {
876 			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
877 			info.ddw.pgsizes = table_group->pgsizes;
878 			info.ddw.max_dynamic_windows_supported =
879 				table_group->max_dynamic_windows_supported;
880 			info.ddw.levels = table_group->max_levels;
881 		}
882 
883 		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
884 
885 		if (info.argsz >= ddwsz)
886 			minsz = ddwsz;
887 
888 		if (copy_to_user((void __user *)arg, &info, minsz))
889 			return -EFAULT;
890 
891 		return 0;
892 	}
893 	case VFIO_IOMMU_MAP_DMA: {
894 		struct vfio_iommu_type1_dma_map param;
895 		struct iommu_table *tbl = NULL;
896 		long num;
897 		enum dma_data_direction direction;
898 
899 		if (!container->enabled)
900 			return -EPERM;
901 
902 		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
903 
904 		if (copy_from_user(&param, (void __user *)arg, minsz))
905 			return -EFAULT;
906 
907 		if (param.argsz < minsz)
908 			return -EINVAL;
909 
910 		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
911 				VFIO_DMA_MAP_FLAG_WRITE))
912 			return -EINVAL;
913 
914 		ret = tce_iommu_create_default_window(container);
915 		if (ret)
916 			return ret;
917 
918 		num = tce_iommu_find_table(container, param.iova, &tbl);
919 		if (num < 0)
920 			return -ENXIO;
921 
922 		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
923 				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
924 			return -EINVAL;
925 
926 		/* iova is checked by the IOMMU API */
927 		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
928 			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
929 				direction = DMA_BIDIRECTIONAL;
930 			else
931 				direction = DMA_TO_DEVICE;
932 		} else {
933 			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
934 				direction = DMA_FROM_DEVICE;
935 			else
936 				return -EINVAL;
937 		}
938 
939 		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
940 		if (ret)
941 			return ret;
942 
943 		if (container->v2)
944 			ret = tce_iommu_build_v2(container, tbl,
945 					param.iova >> tbl->it_page_shift,
946 					param.vaddr,
947 					param.size >> tbl->it_page_shift,
948 					direction);
949 		else
950 			ret = tce_iommu_build(container, tbl,
951 					param.iova >> tbl->it_page_shift,
952 					param.vaddr,
953 					param.size >> tbl->it_page_shift,
954 					direction);
955 
956 		iommu_flush_tce(tbl);
957 
958 		return ret;
959 	}
960 	case VFIO_IOMMU_UNMAP_DMA: {
961 		struct vfio_iommu_type1_dma_unmap param;
962 		struct iommu_table *tbl = NULL;
963 		long num;
964 
965 		if (!container->enabled)
966 			return -EPERM;
967 
968 		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
969 				size);
970 
971 		if (copy_from_user(&param, (void __user *)arg, minsz))
972 			return -EFAULT;
973 
974 		if (param.argsz < minsz)
975 			return -EINVAL;
976 
977 		/* No flag is supported now */
978 		if (param.flags)
979 			return -EINVAL;
980 
981 		ret = tce_iommu_create_default_window(container);
982 		if (ret)
983 			return ret;
984 
985 		num = tce_iommu_find_table(container, param.iova, &tbl);
986 		if (num < 0)
987 			return -ENXIO;
988 
989 		if (param.size & ~IOMMU_PAGE_MASK(tbl))
990 			return -EINVAL;
991 
992 		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
993 				param.size >> tbl->it_page_shift);
994 		if (ret)
995 			return ret;
996 
997 		ret = tce_iommu_clear(container, tbl,
998 				param.iova >> tbl->it_page_shift,
999 				param.size >> tbl->it_page_shift);
1000 		iommu_flush_tce(tbl);
1001 
1002 		return ret;
1003 	}
1004 	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
1005 		struct vfio_iommu_spapr_register_memory param;
1006 
1007 		if (!container->v2)
1008 			break;
1009 
1010 		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1011 				size);
1012 
1013 		ret = tce_iommu_mm_set(container);
1014 		if (ret)
1015 			return ret;
1016 
1017 		if (copy_from_user(&param, (void __user *)arg, minsz))
1018 			return -EFAULT;
1019 
1020 		if (param.argsz < minsz)
1021 			return -EINVAL;
1022 
1023 		/* No flag is supported now */
1024 		if (param.flags)
1025 			return -EINVAL;
1026 
1027 		mutex_lock(&container->lock);
1028 		ret = tce_iommu_register_pages(container, param.vaddr,
1029 				param.size);
1030 		mutex_unlock(&container->lock);
1031 
1032 		return ret;
1033 	}
1034 	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
1035 		struct vfio_iommu_spapr_register_memory param;
1036 
1037 		if (!container->v2)
1038 			break;
1039 
1040 		if (!container->mm)
1041 			return -EPERM;
1042 
1043 		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1044 				size);
1045 
1046 		if (copy_from_user(&param, (void __user *)arg, minsz))
1047 			return -EFAULT;
1048 
1049 		if (param.argsz < minsz)
1050 			return -EINVAL;
1051 
1052 		/* No flag is supported now */
1053 		if (param.flags)
1054 			return -EINVAL;
1055 
1056 		mutex_lock(&container->lock);
1057 		ret = tce_iommu_unregister_pages(container, param.vaddr,
1058 				param.size);
1059 		mutex_unlock(&container->lock);
1060 
1061 		return ret;
1062 	}
1063 	case VFIO_IOMMU_ENABLE:
1064 		if (container->v2)
1065 			break;
1066 
1067 		mutex_lock(&container->lock);
1068 		ret = tce_iommu_enable(container);
1069 		mutex_unlock(&container->lock);
1070 		return ret;
1071 
1072 
1073 	case VFIO_IOMMU_DISABLE:
1074 		if (container->v2)
1075 			break;
1076 
1077 		mutex_lock(&container->lock);
1078 		tce_iommu_disable(container);
1079 		mutex_unlock(&container->lock);
1080 		return 0;
1081 
1082 	case VFIO_EEH_PE_OP: {
1083 		struct tce_iommu_group *tcegrp;
1084 
1085 		ret = 0;
1086 		list_for_each_entry(tcegrp, &container->group_list, next) {
1087 			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
1088 					cmd, arg);
1089 			if (ret)
1090 				return ret;
1091 		}
1092 		return ret;
1093 	}
1094 
1095 	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1096 		struct vfio_iommu_spapr_tce_create create;
1097 
1098 		if (!container->v2)
1099 			break;
1100 
1101 		ret = tce_iommu_mm_set(container);
1102 		if (ret)
1103 			return ret;
1104 
1105 		if (!tce_groups_attached(container))
1106 			return -ENXIO;
1107 
1108 		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1109 				start_addr);
1110 
1111 		if (copy_from_user(&create, (void __user *)arg, minsz))
1112 			return -EFAULT;
1113 
1114 		if (create.argsz < minsz)
1115 			return -EINVAL;
1116 
1117 		if (create.flags)
1118 			return -EINVAL;
1119 
1120 		mutex_lock(&container->lock);
1121 
1122 		ret = tce_iommu_create_default_window(container);
1123 		if (!ret)
1124 			ret = tce_iommu_create_window(container,
1125 					create.page_shift,
1126 					create.window_size, create.levels,
1127 					&create.start_addr);
1128 
1129 		mutex_unlock(&container->lock);
1130 
1131 		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1132 			ret = -EFAULT;
1133 
1134 		return ret;
1135 	}
1136 	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1137 		struct vfio_iommu_spapr_tce_remove remove;
1138 
1139 		if (!container->v2)
1140 			break;
1141 
1142 		ret = tce_iommu_mm_set(container);
1143 		if (ret)
1144 			return ret;
1145 
1146 		if (!tce_groups_attached(container))
1147 			return -ENXIO;
1148 
1149 		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1150 				start_addr);
1151 
1152 		if (copy_from_user(&remove, (void __user *)arg, minsz))
1153 			return -EFAULT;
1154 
1155 		if (remove.argsz < minsz)
1156 			return -EINVAL;
1157 
1158 		if (remove.flags)
1159 			return -EINVAL;
1160 
1161 		if (container->def_window_pending && !remove.start_addr) {
1162 			container->def_window_pending = false;
1163 			return 0;
1164 		}
1165 
1166 		mutex_lock(&container->lock);
1167 
1168 		ret = tce_iommu_remove_window(container, remove.start_addr);
1169 
1170 		mutex_unlock(&container->lock);
1171 
1172 		return ret;
1173 	}
1174 	}
1175 
1176 	return -ENOTTY;
1177 }
1178 
1179 static void tce_iommu_release_ownership(struct tce_container *container,
1180 		struct iommu_table_group *table_group)
1181 {
1182 	int i;
1183 
1184 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1185 		struct iommu_table *tbl = container->tables[i];
1186 
1187 		if (!tbl)
1188 			continue;
1189 
1190 		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1191 		if (tbl->it_map)
1192 			iommu_release_ownership(tbl);
1193 
1194 		container->tables[i] = NULL;
1195 	}
1196 }
1197 
1198 static int tce_iommu_take_ownership(struct tce_container *container,
1199 		struct iommu_table_group *table_group)
1200 {
1201 	int i, j, rc = 0;
1202 
1203 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1204 		struct iommu_table *tbl = table_group->tables[i];
1205 
1206 		if (!tbl || !tbl->it_map)
1207 			continue;
1208 
1209 		rc = iommu_take_ownership(tbl);
1210 		if (rc) {
1211 			for (j = 0; j < i; ++j)
1212 				iommu_release_ownership(
1213 						table_group->tables[j]);
1214 
1215 			return rc;
1216 		}
1217 	}
1218 
1219 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1220 		container->tables[i] = table_group->tables[i];
1221 
1222 	return 0;
1223 }
1224 
1225 static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1226 		struct iommu_table_group *table_group)
1227 {
1228 	long i;
1229 
1230 	if (!table_group->ops->unset_window) {
1231 		WARN_ON_ONCE(1);
1232 		return;
1233 	}
1234 
1235 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1236 		if (container->tables[i])
1237 			table_group->ops->unset_window(table_group, i);
1238 
1239 	table_group->ops->release_ownership(table_group);
1240 }
1241 
1242 static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1243 		struct iommu_table_group *table_group)
1244 {
1245 	long i, ret = 0;
1246 
1247 	if (!table_group->ops->create_table || !table_group->ops->set_window ||
1248 			!table_group->ops->release_ownership) {
1249 		WARN_ON_ONCE(1);
1250 		return -EFAULT;
1251 	}
1252 
1253 	table_group->ops->take_ownership(table_group);
1254 
1255 	/* Set all windows to the new group */
1256 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1257 		struct iommu_table *tbl = container->tables[i];
1258 
1259 		if (!tbl)
1260 			continue;
1261 
1262 		ret = table_group->ops->set_window(table_group, i, tbl);
1263 		if (ret)
1264 			goto release_exit;
1265 	}
1266 
1267 	return 0;
1268 
1269 release_exit:
1270 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1271 		table_group->ops->unset_window(table_group, i);
1272 
1273 	table_group->ops->release_ownership(table_group);
1274 
1275 	return ret;
1276 }
1277 
1278 static int tce_iommu_attach_group(void *iommu_data,
1279 		struct iommu_group *iommu_group)
1280 {
1281 	int ret;
1282 	struct tce_container *container = iommu_data;
1283 	struct iommu_table_group *table_group;
1284 	struct tce_iommu_group *tcegrp = NULL;
1285 
1286 	mutex_lock(&container->lock);
1287 
1288 	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1289 			iommu_group_id(iommu_group), iommu_group); */
1290 	table_group = iommu_group_get_iommudata(iommu_group);
1291 	if (!table_group) {
1292 		ret = -ENODEV;
1293 		goto unlock_exit;
1294 	}
1295 
1296 	if (tce_groups_attached(container) && (!table_group->ops ||
1297 			!table_group->ops->take_ownership ||
1298 			!table_group->ops->release_ownership)) {
1299 		ret = -EBUSY;
1300 		goto unlock_exit;
1301 	}
1302 
1303 	/* Check if new group has the same iommu_ops (i.e. compatible) */
1304 	list_for_each_entry(tcegrp, &container->group_list, next) {
1305 		struct iommu_table_group *table_group_tmp;
1306 
1307 		if (tcegrp->grp == iommu_group) {
1308 			pr_warn("tce_vfio: Group %d is already attached\n",
1309 					iommu_group_id(iommu_group));
1310 			ret = -EBUSY;
1311 			goto unlock_exit;
1312 		}
1313 		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1314 		if (table_group_tmp->ops->create_table !=
1315 				table_group->ops->create_table) {
1316 			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1317 					iommu_group_id(iommu_group),
1318 					iommu_group_id(tcegrp->grp));
1319 			ret = -EPERM;
1320 			goto unlock_exit;
1321 		}
1322 	}
1323 
1324 	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1325 	if (!tcegrp) {
1326 		ret = -ENOMEM;
1327 		goto unlock_exit;
1328 	}
1329 
1330 	if (!table_group->ops || !table_group->ops->take_ownership ||
1331 			!table_group->ops->release_ownership) {
1332 		if (container->v2) {
1333 			ret = -EPERM;
1334 			goto unlock_exit;
1335 		}
1336 		ret = tce_iommu_take_ownership(container, table_group);
1337 	} else {
1338 		if (!container->v2) {
1339 			ret = -EPERM;
1340 			goto unlock_exit;
1341 		}
1342 		ret = tce_iommu_take_ownership_ddw(container, table_group);
1343 		if (!tce_groups_attached(container) && !container->tables[0])
1344 			container->def_window_pending = true;
1345 	}
1346 
1347 	if (!ret) {
1348 		tcegrp->grp = iommu_group;
1349 		list_add(&tcegrp->next, &container->group_list);
1350 	}
1351 
1352 unlock_exit:
1353 	if (ret && tcegrp)
1354 		kfree(tcegrp);
1355 
1356 	mutex_unlock(&container->lock);
1357 
1358 	return ret;
1359 }
1360 
1361 static void tce_iommu_detach_group(void *iommu_data,
1362 		struct iommu_group *iommu_group)
1363 {
1364 	struct tce_container *container = iommu_data;
1365 	struct iommu_table_group *table_group;
1366 	bool found = false;
1367 	struct tce_iommu_group *tcegrp;
1368 
1369 	mutex_lock(&container->lock);
1370 
1371 	list_for_each_entry(tcegrp, &container->group_list, next) {
1372 		if (tcegrp->grp == iommu_group) {
1373 			found = true;
1374 			break;
1375 		}
1376 	}
1377 
1378 	if (!found) {
1379 		pr_warn("tce_vfio: detaching unattached group #%u\n",
1380 				iommu_group_id(iommu_group));
1381 		goto unlock_exit;
1382 	}
1383 
1384 	list_del(&tcegrp->next);
1385 	kfree(tcegrp);
1386 
1387 	table_group = iommu_group_get_iommudata(iommu_group);
1388 	BUG_ON(!table_group);
1389 
1390 	if (!table_group->ops || !table_group->ops->release_ownership)
1391 		tce_iommu_release_ownership(container, table_group);
1392 	else
1393 		tce_iommu_release_ownership_ddw(container, table_group);
1394 
1395 unlock_exit:
1396 	mutex_unlock(&container->lock);
1397 }
1398 
1399 static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1400 	.name		= "iommu-vfio-powerpc",
1401 	.owner		= THIS_MODULE,
1402 	.open		= tce_iommu_open,
1403 	.release	= tce_iommu_release,
1404 	.ioctl		= tce_iommu_ioctl,
1405 	.attach_group	= tce_iommu_attach_group,
1406 	.detach_group	= tce_iommu_detach_group,
1407 };
1408 
1409 static int __init tce_iommu_init(void)
1410 {
1411 	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1412 }
1413 
1414 static void __exit tce_iommu_cleanup(void)
1415 {
1416 	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1417 }
1418 
1419 module_init(tce_iommu_init);
1420 module_exit(tce_iommu_cleanup);
1421 
1422 MODULE_VERSION(DRIVER_VERSION);
1423 MODULE_LICENSE("GPL v2");
1424 MODULE_AUTHOR(DRIVER_AUTHOR);
1425 MODULE_DESCRIPTION(DRIVER_DESC);
1426 
1427