1 /*
2  * VFIO: IOMMU DMA mapping support for TCE on POWER
3  *
4  * Copyright (C) 2013 IBM Corp.  All rights reserved.
5  *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  *
11  * Derived from original vfio_iommu_type1.c:
12  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
13  *     Author: Alex Williamson <alex.williamson@redhat.com>
14  */
15 
16 #include <linux/module.h>
17 #include <linux/pci.h>
18 #include <linux/slab.h>
19 #include <linux/uaccess.h>
20 #include <linux/err.h>
21 #include <linux/vfio.h>
22 #include <linux/vmalloc.h>
23 #include <asm/iommu.h>
24 #include <asm/tce.h>
25 #include <asm/mmu_context.h>
26 
27 #define DRIVER_VERSION  "0.1"
28 #define DRIVER_AUTHOR   "aik@ozlabs.ru"
29 #define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
30 
31 static void tce_iommu_detach_group(void *iommu_data,
32 		struct iommu_group *iommu_group);
33 
34 static long try_increment_locked_vm(long npages)
35 {
36 	long ret = 0, locked, lock_limit;
37 
38 	if (!current || !current->mm)
39 		return -ESRCH; /* process exited */
40 
41 	if (!npages)
42 		return 0;
43 
44 	down_write(&current->mm->mmap_sem);
45 	locked = current->mm->locked_vm + npages;
46 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
47 	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
48 		ret = -ENOMEM;
49 	else
50 		current->mm->locked_vm += npages;
51 
52 	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
53 			npages << PAGE_SHIFT,
54 			current->mm->locked_vm << PAGE_SHIFT,
55 			rlimit(RLIMIT_MEMLOCK),
56 			ret ? " - exceeded" : "");
57 
58 	up_write(&current->mm->mmap_sem);
59 
60 	return ret;
61 }
62 
63 static void decrement_locked_vm(long npages)
64 {
65 	if (!current || !current->mm || !npages)
66 		return; /* process exited */
67 
68 	down_write(&current->mm->mmap_sem);
69 	if (WARN_ON_ONCE(npages > current->mm->locked_vm))
70 		npages = current->mm->locked_vm;
71 	current->mm->locked_vm -= npages;
72 	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
73 			npages << PAGE_SHIFT,
74 			current->mm->locked_vm << PAGE_SHIFT,
75 			rlimit(RLIMIT_MEMLOCK));
76 	up_write(&current->mm->mmap_sem);
77 }
78 
79 /*
80  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
81  *
82  * This code handles mapping and unmapping of user data buffers
83  * into DMA'ble space using the IOMMU
84  */
85 
86 struct tce_iommu_group {
87 	struct list_head next;
88 	struct iommu_group *grp;
89 };
90 
91 /*
92  * The container descriptor supports only a single group per container.
93  * Required by the API as the container is not supplied with the IOMMU group
94  * at the moment of initialization.
95  */
96 struct tce_container {
97 	struct mutex lock;
98 	bool enabled;
99 	bool v2;
100 	unsigned long locked_pages;
101 	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
102 	struct list_head group_list;
103 };
104 
105 static long tce_iommu_unregister_pages(struct tce_container *container,
106 		__u64 vaddr, __u64 size)
107 {
108 	struct mm_iommu_table_group_mem_t *mem;
109 
110 	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
111 		return -EINVAL;
112 
113 	mem = mm_iommu_find(vaddr, size >> PAGE_SHIFT);
114 	if (!mem)
115 		return -ENOENT;
116 
117 	return mm_iommu_put(mem);
118 }
119 
120 static long tce_iommu_register_pages(struct tce_container *container,
121 		__u64 vaddr, __u64 size)
122 {
123 	long ret = 0;
124 	struct mm_iommu_table_group_mem_t *mem = NULL;
125 	unsigned long entries = size >> PAGE_SHIFT;
126 
127 	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
128 			((vaddr + size) < vaddr))
129 		return -EINVAL;
130 
131 	ret = mm_iommu_get(vaddr, entries, &mem);
132 	if (ret)
133 		return ret;
134 
135 	container->enabled = true;
136 
137 	return 0;
138 }
139 
140 static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl)
141 {
142 	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
143 			tbl->it_size, PAGE_SIZE);
144 	unsigned long *uas;
145 	long ret;
146 
147 	BUG_ON(tbl->it_userspace);
148 
149 	ret = try_increment_locked_vm(cb >> PAGE_SHIFT);
150 	if (ret)
151 		return ret;
152 
153 	uas = vzalloc(cb);
154 	if (!uas) {
155 		decrement_locked_vm(cb >> PAGE_SHIFT);
156 		return -ENOMEM;
157 	}
158 	tbl->it_userspace = uas;
159 
160 	return 0;
161 }
162 
163 static void tce_iommu_userspace_view_free(struct iommu_table *tbl)
164 {
165 	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
166 			tbl->it_size, PAGE_SIZE);
167 
168 	if (!tbl->it_userspace)
169 		return;
170 
171 	vfree(tbl->it_userspace);
172 	tbl->it_userspace = NULL;
173 	decrement_locked_vm(cb >> PAGE_SHIFT);
174 }
175 
176 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
177 {
178 	/*
179 	 * Check that the TCE table granularity is not bigger than the size of
180 	 * a page we just found. Otherwise the hardware can get access to
181 	 * a bigger memory chunk that it should.
182 	 */
183 	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
184 }
185 
186 static inline bool tce_groups_attached(struct tce_container *container)
187 {
188 	return !list_empty(&container->group_list);
189 }
190 
191 static long tce_iommu_find_table(struct tce_container *container,
192 		phys_addr_t ioba, struct iommu_table **ptbl)
193 {
194 	long i;
195 
196 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
197 		struct iommu_table *tbl = container->tables[i];
198 
199 		if (tbl) {
200 			unsigned long entry = ioba >> tbl->it_page_shift;
201 			unsigned long start = tbl->it_offset;
202 			unsigned long end = start + tbl->it_size;
203 
204 			if ((start <= entry) && (entry < end)) {
205 				*ptbl = tbl;
206 				return i;
207 			}
208 		}
209 	}
210 
211 	return -1;
212 }
213 
214 static int tce_iommu_find_free_table(struct tce_container *container)
215 {
216 	int i;
217 
218 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
219 		if (!container->tables[i])
220 			return i;
221 	}
222 
223 	return -ENOSPC;
224 }
225 
226 static int tce_iommu_enable(struct tce_container *container)
227 {
228 	int ret = 0;
229 	unsigned long locked;
230 	struct iommu_table_group *table_group;
231 	struct tce_iommu_group *tcegrp;
232 
233 	if (!current->mm)
234 		return -ESRCH; /* process exited */
235 
236 	if (container->enabled)
237 		return -EBUSY;
238 
239 	/*
240 	 * When userspace pages are mapped into the IOMMU, they are effectively
241 	 * locked memory, so, theoretically, we need to update the accounting
242 	 * of locked pages on each map and unmap.  For powerpc, the map unmap
243 	 * paths can be very hot, though, and the accounting would kill
244 	 * performance, especially since it would be difficult to impossible
245 	 * to handle the accounting in real mode only.
246 	 *
247 	 * To address that, rather than precisely accounting every page, we
248 	 * instead account for a worst case on locked memory when the iommu is
249 	 * enabled and disabled.  The worst case upper bound on locked memory
250 	 * is the size of the whole iommu window, which is usually relatively
251 	 * small (compared to total memory sizes) on POWER hardware.
252 	 *
253 	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
254 	 * that would effectively kill the guest at random points, much better
255 	 * enforcing the limit based on the max that the guest can map.
256 	 *
257 	 * Unfortunately at the moment it counts whole tables, no matter how
258 	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
259 	 * each with 2GB DMA window, 8GB will be counted here. The reason for
260 	 * this is that we cannot tell here the amount of RAM used by the guest
261 	 * as this information is only available from KVM and VFIO is
262 	 * KVM agnostic.
263 	 *
264 	 * So we do not allow enabling a container without a group attached
265 	 * as there is no way to know how much we should increment
266 	 * the locked_vm counter.
267 	 */
268 	if (!tce_groups_attached(container))
269 		return -ENODEV;
270 
271 	tcegrp = list_first_entry(&container->group_list,
272 			struct tce_iommu_group, next);
273 	table_group = iommu_group_get_iommudata(tcegrp->grp);
274 	if (!table_group)
275 		return -ENODEV;
276 
277 	if (!table_group->tce32_size)
278 		return -EPERM;
279 
280 	locked = table_group->tce32_size >> PAGE_SHIFT;
281 	ret = try_increment_locked_vm(locked);
282 	if (ret)
283 		return ret;
284 
285 	container->locked_pages = locked;
286 
287 	container->enabled = true;
288 
289 	return ret;
290 }
291 
292 static void tce_iommu_disable(struct tce_container *container)
293 {
294 	if (!container->enabled)
295 		return;
296 
297 	container->enabled = false;
298 
299 	if (!current->mm)
300 		return;
301 
302 	decrement_locked_vm(container->locked_pages);
303 }
304 
305 static void *tce_iommu_open(unsigned long arg)
306 {
307 	struct tce_container *container;
308 
309 	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
310 		pr_err("tce_vfio: Wrong IOMMU type\n");
311 		return ERR_PTR(-EINVAL);
312 	}
313 
314 	container = kzalloc(sizeof(*container), GFP_KERNEL);
315 	if (!container)
316 		return ERR_PTR(-ENOMEM);
317 
318 	mutex_init(&container->lock);
319 	INIT_LIST_HEAD_RCU(&container->group_list);
320 
321 	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
322 
323 	return container;
324 }
325 
326 static int tce_iommu_clear(struct tce_container *container,
327 		struct iommu_table *tbl,
328 		unsigned long entry, unsigned long pages);
329 static void tce_iommu_free_table(struct iommu_table *tbl);
330 
331 static void tce_iommu_release(void *iommu_data)
332 {
333 	struct tce_container *container = iommu_data;
334 	struct tce_iommu_group *tcegrp;
335 	long i;
336 
337 	while (tce_groups_attached(container)) {
338 		tcegrp = list_first_entry(&container->group_list,
339 				struct tce_iommu_group, next);
340 		tce_iommu_detach_group(iommu_data, tcegrp->grp);
341 	}
342 
343 	/*
344 	 * If VFIO created a table, it was not disposed
345 	 * by tce_iommu_detach_group() so do it now.
346 	 */
347 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
348 		struct iommu_table *tbl = container->tables[i];
349 
350 		if (!tbl)
351 			continue;
352 
353 		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
354 		tce_iommu_free_table(tbl);
355 	}
356 
357 	tce_iommu_disable(container);
358 	mutex_destroy(&container->lock);
359 
360 	kfree(container);
361 }
362 
363 static void tce_iommu_unuse_page(struct tce_container *container,
364 		unsigned long hpa)
365 {
366 	struct page *page;
367 
368 	page = pfn_to_page(hpa >> PAGE_SHIFT);
369 	put_page(page);
370 }
371 
372 static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size,
373 		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
374 {
375 	long ret = 0;
376 	struct mm_iommu_table_group_mem_t *mem;
377 
378 	mem = mm_iommu_lookup(tce, size);
379 	if (!mem)
380 		return -EINVAL;
381 
382 	ret = mm_iommu_ua_to_hpa(mem, tce, phpa);
383 	if (ret)
384 		return -EINVAL;
385 
386 	*pmem = mem;
387 
388 	return 0;
389 }
390 
391 static void tce_iommu_unuse_page_v2(struct iommu_table *tbl,
392 		unsigned long entry)
393 {
394 	struct mm_iommu_table_group_mem_t *mem = NULL;
395 	int ret;
396 	unsigned long hpa = 0;
397 	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
398 
399 	if (!pua || !current || !current->mm)
400 		return;
401 
402 	ret = tce_iommu_prereg_ua_to_hpa(*pua, IOMMU_PAGE_SIZE(tbl),
403 			&hpa, &mem);
404 	if (ret)
405 		pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n",
406 				__func__, *pua, entry, ret);
407 	if (mem)
408 		mm_iommu_mapped_dec(mem);
409 
410 	*pua = 0;
411 }
412 
413 static int tce_iommu_clear(struct tce_container *container,
414 		struct iommu_table *tbl,
415 		unsigned long entry, unsigned long pages)
416 {
417 	unsigned long oldhpa;
418 	long ret;
419 	enum dma_data_direction direction;
420 
421 	for ( ; pages; --pages, ++entry) {
422 		direction = DMA_NONE;
423 		oldhpa = 0;
424 		ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
425 		if (ret)
426 			continue;
427 
428 		if (direction == DMA_NONE)
429 			continue;
430 
431 		if (container->v2) {
432 			tce_iommu_unuse_page_v2(tbl, entry);
433 			continue;
434 		}
435 
436 		tce_iommu_unuse_page(container, oldhpa);
437 	}
438 
439 	return 0;
440 }
441 
442 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
443 {
444 	struct page *page = NULL;
445 	enum dma_data_direction direction = iommu_tce_direction(tce);
446 
447 	if (get_user_pages_fast(tce & PAGE_MASK, 1,
448 			direction != DMA_TO_DEVICE, &page) != 1)
449 		return -EFAULT;
450 
451 	*hpa = __pa((unsigned long) page_address(page));
452 
453 	return 0;
454 }
455 
456 static long tce_iommu_build(struct tce_container *container,
457 		struct iommu_table *tbl,
458 		unsigned long entry, unsigned long tce, unsigned long pages,
459 		enum dma_data_direction direction)
460 {
461 	long i, ret = 0;
462 	struct page *page;
463 	unsigned long hpa;
464 	enum dma_data_direction dirtmp;
465 
466 	for (i = 0; i < pages; ++i) {
467 		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
468 
469 		ret = tce_iommu_use_page(tce, &hpa);
470 		if (ret)
471 			break;
472 
473 		page = pfn_to_page(hpa >> PAGE_SHIFT);
474 		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
475 			ret = -EPERM;
476 			break;
477 		}
478 
479 		hpa |= offset;
480 		dirtmp = direction;
481 		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
482 		if (ret) {
483 			tce_iommu_unuse_page(container, hpa);
484 			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
485 					__func__, entry << tbl->it_page_shift,
486 					tce, ret);
487 			break;
488 		}
489 
490 		if (dirtmp != DMA_NONE)
491 			tce_iommu_unuse_page(container, hpa);
492 
493 		tce += IOMMU_PAGE_SIZE(tbl);
494 	}
495 
496 	if (ret)
497 		tce_iommu_clear(container, tbl, entry, i);
498 
499 	return ret;
500 }
501 
502 static long tce_iommu_build_v2(struct tce_container *container,
503 		struct iommu_table *tbl,
504 		unsigned long entry, unsigned long tce, unsigned long pages,
505 		enum dma_data_direction direction)
506 {
507 	long i, ret = 0;
508 	struct page *page;
509 	unsigned long hpa;
510 	enum dma_data_direction dirtmp;
511 
512 	for (i = 0; i < pages; ++i) {
513 		struct mm_iommu_table_group_mem_t *mem = NULL;
514 		unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl,
515 				entry + i);
516 
517 		ret = tce_iommu_prereg_ua_to_hpa(tce, IOMMU_PAGE_SIZE(tbl),
518 				&hpa, &mem);
519 		if (ret)
520 			break;
521 
522 		page = pfn_to_page(hpa >> PAGE_SHIFT);
523 		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
524 			ret = -EPERM;
525 			break;
526 		}
527 
528 		/* Preserve offset within IOMMU page */
529 		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
530 		dirtmp = direction;
531 
532 		/* The registered region is being unregistered */
533 		if (mm_iommu_mapped_inc(mem))
534 			break;
535 
536 		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
537 		if (ret) {
538 			/* dirtmp cannot be DMA_NONE here */
539 			tce_iommu_unuse_page_v2(tbl, entry + i);
540 			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
541 					__func__, entry << tbl->it_page_shift,
542 					tce, ret);
543 			break;
544 		}
545 
546 		if (dirtmp != DMA_NONE)
547 			tce_iommu_unuse_page_v2(tbl, entry + i);
548 
549 		*pua = tce;
550 
551 		tce += IOMMU_PAGE_SIZE(tbl);
552 	}
553 
554 	if (ret)
555 		tce_iommu_clear(container, tbl, entry, i);
556 
557 	return ret;
558 }
559 
560 static long tce_iommu_create_table(struct tce_container *container,
561 			struct iommu_table_group *table_group,
562 			int num,
563 			__u32 page_shift,
564 			__u64 window_size,
565 			__u32 levels,
566 			struct iommu_table **ptbl)
567 {
568 	long ret, table_size;
569 
570 	table_size = table_group->ops->get_table_size(page_shift, window_size,
571 			levels);
572 	if (!table_size)
573 		return -EINVAL;
574 
575 	ret = try_increment_locked_vm(table_size >> PAGE_SHIFT);
576 	if (ret)
577 		return ret;
578 
579 	ret = table_group->ops->create_table(table_group, num,
580 			page_shift, window_size, levels, ptbl);
581 
582 	WARN_ON(!ret && !(*ptbl)->it_ops->free);
583 	WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
584 
585 	if (!ret && container->v2) {
586 		ret = tce_iommu_userspace_view_alloc(*ptbl);
587 		if (ret)
588 			(*ptbl)->it_ops->free(*ptbl);
589 	}
590 
591 	if (ret)
592 		decrement_locked_vm(table_size >> PAGE_SHIFT);
593 
594 	return ret;
595 }
596 
597 static void tce_iommu_free_table(struct iommu_table *tbl)
598 {
599 	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
600 
601 	tce_iommu_userspace_view_free(tbl);
602 	tbl->it_ops->free(tbl);
603 	decrement_locked_vm(pages);
604 }
605 
606 static long tce_iommu_create_window(struct tce_container *container,
607 		__u32 page_shift, __u64 window_size, __u32 levels,
608 		__u64 *start_addr)
609 {
610 	struct tce_iommu_group *tcegrp;
611 	struct iommu_table_group *table_group;
612 	struct iommu_table *tbl = NULL;
613 	long ret, num;
614 
615 	num = tce_iommu_find_free_table(container);
616 	if (num < 0)
617 		return num;
618 
619 	/* Get the first group for ops::create_table */
620 	tcegrp = list_first_entry(&container->group_list,
621 			struct tce_iommu_group, next);
622 	table_group = iommu_group_get_iommudata(tcegrp->grp);
623 	if (!table_group)
624 		return -EFAULT;
625 
626 	if (!(table_group->pgsizes & (1ULL << page_shift)))
627 		return -EINVAL;
628 
629 	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
630 			!table_group->ops->get_table_size ||
631 			!table_group->ops->create_table)
632 		return -EPERM;
633 
634 	/* Create TCE table */
635 	ret = tce_iommu_create_table(container, table_group, num,
636 			page_shift, window_size, levels, &tbl);
637 	if (ret)
638 		return ret;
639 
640 	BUG_ON(!tbl->it_ops->free);
641 
642 	/*
643 	 * Program the table to every group.
644 	 * Groups have been tested for compatibility at the attach time.
645 	 */
646 	list_for_each_entry(tcegrp, &container->group_list, next) {
647 		table_group = iommu_group_get_iommudata(tcegrp->grp);
648 
649 		ret = table_group->ops->set_window(table_group, num, tbl);
650 		if (ret)
651 			goto unset_exit;
652 	}
653 
654 	container->tables[num] = tbl;
655 
656 	/* Return start address assigned by platform in create_table() */
657 	*start_addr = tbl->it_offset << tbl->it_page_shift;
658 
659 	return 0;
660 
661 unset_exit:
662 	list_for_each_entry(tcegrp, &container->group_list, next) {
663 		table_group = iommu_group_get_iommudata(tcegrp->grp);
664 		table_group->ops->unset_window(table_group, num);
665 	}
666 	tce_iommu_free_table(tbl);
667 
668 	return ret;
669 }
670 
671 static long tce_iommu_remove_window(struct tce_container *container,
672 		__u64 start_addr)
673 {
674 	struct iommu_table_group *table_group = NULL;
675 	struct iommu_table *tbl;
676 	struct tce_iommu_group *tcegrp;
677 	int num;
678 
679 	num = tce_iommu_find_table(container, start_addr, &tbl);
680 	if (num < 0)
681 		return -EINVAL;
682 
683 	BUG_ON(!tbl->it_size);
684 
685 	/* Detach groups from IOMMUs */
686 	list_for_each_entry(tcegrp, &container->group_list, next) {
687 		table_group = iommu_group_get_iommudata(tcegrp->grp);
688 
689 		/*
690 		 * SPAPR TCE IOMMU exposes the default DMA window to
691 		 * the guest via dma32_window_start/size of
692 		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
693 		 * the userspace to remove this window, some do not so
694 		 * here we check for the platform capability.
695 		 */
696 		if (!table_group->ops || !table_group->ops->unset_window)
697 			return -EPERM;
698 
699 		table_group->ops->unset_window(table_group, num);
700 	}
701 
702 	/* Free table */
703 	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
704 	tce_iommu_free_table(tbl);
705 	container->tables[num] = NULL;
706 
707 	return 0;
708 }
709 
710 static long tce_iommu_ioctl(void *iommu_data,
711 				 unsigned int cmd, unsigned long arg)
712 {
713 	struct tce_container *container = iommu_data;
714 	unsigned long minsz, ddwsz;
715 	long ret;
716 
717 	switch (cmd) {
718 	case VFIO_CHECK_EXTENSION:
719 		switch (arg) {
720 		case VFIO_SPAPR_TCE_IOMMU:
721 		case VFIO_SPAPR_TCE_v2_IOMMU:
722 			ret = 1;
723 			break;
724 		default:
725 			ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
726 			break;
727 		}
728 
729 		return (ret < 0) ? 0 : ret;
730 
731 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
732 		struct vfio_iommu_spapr_tce_info info;
733 		struct tce_iommu_group *tcegrp;
734 		struct iommu_table_group *table_group;
735 
736 		if (!tce_groups_attached(container))
737 			return -ENXIO;
738 
739 		tcegrp = list_first_entry(&container->group_list,
740 				struct tce_iommu_group, next);
741 		table_group = iommu_group_get_iommudata(tcegrp->grp);
742 
743 		if (!table_group)
744 			return -ENXIO;
745 
746 		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
747 				dma32_window_size);
748 
749 		if (copy_from_user(&info, (void __user *)arg, minsz))
750 			return -EFAULT;
751 
752 		if (info.argsz < minsz)
753 			return -EINVAL;
754 
755 		info.dma32_window_start = table_group->tce32_start;
756 		info.dma32_window_size = table_group->tce32_size;
757 		info.flags = 0;
758 		memset(&info.ddw, 0, sizeof(info.ddw));
759 
760 		if (table_group->max_dynamic_windows_supported &&
761 				container->v2) {
762 			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
763 			info.ddw.pgsizes = table_group->pgsizes;
764 			info.ddw.max_dynamic_windows_supported =
765 				table_group->max_dynamic_windows_supported;
766 			info.ddw.levels = table_group->max_levels;
767 		}
768 
769 		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
770 
771 		if (info.argsz >= ddwsz)
772 			minsz = ddwsz;
773 
774 		if (copy_to_user((void __user *)arg, &info, minsz))
775 			return -EFAULT;
776 
777 		return 0;
778 	}
779 	case VFIO_IOMMU_MAP_DMA: {
780 		struct vfio_iommu_type1_dma_map param;
781 		struct iommu_table *tbl = NULL;
782 		long num;
783 		enum dma_data_direction direction;
784 
785 		if (!container->enabled)
786 			return -EPERM;
787 
788 		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
789 
790 		if (copy_from_user(&param, (void __user *)arg, minsz))
791 			return -EFAULT;
792 
793 		if (param.argsz < minsz)
794 			return -EINVAL;
795 
796 		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
797 				VFIO_DMA_MAP_FLAG_WRITE))
798 			return -EINVAL;
799 
800 		num = tce_iommu_find_table(container, param.iova, &tbl);
801 		if (num < 0)
802 			return -ENXIO;
803 
804 		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
805 				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
806 			return -EINVAL;
807 
808 		/* iova is checked by the IOMMU API */
809 		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
810 			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
811 				direction = DMA_BIDIRECTIONAL;
812 			else
813 				direction = DMA_TO_DEVICE;
814 		} else {
815 			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
816 				direction = DMA_FROM_DEVICE;
817 			else
818 				return -EINVAL;
819 		}
820 
821 		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
822 		if (ret)
823 			return ret;
824 
825 		if (container->v2)
826 			ret = tce_iommu_build_v2(container, tbl,
827 					param.iova >> tbl->it_page_shift,
828 					param.vaddr,
829 					param.size >> tbl->it_page_shift,
830 					direction);
831 		else
832 			ret = tce_iommu_build(container, tbl,
833 					param.iova >> tbl->it_page_shift,
834 					param.vaddr,
835 					param.size >> tbl->it_page_shift,
836 					direction);
837 
838 		iommu_flush_tce(tbl);
839 
840 		return ret;
841 	}
842 	case VFIO_IOMMU_UNMAP_DMA: {
843 		struct vfio_iommu_type1_dma_unmap param;
844 		struct iommu_table *tbl = NULL;
845 		long num;
846 
847 		if (!container->enabled)
848 			return -EPERM;
849 
850 		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
851 				size);
852 
853 		if (copy_from_user(&param, (void __user *)arg, minsz))
854 			return -EFAULT;
855 
856 		if (param.argsz < minsz)
857 			return -EINVAL;
858 
859 		/* No flag is supported now */
860 		if (param.flags)
861 			return -EINVAL;
862 
863 		num = tce_iommu_find_table(container, param.iova, &tbl);
864 		if (num < 0)
865 			return -ENXIO;
866 
867 		if (param.size & ~IOMMU_PAGE_MASK(tbl))
868 			return -EINVAL;
869 
870 		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
871 				param.size >> tbl->it_page_shift);
872 		if (ret)
873 			return ret;
874 
875 		ret = tce_iommu_clear(container, tbl,
876 				param.iova >> tbl->it_page_shift,
877 				param.size >> tbl->it_page_shift);
878 		iommu_flush_tce(tbl);
879 
880 		return ret;
881 	}
882 	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
883 		struct vfio_iommu_spapr_register_memory param;
884 
885 		if (!container->v2)
886 			break;
887 
888 		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
889 				size);
890 
891 		if (copy_from_user(&param, (void __user *)arg, minsz))
892 			return -EFAULT;
893 
894 		if (param.argsz < minsz)
895 			return -EINVAL;
896 
897 		/* No flag is supported now */
898 		if (param.flags)
899 			return -EINVAL;
900 
901 		mutex_lock(&container->lock);
902 		ret = tce_iommu_register_pages(container, param.vaddr,
903 				param.size);
904 		mutex_unlock(&container->lock);
905 
906 		return ret;
907 	}
908 	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
909 		struct vfio_iommu_spapr_register_memory param;
910 
911 		if (!container->v2)
912 			break;
913 
914 		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
915 				size);
916 
917 		if (copy_from_user(&param, (void __user *)arg, minsz))
918 			return -EFAULT;
919 
920 		if (param.argsz < minsz)
921 			return -EINVAL;
922 
923 		/* No flag is supported now */
924 		if (param.flags)
925 			return -EINVAL;
926 
927 		mutex_lock(&container->lock);
928 		ret = tce_iommu_unregister_pages(container, param.vaddr,
929 				param.size);
930 		mutex_unlock(&container->lock);
931 
932 		return ret;
933 	}
934 	case VFIO_IOMMU_ENABLE:
935 		if (container->v2)
936 			break;
937 
938 		mutex_lock(&container->lock);
939 		ret = tce_iommu_enable(container);
940 		mutex_unlock(&container->lock);
941 		return ret;
942 
943 
944 	case VFIO_IOMMU_DISABLE:
945 		if (container->v2)
946 			break;
947 
948 		mutex_lock(&container->lock);
949 		tce_iommu_disable(container);
950 		mutex_unlock(&container->lock);
951 		return 0;
952 
953 	case VFIO_EEH_PE_OP: {
954 		struct tce_iommu_group *tcegrp;
955 
956 		ret = 0;
957 		list_for_each_entry(tcegrp, &container->group_list, next) {
958 			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
959 					cmd, arg);
960 			if (ret)
961 				return ret;
962 		}
963 		return ret;
964 	}
965 
966 	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
967 		struct vfio_iommu_spapr_tce_create create;
968 
969 		if (!container->v2)
970 			break;
971 
972 		if (!tce_groups_attached(container))
973 			return -ENXIO;
974 
975 		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
976 				start_addr);
977 
978 		if (copy_from_user(&create, (void __user *)arg, minsz))
979 			return -EFAULT;
980 
981 		if (create.argsz < minsz)
982 			return -EINVAL;
983 
984 		if (create.flags)
985 			return -EINVAL;
986 
987 		mutex_lock(&container->lock);
988 
989 		ret = tce_iommu_create_window(container, create.page_shift,
990 				create.window_size, create.levels,
991 				&create.start_addr);
992 
993 		mutex_unlock(&container->lock);
994 
995 		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
996 			ret = -EFAULT;
997 
998 		return ret;
999 	}
1000 	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1001 		struct vfio_iommu_spapr_tce_remove remove;
1002 
1003 		if (!container->v2)
1004 			break;
1005 
1006 		if (!tce_groups_attached(container))
1007 			return -ENXIO;
1008 
1009 		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1010 				start_addr);
1011 
1012 		if (copy_from_user(&remove, (void __user *)arg, minsz))
1013 			return -EFAULT;
1014 
1015 		if (remove.argsz < minsz)
1016 			return -EINVAL;
1017 
1018 		if (remove.flags)
1019 			return -EINVAL;
1020 
1021 		mutex_lock(&container->lock);
1022 
1023 		ret = tce_iommu_remove_window(container, remove.start_addr);
1024 
1025 		mutex_unlock(&container->lock);
1026 
1027 		return ret;
1028 	}
1029 	}
1030 
1031 	return -ENOTTY;
1032 }
1033 
1034 static void tce_iommu_release_ownership(struct tce_container *container,
1035 		struct iommu_table_group *table_group)
1036 {
1037 	int i;
1038 
1039 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1040 		struct iommu_table *tbl = container->tables[i];
1041 
1042 		if (!tbl)
1043 			continue;
1044 
1045 		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1046 		tce_iommu_userspace_view_free(tbl);
1047 		if (tbl->it_map)
1048 			iommu_release_ownership(tbl);
1049 
1050 		container->tables[i] = NULL;
1051 	}
1052 }
1053 
1054 static int tce_iommu_take_ownership(struct tce_container *container,
1055 		struct iommu_table_group *table_group)
1056 {
1057 	int i, j, rc = 0;
1058 
1059 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1060 		struct iommu_table *tbl = table_group->tables[i];
1061 
1062 		if (!tbl || !tbl->it_map)
1063 			continue;
1064 
1065 		rc = tce_iommu_userspace_view_alloc(tbl);
1066 		if (!rc)
1067 			rc = iommu_take_ownership(tbl);
1068 
1069 		if (rc) {
1070 			for (j = 0; j < i; ++j)
1071 				iommu_release_ownership(
1072 						table_group->tables[j]);
1073 
1074 			return rc;
1075 		}
1076 	}
1077 
1078 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1079 		container->tables[i] = table_group->tables[i];
1080 
1081 	return 0;
1082 }
1083 
1084 static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1085 		struct iommu_table_group *table_group)
1086 {
1087 	long i;
1088 
1089 	if (!table_group->ops->unset_window) {
1090 		WARN_ON_ONCE(1);
1091 		return;
1092 	}
1093 
1094 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1095 		table_group->ops->unset_window(table_group, i);
1096 
1097 	table_group->ops->release_ownership(table_group);
1098 }
1099 
1100 static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1101 		struct iommu_table_group *table_group)
1102 {
1103 	long i, ret = 0;
1104 	struct iommu_table *tbl = NULL;
1105 
1106 	if (!table_group->ops->create_table || !table_group->ops->set_window ||
1107 			!table_group->ops->release_ownership) {
1108 		WARN_ON_ONCE(1);
1109 		return -EFAULT;
1110 	}
1111 
1112 	table_group->ops->take_ownership(table_group);
1113 
1114 	/*
1115 	 * If it the first group attached, check if there is
1116 	 * a default DMA window and create one if none as
1117 	 * the userspace expects it to exist.
1118 	 */
1119 	if (!tce_groups_attached(container) && !container->tables[0]) {
1120 		ret = tce_iommu_create_table(container,
1121 				table_group,
1122 				0, /* window number */
1123 				IOMMU_PAGE_SHIFT_4K,
1124 				table_group->tce32_size,
1125 				1, /* default levels */
1126 				&tbl);
1127 		if (ret)
1128 			goto release_exit;
1129 		else
1130 			container->tables[0] = tbl;
1131 	}
1132 
1133 	/* Set all windows to the new group */
1134 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1135 		tbl = container->tables[i];
1136 
1137 		if (!tbl)
1138 			continue;
1139 
1140 		/* Set the default window to a new group */
1141 		ret = table_group->ops->set_window(table_group, i, tbl);
1142 		if (ret)
1143 			goto release_exit;
1144 	}
1145 
1146 	return 0;
1147 
1148 release_exit:
1149 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1150 		table_group->ops->unset_window(table_group, i);
1151 
1152 	table_group->ops->release_ownership(table_group);
1153 
1154 	return ret;
1155 }
1156 
1157 static int tce_iommu_attach_group(void *iommu_data,
1158 		struct iommu_group *iommu_group)
1159 {
1160 	int ret;
1161 	struct tce_container *container = iommu_data;
1162 	struct iommu_table_group *table_group;
1163 	struct tce_iommu_group *tcegrp = NULL;
1164 
1165 	mutex_lock(&container->lock);
1166 
1167 	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1168 			iommu_group_id(iommu_group), iommu_group); */
1169 	table_group = iommu_group_get_iommudata(iommu_group);
1170 
1171 	if (tce_groups_attached(container) && (!table_group->ops ||
1172 			!table_group->ops->take_ownership ||
1173 			!table_group->ops->release_ownership)) {
1174 		ret = -EBUSY;
1175 		goto unlock_exit;
1176 	}
1177 
1178 	/* Check if new group has the same iommu_ops (i.e. compatible) */
1179 	list_for_each_entry(tcegrp, &container->group_list, next) {
1180 		struct iommu_table_group *table_group_tmp;
1181 
1182 		if (tcegrp->grp == iommu_group) {
1183 			pr_warn("tce_vfio: Group %d is already attached\n",
1184 					iommu_group_id(iommu_group));
1185 			ret = -EBUSY;
1186 			goto unlock_exit;
1187 		}
1188 		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1189 		if (table_group_tmp->ops->create_table !=
1190 				table_group->ops->create_table) {
1191 			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1192 					iommu_group_id(iommu_group),
1193 					iommu_group_id(tcegrp->grp));
1194 			ret = -EPERM;
1195 			goto unlock_exit;
1196 		}
1197 	}
1198 
1199 	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1200 	if (!tcegrp) {
1201 		ret = -ENOMEM;
1202 		goto unlock_exit;
1203 	}
1204 
1205 	if (!table_group->ops || !table_group->ops->take_ownership ||
1206 			!table_group->ops->release_ownership)
1207 		ret = tce_iommu_take_ownership(container, table_group);
1208 	else
1209 		ret = tce_iommu_take_ownership_ddw(container, table_group);
1210 
1211 	if (!ret) {
1212 		tcegrp->grp = iommu_group;
1213 		list_add(&tcegrp->next, &container->group_list);
1214 	}
1215 
1216 unlock_exit:
1217 	if (ret && tcegrp)
1218 		kfree(tcegrp);
1219 
1220 	mutex_unlock(&container->lock);
1221 
1222 	return ret;
1223 }
1224 
1225 static void tce_iommu_detach_group(void *iommu_data,
1226 		struct iommu_group *iommu_group)
1227 {
1228 	struct tce_container *container = iommu_data;
1229 	struct iommu_table_group *table_group;
1230 	bool found = false;
1231 	struct tce_iommu_group *tcegrp;
1232 
1233 	mutex_lock(&container->lock);
1234 
1235 	list_for_each_entry(tcegrp, &container->group_list, next) {
1236 		if (tcegrp->grp == iommu_group) {
1237 			found = true;
1238 			break;
1239 		}
1240 	}
1241 
1242 	if (!found) {
1243 		pr_warn("tce_vfio: detaching unattached group #%u\n",
1244 				iommu_group_id(iommu_group));
1245 		goto unlock_exit;
1246 	}
1247 
1248 	list_del(&tcegrp->next);
1249 	kfree(tcegrp);
1250 
1251 	table_group = iommu_group_get_iommudata(iommu_group);
1252 	BUG_ON(!table_group);
1253 
1254 	if (!table_group->ops || !table_group->ops->release_ownership)
1255 		tce_iommu_release_ownership(container, table_group);
1256 	else
1257 		tce_iommu_release_ownership_ddw(container, table_group);
1258 
1259 unlock_exit:
1260 	mutex_unlock(&container->lock);
1261 }
1262 
1263 const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1264 	.name		= "iommu-vfio-powerpc",
1265 	.owner		= THIS_MODULE,
1266 	.open		= tce_iommu_open,
1267 	.release	= tce_iommu_release,
1268 	.ioctl		= tce_iommu_ioctl,
1269 	.attach_group	= tce_iommu_attach_group,
1270 	.detach_group	= tce_iommu_detach_group,
1271 };
1272 
1273 static int __init tce_iommu_init(void)
1274 {
1275 	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1276 }
1277 
1278 static void __exit tce_iommu_cleanup(void)
1279 {
1280 	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1281 }
1282 
1283 module_init(tce_iommu_init);
1284 module_exit(tce_iommu_cleanup);
1285 
1286 MODULE_VERSION(DRIVER_VERSION);
1287 MODULE_LICENSE("GPL v2");
1288 MODULE_AUTHOR(DRIVER_AUTHOR);
1289 MODULE_DESCRIPTION(DRIVER_DESC);
1290 
1291