1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
3  */
4 #include <linux/file.h>
5 #include <linux/interval_tree.h>
6 #include <linux/iommu.h>
7 #include <linux/iommufd.h>
8 #include <linux/slab.h>
9 #include <linux/vfio.h>
10 #include <uapi/linux/vfio.h>
11 #include <uapi/linux/iommufd.h>
12 
13 #include "iommufd_private.h"
14 
15 static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx)
16 {
17 	struct iommufd_ioas *ioas = ERR_PTR(-ENODEV);
18 
19 	xa_lock(&ictx->objects);
20 	if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj))
21 		goto out_unlock;
22 	ioas = ictx->vfio_ioas;
23 out_unlock:
24 	xa_unlock(&ictx->objects);
25 	return ioas;
26 }
27 
28 /**
29  * iommufd_vfio_compat_ioas_id - Return the IOAS ID that vfio should use
30  * @ictx: Context to operate on
31  * @out_ioas_id: The ioas_id the caller should use
32  *
33  * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate
34  * on since they do not have an IOAS ID input in their ABI. Only attaching a
35  * group should cause a default creation of the internal ioas, this returns the
36  * existing ioas if it has already been assigned somehow.
37  */
38 int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
39 {
40 	struct iommufd_ioas *ioas = NULL;
41 	struct iommufd_ioas *out_ioas;
42 
43 	ioas = iommufd_ioas_alloc(ictx);
44 	if (IS_ERR(ioas))
45 		return PTR_ERR(ioas);
46 
47 	xa_lock(&ictx->objects);
48 	if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj))
49 		out_ioas = ictx->vfio_ioas;
50 	else {
51 		out_ioas = ioas;
52 		ictx->vfio_ioas = ioas;
53 	}
54 	xa_unlock(&ictx->objects);
55 
56 	*out_ioas_id = out_ioas->obj.id;
57 	if (out_ioas != ioas) {
58 		iommufd_put_object(&out_ioas->obj);
59 		iommufd_object_abort(ictx, &ioas->obj);
60 		return 0;
61 	}
62 	/*
63 	 * An automatically created compat IOAS is treated as a userspace
64 	 * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET,
65 	 * and if not manually destroyed it will be destroyed automatically
66 	 * at iommufd release.
67 	 */
68 	iommufd_object_finalize(ictx, &ioas->obj);
69 	return 0;
70 }
71 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_id, IOMMUFD_VFIO);
72 
73 int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd)
74 {
75 	struct iommu_vfio_ioas *cmd = ucmd->cmd;
76 	struct iommufd_ioas *ioas;
77 
78 	if (cmd->__reserved)
79 		return -EOPNOTSUPP;
80 	switch (cmd->op) {
81 	case IOMMU_VFIO_IOAS_GET:
82 		ioas = get_compat_ioas(ucmd->ictx);
83 		if (IS_ERR(ioas))
84 			return PTR_ERR(ioas);
85 		cmd->ioas_id = ioas->obj.id;
86 		iommufd_put_object(&ioas->obj);
87 		return iommufd_ucmd_respond(ucmd, sizeof(*cmd));
88 
89 	case IOMMU_VFIO_IOAS_SET:
90 		ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
91 		if (IS_ERR(ioas))
92 			return PTR_ERR(ioas);
93 		xa_lock(&ucmd->ictx->objects);
94 		ucmd->ictx->vfio_ioas = ioas;
95 		xa_unlock(&ucmd->ictx->objects);
96 		iommufd_put_object(&ioas->obj);
97 		return 0;
98 
99 	case IOMMU_VFIO_IOAS_CLEAR:
100 		xa_lock(&ucmd->ictx->objects);
101 		ucmd->ictx->vfio_ioas = NULL;
102 		xa_unlock(&ucmd->ictx->objects);
103 		return 0;
104 	default:
105 		return -EOPNOTSUPP;
106 	}
107 }
108 
109 static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd,
110 				void __user *arg)
111 {
112 	u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
113 	size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
114 	struct vfio_iommu_type1_dma_map map;
115 	int iommu_prot = IOMMU_CACHE;
116 	struct iommufd_ioas *ioas;
117 	unsigned long iova;
118 	int rc;
119 
120 	if (copy_from_user(&map, arg, minsz))
121 		return -EFAULT;
122 
123 	if (map.argsz < minsz || map.flags & ~supported_flags)
124 		return -EINVAL;
125 
126 	if (map.flags & VFIO_DMA_MAP_FLAG_READ)
127 		iommu_prot |= IOMMU_READ;
128 	if (map.flags & VFIO_DMA_MAP_FLAG_WRITE)
129 		iommu_prot |= IOMMU_WRITE;
130 
131 	ioas = get_compat_ioas(ictx);
132 	if (IS_ERR(ioas))
133 		return PTR_ERR(ioas);
134 
135 	/*
136 	 * Maps created through the legacy interface always use VFIO compatible
137 	 * rlimit accounting. If the user wishes to use the faster user based
138 	 * rlimit accounting then they must use the new interface.
139 	 */
140 	iova = map.iova;
141 	rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr),
142 				 map.size, iommu_prot, 0);
143 	iommufd_put_object(&ioas->obj);
144 	return rc;
145 }
146 
147 static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd,
148 				  void __user *arg)
149 {
150 	size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
151 	/*
152 	 * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new
153 	 * dirty tracking direction:
154 	 *  https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/
155 	 *  https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/
156 	 */
157 	u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL;
158 	struct vfio_iommu_type1_dma_unmap unmap;
159 	unsigned long unmapped = 0;
160 	struct iommufd_ioas *ioas;
161 	int rc;
162 
163 	if (copy_from_user(&unmap, arg, minsz))
164 		return -EFAULT;
165 
166 	if (unmap.argsz < minsz || unmap.flags & ~supported_flags)
167 		return -EINVAL;
168 
169 	ioas = get_compat_ioas(ictx);
170 	if (IS_ERR(ioas))
171 		return PTR_ERR(ioas);
172 
173 	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) {
174 		if (unmap.iova != 0 || unmap.size != 0) {
175 			rc = -EINVAL;
176 			goto err_put;
177 		}
178 		rc = iopt_unmap_all(&ioas->iopt, &unmapped);
179 	} else {
180 		if (READ_ONCE(ioas->iopt.disable_large_pages)) {
181 			/*
182 			 * Create cuts at the start and last of the requested
183 			 * range. If the start IOVA is 0 then it doesn't need to
184 			 * be cut.
185 			 */
186 			unsigned long iovas[] = { unmap.iova + unmap.size - 1,
187 						  unmap.iova - 1 };
188 
189 			rc = iopt_cut_iova(&ioas->iopt, iovas,
190 					   unmap.iova ? 2 : 1);
191 			if (rc)
192 				goto err_put;
193 		}
194 		rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size,
195 				     &unmapped);
196 	}
197 	unmap.size = unmapped;
198 	if (copy_to_user(arg, &unmap, minsz))
199 		rc = -EFAULT;
200 
201 err_put:
202 	iommufd_put_object(&ioas->obj);
203 	return rc;
204 }
205 
206 static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
207 {
208 	struct iommufd_hw_pagetable *hwpt;
209 	struct iommufd_ioas *ioas;
210 	int rc = 1;
211 
212 	ioas = get_compat_ioas(ictx);
213 	if (IS_ERR(ioas))
214 		return PTR_ERR(ioas);
215 
216 	mutex_lock(&ioas->mutex);
217 	list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
218 		if (!hwpt->enforce_cache_coherency) {
219 			rc = 0;
220 			break;
221 		}
222 	}
223 	mutex_unlock(&ioas->mutex);
224 
225 	iommufd_put_object(&ioas->obj);
226 	return rc;
227 }
228 
229 static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
230 					unsigned long type)
231 {
232 	switch (type) {
233 	case VFIO_TYPE1_IOMMU:
234 	case VFIO_TYPE1v2_IOMMU:
235 	case VFIO_UNMAP_ALL:
236 		return 1;
237 
238 	case VFIO_DMA_CC_IOMMU:
239 		return iommufd_vfio_cc_iommu(ictx);
240 
241 	/*
242 	 * This is obsolete, and to be removed from VFIO. It was an incomplete
243 	 * idea that got merged.
244 	 * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
245 	 */
246 	case VFIO_TYPE1_NESTING_IOMMU:
247 		return 0;
248 
249 	/*
250 	 * VFIO_DMA_MAP_FLAG_VADDR
251 	 * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/
252 	 * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/
253 	 *
254 	 * It is hard to see how this could be implemented safely.
255 	 */
256 	case VFIO_UPDATE_VADDR:
257 	default:
258 		return 0;
259 	}
260 }
261 
262 static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type)
263 {
264 	struct iommufd_ioas *ioas = NULL;
265 	int rc = 0;
266 
267 	if (type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU)
268 		return -EINVAL;
269 
270 	/* VFIO fails the set_iommu if there is no group */
271 	ioas = get_compat_ioas(ictx);
272 	if (IS_ERR(ioas))
273 		return PTR_ERR(ioas);
274 
275 	/*
276 	 * The difference between TYPE1 and TYPE1v2 is the ability to unmap in
277 	 * the middle of mapped ranges. This is complicated by huge page support
278 	 * which creates single large IOPTEs that cannot be split by the iommu
279 	 * driver. TYPE1 is very old at this point and likely nothing uses it,
280 	 * however it is simple enough to emulate by simply disabling the
281 	 * problematic large IOPTEs. Then we can safely unmap within any range.
282 	 */
283 	if (type == VFIO_TYPE1_IOMMU)
284 		rc = iopt_disable_large_pages(&ioas->iopt);
285 	iommufd_put_object(&ioas->obj);
286 	return rc;
287 }
288 
289 static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas)
290 {
291 	struct io_pagetable *iopt = &ioas->iopt;
292 	unsigned long pgsize_bitmap = ULONG_MAX;
293 	struct iommu_domain *domain;
294 	unsigned long index;
295 
296 	down_read(&iopt->domains_rwsem);
297 	xa_for_each(&iopt->domains, index, domain)
298 		pgsize_bitmap &= domain->pgsize_bitmap;
299 
300 	/* See vfio_update_pgsize_bitmap() */
301 	if (pgsize_bitmap & ~PAGE_MASK) {
302 		pgsize_bitmap &= PAGE_MASK;
303 		pgsize_bitmap |= PAGE_SIZE;
304 	}
305 	pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment);
306 	up_read(&iopt->domains_rwsem);
307 	return pgsize_bitmap;
308 }
309 
310 static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas,
311 				 struct vfio_info_cap_header __user *cur,
312 				 size_t avail)
313 {
314 	struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas =
315 		container_of(cur,
316 			     struct vfio_iommu_type1_info_cap_iova_range __user,
317 			     header);
318 	struct vfio_iommu_type1_info_cap_iova_range cap_iovas = {
319 		.header = {
320 			.id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE,
321 			.version = 1,
322 		},
323 	};
324 	struct interval_tree_span_iter span;
325 
326 	interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
327 				    ULONG_MAX) {
328 		struct vfio_iova_range range;
329 
330 		if (!span.is_hole)
331 			continue;
332 		range.start = span.start_hole;
333 		range.end = span.last_hole;
334 		if (avail >= struct_size(&cap_iovas, iova_ranges,
335 					 cap_iovas.nr_iovas + 1) &&
336 		    copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas],
337 				 &range, sizeof(range)))
338 			return -EFAULT;
339 		cap_iovas.nr_iovas++;
340 	}
341 	if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) &&
342 	    copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas)))
343 		return -EFAULT;
344 	return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas);
345 }
346 
347 static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas,
348 				      struct vfio_info_cap_header __user *cur,
349 				      size_t avail)
350 {
351 	struct vfio_iommu_type1_info_dma_avail cap_dma = {
352 		.header = {
353 			.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL,
354 			.version = 1,
355 		},
356 		/*
357 		 * iommufd's limit is based on the cgroup's memory limit.
358 		 * Normally vfio would return U16_MAX here, and provide a module
359 		 * parameter to adjust it. Since S390 qemu userspace actually
360 		 * pays attention and needs a value bigger than U16_MAX return
361 		 * U32_MAX.
362 		 */
363 		.avail = U32_MAX,
364 	};
365 
366 	if (avail >= sizeof(cap_dma) &&
367 	    copy_to_user(cur, &cap_dma, sizeof(cap_dma)))
368 		return -EFAULT;
369 	return sizeof(cap_dma);
370 }
371 
372 static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx,
373 				       void __user *arg)
374 {
375 	typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas,
376 				   struct vfio_info_cap_header __user *cur,
377 				   size_t avail);
378 	static const fill_cap_fn fill_fns[] = {
379 		iommufd_fill_cap_dma_avail,
380 		iommufd_fill_cap_iova,
381 	};
382 	size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
383 	struct vfio_info_cap_header __user *last_cap = NULL;
384 	struct vfio_iommu_type1_info info;
385 	struct iommufd_ioas *ioas;
386 	size_t total_cap_size;
387 	int rc;
388 	int i;
389 
390 	if (copy_from_user(&info, arg, minsz))
391 		return -EFAULT;
392 
393 	if (info.argsz < minsz)
394 		return -EINVAL;
395 	minsz = min_t(size_t, info.argsz, sizeof(info));
396 
397 	ioas = get_compat_ioas(ictx);
398 	if (IS_ERR(ioas))
399 		return PTR_ERR(ioas);
400 
401 	info.flags = VFIO_IOMMU_INFO_PGSIZES;
402 	info.iova_pgsizes = iommufd_get_pagesizes(ioas);
403 	info.cap_offset = 0;
404 
405 	down_read(&ioas->iopt.iova_rwsem);
406 	total_cap_size = sizeof(info);
407 	for (i = 0; i != ARRAY_SIZE(fill_fns); i++) {
408 		int cap_size;
409 
410 		if (info.argsz > total_cap_size)
411 			cap_size = fill_fns[i](ioas, arg + total_cap_size,
412 					       info.argsz - total_cap_size);
413 		else
414 			cap_size = fill_fns[i](ioas, NULL, 0);
415 		if (cap_size < 0) {
416 			rc = cap_size;
417 			goto out_put;
418 		}
419 		if (last_cap && info.argsz >= total_cap_size &&
420 		    put_user(total_cap_size, &last_cap->next)) {
421 			rc = -EFAULT;
422 			goto out_put;
423 		}
424 		last_cap = arg + total_cap_size;
425 		total_cap_size += cap_size;
426 	}
427 
428 	/*
429 	 * If the user did not provide enough space then only some caps are
430 	 * returned and the argsz will be updated to the correct amount to get
431 	 * all caps.
432 	 */
433 	if (info.argsz >= total_cap_size)
434 		info.cap_offset = sizeof(info);
435 	info.argsz = total_cap_size;
436 	info.flags |= VFIO_IOMMU_INFO_CAPS;
437 	if (copy_to_user(arg, &info, minsz)) {
438 		rc = -EFAULT;
439 		goto out_put;
440 	}
441 	rc = 0;
442 
443 out_put:
444 	up_read(&ioas->iopt.iova_rwsem);
445 	iommufd_put_object(&ioas->obj);
446 	return rc;
447 }
448 
449 int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
450 		       unsigned long arg)
451 {
452 	void __user *uarg = (void __user *)arg;
453 
454 	switch (cmd) {
455 	case VFIO_GET_API_VERSION:
456 		return VFIO_API_VERSION;
457 	case VFIO_SET_IOMMU:
458 		return iommufd_vfio_set_iommu(ictx, arg);
459 	case VFIO_CHECK_EXTENSION:
460 		return iommufd_vfio_check_extension(ictx, arg);
461 	case VFIO_IOMMU_GET_INFO:
462 		return iommufd_vfio_iommu_get_info(ictx, uarg);
463 	case VFIO_IOMMU_MAP_DMA:
464 		return iommufd_vfio_map_dma(ictx, cmd, uarg);
465 	case VFIO_IOMMU_UNMAP_DMA:
466 		return iommufd_vfio_unmap_dma(ictx, cmd, uarg);
467 	case VFIO_IOMMU_DIRTY_PAGES:
468 	default:
469 		return -ENOIOCTLCMD;
470 	}
471 	return -ENOIOCTLCMD;
472 }
473