1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES 3 */ 4 #include <linux/file.h> 5 #include <linux/interval_tree.h> 6 #include <linux/iommu.h> 7 #include <linux/iommufd.h> 8 #include <linux/slab.h> 9 #include <linux/vfio.h> 10 #include <uapi/linux/vfio.h> 11 #include <uapi/linux/iommufd.h> 12 13 #include "iommufd_private.h" 14 15 static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx) 16 { 17 struct iommufd_ioas *ioas = ERR_PTR(-ENODEV); 18 19 xa_lock(&ictx->objects); 20 if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj)) 21 goto out_unlock; 22 ioas = ictx->vfio_ioas; 23 out_unlock: 24 xa_unlock(&ictx->objects); 25 return ioas; 26 } 27 28 /** 29 * iommufd_vfio_compat_ioas_id - Return the IOAS ID that vfio should use 30 * @ictx: Context to operate on 31 * @out_ioas_id: The ioas_id the caller should use 32 * 33 * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate 34 * on since they do not have an IOAS ID input in their ABI. Only attaching a 35 * group should cause a default creation of the internal ioas, this returns the 36 * existing ioas if it has already been assigned somehow. 37 */ 38 int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) 39 { 40 struct iommufd_ioas *ioas = NULL; 41 struct iommufd_ioas *out_ioas; 42 43 ioas = iommufd_ioas_alloc(ictx); 44 if (IS_ERR(ioas)) 45 return PTR_ERR(ioas); 46 47 xa_lock(&ictx->objects); 48 if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) 49 out_ioas = ictx->vfio_ioas; 50 else { 51 out_ioas = ioas; 52 ictx->vfio_ioas = ioas; 53 } 54 xa_unlock(&ictx->objects); 55 56 *out_ioas_id = out_ioas->obj.id; 57 if (out_ioas != ioas) { 58 iommufd_put_object(&out_ioas->obj); 59 iommufd_object_abort(ictx, &ioas->obj); 60 return 0; 61 } 62 /* 63 * An automatically created compat IOAS is treated as a userspace 64 * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET, 65 * and if not manually destroyed it will be destroyed automatically 66 * at iommufd release. 67 */ 68 iommufd_object_finalize(ictx, &ioas->obj); 69 return 0; 70 } 71 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_id, IOMMUFD_VFIO); 72 73 int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) 74 { 75 struct iommu_vfio_ioas *cmd = ucmd->cmd; 76 struct iommufd_ioas *ioas; 77 78 if (cmd->__reserved) 79 return -EOPNOTSUPP; 80 switch (cmd->op) { 81 case IOMMU_VFIO_IOAS_GET: 82 ioas = get_compat_ioas(ucmd->ictx); 83 if (IS_ERR(ioas)) 84 return PTR_ERR(ioas); 85 cmd->ioas_id = ioas->obj.id; 86 iommufd_put_object(&ioas->obj); 87 return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); 88 89 case IOMMU_VFIO_IOAS_SET: 90 ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); 91 if (IS_ERR(ioas)) 92 return PTR_ERR(ioas); 93 xa_lock(&ucmd->ictx->objects); 94 ucmd->ictx->vfio_ioas = ioas; 95 xa_unlock(&ucmd->ictx->objects); 96 iommufd_put_object(&ioas->obj); 97 return 0; 98 99 case IOMMU_VFIO_IOAS_CLEAR: 100 xa_lock(&ucmd->ictx->objects); 101 ucmd->ictx->vfio_ioas = NULL; 102 xa_unlock(&ucmd->ictx->objects); 103 return 0; 104 default: 105 return -EOPNOTSUPP; 106 } 107 } 108 109 static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd, 110 void __user *arg) 111 { 112 u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 113 size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 114 struct vfio_iommu_type1_dma_map map; 115 int iommu_prot = IOMMU_CACHE; 116 struct iommufd_ioas *ioas; 117 unsigned long iova; 118 int rc; 119 120 if (copy_from_user(&map, arg, minsz)) 121 return -EFAULT; 122 123 if (map.argsz < minsz || map.flags & ~supported_flags) 124 return -EINVAL; 125 126 if (map.flags & VFIO_DMA_MAP_FLAG_READ) 127 iommu_prot |= IOMMU_READ; 128 if (map.flags & VFIO_DMA_MAP_FLAG_WRITE) 129 iommu_prot |= IOMMU_WRITE; 130 131 ioas = get_compat_ioas(ictx); 132 if (IS_ERR(ioas)) 133 return PTR_ERR(ioas); 134 135 /* 136 * Maps created through the legacy interface always use VFIO compatible 137 * rlimit accounting. If the user wishes to use the faster user based 138 * rlimit accounting then they must use the new interface. 139 */ 140 iova = map.iova; 141 rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr), 142 map.size, iommu_prot, 0); 143 iommufd_put_object(&ioas->obj); 144 return rc; 145 } 146 147 static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd, 148 void __user *arg) 149 { 150 size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); 151 /* 152 * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new 153 * dirty tracking direction: 154 * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/ 155 * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/ 156 */ 157 u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL; 158 struct vfio_iommu_type1_dma_unmap unmap; 159 unsigned long unmapped = 0; 160 struct iommufd_ioas *ioas; 161 int rc; 162 163 if (copy_from_user(&unmap, arg, minsz)) 164 return -EFAULT; 165 166 if (unmap.argsz < minsz || unmap.flags & ~supported_flags) 167 return -EINVAL; 168 169 ioas = get_compat_ioas(ictx); 170 if (IS_ERR(ioas)) 171 return PTR_ERR(ioas); 172 173 if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) { 174 if (unmap.iova != 0 || unmap.size != 0) { 175 rc = -EINVAL; 176 goto err_put; 177 } 178 rc = iopt_unmap_all(&ioas->iopt, &unmapped); 179 } else { 180 if (READ_ONCE(ioas->iopt.disable_large_pages)) { 181 /* 182 * Create cuts at the start and last of the requested 183 * range. If the start IOVA is 0 then it doesn't need to 184 * be cut. 185 */ 186 unsigned long iovas[] = { unmap.iova + unmap.size - 1, 187 unmap.iova - 1 }; 188 189 rc = iopt_cut_iova(&ioas->iopt, iovas, 190 unmap.iova ? 2 : 1); 191 if (rc) 192 goto err_put; 193 } 194 rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size, 195 &unmapped); 196 } 197 unmap.size = unmapped; 198 if (copy_to_user(arg, &unmap, minsz)) 199 rc = -EFAULT; 200 201 err_put: 202 iommufd_put_object(&ioas->obj); 203 return rc; 204 } 205 206 static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) 207 { 208 struct iommufd_hw_pagetable *hwpt; 209 struct iommufd_ioas *ioas; 210 int rc = 1; 211 212 ioas = get_compat_ioas(ictx); 213 if (IS_ERR(ioas)) 214 return PTR_ERR(ioas); 215 216 mutex_lock(&ioas->mutex); 217 list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { 218 if (!hwpt->enforce_cache_coherency) { 219 rc = 0; 220 break; 221 } 222 } 223 mutex_unlock(&ioas->mutex); 224 225 iommufd_put_object(&ioas->obj); 226 return rc; 227 } 228 229 static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, 230 unsigned long type) 231 { 232 switch (type) { 233 case VFIO_TYPE1_IOMMU: 234 case VFIO_TYPE1v2_IOMMU: 235 case VFIO_UNMAP_ALL: 236 return 1; 237 238 case VFIO_DMA_CC_IOMMU: 239 return iommufd_vfio_cc_iommu(ictx); 240 241 /* 242 * This is obsolete, and to be removed from VFIO. It was an incomplete 243 * idea that got merged. 244 * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ 245 */ 246 case VFIO_TYPE1_NESTING_IOMMU: 247 return 0; 248 249 /* 250 * VFIO_DMA_MAP_FLAG_VADDR 251 * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/ 252 * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/ 253 * 254 * It is hard to see how this could be implemented safely. 255 */ 256 case VFIO_UPDATE_VADDR: 257 default: 258 return 0; 259 } 260 } 261 262 static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type) 263 { 264 struct iommufd_ioas *ioas = NULL; 265 int rc = 0; 266 267 if (type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) 268 return -EINVAL; 269 270 /* VFIO fails the set_iommu if there is no group */ 271 ioas = get_compat_ioas(ictx); 272 if (IS_ERR(ioas)) 273 return PTR_ERR(ioas); 274 275 /* 276 * The difference between TYPE1 and TYPE1v2 is the ability to unmap in 277 * the middle of mapped ranges. This is complicated by huge page support 278 * which creates single large IOPTEs that cannot be split by the iommu 279 * driver. TYPE1 is very old at this point and likely nothing uses it, 280 * however it is simple enough to emulate by simply disabling the 281 * problematic large IOPTEs. Then we can safely unmap within any range. 282 */ 283 if (type == VFIO_TYPE1_IOMMU) 284 rc = iopt_disable_large_pages(&ioas->iopt); 285 iommufd_put_object(&ioas->obj); 286 return rc; 287 } 288 289 static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas) 290 { 291 struct io_pagetable *iopt = &ioas->iopt; 292 unsigned long pgsize_bitmap = ULONG_MAX; 293 struct iommu_domain *domain; 294 unsigned long index; 295 296 down_read(&iopt->domains_rwsem); 297 xa_for_each(&iopt->domains, index, domain) 298 pgsize_bitmap &= domain->pgsize_bitmap; 299 300 /* See vfio_update_pgsize_bitmap() */ 301 if (pgsize_bitmap & ~PAGE_MASK) { 302 pgsize_bitmap &= PAGE_MASK; 303 pgsize_bitmap |= PAGE_SIZE; 304 } 305 pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment); 306 up_read(&iopt->domains_rwsem); 307 return pgsize_bitmap; 308 } 309 310 static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas, 311 struct vfio_info_cap_header __user *cur, 312 size_t avail) 313 { 314 struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas = 315 container_of(cur, 316 struct vfio_iommu_type1_info_cap_iova_range __user, 317 header); 318 struct vfio_iommu_type1_info_cap_iova_range cap_iovas = { 319 .header = { 320 .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 321 .version = 1, 322 }, 323 }; 324 struct interval_tree_span_iter span; 325 326 interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, 327 ULONG_MAX) { 328 struct vfio_iova_range range; 329 330 if (!span.is_hole) 331 continue; 332 range.start = span.start_hole; 333 range.end = span.last_hole; 334 if (avail >= struct_size(&cap_iovas, iova_ranges, 335 cap_iovas.nr_iovas + 1) && 336 copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas], 337 &range, sizeof(range))) 338 return -EFAULT; 339 cap_iovas.nr_iovas++; 340 } 341 if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) && 342 copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas))) 343 return -EFAULT; 344 return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas); 345 } 346 347 static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas, 348 struct vfio_info_cap_header __user *cur, 349 size_t avail) 350 { 351 struct vfio_iommu_type1_info_dma_avail cap_dma = { 352 .header = { 353 .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL, 354 .version = 1, 355 }, 356 /* 357 * iommufd's limit is based on the cgroup's memory limit. 358 * Normally vfio would return U16_MAX here, and provide a module 359 * parameter to adjust it. Since S390 qemu userspace actually 360 * pays attention and needs a value bigger than U16_MAX return 361 * U32_MAX. 362 */ 363 .avail = U32_MAX, 364 }; 365 366 if (avail >= sizeof(cap_dma) && 367 copy_to_user(cur, &cap_dma, sizeof(cap_dma))) 368 return -EFAULT; 369 return sizeof(cap_dma); 370 } 371 372 static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, 373 void __user *arg) 374 { 375 typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas, 376 struct vfio_info_cap_header __user *cur, 377 size_t avail); 378 static const fill_cap_fn fill_fns[] = { 379 iommufd_fill_cap_dma_avail, 380 iommufd_fill_cap_iova, 381 }; 382 size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); 383 struct vfio_info_cap_header __user *last_cap = NULL; 384 struct vfio_iommu_type1_info info; 385 struct iommufd_ioas *ioas; 386 size_t total_cap_size; 387 int rc; 388 int i; 389 390 if (copy_from_user(&info, arg, minsz)) 391 return -EFAULT; 392 393 if (info.argsz < minsz) 394 return -EINVAL; 395 minsz = min_t(size_t, info.argsz, sizeof(info)); 396 397 ioas = get_compat_ioas(ictx); 398 if (IS_ERR(ioas)) 399 return PTR_ERR(ioas); 400 401 info.flags = VFIO_IOMMU_INFO_PGSIZES; 402 info.iova_pgsizes = iommufd_get_pagesizes(ioas); 403 info.cap_offset = 0; 404 405 down_read(&ioas->iopt.iova_rwsem); 406 total_cap_size = sizeof(info); 407 for (i = 0; i != ARRAY_SIZE(fill_fns); i++) { 408 int cap_size; 409 410 if (info.argsz > total_cap_size) 411 cap_size = fill_fns[i](ioas, arg + total_cap_size, 412 info.argsz - total_cap_size); 413 else 414 cap_size = fill_fns[i](ioas, NULL, 0); 415 if (cap_size < 0) { 416 rc = cap_size; 417 goto out_put; 418 } 419 if (last_cap && info.argsz >= total_cap_size && 420 put_user(total_cap_size, &last_cap->next)) { 421 rc = -EFAULT; 422 goto out_put; 423 } 424 last_cap = arg + total_cap_size; 425 total_cap_size += cap_size; 426 } 427 428 /* 429 * If the user did not provide enough space then only some caps are 430 * returned and the argsz will be updated to the correct amount to get 431 * all caps. 432 */ 433 if (info.argsz >= total_cap_size) 434 info.cap_offset = sizeof(info); 435 info.argsz = total_cap_size; 436 info.flags |= VFIO_IOMMU_INFO_CAPS; 437 if (copy_to_user(arg, &info, minsz)) { 438 rc = -EFAULT; 439 goto out_put; 440 } 441 rc = 0; 442 443 out_put: 444 up_read(&ioas->iopt.iova_rwsem); 445 iommufd_put_object(&ioas->obj); 446 return rc; 447 } 448 449 int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, 450 unsigned long arg) 451 { 452 void __user *uarg = (void __user *)arg; 453 454 switch (cmd) { 455 case VFIO_GET_API_VERSION: 456 return VFIO_API_VERSION; 457 case VFIO_SET_IOMMU: 458 return iommufd_vfio_set_iommu(ictx, arg); 459 case VFIO_CHECK_EXTENSION: 460 return iommufd_vfio_check_extension(ictx, arg); 461 case VFIO_IOMMU_GET_INFO: 462 return iommufd_vfio_iommu_get_info(ictx, uarg); 463 case VFIO_IOMMU_MAP_DMA: 464 return iommufd_vfio_map_dma(ictx, cmd, uarg); 465 case VFIO_IOMMU_UNMAP_DMA: 466 return iommufd_vfio_unmap_dma(ictx, cmd, uarg); 467 case VFIO_IOMMU_DIRTY_PAGES: 468 default: 469 return -ENOIOCTLCMD; 470 } 471 return -ENOIOCTLCMD; 472 } 473