1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES 3 */ 4 #include <linux/file.h> 5 #include <linux/interval_tree.h> 6 #include <linux/iommu.h> 7 #include <linux/iommufd.h> 8 #include <linux/slab.h> 9 #include <linux/vfio.h> 10 #include <uapi/linux/vfio.h> 11 #include <uapi/linux/iommufd.h> 12 13 #include "iommufd_private.h" 14 15 static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx) 16 { 17 struct iommufd_ioas *ioas = ERR_PTR(-ENODEV); 18 19 xa_lock(&ictx->objects); 20 if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj)) 21 goto out_unlock; 22 ioas = ictx->vfio_ioas; 23 out_unlock: 24 xa_unlock(&ictx->objects); 25 return ioas; 26 } 27 28 /** 29 * iommufd_vfio_compat_ioas_get_id - Ensure a compat IOAS exists 30 * @ictx: Context to operate on 31 * @out_ioas_id: The IOAS ID of the compatibility IOAS 32 * 33 * Return the ID of the current compatibility IOAS. The ID can be passed into 34 * other functions that take an ioas_id. 35 */ 36 int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) 37 { 38 struct iommufd_ioas *ioas; 39 40 ioas = get_compat_ioas(ictx); 41 if (IS_ERR(ioas)) 42 return PTR_ERR(ioas); 43 *out_ioas_id = ioas->obj.id; 44 iommufd_put_object(&ioas->obj); 45 return 0; 46 } 47 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO); 48 49 /** 50 * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached 51 * @ictx: Context to operate on 52 * 53 * This allows selecting the VFIO_NOIOMMU_IOMMU and blocks normal types. 54 */ 55 int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx) 56 { 57 int ret; 58 59 xa_lock(&ictx->objects); 60 if (!ictx->vfio_ioas) { 61 ictx->no_iommu_mode = 1; 62 ret = 0; 63 } else { 64 ret = -EINVAL; 65 } 66 xa_unlock(&ictx->objects); 67 return ret; 68 } 69 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, IOMMUFD_VFIO); 70 71 /** 72 * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created 73 * @ictx: Context to operate on 74 * 75 * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate 76 * on since they do not have an IOAS ID input in their ABI. Only attaching a 77 * group should cause a default creation of the internal ioas, this does nothing 78 * if an existing ioas has already been assigned somehow. 79 */ 80 int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx) 81 { 82 struct iommufd_ioas *ioas = NULL; 83 int ret; 84 85 ioas = iommufd_ioas_alloc(ictx); 86 if (IS_ERR(ioas)) 87 return PTR_ERR(ioas); 88 89 xa_lock(&ictx->objects); 90 /* 91 * VFIO won't allow attaching a container to both iommu and no iommu 92 * operation 93 */ 94 if (ictx->no_iommu_mode) { 95 ret = -EINVAL; 96 goto out_abort; 97 } 98 99 if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) { 100 ret = 0; 101 iommufd_put_object(&ictx->vfio_ioas->obj); 102 goto out_abort; 103 } 104 ictx->vfio_ioas = ioas; 105 xa_unlock(&ictx->objects); 106 107 /* 108 * An automatically created compat IOAS is treated as a userspace 109 * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET, 110 * and if not manually destroyed it will be destroyed automatically 111 * at iommufd release. 112 */ 113 iommufd_object_finalize(ictx, &ioas->obj); 114 return 0; 115 116 out_abort: 117 xa_unlock(&ictx->objects); 118 iommufd_object_abort(ictx, &ioas->obj); 119 return ret; 120 } 121 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, IOMMUFD_VFIO); 122 123 int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) 124 { 125 struct iommu_vfio_ioas *cmd = ucmd->cmd; 126 struct iommufd_ioas *ioas; 127 128 if (cmd->__reserved) 129 return -EOPNOTSUPP; 130 switch (cmd->op) { 131 case IOMMU_VFIO_IOAS_GET: 132 ioas = get_compat_ioas(ucmd->ictx); 133 if (IS_ERR(ioas)) 134 return PTR_ERR(ioas); 135 cmd->ioas_id = ioas->obj.id; 136 iommufd_put_object(&ioas->obj); 137 return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); 138 139 case IOMMU_VFIO_IOAS_SET: 140 ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); 141 if (IS_ERR(ioas)) 142 return PTR_ERR(ioas); 143 xa_lock(&ucmd->ictx->objects); 144 ucmd->ictx->vfio_ioas = ioas; 145 xa_unlock(&ucmd->ictx->objects); 146 iommufd_put_object(&ioas->obj); 147 return 0; 148 149 case IOMMU_VFIO_IOAS_CLEAR: 150 xa_lock(&ucmd->ictx->objects); 151 ucmd->ictx->vfio_ioas = NULL; 152 xa_unlock(&ucmd->ictx->objects); 153 return 0; 154 default: 155 return -EOPNOTSUPP; 156 } 157 } 158 159 static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd, 160 void __user *arg) 161 { 162 u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; 163 size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); 164 struct vfio_iommu_type1_dma_map map; 165 int iommu_prot = IOMMU_CACHE; 166 struct iommufd_ioas *ioas; 167 unsigned long iova; 168 int rc; 169 170 if (copy_from_user(&map, arg, minsz)) 171 return -EFAULT; 172 173 if (map.argsz < minsz || map.flags & ~supported_flags) 174 return -EINVAL; 175 176 if (map.flags & VFIO_DMA_MAP_FLAG_READ) 177 iommu_prot |= IOMMU_READ; 178 if (map.flags & VFIO_DMA_MAP_FLAG_WRITE) 179 iommu_prot |= IOMMU_WRITE; 180 181 ioas = get_compat_ioas(ictx); 182 if (IS_ERR(ioas)) 183 return PTR_ERR(ioas); 184 185 /* 186 * Maps created through the legacy interface always use VFIO compatible 187 * rlimit accounting. If the user wishes to use the faster user based 188 * rlimit accounting then they must use the new interface. 189 */ 190 iova = map.iova; 191 rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr), 192 map.size, iommu_prot, 0); 193 iommufd_put_object(&ioas->obj); 194 return rc; 195 } 196 197 static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd, 198 void __user *arg) 199 { 200 size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); 201 /* 202 * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new 203 * dirty tracking direction: 204 * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/ 205 * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/ 206 */ 207 u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL; 208 struct vfio_iommu_type1_dma_unmap unmap; 209 unsigned long unmapped = 0; 210 struct iommufd_ioas *ioas; 211 int rc; 212 213 if (copy_from_user(&unmap, arg, minsz)) 214 return -EFAULT; 215 216 if (unmap.argsz < minsz || unmap.flags & ~supported_flags) 217 return -EINVAL; 218 219 ioas = get_compat_ioas(ictx); 220 if (IS_ERR(ioas)) 221 return PTR_ERR(ioas); 222 223 if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) { 224 if (unmap.iova != 0 || unmap.size != 0) { 225 rc = -EINVAL; 226 goto err_put; 227 } 228 rc = iopt_unmap_all(&ioas->iopt, &unmapped); 229 } else { 230 if (READ_ONCE(ioas->iopt.disable_large_pages)) { 231 /* 232 * Create cuts at the start and last of the requested 233 * range. If the start IOVA is 0 then it doesn't need to 234 * be cut. 235 */ 236 unsigned long iovas[] = { unmap.iova + unmap.size - 1, 237 unmap.iova - 1 }; 238 239 rc = iopt_cut_iova(&ioas->iopt, iovas, 240 unmap.iova ? 2 : 1); 241 if (rc) 242 goto err_put; 243 } 244 rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size, 245 &unmapped); 246 } 247 unmap.size = unmapped; 248 if (copy_to_user(arg, &unmap, minsz)) 249 rc = -EFAULT; 250 251 err_put: 252 iommufd_put_object(&ioas->obj); 253 return rc; 254 } 255 256 static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) 257 { 258 struct iommufd_hw_pagetable *hwpt; 259 struct iommufd_ioas *ioas; 260 int rc = 1; 261 262 ioas = get_compat_ioas(ictx); 263 if (IS_ERR(ioas)) 264 return PTR_ERR(ioas); 265 266 mutex_lock(&ioas->mutex); 267 list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { 268 if (!hwpt->enforce_cache_coherency) { 269 rc = 0; 270 break; 271 } 272 } 273 mutex_unlock(&ioas->mutex); 274 275 iommufd_put_object(&ioas->obj); 276 return rc; 277 } 278 279 static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, 280 unsigned long type) 281 { 282 switch (type) { 283 case VFIO_TYPE1_IOMMU: 284 case VFIO_TYPE1v2_IOMMU: 285 case VFIO_UNMAP_ALL: 286 return 1; 287 288 case VFIO_NOIOMMU_IOMMU: 289 return IS_ENABLED(CONFIG_VFIO_NOIOMMU); 290 291 case VFIO_DMA_CC_IOMMU: 292 return iommufd_vfio_cc_iommu(ictx); 293 294 /* 295 * This is obsolete, and to be removed from VFIO. It was an incomplete 296 * idea that got merged. 297 * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ 298 */ 299 case VFIO_TYPE1_NESTING_IOMMU: 300 return 0; 301 302 /* 303 * VFIO_DMA_MAP_FLAG_VADDR 304 * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/ 305 * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/ 306 * 307 * It is hard to see how this could be implemented safely. 308 */ 309 case VFIO_UPDATE_VADDR: 310 default: 311 return 0; 312 } 313 } 314 315 static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type) 316 { 317 bool no_iommu_mode = READ_ONCE(ictx->no_iommu_mode); 318 struct iommufd_ioas *ioas = NULL; 319 int rc = 0; 320 321 /* 322 * Emulation for NOIOMMU is imperfect in that VFIO blocks almost all 323 * other ioctls. We let them keep working but they mostly fail since no 324 * IOAS should exist. 325 */ 326 if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU && 327 no_iommu_mode) { 328 if (!capable(CAP_SYS_RAWIO)) 329 return -EPERM; 330 return 0; 331 } 332 333 if ((type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) || 334 no_iommu_mode) 335 return -EINVAL; 336 337 /* VFIO fails the set_iommu if there is no group */ 338 ioas = get_compat_ioas(ictx); 339 if (IS_ERR(ioas)) 340 return PTR_ERR(ioas); 341 342 /* 343 * The difference between TYPE1 and TYPE1v2 is the ability to unmap in 344 * the middle of mapped ranges. This is complicated by huge page support 345 * which creates single large IOPTEs that cannot be split by the iommu 346 * driver. TYPE1 is very old at this point and likely nothing uses it, 347 * however it is simple enough to emulate by simply disabling the 348 * problematic large IOPTEs. Then we can safely unmap within any range. 349 */ 350 if (type == VFIO_TYPE1_IOMMU) 351 rc = iopt_disable_large_pages(&ioas->iopt); 352 iommufd_put_object(&ioas->obj); 353 return rc; 354 } 355 356 static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas) 357 { 358 struct io_pagetable *iopt = &ioas->iopt; 359 unsigned long pgsize_bitmap = ULONG_MAX; 360 struct iommu_domain *domain; 361 unsigned long index; 362 363 down_read(&iopt->domains_rwsem); 364 xa_for_each(&iopt->domains, index, domain) 365 pgsize_bitmap &= domain->pgsize_bitmap; 366 367 /* See vfio_update_pgsize_bitmap() */ 368 if (pgsize_bitmap & ~PAGE_MASK) { 369 pgsize_bitmap &= PAGE_MASK; 370 pgsize_bitmap |= PAGE_SIZE; 371 } 372 pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment); 373 up_read(&iopt->domains_rwsem); 374 return pgsize_bitmap; 375 } 376 377 static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas, 378 struct vfio_info_cap_header __user *cur, 379 size_t avail) 380 { 381 struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas = 382 container_of(cur, 383 struct vfio_iommu_type1_info_cap_iova_range __user, 384 header); 385 struct vfio_iommu_type1_info_cap_iova_range cap_iovas = { 386 .header = { 387 .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 388 .version = 1, 389 }, 390 }; 391 struct interval_tree_span_iter span; 392 393 interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, 394 ULONG_MAX) { 395 struct vfio_iova_range range; 396 397 if (!span.is_hole) 398 continue; 399 range.start = span.start_hole; 400 range.end = span.last_hole; 401 if (avail >= struct_size(&cap_iovas, iova_ranges, 402 cap_iovas.nr_iovas + 1) && 403 copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas], 404 &range, sizeof(range))) 405 return -EFAULT; 406 cap_iovas.nr_iovas++; 407 } 408 if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) && 409 copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas))) 410 return -EFAULT; 411 return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas); 412 } 413 414 static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas, 415 struct vfio_info_cap_header __user *cur, 416 size_t avail) 417 { 418 struct vfio_iommu_type1_info_dma_avail cap_dma = { 419 .header = { 420 .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL, 421 .version = 1, 422 }, 423 /* 424 * iommufd's limit is based on the cgroup's memory limit. 425 * Normally vfio would return U16_MAX here, and provide a module 426 * parameter to adjust it. Since S390 qemu userspace actually 427 * pays attention and needs a value bigger than U16_MAX return 428 * U32_MAX. 429 */ 430 .avail = U32_MAX, 431 }; 432 433 if (avail >= sizeof(cap_dma) && 434 copy_to_user(cur, &cap_dma, sizeof(cap_dma))) 435 return -EFAULT; 436 return sizeof(cap_dma); 437 } 438 439 static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, 440 void __user *arg) 441 { 442 typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas, 443 struct vfio_info_cap_header __user *cur, 444 size_t avail); 445 static const fill_cap_fn fill_fns[] = { 446 iommufd_fill_cap_dma_avail, 447 iommufd_fill_cap_iova, 448 }; 449 size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); 450 struct vfio_info_cap_header __user *last_cap = NULL; 451 struct vfio_iommu_type1_info info = {}; 452 struct iommufd_ioas *ioas; 453 size_t total_cap_size; 454 int rc; 455 int i; 456 457 if (copy_from_user(&info, arg, minsz)) 458 return -EFAULT; 459 460 if (info.argsz < minsz) 461 return -EINVAL; 462 minsz = min_t(size_t, info.argsz, sizeof(info)); 463 464 ioas = get_compat_ioas(ictx); 465 if (IS_ERR(ioas)) 466 return PTR_ERR(ioas); 467 468 info.flags = VFIO_IOMMU_INFO_PGSIZES; 469 info.iova_pgsizes = iommufd_get_pagesizes(ioas); 470 info.cap_offset = 0; 471 472 down_read(&ioas->iopt.iova_rwsem); 473 total_cap_size = sizeof(info); 474 for (i = 0; i != ARRAY_SIZE(fill_fns); i++) { 475 int cap_size; 476 477 if (info.argsz > total_cap_size) 478 cap_size = fill_fns[i](ioas, arg + total_cap_size, 479 info.argsz - total_cap_size); 480 else 481 cap_size = fill_fns[i](ioas, NULL, 0); 482 if (cap_size < 0) { 483 rc = cap_size; 484 goto out_put; 485 } 486 if (last_cap && info.argsz >= total_cap_size && 487 put_user(total_cap_size, &last_cap->next)) { 488 rc = -EFAULT; 489 goto out_put; 490 } 491 last_cap = arg + total_cap_size; 492 total_cap_size += cap_size; 493 } 494 495 /* 496 * If the user did not provide enough space then only some caps are 497 * returned and the argsz will be updated to the correct amount to get 498 * all caps. 499 */ 500 if (info.argsz >= total_cap_size) 501 info.cap_offset = sizeof(info); 502 info.argsz = total_cap_size; 503 info.flags |= VFIO_IOMMU_INFO_CAPS; 504 if (copy_to_user(arg, &info, minsz)) { 505 rc = -EFAULT; 506 goto out_put; 507 } 508 rc = 0; 509 510 out_put: 511 up_read(&ioas->iopt.iova_rwsem); 512 iommufd_put_object(&ioas->obj); 513 return rc; 514 } 515 516 int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, 517 unsigned long arg) 518 { 519 void __user *uarg = (void __user *)arg; 520 521 switch (cmd) { 522 case VFIO_GET_API_VERSION: 523 return VFIO_API_VERSION; 524 case VFIO_SET_IOMMU: 525 return iommufd_vfio_set_iommu(ictx, arg); 526 case VFIO_CHECK_EXTENSION: 527 return iommufd_vfio_check_extension(ictx, arg); 528 case VFIO_IOMMU_GET_INFO: 529 return iommufd_vfio_iommu_get_info(ictx, uarg); 530 case VFIO_IOMMU_MAP_DMA: 531 return iommufd_vfio_map_dma(ictx, cmd, uarg); 532 case VFIO_IOMMU_UNMAP_DMA: 533 return iommufd_vfio_unmap_dma(ictx, cmd, uarg); 534 case VFIO_IOMMU_DIRTY_PAGES: 535 default: 536 return -ENOIOCTLCMD; 537 } 538 return -ENOIOCTLCMD; 539 } 540