1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (C) 2021 Intel Corporation 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES 4 * 5 * iommufd provides control over the IOMMU HW objects created by IOMMU kernel 6 * drivers. IOMMU HW objects revolve around IO page tables that map incoming DMA 7 * addresses (IOVA) to CPU addresses. 8 */ 9 #define pr_fmt(fmt) "iommufd: " fmt 10 11 #include <linux/file.h> 12 #include <linux/fs.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/miscdevice.h> 16 #include <linux/mutex.h> 17 #include <linux/bug.h> 18 #include <uapi/linux/iommufd.h> 19 #include <linux/iommufd.h> 20 21 #include "io_pagetable.h" 22 #include "iommufd_private.h" 23 #include "iommufd_test.h" 24 25 struct iommufd_object_ops { 26 void (*destroy)(struct iommufd_object *obj); 27 void (*abort)(struct iommufd_object *obj); 28 }; 29 static const struct iommufd_object_ops iommufd_object_ops[]; 30 static struct miscdevice vfio_misc_dev; 31 32 struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, 33 size_t size, 34 enum iommufd_object_type type) 35 { 36 static struct lock_class_key obj_keys[IOMMUFD_OBJ_MAX]; 37 struct iommufd_object *obj; 38 int rc; 39 40 obj = kzalloc(size, GFP_KERNEL_ACCOUNT); 41 if (!obj) 42 return ERR_PTR(-ENOMEM); 43 obj->type = type; 44 /* 45 * In most cases the destroy_rwsem is obtained with try so it doesn't 46 * interact with lockdep, however on destroy we have to sleep. This 47 * means if we have to destroy an object while holding a get on another 48 * object it triggers lockdep. Using one locking class per object type 49 * is a simple and reasonable way to avoid this. 50 */ 51 __init_rwsem(&obj->destroy_rwsem, "iommufd_object::destroy_rwsem", 52 &obj_keys[type]); 53 refcount_set(&obj->users, 1); 54 55 /* 56 * Reserve an ID in the xarray but do not publish the pointer yet since 57 * the caller hasn't initialized it yet. Once the pointer is published 58 * in the xarray and visible to other threads we can't reliably destroy 59 * it anymore, so the caller must complete all errorable operations 60 * before calling iommufd_object_finalize(). 61 */ 62 rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, 63 xa_limit_31b, GFP_KERNEL_ACCOUNT); 64 if (rc) 65 goto out_free; 66 return obj; 67 out_free: 68 kfree(obj); 69 return ERR_PTR(rc); 70 } 71 72 /* 73 * Allow concurrent access to the object. 74 * 75 * Once another thread can see the object pointer it can prevent object 76 * destruction. Expect for special kernel-only objects there is no in-kernel way 77 * to reliably destroy a single object. Thus all APIs that are creating objects 78 * must use iommufd_object_abort() to handle their errors and only call 79 * iommufd_object_finalize() once object creation cannot fail. 80 */ 81 void iommufd_object_finalize(struct iommufd_ctx *ictx, 82 struct iommufd_object *obj) 83 { 84 void *old; 85 86 old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL); 87 /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */ 88 WARN_ON(old); 89 } 90 91 /* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ 92 void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) 93 { 94 void *old; 95 96 old = xa_erase(&ictx->objects, obj->id); 97 WARN_ON(old); 98 kfree(obj); 99 } 100 101 /* 102 * Abort an object that has been fully initialized and needs destroy, but has 103 * not been finalized. 104 */ 105 void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, 106 struct iommufd_object *obj) 107 { 108 if (iommufd_object_ops[obj->type].abort) 109 iommufd_object_ops[obj->type].abort(obj); 110 else 111 iommufd_object_ops[obj->type].destroy(obj); 112 iommufd_object_abort(ictx, obj); 113 } 114 115 struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, 116 enum iommufd_object_type type) 117 { 118 struct iommufd_object *obj; 119 120 if (iommufd_should_fail()) 121 return ERR_PTR(-ENOENT); 122 123 xa_lock(&ictx->objects); 124 obj = xa_load(&ictx->objects, id); 125 if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) || 126 !iommufd_lock_obj(obj)) 127 obj = ERR_PTR(-ENOENT); 128 xa_unlock(&ictx->objects); 129 return obj; 130 } 131 132 /* 133 * Remove the given object id from the xarray if the only reference to the 134 * object is held by the xarray. The caller must call ops destroy(). 135 */ 136 static struct iommufd_object *iommufd_object_remove(struct iommufd_ctx *ictx, 137 u32 id, bool extra_put) 138 { 139 struct iommufd_object *obj; 140 XA_STATE(xas, &ictx->objects, id); 141 142 xa_lock(&ictx->objects); 143 obj = xas_load(&xas); 144 if (xa_is_zero(obj) || !obj) { 145 obj = ERR_PTR(-ENOENT); 146 goto out_xa; 147 } 148 149 /* 150 * If the caller is holding a ref on obj we put it here under the 151 * spinlock. 152 */ 153 if (extra_put) 154 refcount_dec(&obj->users); 155 156 if (!refcount_dec_if_one(&obj->users)) { 157 obj = ERR_PTR(-EBUSY); 158 goto out_xa; 159 } 160 161 xas_store(&xas, NULL); 162 if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) 163 ictx->vfio_ioas = NULL; 164 165 out_xa: 166 xa_unlock(&ictx->objects); 167 168 /* The returned object reference count is zero */ 169 return obj; 170 } 171 172 /* 173 * The caller holds a users refcount and wants to destroy the object. Returns 174 * true if the object was destroyed. In all cases the caller no longer has a 175 * reference on obj. 176 */ 177 void __iommufd_object_destroy_user(struct iommufd_ctx *ictx, 178 struct iommufd_object *obj, bool allow_fail) 179 { 180 struct iommufd_object *ret; 181 182 /* 183 * The purpose of the destroy_rwsem is to ensure deterministic 184 * destruction of objects used by external drivers and destroyed by this 185 * function. Any temporary increment of the refcount must hold the read 186 * side of this, such as during ioctl execution. 187 */ 188 down_write(&obj->destroy_rwsem); 189 ret = iommufd_object_remove(ictx, obj->id, true); 190 up_write(&obj->destroy_rwsem); 191 192 if (allow_fail && IS_ERR(ret)) 193 return; 194 195 /* 196 * If there is a bug and we couldn't destroy the object then we did put 197 * back the caller's refcount and will eventually try to free it again 198 * during close. 199 */ 200 if (WARN_ON(IS_ERR(ret))) 201 return; 202 203 iommufd_object_ops[obj->type].destroy(obj); 204 kfree(obj); 205 } 206 207 static int iommufd_destroy(struct iommufd_ucmd *ucmd) 208 { 209 struct iommu_destroy *cmd = ucmd->cmd; 210 struct iommufd_object *obj; 211 212 obj = iommufd_object_remove(ucmd->ictx, cmd->id, false); 213 if (IS_ERR(obj)) 214 return PTR_ERR(obj); 215 iommufd_object_ops[obj->type].destroy(obj); 216 kfree(obj); 217 return 0; 218 } 219 220 static int iommufd_fops_open(struct inode *inode, struct file *filp) 221 { 222 struct iommufd_ctx *ictx; 223 224 ictx = kzalloc(sizeof(*ictx), GFP_KERNEL_ACCOUNT); 225 if (!ictx) 226 return -ENOMEM; 227 228 /* 229 * For compatibility with VFIO when /dev/vfio/vfio is opened we default 230 * to the same rlimit accounting as vfio uses. 231 */ 232 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) && 233 filp->private_data == &vfio_misc_dev) { 234 ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; 235 pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); 236 } 237 238 xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); 239 xa_init(&ictx->groups); 240 ictx->file = filp; 241 filp->private_data = ictx; 242 return 0; 243 } 244 245 static int iommufd_fops_release(struct inode *inode, struct file *filp) 246 { 247 struct iommufd_ctx *ictx = filp->private_data; 248 struct iommufd_object *obj; 249 250 /* 251 * The objects in the xarray form a graph of "users" counts, and we have 252 * to destroy them in a depth first manner. Leaf objects will reduce the 253 * users count of interior objects when they are destroyed. 254 * 255 * Repeatedly destroying all the "1 users" leaf objects will progress 256 * until the entire list is destroyed. If this can't progress then there 257 * is some bug related to object refcounting. 258 */ 259 while (!xa_empty(&ictx->objects)) { 260 unsigned int destroyed = 0; 261 unsigned long index; 262 263 xa_for_each(&ictx->objects, index, obj) { 264 if (!refcount_dec_if_one(&obj->users)) 265 continue; 266 destroyed++; 267 xa_erase(&ictx->objects, index); 268 iommufd_object_ops[obj->type].destroy(obj); 269 kfree(obj); 270 } 271 /* Bug related to users refcount */ 272 if (WARN_ON(!destroyed)) 273 break; 274 } 275 WARN_ON(!xa_empty(&ictx->groups)); 276 kfree(ictx); 277 return 0; 278 } 279 280 static int iommufd_option(struct iommufd_ucmd *ucmd) 281 { 282 struct iommu_option *cmd = ucmd->cmd; 283 int rc; 284 285 if (cmd->__reserved) 286 return -EOPNOTSUPP; 287 288 switch (cmd->option_id) { 289 case IOMMU_OPTION_RLIMIT_MODE: 290 rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); 291 break; 292 case IOMMU_OPTION_HUGE_PAGES: 293 rc = iommufd_ioas_option(ucmd); 294 break; 295 default: 296 return -EOPNOTSUPP; 297 } 298 if (rc) 299 return rc; 300 if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64, 301 &cmd->val64, sizeof(cmd->val64))) 302 return -EFAULT; 303 return 0; 304 } 305 306 union ucmd_buffer { 307 struct iommu_destroy destroy; 308 struct iommu_hw_info info; 309 struct iommu_hwpt_alloc hwpt; 310 struct iommu_ioas_alloc alloc; 311 struct iommu_ioas_allow_iovas allow_iovas; 312 struct iommu_ioas_copy ioas_copy; 313 struct iommu_ioas_iova_ranges iova_ranges; 314 struct iommu_ioas_map map; 315 struct iommu_ioas_unmap unmap; 316 struct iommu_option option; 317 struct iommu_vfio_ioas vfio_ioas; 318 #ifdef CONFIG_IOMMUFD_TEST 319 struct iommu_test_cmd test; 320 #endif 321 }; 322 323 struct iommufd_ioctl_op { 324 unsigned int size; 325 unsigned int min_size; 326 unsigned int ioctl_num; 327 int (*execute)(struct iommufd_ucmd *ucmd); 328 }; 329 330 #define IOCTL_OP(_ioctl, _fn, _struct, _last) \ 331 [_IOC_NR(_ioctl) - IOMMUFD_CMD_BASE] = { \ 332 .size = sizeof(_struct) + \ 333 BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ 334 sizeof(_struct)), \ 335 .min_size = offsetofend(_struct, _last), \ 336 .ioctl_num = _ioctl, \ 337 .execute = _fn, \ 338 } 339 static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { 340 IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), 341 IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, 342 __reserved), 343 IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, 344 __reserved), 345 IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, 346 struct iommu_ioas_alloc, out_ioas_id), 347 IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, 348 struct iommu_ioas_allow_iovas, allowed_iovas), 349 IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, 350 src_iova), 351 IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, 352 struct iommu_ioas_iova_ranges, out_iova_alignment), 353 IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, 354 iova), 355 IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, 356 length), 357 IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, 358 val64), 359 IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, 360 __reserved), 361 #ifdef CONFIG_IOMMUFD_TEST 362 IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), 363 #endif 364 }; 365 366 static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, 367 unsigned long arg) 368 { 369 struct iommufd_ctx *ictx = filp->private_data; 370 const struct iommufd_ioctl_op *op; 371 struct iommufd_ucmd ucmd = {}; 372 union ucmd_buffer buf; 373 unsigned int nr; 374 int ret; 375 376 nr = _IOC_NR(cmd); 377 if (nr < IOMMUFD_CMD_BASE || 378 (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops)) 379 return iommufd_vfio_ioctl(ictx, cmd, arg); 380 381 ucmd.ictx = ictx; 382 ucmd.ubuffer = (void __user *)arg; 383 ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); 384 if (ret) 385 return ret; 386 387 op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE]; 388 if (op->ioctl_num != cmd) 389 return -ENOIOCTLCMD; 390 if (ucmd.user_size < op->min_size) 391 return -EINVAL; 392 393 ucmd.cmd = &buf; 394 ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, 395 ucmd.user_size); 396 if (ret) 397 return ret; 398 ret = op->execute(&ucmd); 399 return ret; 400 } 401 402 static const struct file_operations iommufd_fops = { 403 .owner = THIS_MODULE, 404 .open = iommufd_fops_open, 405 .release = iommufd_fops_release, 406 .unlocked_ioctl = iommufd_fops_ioctl, 407 }; 408 409 /** 410 * iommufd_ctx_get - Get a context reference 411 * @ictx: Context to get 412 * 413 * The caller must already hold a valid reference to ictx. 414 */ 415 void iommufd_ctx_get(struct iommufd_ctx *ictx) 416 { 417 get_file(ictx->file); 418 } 419 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD); 420 421 /** 422 * iommufd_ctx_from_file - Acquires a reference to the iommufd context 423 * @file: File to obtain the reference from 424 * 425 * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. The struct file 426 * remains owned by the caller and the caller must still do fput. On success 427 * the caller is responsible to call iommufd_ctx_put(). 428 */ 429 struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) 430 { 431 struct iommufd_ctx *ictx; 432 433 if (file->f_op != &iommufd_fops) 434 return ERR_PTR(-EBADFD); 435 ictx = file->private_data; 436 iommufd_ctx_get(ictx); 437 return ictx; 438 } 439 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD); 440 441 /** 442 * iommufd_ctx_from_fd - Acquires a reference to the iommufd context 443 * @fd: File descriptor to obtain the reference from 444 * 445 * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. On success 446 * the caller is responsible to call iommufd_ctx_put(). 447 */ 448 struct iommufd_ctx *iommufd_ctx_from_fd(int fd) 449 { 450 struct file *file; 451 452 file = fget(fd); 453 if (!file) 454 return ERR_PTR(-EBADF); 455 456 if (file->f_op != &iommufd_fops) { 457 fput(file); 458 return ERR_PTR(-EBADFD); 459 } 460 /* fget is the same as iommufd_ctx_get() */ 461 return file->private_data; 462 } 463 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, IOMMUFD); 464 465 /** 466 * iommufd_ctx_put - Put back a reference 467 * @ictx: Context to put back 468 */ 469 void iommufd_ctx_put(struct iommufd_ctx *ictx) 470 { 471 fput(ictx->file); 472 } 473 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); 474 475 static const struct iommufd_object_ops iommufd_object_ops[] = { 476 [IOMMUFD_OBJ_ACCESS] = { 477 .destroy = iommufd_access_destroy_object, 478 }, 479 [IOMMUFD_OBJ_DEVICE] = { 480 .destroy = iommufd_device_destroy, 481 }, 482 [IOMMUFD_OBJ_IOAS] = { 483 .destroy = iommufd_ioas_destroy, 484 }, 485 [IOMMUFD_OBJ_HW_PAGETABLE] = { 486 .destroy = iommufd_hw_pagetable_destroy, 487 .abort = iommufd_hw_pagetable_abort, 488 }, 489 #ifdef CONFIG_IOMMUFD_TEST 490 [IOMMUFD_OBJ_SELFTEST] = { 491 .destroy = iommufd_selftest_destroy, 492 }, 493 #endif 494 }; 495 496 static struct miscdevice iommu_misc_dev = { 497 .minor = MISC_DYNAMIC_MINOR, 498 .name = "iommu", 499 .fops = &iommufd_fops, 500 .nodename = "iommu", 501 .mode = 0660, 502 }; 503 504 505 static struct miscdevice vfio_misc_dev = { 506 .minor = VFIO_MINOR, 507 .name = "vfio", 508 .fops = &iommufd_fops, 509 .nodename = "vfio/vfio", 510 .mode = 0666, 511 }; 512 513 static int __init iommufd_init(void) 514 { 515 int ret; 516 517 ret = misc_register(&iommu_misc_dev); 518 if (ret) 519 return ret; 520 521 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) { 522 ret = misc_register(&vfio_misc_dev); 523 if (ret) 524 goto err_misc; 525 } 526 ret = iommufd_test_init(); 527 if (ret) 528 goto err_vfio_misc; 529 return 0; 530 531 err_vfio_misc: 532 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) 533 misc_deregister(&vfio_misc_dev); 534 err_misc: 535 misc_deregister(&iommu_misc_dev); 536 return ret; 537 } 538 539 static void __exit iommufd_exit(void) 540 { 541 iommufd_test_exit(); 542 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) 543 misc_deregister(&vfio_misc_dev); 544 misc_deregister(&iommu_misc_dev); 545 } 546 547 module_init(iommufd_init); 548 module_exit(iommufd_exit); 549 550 #if IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) 551 MODULE_ALIAS_MISCDEV(VFIO_MINOR); 552 MODULE_ALIAS("devname:vfio/vfio"); 553 #endif 554 MODULE_IMPORT_NS(IOMMUFD_INTERNAL); 555 MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); 556 MODULE_LICENSE("GPL"); 557