1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (C) 2021 Intel Corporation 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES 4 * 5 * iommufd provides control over the IOMMU HW objects created by IOMMU kernel 6 * drivers. IOMMU HW objects revolve around IO page tables that map incoming DMA 7 * addresses (IOVA) to CPU addresses. 8 */ 9 #define pr_fmt(fmt) "iommufd: " fmt 10 11 #include <linux/file.h> 12 #include <linux/fs.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/miscdevice.h> 16 #include <linux/mutex.h> 17 #include <linux/bug.h> 18 #include <uapi/linux/iommufd.h> 19 #include <linux/iommufd.h> 20 21 #include "io_pagetable.h" 22 #include "iommufd_private.h" 23 #include "iommufd_test.h" 24 25 struct iommufd_object_ops { 26 void (*destroy)(struct iommufd_object *obj); 27 }; 28 static const struct iommufd_object_ops iommufd_object_ops[]; 29 static struct miscdevice vfio_misc_dev; 30 31 struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, 32 size_t size, 33 enum iommufd_object_type type) 34 { 35 struct iommufd_object *obj; 36 int rc; 37 38 obj = kzalloc(size, GFP_KERNEL_ACCOUNT); 39 if (!obj) 40 return ERR_PTR(-ENOMEM); 41 obj->type = type; 42 init_rwsem(&obj->destroy_rwsem); 43 refcount_set(&obj->users, 1); 44 45 /* 46 * Reserve an ID in the xarray but do not publish the pointer yet since 47 * the caller hasn't initialized it yet. Once the pointer is published 48 * in the xarray and visible to other threads we can't reliably destroy 49 * it anymore, so the caller must complete all errorable operations 50 * before calling iommufd_object_finalize(). 51 */ 52 rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, 53 xa_limit_32b, GFP_KERNEL_ACCOUNT); 54 if (rc) 55 goto out_free; 56 return obj; 57 out_free: 58 kfree(obj); 59 return ERR_PTR(rc); 60 } 61 62 /* 63 * Allow concurrent access to the object. 64 * 65 * Once another thread can see the object pointer it can prevent object 66 * destruction. Expect for special kernel-only objects there is no in-kernel way 67 * to reliably destroy a single object. Thus all APIs that are creating objects 68 * must use iommufd_object_abort() to handle their errors and only call 69 * iommufd_object_finalize() once object creation cannot fail. 70 */ 71 void iommufd_object_finalize(struct iommufd_ctx *ictx, 72 struct iommufd_object *obj) 73 { 74 void *old; 75 76 old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL); 77 /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */ 78 WARN_ON(old); 79 } 80 81 /* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ 82 void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) 83 { 84 void *old; 85 86 old = xa_erase(&ictx->objects, obj->id); 87 WARN_ON(old); 88 kfree(obj); 89 } 90 91 /* 92 * Abort an object that has been fully initialized and needs destroy, but has 93 * not been finalized. 94 */ 95 void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, 96 struct iommufd_object *obj) 97 { 98 iommufd_object_ops[obj->type].destroy(obj); 99 iommufd_object_abort(ictx, obj); 100 } 101 102 struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, 103 enum iommufd_object_type type) 104 { 105 struct iommufd_object *obj; 106 107 if (iommufd_should_fail()) 108 return ERR_PTR(-ENOENT); 109 110 xa_lock(&ictx->objects); 111 obj = xa_load(&ictx->objects, id); 112 if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) || 113 !iommufd_lock_obj(obj)) 114 obj = ERR_PTR(-ENOENT); 115 xa_unlock(&ictx->objects); 116 return obj; 117 } 118 119 /* 120 * Remove the given object id from the xarray if the only reference to the 121 * object is held by the xarray. The caller must call ops destroy(). 122 */ 123 static struct iommufd_object *iommufd_object_remove(struct iommufd_ctx *ictx, 124 u32 id, bool extra_put) 125 { 126 struct iommufd_object *obj; 127 XA_STATE(xas, &ictx->objects, id); 128 129 xa_lock(&ictx->objects); 130 obj = xas_load(&xas); 131 if (xa_is_zero(obj) || !obj) { 132 obj = ERR_PTR(-ENOENT); 133 goto out_xa; 134 } 135 136 /* 137 * If the caller is holding a ref on obj we put it here under the 138 * spinlock. 139 */ 140 if (extra_put) 141 refcount_dec(&obj->users); 142 143 if (!refcount_dec_if_one(&obj->users)) { 144 obj = ERR_PTR(-EBUSY); 145 goto out_xa; 146 } 147 148 xas_store(&xas, NULL); 149 if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) 150 ictx->vfio_ioas = NULL; 151 152 out_xa: 153 xa_unlock(&ictx->objects); 154 155 /* The returned object reference count is zero */ 156 return obj; 157 } 158 159 /* 160 * The caller holds a users refcount and wants to destroy the object. Returns 161 * true if the object was destroyed. In all cases the caller no longer has a 162 * reference on obj. 163 */ 164 void __iommufd_object_destroy_user(struct iommufd_ctx *ictx, 165 struct iommufd_object *obj, bool allow_fail) 166 { 167 struct iommufd_object *ret; 168 169 /* 170 * The purpose of the destroy_rwsem is to ensure deterministic 171 * destruction of objects used by external drivers and destroyed by this 172 * function. Any temporary increment of the refcount must hold the read 173 * side of this, such as during ioctl execution. 174 */ 175 down_write(&obj->destroy_rwsem); 176 ret = iommufd_object_remove(ictx, obj->id, true); 177 up_write(&obj->destroy_rwsem); 178 179 if (allow_fail && IS_ERR(ret)) 180 return; 181 182 /* 183 * If there is a bug and we couldn't destroy the object then we did put 184 * back the caller's refcount and will eventually try to free it again 185 * during close. 186 */ 187 if (WARN_ON(IS_ERR(ret))) 188 return; 189 190 iommufd_object_ops[obj->type].destroy(obj); 191 kfree(obj); 192 } 193 194 static int iommufd_destroy(struct iommufd_ucmd *ucmd) 195 { 196 struct iommu_destroy *cmd = ucmd->cmd; 197 struct iommufd_object *obj; 198 199 obj = iommufd_object_remove(ucmd->ictx, cmd->id, false); 200 if (IS_ERR(obj)) 201 return PTR_ERR(obj); 202 iommufd_object_ops[obj->type].destroy(obj); 203 kfree(obj); 204 return 0; 205 } 206 207 static int iommufd_fops_open(struct inode *inode, struct file *filp) 208 { 209 struct iommufd_ctx *ictx; 210 211 ictx = kzalloc(sizeof(*ictx), GFP_KERNEL_ACCOUNT); 212 if (!ictx) 213 return -ENOMEM; 214 215 /* 216 * For compatibility with VFIO when /dev/vfio/vfio is opened we default 217 * to the same rlimit accounting as vfio uses. 218 */ 219 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) && 220 filp->private_data == &vfio_misc_dev) { 221 ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; 222 pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); 223 } 224 225 xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); 226 ictx->file = filp; 227 filp->private_data = ictx; 228 return 0; 229 } 230 231 static int iommufd_fops_release(struct inode *inode, struct file *filp) 232 { 233 struct iommufd_ctx *ictx = filp->private_data; 234 struct iommufd_object *obj; 235 236 /* 237 * The objects in the xarray form a graph of "users" counts, and we have 238 * to destroy them in a depth first manner. Leaf objects will reduce the 239 * users count of interior objects when they are destroyed. 240 * 241 * Repeatedly destroying all the "1 users" leaf objects will progress 242 * until the entire list is destroyed. If this can't progress then there 243 * is some bug related to object refcounting. 244 */ 245 while (!xa_empty(&ictx->objects)) { 246 unsigned int destroyed = 0; 247 unsigned long index; 248 249 xa_for_each(&ictx->objects, index, obj) { 250 if (!refcount_dec_if_one(&obj->users)) 251 continue; 252 destroyed++; 253 xa_erase(&ictx->objects, index); 254 iommufd_object_ops[obj->type].destroy(obj); 255 kfree(obj); 256 } 257 /* Bug related to users refcount */ 258 if (WARN_ON(!destroyed)) 259 break; 260 } 261 kfree(ictx); 262 return 0; 263 } 264 265 static int iommufd_option(struct iommufd_ucmd *ucmd) 266 { 267 struct iommu_option *cmd = ucmd->cmd; 268 int rc; 269 270 if (cmd->__reserved) 271 return -EOPNOTSUPP; 272 273 switch (cmd->option_id) { 274 case IOMMU_OPTION_RLIMIT_MODE: 275 rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); 276 break; 277 case IOMMU_OPTION_HUGE_PAGES: 278 rc = iommufd_ioas_option(ucmd); 279 break; 280 default: 281 return -EOPNOTSUPP; 282 } 283 if (rc) 284 return rc; 285 if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64, 286 &cmd->val64, sizeof(cmd->val64))) 287 return -EFAULT; 288 return 0; 289 } 290 291 union ucmd_buffer { 292 struct iommu_destroy destroy; 293 struct iommu_ioas_alloc alloc; 294 struct iommu_ioas_allow_iovas allow_iovas; 295 struct iommu_ioas_copy ioas_copy; 296 struct iommu_ioas_iova_ranges iova_ranges; 297 struct iommu_ioas_map map; 298 struct iommu_ioas_unmap unmap; 299 struct iommu_option option; 300 struct iommu_vfio_ioas vfio_ioas; 301 #ifdef CONFIG_IOMMUFD_TEST 302 struct iommu_test_cmd test; 303 #endif 304 }; 305 306 struct iommufd_ioctl_op { 307 unsigned int size; 308 unsigned int min_size; 309 unsigned int ioctl_num; 310 int (*execute)(struct iommufd_ucmd *ucmd); 311 }; 312 313 #define IOCTL_OP(_ioctl, _fn, _struct, _last) \ 314 [_IOC_NR(_ioctl) - IOMMUFD_CMD_BASE] = { \ 315 .size = sizeof(_struct) + \ 316 BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ 317 sizeof(_struct)), \ 318 .min_size = offsetofend(_struct, _last), \ 319 .ioctl_num = _ioctl, \ 320 .execute = _fn, \ 321 } 322 static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { 323 IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), 324 IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, 325 struct iommu_ioas_alloc, out_ioas_id), 326 IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, 327 struct iommu_ioas_allow_iovas, allowed_iovas), 328 IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, 329 src_iova), 330 IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, 331 struct iommu_ioas_iova_ranges, out_iova_alignment), 332 IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, 333 iova), 334 IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, 335 length), 336 IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, 337 val64), 338 IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, 339 __reserved), 340 #ifdef CONFIG_IOMMUFD_TEST 341 IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), 342 #endif 343 }; 344 345 static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, 346 unsigned long arg) 347 { 348 struct iommufd_ctx *ictx = filp->private_data; 349 const struct iommufd_ioctl_op *op; 350 struct iommufd_ucmd ucmd = {}; 351 union ucmd_buffer buf; 352 unsigned int nr; 353 int ret; 354 355 nr = _IOC_NR(cmd); 356 if (nr < IOMMUFD_CMD_BASE || 357 (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops)) 358 return iommufd_vfio_ioctl(ictx, cmd, arg); 359 360 ucmd.ictx = ictx; 361 ucmd.ubuffer = (void __user *)arg; 362 ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); 363 if (ret) 364 return ret; 365 366 op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE]; 367 if (op->ioctl_num != cmd) 368 return -ENOIOCTLCMD; 369 if (ucmd.user_size < op->min_size) 370 return -EINVAL; 371 372 ucmd.cmd = &buf; 373 ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, 374 ucmd.user_size); 375 if (ret) 376 return ret; 377 ret = op->execute(&ucmd); 378 return ret; 379 } 380 381 static const struct file_operations iommufd_fops = { 382 .owner = THIS_MODULE, 383 .open = iommufd_fops_open, 384 .release = iommufd_fops_release, 385 .unlocked_ioctl = iommufd_fops_ioctl, 386 }; 387 388 /** 389 * iommufd_ctx_get - Get a context reference 390 * @ictx: Context to get 391 * 392 * The caller must already hold a valid reference to ictx. 393 */ 394 void iommufd_ctx_get(struct iommufd_ctx *ictx) 395 { 396 get_file(ictx->file); 397 } 398 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD); 399 400 /** 401 * iommufd_ctx_from_file - Acquires a reference to the iommufd context 402 * @file: File to obtain the reference from 403 * 404 * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. The struct file 405 * remains owned by the caller and the caller must still do fput. On success 406 * the caller is responsible to call iommufd_ctx_put(). 407 */ 408 struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) 409 { 410 struct iommufd_ctx *ictx; 411 412 if (file->f_op != &iommufd_fops) 413 return ERR_PTR(-EBADFD); 414 ictx = file->private_data; 415 iommufd_ctx_get(ictx); 416 return ictx; 417 } 418 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD); 419 420 /** 421 * iommufd_ctx_put - Put back a reference 422 * @ictx: Context to put back 423 */ 424 void iommufd_ctx_put(struct iommufd_ctx *ictx) 425 { 426 fput(ictx->file); 427 } 428 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); 429 430 static const struct iommufd_object_ops iommufd_object_ops[] = { 431 [IOMMUFD_OBJ_ACCESS] = { 432 .destroy = iommufd_access_destroy_object, 433 }, 434 [IOMMUFD_OBJ_DEVICE] = { 435 .destroy = iommufd_device_destroy, 436 }, 437 [IOMMUFD_OBJ_IOAS] = { 438 .destroy = iommufd_ioas_destroy, 439 }, 440 [IOMMUFD_OBJ_HW_PAGETABLE] = { 441 .destroy = iommufd_hw_pagetable_destroy, 442 }, 443 #ifdef CONFIG_IOMMUFD_TEST 444 [IOMMUFD_OBJ_SELFTEST] = { 445 .destroy = iommufd_selftest_destroy, 446 }, 447 #endif 448 }; 449 450 static struct miscdevice iommu_misc_dev = { 451 .minor = MISC_DYNAMIC_MINOR, 452 .name = "iommu", 453 .fops = &iommufd_fops, 454 .nodename = "iommu", 455 .mode = 0660, 456 }; 457 458 459 static struct miscdevice vfio_misc_dev = { 460 .minor = VFIO_MINOR, 461 .name = "vfio", 462 .fops = &iommufd_fops, 463 .nodename = "vfio/vfio", 464 .mode = 0666, 465 }; 466 467 static int __init iommufd_init(void) 468 { 469 int ret; 470 471 ret = misc_register(&iommu_misc_dev); 472 if (ret) 473 return ret; 474 475 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) { 476 ret = misc_register(&vfio_misc_dev); 477 if (ret) 478 goto err_misc; 479 } 480 iommufd_test_init(); 481 return 0; 482 err_misc: 483 misc_deregister(&iommu_misc_dev); 484 return ret; 485 } 486 487 static void __exit iommufd_exit(void) 488 { 489 iommufd_test_exit(); 490 if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) 491 misc_deregister(&vfio_misc_dev); 492 misc_deregister(&iommu_misc_dev); 493 } 494 495 module_init(iommufd_init); 496 module_exit(iommufd_exit); 497 498 #if IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) 499 MODULE_ALIAS_MISCDEV(VFIO_MINOR); 500 MODULE_ALIAS("devname:vfio/vfio"); 501 #endif 502 MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); 503 MODULE_LICENSE("GPL"); 504