1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Device driver to expose SGX enclave memory to KVM guests. 4 * 5 * Copyright(c) 2021 Intel Corporation. 6 */ 7 8 #include <linux/miscdevice.h> 9 #include <linux/mm.h> 10 #include <linux/mman.h> 11 #include <linux/sched/mm.h> 12 #include <linux/sched/signal.h> 13 #include <linux/slab.h> 14 #include <linux/xarray.h> 15 #include <asm/sgx.h> 16 #include <uapi/asm/sgx.h> 17 18 #include "encls.h" 19 #include "sgx.h" 20 21 struct sgx_vepc { 22 struct xarray page_array; 23 struct mutex lock; 24 }; 25 26 /* 27 * Temporary SECS pages that cannot be EREMOVE'd due to having child in other 28 * virtual EPC instances, and the lock to protect it. 29 */ 30 static struct mutex zombie_secs_pages_lock; 31 static struct list_head zombie_secs_pages; 32 33 static int __sgx_vepc_fault(struct sgx_vepc *vepc, 34 struct vm_area_struct *vma, unsigned long addr) 35 { 36 struct sgx_epc_page *epc_page; 37 unsigned long index, pfn; 38 int ret; 39 40 WARN_ON(!mutex_is_locked(&vepc->lock)); 41 42 /* Calculate index of EPC page in virtual EPC's page_array */ 43 index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); 44 45 epc_page = xa_load(&vepc->page_array, index); 46 if (epc_page) 47 return 0; 48 49 epc_page = sgx_alloc_epc_page(vepc, false); 50 if (IS_ERR(epc_page)) 51 return PTR_ERR(epc_page); 52 53 ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL)); 54 if (ret) 55 goto err_free; 56 57 pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page)); 58 59 ret = vmf_insert_pfn(vma, addr, pfn); 60 if (ret != VM_FAULT_NOPAGE) { 61 ret = -EFAULT; 62 goto err_delete; 63 } 64 65 return 0; 66 67 err_delete: 68 xa_erase(&vepc->page_array, index); 69 err_free: 70 sgx_free_epc_page(epc_page); 71 return ret; 72 } 73 74 static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf) 75 { 76 struct vm_area_struct *vma = vmf->vma; 77 struct sgx_vepc *vepc = vma->vm_private_data; 78 int ret; 79 80 mutex_lock(&vepc->lock); 81 ret = __sgx_vepc_fault(vepc, vma, vmf->address); 82 mutex_unlock(&vepc->lock); 83 84 if (!ret) 85 return VM_FAULT_NOPAGE; 86 87 if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) { 88 mmap_read_unlock(vma->vm_mm); 89 return VM_FAULT_RETRY; 90 } 91 92 return VM_FAULT_SIGBUS; 93 } 94 95 static const struct vm_operations_struct sgx_vepc_vm_ops = { 96 .fault = sgx_vepc_fault, 97 }; 98 99 static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) 100 { 101 struct sgx_vepc *vepc = file->private_data; 102 103 if (!(vma->vm_flags & VM_SHARED)) 104 return -EINVAL; 105 106 vma->vm_ops = &sgx_vepc_vm_ops; 107 /* Don't copy VMA in fork() */ 108 vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY); 109 vma->vm_private_data = vepc; 110 111 return 0; 112 } 113 114 static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page) 115 { 116 /* 117 * Take a previously guest-owned EPC page and return it to the 118 * general EPC page pool. 119 * 120 * Guests can not be trusted to have left this page in a good 121 * state, so run EREMOVE on the page unconditionally. In the 122 * case that a guest properly EREMOVE'd this page, a superfluous 123 * EREMOVE is harmless. 124 */ 125 return __eremove(sgx_get_epc_virt_addr(epc_page)); 126 } 127 128 static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) 129 { 130 int ret = sgx_vepc_remove_page(epc_page); 131 if (ret) { 132 /* 133 * Only SGX_CHILD_PRESENT is expected, which is because of 134 * EREMOVE'ing an SECS still with child, in which case it can 135 * be handled by EREMOVE'ing the SECS again after all pages in 136 * virtual EPC have been EREMOVE'd. See comments in below in 137 * sgx_vepc_release(). 138 * 139 * The user of virtual EPC (KVM) needs to guarantee there's no 140 * logical processor is still running in the enclave in guest, 141 * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be 142 * handled here. 143 */ 144 WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE, 145 ret, ret); 146 return ret; 147 } 148 149 sgx_free_epc_page(epc_page); 150 return 0; 151 } 152 153 static long sgx_vepc_remove_all(struct sgx_vepc *vepc) 154 { 155 struct sgx_epc_page *entry; 156 unsigned long index; 157 long failures = 0; 158 159 xa_for_each(&vepc->page_array, index, entry) { 160 int ret = sgx_vepc_remove_page(entry); 161 if (ret) { 162 if (ret == SGX_CHILD_PRESENT) { 163 /* The page is a SECS, userspace will retry. */ 164 failures++; 165 } else { 166 /* 167 * Report errors due to #GP or SGX_ENCLAVE_ACT; do not 168 * WARN, as userspace can induce said failures by 169 * calling the ioctl concurrently on multiple vEPCs or 170 * while one or more CPUs is running the enclave. Only 171 * a #PF on EREMOVE indicates a kernel/hardware issue. 172 */ 173 WARN_ON_ONCE(encls_faulted(ret) && 174 ENCLS_TRAPNR(ret) != X86_TRAP_GP); 175 return -EBUSY; 176 } 177 } 178 cond_resched(); 179 } 180 181 /* 182 * Return the number of SECS pages that failed to be removed, so 183 * userspace knows that it has to retry. 184 */ 185 return failures; 186 } 187 188 static int sgx_vepc_release(struct inode *inode, struct file *file) 189 { 190 struct sgx_vepc *vepc = file->private_data; 191 struct sgx_epc_page *epc_page, *tmp, *entry; 192 unsigned long index; 193 194 LIST_HEAD(secs_pages); 195 196 xa_for_each(&vepc->page_array, index, entry) { 197 /* 198 * Remove all normal, child pages. sgx_vepc_free_page() 199 * will fail if EREMOVE fails, but this is OK and expected on 200 * SECS pages. Those can only be EREMOVE'd *after* all their 201 * child pages. Retries below will clean them up. 202 */ 203 if (sgx_vepc_free_page(entry)) 204 continue; 205 206 xa_erase(&vepc->page_array, index); 207 cond_resched(); 208 } 209 210 /* 211 * Retry EREMOVE'ing pages. This will clean up any SECS pages that 212 * only had children in this 'epc' area. 213 */ 214 xa_for_each(&vepc->page_array, index, entry) { 215 epc_page = entry; 216 /* 217 * An EREMOVE failure here means that the SECS page still 218 * has children. But, since all children in this 'sgx_vepc' 219 * have been removed, the SECS page must have a child on 220 * another instance. 221 */ 222 if (sgx_vepc_free_page(epc_page)) 223 list_add_tail(&epc_page->list, &secs_pages); 224 225 xa_erase(&vepc->page_array, index); 226 cond_resched(); 227 } 228 229 /* 230 * SECS pages are "pinned" by child pages, and "unpinned" once all 231 * children have been EREMOVE'd. A child page in this instance 232 * may have pinned an SECS page encountered in an earlier release(), 233 * creating a zombie. Since some children were EREMOVE'd above, 234 * try to EREMOVE all zombies in the hopes that one was unpinned. 235 */ 236 mutex_lock(&zombie_secs_pages_lock); 237 list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) { 238 /* 239 * Speculatively remove the page from the list of zombies, 240 * if the page is successfully EREMOVE'd it will be added to 241 * the list of free pages. If EREMOVE fails, throw the page 242 * on the local list, which will be spliced on at the end. 243 */ 244 list_del(&epc_page->list); 245 246 if (sgx_vepc_free_page(epc_page)) 247 list_add_tail(&epc_page->list, &secs_pages); 248 cond_resched(); 249 } 250 251 if (!list_empty(&secs_pages)) 252 list_splice_tail(&secs_pages, &zombie_secs_pages); 253 mutex_unlock(&zombie_secs_pages_lock); 254 255 xa_destroy(&vepc->page_array); 256 kfree(vepc); 257 258 return 0; 259 } 260 261 static int sgx_vepc_open(struct inode *inode, struct file *file) 262 { 263 struct sgx_vepc *vepc; 264 265 vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL); 266 if (!vepc) 267 return -ENOMEM; 268 mutex_init(&vepc->lock); 269 xa_init(&vepc->page_array); 270 271 file->private_data = vepc; 272 273 return 0; 274 } 275 276 static long sgx_vepc_ioctl(struct file *file, 277 unsigned int cmd, unsigned long arg) 278 { 279 struct sgx_vepc *vepc = file->private_data; 280 281 switch (cmd) { 282 case SGX_IOC_VEPC_REMOVE_ALL: 283 if (arg) 284 return -EINVAL; 285 return sgx_vepc_remove_all(vepc); 286 287 default: 288 return -ENOTTY; 289 } 290 } 291 292 static const struct file_operations sgx_vepc_fops = { 293 .owner = THIS_MODULE, 294 .open = sgx_vepc_open, 295 .unlocked_ioctl = sgx_vepc_ioctl, 296 .compat_ioctl = sgx_vepc_ioctl, 297 .release = sgx_vepc_release, 298 .mmap = sgx_vepc_mmap, 299 }; 300 301 static struct miscdevice sgx_vepc_dev = { 302 .minor = MISC_DYNAMIC_MINOR, 303 .name = "sgx_vepc", 304 .nodename = "sgx_vepc", 305 .fops = &sgx_vepc_fops, 306 }; 307 308 int __init sgx_vepc_init(void) 309 { 310 /* SGX virtualization requires KVM to work */ 311 if (!cpu_feature_enabled(X86_FEATURE_VMX)) 312 return -ENODEV; 313 314 INIT_LIST_HEAD(&zombie_secs_pages); 315 mutex_init(&zombie_secs_pages_lock); 316 317 return misc_register(&sgx_vepc_dev); 318 } 319 320 /** 321 * sgx_virt_ecreate() - Run ECREATE on behalf of guest 322 * @pageinfo: Pointer to PAGEINFO structure 323 * @secs: Userspace pointer to SECS page 324 * @trapnr: trap number injected to guest in case of ECREATE error 325 * 326 * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose 327 * of enforcing policies of guest's enclaves, and return the trap number 328 * which should be injected to guest in case of any ECREATE error. 329 * 330 * Return: 331 * - 0: ECREATE was successful. 332 * - <0: on error. 333 */ 334 int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, 335 int *trapnr) 336 { 337 int ret; 338 339 /* 340 * @secs is an untrusted, userspace-provided address. It comes from 341 * KVM and is assumed to be a valid pointer which points somewhere in 342 * userspace. This can fault and call SGX or other fault handlers when 343 * userspace mapping @secs doesn't exist. 344 * 345 * Add a WARN() to make sure @secs is already valid userspace pointer 346 * from caller (KVM), who should already have handled invalid pointer 347 * case (for instance, made by malicious guest). All other checks, 348 * such as alignment of @secs, are deferred to ENCLS itself. 349 */ 350 if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE))) 351 return -EINVAL; 352 353 __uaccess_begin(); 354 ret = __ecreate(pageinfo, (void *)secs); 355 __uaccess_end(); 356 357 if (encls_faulted(ret)) { 358 *trapnr = ENCLS_TRAPNR(ret); 359 return -EFAULT; 360 } 361 362 /* ECREATE doesn't return an error code, it faults or succeeds. */ 363 WARN_ON_ONCE(ret); 364 return 0; 365 } 366 EXPORT_SYMBOL_GPL(sgx_virt_ecreate); 367 368 static int __sgx_virt_einit(void __user *sigstruct, void __user *token, 369 void __user *secs) 370 { 371 int ret; 372 373 /* 374 * Make sure all userspace pointers from caller (KVM) are valid. 375 * All other checks deferred to ENCLS itself. Also see comment 376 * for @secs in sgx_virt_ecreate(). 377 */ 378 #define SGX_EINITTOKEN_SIZE 304 379 if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) || 380 !access_ok(token, SGX_EINITTOKEN_SIZE) || 381 !access_ok(secs, PAGE_SIZE))) 382 return -EINVAL; 383 384 __uaccess_begin(); 385 ret = __einit((void *)sigstruct, (void *)token, (void *)secs); 386 __uaccess_end(); 387 388 return ret; 389 } 390 391 /** 392 * sgx_virt_einit() - Run EINIT on behalf of guest 393 * @sigstruct: Userspace pointer to SIGSTRUCT structure 394 * @token: Userspace pointer to EINITTOKEN structure 395 * @secs: Userspace pointer to SECS page 396 * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values 397 * @trapnr: trap number injected to guest in case of EINIT error 398 * 399 * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available 400 * in host, SGX driver may rewrite the hardware values at wish, therefore KVM 401 * needs to update hardware values to guest's virtual MSR values in order to 402 * ensure EINIT is executed with expected hardware values. 403 * 404 * Return: 405 * - 0: EINIT was successful. 406 * - <0: on error. 407 */ 408 int sgx_virt_einit(void __user *sigstruct, void __user *token, 409 void __user *secs, u64 *lepubkeyhash, int *trapnr) 410 { 411 int ret; 412 413 if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { 414 ret = __sgx_virt_einit(sigstruct, token, secs); 415 } else { 416 preempt_disable(); 417 418 sgx_update_lepubkeyhash(lepubkeyhash); 419 420 ret = __sgx_virt_einit(sigstruct, token, secs); 421 preempt_enable(); 422 } 423 424 /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */ 425 if (ret == -EINVAL) 426 return ret; 427 428 if (encls_faulted(ret)) { 429 *trapnr = ENCLS_TRAPNR(ret); 430 return -EFAULT; 431 } 432 433 return ret; 434 } 435 EXPORT_SYMBOL_GPL(sgx_virt_einit); 436