1 /* 2 * QEMU TDX support 3 * 4 * Copyright (c) 2025 Intel Corporation 5 * 6 * Author: 7 * Xiaoyao Li <xiaoyao.li@intel.com> 8 * 9 * SPDX-License-Identifier: GPL-2.0-or-later 10 */ 11 12 #include "qemu/osdep.h" 13 #include "qemu/error-report.h" 14 #include "qemu/base64.h" 15 #include "qemu/mmap-alloc.h" 16 #include "qapi/error.h" 17 #include "qom/object_interfaces.h" 18 #include "crypto/hash.h" 19 #include "system/system.h" 20 21 #include "hw/i386/e820_memory_layout.h" 22 #include "hw/i386/tdvf.h" 23 #include "hw/i386/x86.h" 24 #include "kvm_i386.h" 25 #include "tdx.h" 26 27 #define TDX_MIN_TSC_FREQUENCY_KHZ (100 * 1000) 28 #define TDX_MAX_TSC_FREQUENCY_KHZ (10 * 1000 * 1000) 29 30 #define TDX_TD_ATTRIBUTES_DEBUG BIT_ULL(0) 31 #define TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE BIT_ULL(28) 32 #define TDX_TD_ATTRIBUTES_PKS BIT_ULL(30) 33 #define TDX_TD_ATTRIBUTES_PERFMON BIT_ULL(63) 34 35 #define TDX_SUPPORTED_TD_ATTRS (TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE |\ 36 TDX_TD_ATTRIBUTES_PKS | \ 37 TDX_TD_ATTRIBUTES_PERFMON) 38 39 static TdxGuest *tdx_guest; 40 41 static struct kvm_tdx_capabilities *tdx_caps; 42 43 /* Valid after kvm_arch_init()->confidential_guest_kvm_init()->tdx_kvm_init() */ 44 bool is_tdx_vm(void) 45 { 46 return !!tdx_guest; 47 } 48 49 enum tdx_ioctl_level { 50 TDX_VM_IOCTL, 51 TDX_VCPU_IOCTL, 52 }; 53 54 static int tdx_ioctl_internal(enum tdx_ioctl_level level, void *state, 55 int cmd_id, __u32 flags, void *data, 56 Error **errp) 57 { 58 struct kvm_tdx_cmd tdx_cmd = {}; 59 int r; 60 61 const char *tdx_ioctl_name[] = { 62 [KVM_TDX_CAPABILITIES] = "KVM_TDX_CAPABILITIES", 63 [KVM_TDX_INIT_VM] = "KVM_TDX_INIT_VM", 64 [KVM_TDX_INIT_VCPU] = "KVM_TDX_INIT_VCPU", 65 [KVM_TDX_INIT_MEM_REGION] = "KVM_TDX_INIT_MEM_REGION", 66 [KVM_TDX_FINALIZE_VM] = "KVM_TDX_FINALIZE_VM", 67 [KVM_TDX_GET_CPUID] = "KVM_TDX_GET_CPUID", 68 }; 69 70 tdx_cmd.id = cmd_id; 71 tdx_cmd.flags = flags; 72 tdx_cmd.data = (__u64)(unsigned long)data; 73 74 switch (level) { 75 case TDX_VM_IOCTL: 76 r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd); 77 break; 78 case TDX_VCPU_IOCTL: 79 r = kvm_vcpu_ioctl(state, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd); 80 break; 81 default: 82 error_setg(errp, "Invalid tdx_ioctl_level %d", level); 83 return -EINVAL; 84 } 85 86 if (r < 0) { 87 error_setg_errno(errp, -r, "TDX ioctl %s failed, hw_errors: 0x%llx", 88 tdx_ioctl_name[cmd_id], tdx_cmd.hw_error); 89 } 90 return r; 91 } 92 93 static inline int tdx_vm_ioctl(int cmd_id, __u32 flags, void *data, 94 Error **errp) 95 { 96 return tdx_ioctl_internal(TDX_VM_IOCTL, NULL, cmd_id, flags, data, errp); 97 } 98 99 static inline int tdx_vcpu_ioctl(CPUState *cpu, int cmd_id, __u32 flags, 100 void *data, Error **errp) 101 { 102 return tdx_ioctl_internal(TDX_VCPU_IOCTL, cpu, cmd_id, flags, data, errp); 103 } 104 105 static int get_tdx_capabilities(Error **errp) 106 { 107 struct kvm_tdx_capabilities *caps; 108 /* 1st generation of TDX reports 6 cpuid configs */ 109 int nr_cpuid_configs = 6; 110 size_t size; 111 int r; 112 113 do { 114 Error *local_err = NULL; 115 size = sizeof(struct kvm_tdx_capabilities) + 116 nr_cpuid_configs * sizeof(struct kvm_cpuid_entry2); 117 caps = g_malloc0(size); 118 caps->cpuid.nent = nr_cpuid_configs; 119 120 r = tdx_vm_ioctl(KVM_TDX_CAPABILITIES, 0, caps, &local_err); 121 if (r == -E2BIG) { 122 g_free(caps); 123 nr_cpuid_configs *= 2; 124 if (nr_cpuid_configs > KVM_MAX_CPUID_ENTRIES) { 125 error_report("KVM TDX seems broken that number of CPUID entries" 126 " in kvm_tdx_capabilities exceeds limit: %d", 127 KVM_MAX_CPUID_ENTRIES); 128 error_propagate(errp, local_err); 129 return r; 130 } 131 error_free(local_err); 132 } else if (r < 0) { 133 g_free(caps); 134 error_propagate(errp, local_err); 135 return r; 136 } 137 } while (r == -E2BIG); 138 139 tdx_caps = caps; 140 141 return 0; 142 } 143 144 void tdx_set_tdvf_region(MemoryRegion *tdvf_mr) 145 { 146 assert(!tdx_guest->tdvf_mr); 147 tdx_guest->tdvf_mr = tdvf_mr; 148 } 149 150 static void tdx_add_ram_entry(uint64_t address, uint64_t length, 151 enum TdxRamType type) 152 { 153 uint32_t nr_entries = tdx_guest->nr_ram_entries; 154 tdx_guest->ram_entries = g_renew(TdxRamEntry, tdx_guest->ram_entries, 155 nr_entries + 1); 156 157 tdx_guest->ram_entries[nr_entries].address = address; 158 tdx_guest->ram_entries[nr_entries].length = length; 159 tdx_guest->ram_entries[nr_entries].type = type; 160 tdx_guest->nr_ram_entries++; 161 } 162 163 static int tdx_accept_ram_range(uint64_t address, uint64_t length) 164 { 165 uint64_t head_start, tail_start, head_length, tail_length; 166 uint64_t tmp_address, tmp_length; 167 TdxRamEntry *e; 168 int i = 0; 169 170 do { 171 if (i == tdx_guest->nr_ram_entries) { 172 return -1; 173 } 174 175 e = &tdx_guest->ram_entries[i++]; 176 } while (address + length <= e->address || address >= e->address + e->length); 177 178 /* 179 * The to-be-accepted ram range must be fully contained by one 180 * RAM entry. 181 */ 182 if (e->address > address || 183 e->address + e->length < address + length) { 184 return -1; 185 } 186 187 if (e->type == TDX_RAM_ADDED) { 188 return 0; 189 } 190 191 tmp_address = e->address; 192 tmp_length = e->length; 193 194 e->address = address; 195 e->length = length; 196 e->type = TDX_RAM_ADDED; 197 198 head_length = address - tmp_address; 199 if (head_length > 0) { 200 head_start = tmp_address; 201 tdx_add_ram_entry(head_start, head_length, TDX_RAM_UNACCEPTED); 202 } 203 204 tail_start = address + length; 205 if (tail_start < tmp_address + tmp_length) { 206 tail_length = tmp_address + tmp_length - tail_start; 207 tdx_add_ram_entry(tail_start, tail_length, TDX_RAM_UNACCEPTED); 208 } 209 210 return 0; 211 } 212 213 static int tdx_ram_entry_compare(const void *lhs_, const void* rhs_) 214 { 215 const TdxRamEntry *lhs = lhs_; 216 const TdxRamEntry *rhs = rhs_; 217 218 if (lhs->address == rhs->address) { 219 return 0; 220 } 221 if (le64_to_cpu(lhs->address) > le64_to_cpu(rhs->address)) { 222 return 1; 223 } 224 return -1; 225 } 226 227 static void tdx_init_ram_entries(void) 228 { 229 unsigned i, j, nr_e820_entries; 230 231 nr_e820_entries = e820_get_table(NULL); 232 tdx_guest->ram_entries = g_new(TdxRamEntry, nr_e820_entries); 233 234 for (i = 0, j = 0; i < nr_e820_entries; i++) { 235 uint64_t addr, len; 236 237 if (e820_get_entry(i, E820_RAM, &addr, &len)) { 238 tdx_guest->ram_entries[j].address = addr; 239 tdx_guest->ram_entries[j].length = len; 240 tdx_guest->ram_entries[j].type = TDX_RAM_UNACCEPTED; 241 j++; 242 } 243 } 244 tdx_guest->nr_ram_entries = j; 245 } 246 247 static void tdx_finalize_vm(Notifier *notifier, void *unused) 248 { 249 TdxFirmware *tdvf = &tdx_guest->tdvf; 250 TdxFirmwareEntry *entry; 251 252 tdx_init_ram_entries(); 253 254 for_each_tdx_fw_entry(tdvf, entry) { 255 switch (entry->type) { 256 case TDVF_SECTION_TYPE_BFV: 257 case TDVF_SECTION_TYPE_CFV: 258 entry->mem_ptr = tdvf->mem_ptr + entry->data_offset; 259 break; 260 case TDVF_SECTION_TYPE_TD_HOB: 261 case TDVF_SECTION_TYPE_TEMP_MEM: 262 entry->mem_ptr = qemu_ram_mmap(-1, entry->size, 263 qemu_real_host_page_size(), 0, 0); 264 if (entry->mem_ptr == MAP_FAILED) { 265 error_report("Failed to mmap memory for TDVF section %d", 266 entry->type); 267 exit(1); 268 } 269 if (tdx_accept_ram_range(entry->address, entry->size)) { 270 error_report("Failed to accept memory for TDVF section %d", 271 entry->type); 272 qemu_ram_munmap(-1, entry->mem_ptr, entry->size); 273 exit(1); 274 } 275 break; 276 default: 277 error_report("Unsupported TDVF section %d", entry->type); 278 exit(1); 279 } 280 } 281 282 qsort(tdx_guest->ram_entries, tdx_guest->nr_ram_entries, 283 sizeof(TdxRamEntry), &tdx_ram_entry_compare); 284 } 285 286 static Notifier tdx_machine_done_notify = { 287 .notify = tdx_finalize_vm, 288 }; 289 290 static int tdx_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) 291 { 292 TdxGuest *tdx = TDX_GUEST(cgs); 293 int r = 0; 294 295 kvm_mark_guest_state_protected(); 296 297 if (!tdx_caps) { 298 r = get_tdx_capabilities(errp); 299 if (r) { 300 return r; 301 } 302 } 303 304 qemu_add_machine_init_done_notifier(&tdx_machine_done_notify); 305 306 tdx_guest = tdx; 307 return 0; 308 } 309 310 static int tdx_kvm_type(X86ConfidentialGuest *cg) 311 { 312 /* Do the object check */ 313 TDX_GUEST(cg); 314 315 return KVM_X86_TDX_VM; 316 } 317 318 static int tdx_validate_attributes(TdxGuest *tdx, Error **errp) 319 { 320 if ((tdx->attributes & ~tdx_caps->supported_attrs)) { 321 error_setg(errp, "Invalid attributes 0x%lx for TDX VM " 322 "(KVM supported: 0x%llx)", tdx->attributes, 323 tdx_caps->supported_attrs); 324 return -1; 325 } 326 327 if (tdx->attributes & ~TDX_SUPPORTED_TD_ATTRS) { 328 error_setg(errp, "Some QEMU unsupported TD attribute bits being " 329 "requested: 0x%lx (QEMU supported: 0x%llx)", 330 tdx->attributes, TDX_SUPPORTED_TD_ATTRS); 331 return -1; 332 } 333 334 return 0; 335 } 336 337 static int setup_td_guest_attributes(X86CPU *x86cpu, Error **errp) 338 { 339 CPUX86State *env = &x86cpu->env; 340 341 tdx_guest->attributes |= (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS) ? 342 TDX_TD_ATTRIBUTES_PKS : 0; 343 tdx_guest->attributes |= x86cpu->enable_pmu ? TDX_TD_ATTRIBUTES_PERFMON : 0; 344 345 return tdx_validate_attributes(tdx_guest, errp); 346 } 347 348 static int setup_td_xfam(X86CPU *x86cpu, Error **errp) 349 { 350 CPUX86State *env = &x86cpu->env; 351 uint64_t xfam; 352 353 xfam = env->features[FEAT_XSAVE_XCR0_LO] | 354 env->features[FEAT_XSAVE_XCR0_HI] | 355 env->features[FEAT_XSAVE_XSS_LO] | 356 env->features[FEAT_XSAVE_XSS_HI]; 357 358 if (xfam & ~tdx_caps->supported_xfam) { 359 error_setg(errp, "Invalid XFAM 0x%lx for TDX VM (supported: 0x%llx))", 360 xfam, tdx_caps->supported_xfam); 361 return -1; 362 } 363 364 tdx_guest->xfam = xfam; 365 return 0; 366 } 367 368 static void tdx_filter_cpuid(struct kvm_cpuid2 *cpuids) 369 { 370 int i, dest_cnt = 0; 371 struct kvm_cpuid_entry2 *src, *dest, *conf; 372 373 for (i = 0; i < cpuids->nent; i++) { 374 src = cpuids->entries + i; 375 conf = cpuid_find_entry(&tdx_caps->cpuid, src->function, src->index); 376 if (!conf) { 377 continue; 378 } 379 dest = cpuids->entries + dest_cnt; 380 381 dest->function = src->function; 382 dest->index = src->index; 383 dest->flags = src->flags; 384 dest->eax = src->eax & conf->eax; 385 dest->ebx = src->ebx & conf->ebx; 386 dest->ecx = src->ecx & conf->ecx; 387 dest->edx = src->edx & conf->edx; 388 389 dest_cnt++; 390 } 391 cpuids->nent = dest_cnt++; 392 } 393 394 int tdx_pre_create_vcpu(CPUState *cpu, Error **errp) 395 { 396 X86CPU *x86cpu = X86_CPU(cpu); 397 CPUX86State *env = &x86cpu->env; 398 g_autofree struct kvm_tdx_init_vm *init_vm = NULL; 399 Error *local_err = NULL; 400 size_t data_len; 401 int retry = 10000; 402 int r = 0; 403 404 QEMU_LOCK_GUARD(&tdx_guest->lock); 405 if (tdx_guest->initialized) { 406 return r; 407 } 408 409 init_vm = g_malloc0(sizeof(struct kvm_tdx_init_vm) + 410 sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES); 411 412 if (!kvm_check_extension(kvm_state, KVM_CAP_X86_APIC_BUS_CYCLES_NS)) { 413 error_setg(errp, "KVM doesn't support KVM_CAP_X86_APIC_BUS_CYCLES_NS"); 414 return -EOPNOTSUPP; 415 } 416 417 r = kvm_vm_enable_cap(kvm_state, KVM_CAP_X86_APIC_BUS_CYCLES_NS, 418 0, TDX_APIC_BUS_CYCLES_NS); 419 if (r < 0) { 420 error_setg_errno(errp, -r, 421 "Unable to set core crystal clock frequency to 25MHz"); 422 return r; 423 } 424 425 if (env->tsc_khz && (env->tsc_khz < TDX_MIN_TSC_FREQUENCY_KHZ || 426 env->tsc_khz > TDX_MAX_TSC_FREQUENCY_KHZ)) { 427 error_setg(errp, "Invalid TSC %ld KHz, must specify cpu_frequency " 428 "between [%d, %d] kHz", env->tsc_khz, 429 TDX_MIN_TSC_FREQUENCY_KHZ, TDX_MAX_TSC_FREQUENCY_KHZ); 430 return -EINVAL; 431 } 432 433 if (env->tsc_khz % (25 * 1000)) { 434 error_setg(errp, "Invalid TSC %ld KHz, it must be multiple of 25MHz", 435 env->tsc_khz); 436 return -EINVAL; 437 } 438 439 /* it's safe even env->tsc_khz is 0. KVM uses host's tsc_khz in this case */ 440 r = kvm_vm_ioctl(kvm_state, KVM_SET_TSC_KHZ, env->tsc_khz); 441 if (r < 0) { 442 error_setg_errno(errp, -r, "Unable to set TSC frequency to %ld kHz", 443 env->tsc_khz); 444 return r; 445 } 446 447 if (tdx_guest->mrconfigid) { 448 g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrconfigid, 449 strlen(tdx_guest->mrconfigid), &data_len, errp); 450 if (!data) { 451 return -1; 452 } 453 if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) { 454 error_setg(errp, "TDX: failed to decode mrconfigid"); 455 return -1; 456 } 457 memcpy(init_vm->mrconfigid, data, data_len); 458 } 459 460 if (tdx_guest->mrowner) { 461 g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrowner, 462 strlen(tdx_guest->mrowner), &data_len, errp); 463 if (!data) { 464 return -1; 465 } 466 if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) { 467 error_setg(errp, "TDX: failed to decode mrowner"); 468 return -1; 469 } 470 memcpy(init_vm->mrowner, data, data_len); 471 } 472 473 if (tdx_guest->mrownerconfig) { 474 g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrownerconfig, 475 strlen(tdx_guest->mrownerconfig), &data_len, errp); 476 if (!data) { 477 return -1; 478 } 479 if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) { 480 error_setg(errp, "TDX: failed to decode mrownerconfig"); 481 return -1; 482 } 483 memcpy(init_vm->mrownerconfig, data, data_len); 484 } 485 486 r = setup_td_guest_attributes(x86cpu, errp); 487 if (r) { 488 return r; 489 } 490 491 r = setup_td_xfam(x86cpu, errp); 492 if (r) { 493 return r; 494 } 495 496 init_vm->cpuid.nent = kvm_x86_build_cpuid(env, init_vm->cpuid.entries, 0); 497 tdx_filter_cpuid(&init_vm->cpuid); 498 499 init_vm->attributes = tdx_guest->attributes; 500 init_vm->xfam = tdx_guest->xfam; 501 502 /* 503 * KVM_TDX_INIT_VM gets -EAGAIN when KVM side SEAMCALL(TDH_MNG_CREATE) 504 * gets TDX_RND_NO_ENTROPY due to Random number generation (e.g., RDRAND or 505 * RDSEED) is busy. 506 * 507 * Retry for the case. 508 */ 509 do { 510 error_free(local_err); 511 local_err = NULL; 512 r = tdx_vm_ioctl(KVM_TDX_INIT_VM, 0, init_vm, &local_err); 513 } while (r == -EAGAIN && --retry); 514 515 if (r < 0) { 516 if (!retry) { 517 error_append_hint(&local_err, "Hardware RNG (Random Number " 518 "Generator) is busy occupied by someone (via RDRAND/RDSEED) " 519 "maliciously, which leads to KVM_TDX_INIT_VM keeping failure " 520 "due to lack of entropy.\n"); 521 } 522 error_propagate(errp, local_err); 523 return r; 524 } 525 526 tdx_guest->initialized = true; 527 528 return 0; 529 } 530 531 int tdx_parse_tdvf(void *flash_ptr, int size) 532 { 533 return tdvf_parse_metadata(&tdx_guest->tdvf, flash_ptr, size); 534 } 535 536 static bool tdx_guest_get_sept_ve_disable(Object *obj, Error **errp) 537 { 538 TdxGuest *tdx = TDX_GUEST(obj); 539 540 return !!(tdx->attributes & TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE); 541 } 542 543 static void tdx_guest_set_sept_ve_disable(Object *obj, bool value, Error **errp) 544 { 545 TdxGuest *tdx = TDX_GUEST(obj); 546 547 if (value) { 548 tdx->attributes |= TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE; 549 } else { 550 tdx->attributes &= ~TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE; 551 } 552 } 553 554 static char *tdx_guest_get_mrconfigid(Object *obj, Error **errp) 555 { 556 TdxGuest *tdx = TDX_GUEST(obj); 557 558 return g_strdup(tdx->mrconfigid); 559 } 560 561 static void tdx_guest_set_mrconfigid(Object *obj, const char *value, Error **errp) 562 { 563 TdxGuest *tdx = TDX_GUEST(obj); 564 565 g_free(tdx->mrconfigid); 566 tdx->mrconfigid = g_strdup(value); 567 } 568 569 static char *tdx_guest_get_mrowner(Object *obj, Error **errp) 570 { 571 TdxGuest *tdx = TDX_GUEST(obj); 572 573 return g_strdup(tdx->mrowner); 574 } 575 576 static void tdx_guest_set_mrowner(Object *obj, const char *value, Error **errp) 577 { 578 TdxGuest *tdx = TDX_GUEST(obj); 579 580 g_free(tdx->mrowner); 581 tdx->mrowner = g_strdup(value); 582 } 583 584 static char *tdx_guest_get_mrownerconfig(Object *obj, Error **errp) 585 { 586 TdxGuest *tdx = TDX_GUEST(obj); 587 588 return g_strdup(tdx->mrownerconfig); 589 } 590 591 static void tdx_guest_set_mrownerconfig(Object *obj, const char *value, Error **errp) 592 { 593 TdxGuest *tdx = TDX_GUEST(obj); 594 595 g_free(tdx->mrownerconfig); 596 tdx->mrownerconfig = g_strdup(value); 597 } 598 599 /* tdx guest */ 600 OBJECT_DEFINE_TYPE_WITH_INTERFACES(TdxGuest, 601 tdx_guest, 602 TDX_GUEST, 603 X86_CONFIDENTIAL_GUEST, 604 { TYPE_USER_CREATABLE }, 605 { NULL }) 606 607 static void tdx_guest_init(Object *obj) 608 { 609 ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj); 610 TdxGuest *tdx = TDX_GUEST(obj); 611 612 qemu_mutex_init(&tdx->lock); 613 614 cgs->require_guest_memfd = true; 615 tdx->attributes = TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE; 616 617 object_property_add_uint64_ptr(obj, "attributes", &tdx->attributes, 618 OBJ_PROP_FLAG_READWRITE); 619 object_property_add_bool(obj, "sept-ve-disable", 620 tdx_guest_get_sept_ve_disable, 621 tdx_guest_set_sept_ve_disable); 622 object_property_add_str(obj, "mrconfigid", 623 tdx_guest_get_mrconfigid, 624 tdx_guest_set_mrconfigid); 625 object_property_add_str(obj, "mrowner", 626 tdx_guest_get_mrowner, tdx_guest_set_mrowner); 627 object_property_add_str(obj, "mrownerconfig", 628 tdx_guest_get_mrownerconfig, 629 tdx_guest_set_mrownerconfig); 630 } 631 632 static void tdx_guest_finalize(Object *obj) 633 { 634 } 635 636 static void tdx_guest_class_init(ObjectClass *oc, const void *data) 637 { 638 ConfidentialGuestSupportClass *klass = CONFIDENTIAL_GUEST_SUPPORT_CLASS(oc); 639 X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc); 640 641 klass->kvm_init = tdx_kvm_init; 642 x86_klass->kvm_type = tdx_kvm_type; 643 } 644