1 /* 2 * QEMU TDX support 3 * 4 * Copyright (c) 2025 Intel Corporation 5 * 6 * Author: 7 * Xiaoyao Li <xiaoyao.li@intel.com> 8 * 9 * SPDX-License-Identifier: GPL-2.0-or-later 10 */ 11 12 #include "qemu/osdep.h" 13 #include "qemu/error-report.h" 14 #include "qemu/base64.h" 15 #include "qemu/mmap-alloc.h" 16 #include "qapi/error.h" 17 #include "qapi/qapi-visit-sockets.h" 18 #include "qom/object_interfaces.h" 19 #include "crypto/hash.h" 20 #include "system/kvm_int.h" 21 #include "system/runstate.h" 22 #include "system/system.h" 23 #include "system/ramblock.h" 24 #include "system/address-spaces.h" 25 26 #include <linux/kvm_para.h> 27 28 #include "cpu.h" 29 #include "cpu-internal.h" 30 #include "host-cpu.h" 31 #include "hw/i386/apic_internal.h" 32 #include "hw/i386/apic-msidef.h" 33 #include "hw/i386/e820_memory_layout.h" 34 #include "hw/i386/tdvf.h" 35 #include "hw/i386/x86.h" 36 #include "hw/i386/tdvf-hob.h" 37 #include "hw/pci/msi.h" 38 #include "kvm_i386.h" 39 #include "tdx.h" 40 #include "tdx-quote-generator.h" 41 42 #include "standard-headers/asm-x86/kvm_para.h" 43 44 #define TDX_MIN_TSC_FREQUENCY_KHZ (100 * 1000) 45 #define TDX_MAX_TSC_FREQUENCY_KHZ (10 * 1000 * 1000) 46 47 #define TDX_TD_ATTRIBUTES_DEBUG BIT_ULL(0) 48 #define TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE BIT_ULL(28) 49 #define TDX_TD_ATTRIBUTES_PKS BIT_ULL(30) 50 #define TDX_TD_ATTRIBUTES_PERFMON BIT_ULL(63) 51 52 #define TDX_SUPPORTED_TD_ATTRS (TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE |\ 53 TDX_TD_ATTRIBUTES_PKS | \ 54 TDX_TD_ATTRIBUTES_PERFMON) 55 56 #define TDX_SUPPORTED_KVM_FEATURES ((1U << KVM_FEATURE_NOP_IO_DELAY) | \ 57 (1U << KVM_FEATURE_PV_UNHALT) | \ 58 (1U << KVM_FEATURE_PV_TLB_FLUSH) | \ 59 (1U << KVM_FEATURE_PV_SEND_IPI) | \ 60 (1U << KVM_FEATURE_POLL_CONTROL) | \ 61 (1U << KVM_FEATURE_PV_SCHED_YIELD) | \ 62 (1U << KVM_FEATURE_MSI_EXT_DEST_ID)) 63 64 static TdxGuest *tdx_guest; 65 66 static struct kvm_tdx_capabilities *tdx_caps; 67 static struct kvm_cpuid2 *tdx_supported_cpuid; 68 69 /* Valid after kvm_arch_init()->confidential_guest_kvm_init()->tdx_kvm_init() */ 70 bool is_tdx_vm(void) 71 { 72 return !!tdx_guest; 73 } 74 75 enum tdx_ioctl_level { 76 TDX_VM_IOCTL, 77 TDX_VCPU_IOCTL, 78 }; 79 80 static int tdx_ioctl_internal(enum tdx_ioctl_level level, void *state, 81 int cmd_id, __u32 flags, void *data, 82 Error **errp) 83 { 84 struct kvm_tdx_cmd tdx_cmd = {}; 85 int r; 86 87 const char *tdx_ioctl_name[] = { 88 [KVM_TDX_CAPABILITIES] = "KVM_TDX_CAPABILITIES", 89 [KVM_TDX_INIT_VM] = "KVM_TDX_INIT_VM", 90 [KVM_TDX_INIT_VCPU] = "KVM_TDX_INIT_VCPU", 91 [KVM_TDX_INIT_MEM_REGION] = "KVM_TDX_INIT_MEM_REGION", 92 [KVM_TDX_FINALIZE_VM] = "KVM_TDX_FINALIZE_VM", 93 [KVM_TDX_GET_CPUID] = "KVM_TDX_GET_CPUID", 94 }; 95 96 tdx_cmd.id = cmd_id; 97 tdx_cmd.flags = flags; 98 tdx_cmd.data = (__u64)(unsigned long)data; 99 100 switch (level) { 101 case TDX_VM_IOCTL: 102 r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd); 103 break; 104 case TDX_VCPU_IOCTL: 105 r = kvm_vcpu_ioctl(state, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd); 106 break; 107 default: 108 error_setg(errp, "Invalid tdx_ioctl_level %d", level); 109 return -EINVAL; 110 } 111 112 if (r < 0) { 113 error_setg_errno(errp, -r, "TDX ioctl %s failed, hw_errors: 0x%llx", 114 tdx_ioctl_name[cmd_id], tdx_cmd.hw_error); 115 } 116 return r; 117 } 118 119 static inline int tdx_vm_ioctl(int cmd_id, __u32 flags, void *data, 120 Error **errp) 121 { 122 return tdx_ioctl_internal(TDX_VM_IOCTL, NULL, cmd_id, flags, data, errp); 123 } 124 125 static inline int tdx_vcpu_ioctl(CPUState *cpu, int cmd_id, __u32 flags, 126 void *data, Error **errp) 127 { 128 return tdx_ioctl_internal(TDX_VCPU_IOCTL, cpu, cmd_id, flags, data, errp); 129 } 130 131 static int get_tdx_capabilities(Error **errp) 132 { 133 struct kvm_tdx_capabilities *caps; 134 /* 1st generation of TDX reports 6 cpuid configs */ 135 int nr_cpuid_configs = 6; 136 size_t size; 137 int r; 138 139 do { 140 Error *local_err = NULL; 141 size = sizeof(struct kvm_tdx_capabilities) + 142 nr_cpuid_configs * sizeof(struct kvm_cpuid_entry2); 143 caps = g_malloc0(size); 144 caps->cpuid.nent = nr_cpuid_configs; 145 146 r = tdx_vm_ioctl(KVM_TDX_CAPABILITIES, 0, caps, &local_err); 147 if (r == -E2BIG) { 148 g_free(caps); 149 nr_cpuid_configs *= 2; 150 if (nr_cpuid_configs > KVM_MAX_CPUID_ENTRIES) { 151 error_report("KVM TDX seems broken that number of CPUID entries" 152 " in kvm_tdx_capabilities exceeds limit: %d", 153 KVM_MAX_CPUID_ENTRIES); 154 error_propagate(errp, local_err); 155 return r; 156 } 157 error_free(local_err); 158 } else if (r < 0) { 159 g_free(caps); 160 error_propagate(errp, local_err); 161 return r; 162 } 163 } while (r == -E2BIG); 164 165 tdx_caps = caps; 166 167 return 0; 168 } 169 170 void tdx_set_tdvf_region(MemoryRegion *tdvf_mr) 171 { 172 assert(!tdx_guest->tdvf_mr); 173 tdx_guest->tdvf_mr = tdvf_mr; 174 } 175 176 static TdxFirmwareEntry *tdx_get_hob_entry(TdxGuest *tdx) 177 { 178 TdxFirmwareEntry *entry; 179 180 for_each_tdx_fw_entry(&tdx->tdvf, entry) { 181 if (entry->type == TDVF_SECTION_TYPE_TD_HOB) { 182 return entry; 183 } 184 } 185 error_report("TDVF metadata doesn't specify TD_HOB location."); 186 exit(1); 187 } 188 189 static void tdx_add_ram_entry(uint64_t address, uint64_t length, 190 enum TdxRamType type) 191 { 192 uint32_t nr_entries = tdx_guest->nr_ram_entries; 193 tdx_guest->ram_entries = g_renew(TdxRamEntry, tdx_guest->ram_entries, 194 nr_entries + 1); 195 196 tdx_guest->ram_entries[nr_entries].address = address; 197 tdx_guest->ram_entries[nr_entries].length = length; 198 tdx_guest->ram_entries[nr_entries].type = type; 199 tdx_guest->nr_ram_entries++; 200 } 201 202 static int tdx_accept_ram_range(uint64_t address, uint64_t length) 203 { 204 uint64_t head_start, tail_start, head_length, tail_length; 205 uint64_t tmp_address, tmp_length; 206 TdxRamEntry *e; 207 int i = 0; 208 209 do { 210 if (i == tdx_guest->nr_ram_entries) { 211 return -1; 212 } 213 214 e = &tdx_guest->ram_entries[i++]; 215 } while (address + length <= e->address || address >= e->address + e->length); 216 217 /* 218 * The to-be-accepted ram range must be fully contained by one 219 * RAM entry. 220 */ 221 if (e->address > address || 222 e->address + e->length < address + length) { 223 return -1; 224 } 225 226 if (e->type == TDX_RAM_ADDED) { 227 return 0; 228 } 229 230 tmp_address = e->address; 231 tmp_length = e->length; 232 233 e->address = address; 234 e->length = length; 235 e->type = TDX_RAM_ADDED; 236 237 head_length = address - tmp_address; 238 if (head_length > 0) { 239 head_start = tmp_address; 240 tdx_add_ram_entry(head_start, head_length, TDX_RAM_UNACCEPTED); 241 } 242 243 tail_start = address + length; 244 if (tail_start < tmp_address + tmp_length) { 245 tail_length = tmp_address + tmp_length - tail_start; 246 tdx_add_ram_entry(tail_start, tail_length, TDX_RAM_UNACCEPTED); 247 } 248 249 return 0; 250 } 251 252 static int tdx_ram_entry_compare(const void *lhs_, const void* rhs_) 253 { 254 const TdxRamEntry *lhs = lhs_; 255 const TdxRamEntry *rhs = rhs_; 256 257 if (lhs->address == rhs->address) { 258 return 0; 259 } 260 if (le64_to_cpu(lhs->address) > le64_to_cpu(rhs->address)) { 261 return 1; 262 } 263 return -1; 264 } 265 266 static void tdx_init_ram_entries(void) 267 { 268 unsigned i, j, nr_e820_entries; 269 270 nr_e820_entries = e820_get_table(NULL); 271 tdx_guest->ram_entries = g_new(TdxRamEntry, nr_e820_entries); 272 273 for (i = 0, j = 0; i < nr_e820_entries; i++) { 274 uint64_t addr, len; 275 276 if (e820_get_entry(i, E820_RAM, &addr, &len)) { 277 tdx_guest->ram_entries[j].address = addr; 278 tdx_guest->ram_entries[j].length = len; 279 tdx_guest->ram_entries[j].type = TDX_RAM_UNACCEPTED; 280 j++; 281 } 282 } 283 tdx_guest->nr_ram_entries = j; 284 } 285 286 static void tdx_post_init_vcpus(void) 287 { 288 TdxFirmwareEntry *hob; 289 CPUState *cpu; 290 291 hob = tdx_get_hob_entry(tdx_guest); 292 CPU_FOREACH(cpu) { 293 tdx_vcpu_ioctl(cpu, KVM_TDX_INIT_VCPU, 0, (void *)(uintptr_t)hob->address, 294 &error_fatal); 295 } 296 } 297 298 static void tdx_finalize_vm(Notifier *notifier, void *unused) 299 { 300 TdxFirmware *tdvf = &tdx_guest->tdvf; 301 TdxFirmwareEntry *entry; 302 RAMBlock *ram_block; 303 Error *local_err = NULL; 304 int r; 305 306 tdx_init_ram_entries(); 307 308 for_each_tdx_fw_entry(tdvf, entry) { 309 switch (entry->type) { 310 case TDVF_SECTION_TYPE_BFV: 311 case TDVF_SECTION_TYPE_CFV: 312 entry->mem_ptr = tdvf->mem_ptr + entry->data_offset; 313 break; 314 case TDVF_SECTION_TYPE_TD_HOB: 315 case TDVF_SECTION_TYPE_TEMP_MEM: 316 entry->mem_ptr = qemu_ram_mmap(-1, entry->size, 317 qemu_real_host_page_size(), 0, 0); 318 if (entry->mem_ptr == MAP_FAILED) { 319 error_report("Failed to mmap memory for TDVF section %d", 320 entry->type); 321 exit(1); 322 } 323 if (tdx_accept_ram_range(entry->address, entry->size)) { 324 error_report("Failed to accept memory for TDVF section %d", 325 entry->type); 326 qemu_ram_munmap(-1, entry->mem_ptr, entry->size); 327 exit(1); 328 } 329 break; 330 default: 331 error_report("Unsupported TDVF section %d", entry->type); 332 exit(1); 333 } 334 } 335 336 qsort(tdx_guest->ram_entries, tdx_guest->nr_ram_entries, 337 sizeof(TdxRamEntry), &tdx_ram_entry_compare); 338 339 tdvf_hob_create(tdx_guest, tdx_get_hob_entry(tdx_guest)); 340 341 tdx_post_init_vcpus(); 342 343 for_each_tdx_fw_entry(tdvf, entry) { 344 struct kvm_tdx_init_mem_region region; 345 uint32_t flags; 346 347 region = (struct kvm_tdx_init_mem_region) { 348 .source_addr = (uintptr_t)entry->mem_ptr, 349 .gpa = entry->address, 350 .nr_pages = entry->size >> 12, 351 }; 352 353 flags = entry->attributes & TDVF_SECTION_ATTRIBUTES_MR_EXTEND ? 354 KVM_TDX_MEASURE_MEMORY_REGION : 0; 355 356 do { 357 error_free(local_err); 358 local_err = NULL; 359 r = tdx_vcpu_ioctl(first_cpu, KVM_TDX_INIT_MEM_REGION, flags, 360 ®ion, &local_err); 361 } while (r == -EAGAIN || r == -EINTR); 362 if (r < 0) { 363 error_report_err(local_err); 364 exit(1); 365 } 366 367 if (entry->type == TDVF_SECTION_TYPE_TD_HOB || 368 entry->type == TDVF_SECTION_TYPE_TEMP_MEM) { 369 qemu_ram_munmap(-1, entry->mem_ptr, entry->size); 370 entry->mem_ptr = NULL; 371 } 372 } 373 374 /* 375 * TDVF image has been copied into private region above via 376 * KVM_MEMORY_MAPPING. It becomes useless. 377 */ 378 ram_block = tdx_guest->tdvf_mr->ram_block; 379 ram_block_discard_range(ram_block, 0, ram_block->max_length); 380 381 tdx_vm_ioctl(KVM_TDX_FINALIZE_VM, 0, NULL, &error_fatal); 382 CONFIDENTIAL_GUEST_SUPPORT(tdx_guest)->ready = true; 383 } 384 385 static Notifier tdx_machine_done_notify = { 386 .notify = tdx_finalize_vm, 387 }; 388 389 /* 390 * Some CPUID bits change from fixed1 to configurable bits when TDX module 391 * supports TDX_FEATURES0.VE_REDUCTION. e.g., MCA/MCE/MTRR/CORE_CAPABILITY. 392 * 393 * To make QEMU work with all the versions of TDX module, keep the fixed1 bits 394 * here if they are ever fixed1 bits in any of the version though not fixed1 in 395 * the latest version. Otherwise, with the older version of TDX module, QEMU may 396 * treat the fixed1 bit as unsupported. 397 * 398 * For newer TDX module, it does no harm to keep them in tdx_fixed1_bits even 399 * though they changed to configurable bits. Because tdx_fixed1_bits is used to 400 * setup the supported bits. 401 */ 402 KvmCpuidInfo tdx_fixed1_bits = { 403 .cpuid.nent = 8, 404 .entries[0] = { 405 .function = 0x1, 406 .index = 0, 407 .ecx = CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_DTES64 | 408 CPUID_EXT_DSCPL | CPUID_EXT_SSSE3 | CPUID_EXT_CX16 | 409 CPUID_EXT_PDCM | CPUID_EXT_PCID | CPUID_EXT_SSE41 | 410 CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE | 411 CPUID_EXT_POPCNT | CPUID_EXT_AES | CPUID_EXT_XSAVE | 412 CPUID_EXT_RDRAND | CPUID_EXT_HYPERVISOR, 413 .edx = CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC | 414 CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC | 415 CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV | 416 CPUID_PAT | CPUID_CLFLUSH | CPUID_DTS | CPUID_MMX | CPUID_FXSR | 417 CPUID_SSE | CPUID_SSE2, 418 }, 419 .entries[1] = { 420 .function = 0x6, 421 .index = 0, 422 .eax = CPUID_6_EAX_ARAT, 423 }, 424 .entries[2] = { 425 .function = 0x7, 426 .index = 0, 427 .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX, 428 .ebx = CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_FDP_EXCPTN_ONLY | 429 CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_INVPCID | 430 CPUID_7_0_EBX_ZERO_FCS_FDS | CPUID_7_0_EBX_RDSEED | 431 CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT | 432 CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_SHA_NI, 433 .ecx = CPUID_7_0_ECX_BUS_LOCK_DETECT | CPUID_7_0_ECX_MOVDIRI | 434 CPUID_7_0_ECX_MOVDIR64B, 435 .edx = CPUID_7_0_EDX_MD_CLEAR | CPUID_7_0_EDX_SPEC_CTRL | 436 CPUID_7_0_EDX_STIBP | CPUID_7_0_EDX_FLUSH_L1D | 437 CPUID_7_0_EDX_ARCH_CAPABILITIES | CPUID_7_0_EDX_CORE_CAPABILITY | 438 CPUID_7_0_EDX_SPEC_CTRL_SSBD, 439 }, 440 .entries[3] = { 441 .function = 0x7, 442 .index = 2, 443 .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX, 444 .edx = CPUID_7_2_EDX_PSFD | CPUID_7_2_EDX_IPRED_CTRL | 445 CPUID_7_2_EDX_RRSBA_CTRL | CPUID_7_2_EDX_BHI_CTRL, 446 }, 447 .entries[4] = { 448 .function = 0xD, 449 .index = 0, 450 .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX, 451 .eax = XSTATE_FP_MASK | XSTATE_SSE_MASK, 452 }, 453 .entries[5] = { 454 .function = 0xD, 455 .index = 1, 456 .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX, 457 .eax = CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC| 458 CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES, 459 }, 460 .entries[6] = { 461 .function = 0x80000001, 462 .index = 0, 463 .ecx = CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH, 464 /* 465 * Strictly speaking, SYSCALL is not fixed1 bit since it depends on 466 * the CPU to be in 64-bit mode. But here fixed1 is used to serve the 467 * purpose of supported bits for TDX. In this sense, SYACALL is always 468 * supported. 469 */ 470 .edx = CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB | 471 CPUID_EXT2_RDTSCP | CPUID_EXT2_LM, 472 }, 473 .entries[7] = { 474 .function = 0x80000007, 475 .index = 0, 476 .edx = CPUID_APM_INVTSC, 477 }, 478 }; 479 480 typedef struct TdxAttrsMap { 481 uint32_t attr_index; 482 uint32_t cpuid_leaf; 483 uint32_t cpuid_subleaf; 484 int cpuid_reg; 485 uint32_t feat_mask; 486 } TdxAttrsMap; 487 488 static TdxAttrsMap tdx_attrs_maps[] = { 489 {.attr_index = 27, 490 .cpuid_leaf = 7, 491 .cpuid_subleaf = 1, 492 .cpuid_reg = R_EAX, 493 .feat_mask = CPUID_7_1_EAX_LASS,}, 494 495 {.attr_index = 30, 496 .cpuid_leaf = 7, 497 .cpuid_subleaf = 0, 498 .cpuid_reg = R_ECX, 499 .feat_mask = CPUID_7_0_ECX_PKS,}, 500 501 {.attr_index = 31, 502 .cpuid_leaf = 7, 503 .cpuid_subleaf = 0, 504 .cpuid_reg = R_ECX, 505 .feat_mask = CPUID_7_0_ECX_KeyLocker,}, 506 }; 507 508 typedef struct TdxXFAMDep { 509 int xfam_bit; 510 FeatureMask feat_mask; 511 } TdxXFAMDep; 512 513 /* 514 * Note, only the CPUID bits whose virtualization type are "XFAM & Native" are 515 * defiend here. 516 * 517 * For those whose virtualization type are "XFAM & Configured & Native", they 518 * are reported as configurable bits. And they are not supported if not in the 519 * configureable bits list from KVM even if the corresponding XFAM bit is 520 * supported. 521 */ 522 TdxXFAMDep tdx_xfam_deps[] = { 523 { XSTATE_YMM_BIT, { FEAT_1_ECX, CPUID_EXT_FMA }}, 524 { XSTATE_YMM_BIT, { FEAT_7_0_EBX, CPUID_7_0_EBX_AVX2 }}, 525 { XSTATE_OPMASK_BIT, { FEAT_7_0_ECX, CPUID_7_0_ECX_AVX512_VBMI}}, 526 { XSTATE_OPMASK_BIT, { FEAT_7_0_EDX, CPUID_7_0_EDX_AVX512_FP16}}, 527 { XSTATE_PT_BIT, { FEAT_7_0_EBX, CPUID_7_0_EBX_INTEL_PT}}, 528 { XSTATE_PKRU_BIT, { FEAT_7_0_ECX, CPUID_7_0_ECX_PKU}}, 529 { XSTATE_XTILE_CFG_BIT, { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_BF16 }}, 530 { XSTATE_XTILE_CFG_BIT, { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE }}, 531 { XSTATE_XTILE_CFG_BIT, { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_INT8 }}, 532 }; 533 534 static struct kvm_cpuid_entry2 *find_in_supported_entry(uint32_t function, 535 uint32_t index) 536 { 537 struct kvm_cpuid_entry2 *e; 538 539 e = cpuid_find_entry(tdx_supported_cpuid, function, index); 540 if (!e) { 541 if (tdx_supported_cpuid->nent >= KVM_MAX_CPUID_ENTRIES) { 542 error_report("tdx_supported_cpuid requries more space than %d entries", 543 KVM_MAX_CPUID_ENTRIES); 544 exit(1); 545 } 546 e = &tdx_supported_cpuid->entries[tdx_supported_cpuid->nent++]; 547 e->function = function; 548 e->index = index; 549 } 550 551 return e; 552 } 553 554 static void tdx_add_supported_cpuid_by_fixed1_bits(void) 555 { 556 struct kvm_cpuid_entry2 *e, *e1; 557 int i; 558 559 for (i = 0; i < tdx_fixed1_bits.cpuid.nent; i++) { 560 e = &tdx_fixed1_bits.entries[i]; 561 562 e1 = find_in_supported_entry(e->function, e->index); 563 e1->eax |= e->eax; 564 e1->ebx |= e->ebx; 565 e1->ecx |= e->ecx; 566 e1->edx |= e->edx; 567 } 568 } 569 570 static void tdx_add_supported_cpuid_by_attrs(void) 571 { 572 struct kvm_cpuid_entry2 *e; 573 TdxAttrsMap *map; 574 int i; 575 576 for (i = 0; i < ARRAY_SIZE(tdx_attrs_maps); i++) { 577 map = &tdx_attrs_maps[i]; 578 if (!((1ULL << map->attr_index) & tdx_caps->supported_attrs)) { 579 continue; 580 } 581 582 e = find_in_supported_entry(map->cpuid_leaf, map->cpuid_subleaf); 583 584 switch(map->cpuid_reg) { 585 case R_EAX: 586 e->eax |= map->feat_mask; 587 break; 588 case R_EBX: 589 e->ebx |= map->feat_mask; 590 break; 591 case R_ECX: 592 e->ecx |= map->feat_mask; 593 break; 594 case R_EDX: 595 e->edx |= map->feat_mask; 596 break; 597 } 598 } 599 } 600 601 static void tdx_add_supported_cpuid_by_xfam(void) 602 { 603 struct kvm_cpuid_entry2 *e; 604 int i; 605 606 const TdxXFAMDep *xfam_dep; 607 const FeatureWordInfo *f; 608 for (i = 0; i < ARRAY_SIZE(tdx_xfam_deps); i++) { 609 xfam_dep = &tdx_xfam_deps[i]; 610 if (!((1ULL << xfam_dep->xfam_bit) & tdx_caps->supported_xfam)) { 611 continue; 612 } 613 614 f = &feature_word_info[xfam_dep->feat_mask.index]; 615 if (f->type != CPUID_FEATURE_WORD) { 616 continue; 617 } 618 619 e = find_in_supported_entry(f->cpuid.eax, f->cpuid.ecx); 620 switch(f->cpuid.reg) { 621 case R_EAX: 622 e->eax |= xfam_dep->feat_mask.mask; 623 break; 624 case R_EBX: 625 e->ebx |= xfam_dep->feat_mask.mask; 626 break; 627 case R_ECX: 628 e->ecx |= xfam_dep->feat_mask.mask; 629 break; 630 case R_EDX: 631 e->edx |= xfam_dep->feat_mask.mask; 632 break; 633 } 634 } 635 636 e = find_in_supported_entry(0xd, 0); 637 e->eax |= (tdx_caps->supported_xfam & CPUID_XSTATE_XCR0_MASK); 638 e->edx |= (tdx_caps->supported_xfam & CPUID_XSTATE_XCR0_MASK) >> 32; 639 640 e = find_in_supported_entry(0xd, 1); 641 /* 642 * Mark XFD always support for TDX, it will be cleared finally in 643 * tdx_adjust_cpuid_features() if XFD is unavailable on the hardware 644 * because in this case the original data has it as 0. 645 */ 646 e->eax |= CPUID_XSAVE_XFD; 647 e->ecx |= (tdx_caps->supported_xfam & CPUID_XSTATE_XSS_MASK); 648 e->edx |= (tdx_caps->supported_xfam & CPUID_XSTATE_XSS_MASK) >> 32; 649 } 650 651 static void tdx_add_supported_kvm_features(void) 652 { 653 struct kvm_cpuid_entry2 *e; 654 655 e = find_in_supported_entry(0x40000001, 0); 656 e->eax = TDX_SUPPORTED_KVM_FEATURES; 657 } 658 659 static void tdx_setup_supported_cpuid(void) 660 { 661 if (tdx_supported_cpuid) { 662 return; 663 } 664 665 tdx_supported_cpuid = g_malloc0(sizeof(*tdx_supported_cpuid) + 666 KVM_MAX_CPUID_ENTRIES * sizeof(struct kvm_cpuid_entry2)); 667 668 memcpy(tdx_supported_cpuid->entries, tdx_caps->cpuid.entries, 669 tdx_caps->cpuid.nent * sizeof(struct kvm_cpuid_entry2)); 670 tdx_supported_cpuid->nent = tdx_caps->cpuid.nent; 671 672 tdx_add_supported_cpuid_by_fixed1_bits(); 673 tdx_add_supported_cpuid_by_attrs(); 674 tdx_add_supported_cpuid_by_xfam(); 675 676 tdx_add_supported_kvm_features(); 677 } 678 679 static int tdx_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) 680 { 681 MachineState *ms = MACHINE(qdev_get_machine()); 682 X86MachineState *x86ms = X86_MACHINE(ms); 683 TdxGuest *tdx = TDX_GUEST(cgs); 684 int r = 0; 685 686 kvm_mark_guest_state_protected(); 687 688 if (x86ms->smm == ON_OFF_AUTO_AUTO) { 689 x86ms->smm = ON_OFF_AUTO_OFF; 690 } else if (x86ms->smm == ON_OFF_AUTO_ON) { 691 error_setg(errp, "TDX VM doesn't support SMM"); 692 return -EINVAL; 693 } 694 695 if (x86ms->pic == ON_OFF_AUTO_AUTO) { 696 x86ms->pic = ON_OFF_AUTO_OFF; 697 } else if (x86ms->pic == ON_OFF_AUTO_ON) { 698 error_setg(errp, "TDX VM doesn't support PIC"); 699 return -EINVAL; 700 } 701 702 if (kvm_state->kernel_irqchip_split == ON_OFF_AUTO_AUTO) { 703 kvm_state->kernel_irqchip_split = ON_OFF_AUTO_ON; 704 } else if (kvm_state->kernel_irqchip_split != ON_OFF_AUTO_ON) { 705 error_setg(errp, "TDX VM requires kernel_irqchip to be split"); 706 return -EINVAL; 707 } 708 709 if (!tdx_caps) { 710 r = get_tdx_capabilities(errp); 711 if (r) { 712 return r; 713 } 714 } 715 716 tdx_setup_supported_cpuid(); 717 718 /* TDX relies on KVM_HC_MAP_GPA_RANGE to handle TDG.VP.VMCALL<MapGPA> */ 719 if (!kvm_enable_hypercall(BIT_ULL(KVM_HC_MAP_GPA_RANGE))) { 720 return -EOPNOTSUPP; 721 } 722 723 /* 724 * Set kvm_readonly_mem_allowed to false, because TDX only supports readonly 725 * memory for shared memory but not for private memory. Besides, whether a 726 * memslot is private or shared is not determined by QEMU. 727 * 728 * Thus, just mark readonly memory not supported for simplicity. 729 */ 730 kvm_readonly_mem_allowed = false; 731 732 qemu_add_machine_init_done_notifier(&tdx_machine_done_notify); 733 734 tdx_guest = tdx; 735 return 0; 736 } 737 738 static int tdx_kvm_type(X86ConfidentialGuest *cg) 739 { 740 /* Do the object check */ 741 TDX_GUEST(cg); 742 743 return KVM_X86_TDX_VM; 744 } 745 746 static void tdx_cpu_instance_init(X86ConfidentialGuest *cg, CPUState *cpu) 747 { 748 X86CPUClass *xcc = X86_CPU_GET_CLASS(cpu); 749 X86CPU *x86cpu = X86_CPU(cpu); 750 751 if (xcc->model) { 752 error_report("Named cpu model is not supported for TDX yet!"); 753 exit(1); 754 } 755 756 object_property_set_bool(OBJECT(cpu), "pmu", false, &error_abort); 757 758 /* invtsc is fixed1 for TD guest */ 759 object_property_set_bool(OBJECT(cpu), "invtsc", true, &error_abort); 760 761 x86cpu->force_cpuid_0x1f = true; 762 } 763 764 static uint32_t tdx_adjust_cpuid_features(X86ConfidentialGuest *cg, 765 uint32_t feature, uint32_t index, 766 int reg, uint32_t value) 767 { 768 struct kvm_cpuid_entry2 *e; 769 770 e = cpuid_find_entry(&tdx_fixed1_bits.cpuid, feature, index); 771 if (e) { 772 value |= cpuid_entry_get_reg(e, reg); 773 } 774 775 if (is_feature_word_cpuid(feature, index, reg)) { 776 e = cpuid_find_entry(tdx_supported_cpuid, feature, index); 777 if (e) { 778 value &= cpuid_entry_get_reg(e, reg); 779 } 780 } 781 782 return value; 783 } 784 785 static struct kvm_cpuid2 *tdx_fetch_cpuid(CPUState *cpu, int *ret) 786 { 787 struct kvm_cpuid2 *fetch_cpuid; 788 int size = KVM_MAX_CPUID_ENTRIES; 789 Error *local_err = NULL; 790 int r; 791 792 do { 793 error_free(local_err); 794 local_err = NULL; 795 796 fetch_cpuid = g_malloc0(sizeof(*fetch_cpuid) + 797 sizeof(struct kvm_cpuid_entry2) * size); 798 fetch_cpuid->nent = size; 799 r = tdx_vcpu_ioctl(cpu, KVM_TDX_GET_CPUID, 0, fetch_cpuid, &local_err); 800 if (r == -E2BIG) { 801 g_free(fetch_cpuid); 802 size = fetch_cpuid->nent; 803 } 804 } while (r == -E2BIG); 805 806 if (r < 0) { 807 error_report_err(local_err); 808 *ret = r; 809 return NULL; 810 } 811 812 return fetch_cpuid; 813 } 814 815 static int tdx_check_features(X86ConfidentialGuest *cg, CPUState *cs) 816 { 817 uint64_t actual, requested, unavailable, forced_on; 818 g_autofree struct kvm_cpuid2 *fetch_cpuid; 819 const char *forced_on_prefix = NULL; 820 const char *unav_prefix = NULL; 821 struct kvm_cpuid_entry2 *entry; 822 X86CPU *cpu = X86_CPU(cs); 823 CPUX86State *env = &cpu->env; 824 FeatureWordInfo *wi; 825 FeatureWord w; 826 bool mismatch = false; 827 int r; 828 829 fetch_cpuid = tdx_fetch_cpuid(cs, &r); 830 if (!fetch_cpuid) { 831 return r; 832 } 833 834 if (cpu->check_cpuid || cpu->enforce_cpuid) { 835 unav_prefix = "TDX doesn't support requested feature"; 836 forced_on_prefix = "TDX forcibly sets the feature"; 837 } 838 839 for (w = 0; w < FEATURE_WORDS; w++) { 840 wi = &feature_word_info[w]; 841 actual = 0; 842 843 switch (wi->type) { 844 case CPUID_FEATURE_WORD: 845 entry = cpuid_find_entry(fetch_cpuid, wi->cpuid.eax, wi->cpuid.ecx); 846 if (!entry) { 847 /* 848 * If KVM doesn't report it means it's totally configurable 849 * by QEMU 850 */ 851 continue; 852 } 853 854 actual = cpuid_entry_get_reg(entry, wi->cpuid.reg); 855 break; 856 case MSR_FEATURE_WORD: 857 /* 858 * TODO: 859 * validate MSR features when KVM has interface report them. 860 */ 861 continue; 862 } 863 864 /* Fixup for special cases */ 865 switch (w) { 866 case FEAT_8000_0001_EDX: 867 /* 868 * Intel enumerates SYSCALL bit as 1 only when processor in 64-bit 869 * mode and before vcpu running it's not in 64-bit mode. 870 */ 871 actual |= CPUID_EXT2_SYSCALL; 872 break; 873 default: 874 break; 875 } 876 877 requested = env->features[w]; 878 unavailable = requested & ~actual; 879 mark_unavailable_features(cpu, w, unavailable, unav_prefix); 880 if (unavailable) { 881 mismatch = true; 882 } 883 884 forced_on = actual & ~requested; 885 mark_forced_on_features(cpu, w, forced_on, forced_on_prefix); 886 if (forced_on) { 887 mismatch = true; 888 } 889 } 890 891 if (cpu->enforce_cpuid && mismatch) { 892 return -EINVAL; 893 } 894 895 if (cpu->phys_bits != host_cpu_phys_bits()) { 896 error_report("TDX requires guest CPU physical bits (%u) " 897 "to match host CPU physical bits (%u)", 898 cpu->phys_bits, host_cpu_phys_bits()); 899 return -EINVAL; 900 } 901 902 return 0; 903 } 904 905 static int tdx_validate_attributes(TdxGuest *tdx, Error **errp) 906 { 907 if ((tdx->attributes & ~tdx_caps->supported_attrs)) { 908 error_setg(errp, "Invalid attributes 0x%"PRIx64" for TDX VM " 909 "(KVM supported: 0x%"PRIx64")", tdx->attributes, 910 (uint64_t)tdx_caps->supported_attrs); 911 return -1; 912 } 913 914 if (tdx->attributes & ~TDX_SUPPORTED_TD_ATTRS) { 915 error_setg(errp, "Some QEMU unsupported TD attribute bits being " 916 "requested: 0x%"PRIx64" (QEMU supported: 0x%"PRIx64")", 917 tdx->attributes, (uint64_t)TDX_SUPPORTED_TD_ATTRS); 918 return -1; 919 } 920 921 return 0; 922 } 923 924 static int setup_td_guest_attributes(X86CPU *x86cpu, Error **errp) 925 { 926 CPUX86State *env = &x86cpu->env; 927 928 tdx_guest->attributes |= (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS) ? 929 TDX_TD_ATTRIBUTES_PKS : 0; 930 tdx_guest->attributes |= x86cpu->enable_pmu ? TDX_TD_ATTRIBUTES_PERFMON : 0; 931 932 return tdx_validate_attributes(tdx_guest, errp); 933 } 934 935 static int setup_td_xfam(X86CPU *x86cpu, Error **errp) 936 { 937 CPUX86State *env = &x86cpu->env; 938 uint64_t xfam; 939 940 xfam = env->features[FEAT_XSAVE_XCR0_LO] | 941 env->features[FEAT_XSAVE_XCR0_HI] | 942 env->features[FEAT_XSAVE_XSS_LO] | 943 env->features[FEAT_XSAVE_XSS_HI]; 944 945 if (xfam & ~tdx_caps->supported_xfam) { 946 error_setg(errp, "Invalid XFAM 0x%"PRIx64" for TDX VM (supported: 0x%"PRIx64"))", 947 xfam, (uint64_t)tdx_caps->supported_xfam); 948 return -1; 949 } 950 951 tdx_guest->xfam = xfam; 952 return 0; 953 } 954 955 static void tdx_filter_cpuid(struct kvm_cpuid2 *cpuids) 956 { 957 int i, dest_cnt = 0; 958 struct kvm_cpuid_entry2 *src, *dest, *conf; 959 960 for (i = 0; i < cpuids->nent; i++) { 961 src = cpuids->entries + i; 962 conf = cpuid_find_entry(&tdx_caps->cpuid, src->function, src->index); 963 if (!conf) { 964 continue; 965 } 966 dest = cpuids->entries + dest_cnt; 967 968 dest->function = src->function; 969 dest->index = src->index; 970 dest->flags = src->flags; 971 dest->eax = src->eax & conf->eax; 972 dest->ebx = src->ebx & conf->ebx; 973 dest->ecx = src->ecx & conf->ecx; 974 dest->edx = src->edx & conf->edx; 975 976 dest_cnt++; 977 } 978 cpuids->nent = dest_cnt++; 979 } 980 981 int tdx_pre_create_vcpu(CPUState *cpu, Error **errp) 982 { 983 X86CPU *x86cpu = X86_CPU(cpu); 984 CPUX86State *env = &x86cpu->env; 985 g_autofree struct kvm_tdx_init_vm *init_vm = NULL; 986 Error *local_err = NULL; 987 size_t data_len; 988 int retry = 10000; 989 int r = 0; 990 991 QEMU_LOCK_GUARD(&tdx_guest->lock); 992 if (tdx_guest->initialized) { 993 return r; 994 } 995 996 init_vm = g_malloc0(sizeof(struct kvm_tdx_init_vm) + 997 sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES); 998 999 if (!kvm_check_extension(kvm_state, KVM_CAP_X86_APIC_BUS_CYCLES_NS)) { 1000 error_setg(errp, "KVM doesn't support KVM_CAP_X86_APIC_BUS_CYCLES_NS"); 1001 return -EOPNOTSUPP; 1002 } 1003 1004 r = kvm_vm_enable_cap(kvm_state, KVM_CAP_X86_APIC_BUS_CYCLES_NS, 1005 0, TDX_APIC_BUS_CYCLES_NS); 1006 if (r < 0) { 1007 error_setg_errno(errp, -r, 1008 "Unable to set core crystal clock frequency to 25MHz"); 1009 return r; 1010 } 1011 1012 if (env->tsc_khz && (env->tsc_khz < TDX_MIN_TSC_FREQUENCY_KHZ || 1013 env->tsc_khz > TDX_MAX_TSC_FREQUENCY_KHZ)) { 1014 error_setg(errp, "Invalid TSC %"PRId64" KHz, must specify cpu_frequency " 1015 "between [%d, %d] kHz", env->tsc_khz, 1016 TDX_MIN_TSC_FREQUENCY_KHZ, TDX_MAX_TSC_FREQUENCY_KHZ); 1017 return -EINVAL; 1018 } 1019 1020 if (env->tsc_khz % (25 * 1000)) { 1021 error_setg(errp, "Invalid TSC %"PRId64" KHz, it must be multiple of 25MHz", 1022 env->tsc_khz); 1023 return -EINVAL; 1024 } 1025 1026 /* it's safe even env->tsc_khz is 0. KVM uses host's tsc_khz in this case */ 1027 r = kvm_vm_ioctl(kvm_state, KVM_SET_TSC_KHZ, env->tsc_khz); 1028 if (r < 0) { 1029 error_setg_errno(errp, -r, "Unable to set TSC frequency to %"PRId64" kHz", 1030 env->tsc_khz); 1031 return r; 1032 } 1033 1034 if (tdx_guest->mrconfigid) { 1035 g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrconfigid, 1036 strlen(tdx_guest->mrconfigid), &data_len, errp); 1037 if (!data) { 1038 return -1; 1039 } 1040 if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) { 1041 error_setg(errp, "TDX 'mrconfigid' sha384 digest was %ld bytes, " 1042 "expected %d bytes", data_len, 1043 QCRYPTO_HASH_DIGEST_LEN_SHA384); 1044 return -1; 1045 } 1046 memcpy(init_vm->mrconfigid, data, data_len); 1047 } 1048 1049 if (tdx_guest->mrowner) { 1050 g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrowner, 1051 strlen(tdx_guest->mrowner), &data_len, errp); 1052 if (!data) { 1053 return -1; 1054 } 1055 if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) { 1056 error_setg(errp, "TDX 'mrowner' sha384 digest was %ld bytes, " 1057 "expected %d bytes", data_len, 1058 QCRYPTO_HASH_DIGEST_LEN_SHA384); 1059 return -1; 1060 } 1061 memcpy(init_vm->mrowner, data, data_len); 1062 } 1063 1064 if (tdx_guest->mrownerconfig) { 1065 g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrownerconfig, 1066 strlen(tdx_guest->mrownerconfig), &data_len, errp); 1067 if (!data) { 1068 return -1; 1069 } 1070 if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) { 1071 error_setg(errp, "TDX 'mrownerconfig' sha384 digest was %ld bytes, " 1072 "expected %d bytes", data_len, 1073 QCRYPTO_HASH_DIGEST_LEN_SHA384); 1074 return -1; 1075 } 1076 memcpy(init_vm->mrownerconfig, data, data_len); 1077 } 1078 1079 r = setup_td_guest_attributes(x86cpu, errp); 1080 if (r) { 1081 return r; 1082 } 1083 1084 r = setup_td_xfam(x86cpu, errp); 1085 if (r) { 1086 return r; 1087 } 1088 1089 init_vm->cpuid.nent = kvm_x86_build_cpuid(env, init_vm->cpuid.entries, 0); 1090 tdx_filter_cpuid(&init_vm->cpuid); 1091 1092 init_vm->attributes = tdx_guest->attributes; 1093 init_vm->xfam = tdx_guest->xfam; 1094 1095 /* 1096 * KVM_TDX_INIT_VM gets -EAGAIN when KVM side SEAMCALL(TDH_MNG_CREATE) 1097 * gets TDX_RND_NO_ENTROPY due to Random number generation (e.g., RDRAND or 1098 * RDSEED) is busy. 1099 * 1100 * Retry for the case. 1101 */ 1102 do { 1103 error_free(local_err); 1104 local_err = NULL; 1105 r = tdx_vm_ioctl(KVM_TDX_INIT_VM, 0, init_vm, &local_err); 1106 } while (r == -EAGAIN && --retry); 1107 1108 if (r < 0) { 1109 if (!retry) { 1110 error_append_hint(&local_err, "Hardware RNG (Random Number " 1111 "Generator) is busy occupied by someone (via RDRAND/RDSEED) " 1112 "maliciously, which leads to KVM_TDX_INIT_VM keeping failure " 1113 "due to lack of entropy.\n"); 1114 } 1115 error_propagate(errp, local_err); 1116 return r; 1117 } 1118 1119 tdx_guest->initialized = true; 1120 1121 return 0; 1122 } 1123 1124 int tdx_parse_tdvf(void *flash_ptr, int size) 1125 { 1126 return tdvf_parse_metadata(&tdx_guest->tdvf, flash_ptr, size); 1127 } 1128 1129 static void tdx_inject_interrupt(TdxGuest *tdx) 1130 { 1131 int ret; 1132 uint32_t apicid, vector; 1133 1134 qemu_mutex_lock(&tdx->lock); 1135 vector = tdx->event_notify_vector; 1136 apicid = tdx->event_notify_apicid; 1137 qemu_mutex_unlock(&tdx->lock); 1138 if (vector < 32 || vector > 255) { 1139 return; 1140 } 1141 1142 MSIMessage msg = { 1143 .address = ((apicid & 0xff) << MSI_ADDR_DEST_ID_SHIFT) | 1144 (((uint64_t)apicid & 0xffffff00) << 32), 1145 .data = vector | (APIC_DM_FIXED << MSI_DATA_DELIVERY_MODE_SHIFT), 1146 }; 1147 1148 ret = kvm_irqchip_send_msi(kvm_state, msg); 1149 if (ret < 0) { 1150 /* In this case, no better way to tell it to guest. Log it. */ 1151 error_report("TDX: injection interrupt %d failed, interrupt lost (%s).", 1152 vector, strerror(-ret)); 1153 } 1154 } 1155 1156 static void tdx_get_quote_completion(TdxGenerateQuoteTask *task) 1157 { 1158 TdxGuest *tdx = task->opaque; 1159 int ret; 1160 1161 /* Maintain the number of in-flight requests. */ 1162 qemu_mutex_lock(&tdx->lock); 1163 tdx->num--; 1164 qemu_mutex_unlock(&tdx->lock); 1165 1166 if (task->status_code == TDX_VP_GET_QUOTE_SUCCESS) { 1167 ret = address_space_write(&address_space_memory, task->payload_gpa, 1168 MEMTXATTRS_UNSPECIFIED, task->receive_buf, 1169 task->receive_buf_received); 1170 if (ret != MEMTX_OK) { 1171 error_report("TDX: get-quote: failed to write quote data."); 1172 } else { 1173 task->hdr.out_len = cpu_to_le64(task->receive_buf_received); 1174 } 1175 } 1176 task->hdr.error_code = cpu_to_le64(task->status_code); 1177 1178 /* Publish the response contents before marking this request completed. */ 1179 smp_wmb(); 1180 ret = address_space_write(&address_space_memory, task->buf_gpa, 1181 MEMTXATTRS_UNSPECIFIED, &task->hdr, 1182 TDX_GET_QUOTE_HDR_SIZE); 1183 if (ret != MEMTX_OK) { 1184 error_report("TDX: get-quote: failed to update GetQuote header."); 1185 } 1186 1187 tdx_inject_interrupt(tdx); 1188 1189 g_free(task->send_data); 1190 g_free(task->receive_buf); 1191 g_free(task); 1192 object_unref(tdx); 1193 } 1194 1195 void tdx_handle_get_quote(X86CPU *cpu, struct kvm_run *run) 1196 { 1197 TdxGenerateQuoteTask *task; 1198 struct tdx_get_quote_header hdr; 1199 hwaddr buf_gpa = run->tdx.get_quote.gpa; 1200 uint64_t buf_len = run->tdx.get_quote.size; 1201 1202 QEMU_BUILD_BUG_ON(sizeof(struct tdx_get_quote_header) != TDX_GET_QUOTE_HDR_SIZE); 1203 1204 run->tdx.get_quote.ret = TDG_VP_VMCALL_INVALID_OPERAND; 1205 1206 if (buf_len == 0) { 1207 return; 1208 } 1209 1210 if (!QEMU_IS_ALIGNED(buf_gpa, 4096) || !QEMU_IS_ALIGNED(buf_len, 4096)) { 1211 run->tdx.get_quote.ret = TDG_VP_VMCALL_ALIGN_ERROR; 1212 return; 1213 } 1214 1215 if (address_space_read(&address_space_memory, buf_gpa, MEMTXATTRS_UNSPECIFIED, 1216 &hdr, TDX_GET_QUOTE_HDR_SIZE) != MEMTX_OK) { 1217 error_report("TDX: get-quote: failed to read GetQuote header."); 1218 return; 1219 } 1220 1221 if (le64_to_cpu(hdr.structure_version) != TDX_GET_QUOTE_STRUCTURE_VERSION) { 1222 return; 1223 } 1224 1225 /* Only safe-guard check to avoid too large buffer size. */ 1226 if (buf_len > TDX_GET_QUOTE_MAX_BUF_LEN || 1227 le32_to_cpu(hdr.in_len) > buf_len - TDX_GET_QUOTE_HDR_SIZE) { 1228 return; 1229 } 1230 1231 if (!tdx_guest->qg_sock_addr) { 1232 hdr.error_code = cpu_to_le64(TDX_VP_GET_QUOTE_QGS_UNAVAILABLE); 1233 if (address_space_write(&address_space_memory, buf_gpa, 1234 MEMTXATTRS_UNSPECIFIED, 1235 &hdr, TDX_GET_QUOTE_HDR_SIZE) != MEMTX_OK) { 1236 error_report("TDX: failed to update GetQuote header."); 1237 return; 1238 } 1239 run->tdx.get_quote.ret = TDG_VP_VMCALL_SUCCESS; 1240 return; 1241 } 1242 1243 qemu_mutex_lock(&tdx_guest->lock); 1244 if (tdx_guest->num >= TDX_MAX_GET_QUOTE_REQUEST) { 1245 qemu_mutex_unlock(&tdx_guest->lock); 1246 run->tdx.get_quote.ret = TDG_VP_VMCALL_RETRY; 1247 return; 1248 } 1249 tdx_guest->num++; 1250 qemu_mutex_unlock(&tdx_guest->lock); 1251 1252 task = g_new(TdxGenerateQuoteTask, 1); 1253 task->buf_gpa = buf_gpa; 1254 task->payload_gpa = buf_gpa + TDX_GET_QUOTE_HDR_SIZE; 1255 task->payload_len = buf_len - TDX_GET_QUOTE_HDR_SIZE; 1256 task->hdr = hdr; 1257 task->completion = tdx_get_quote_completion; 1258 1259 task->send_data_size = le32_to_cpu(hdr.in_len); 1260 task->send_data = g_malloc(task->send_data_size); 1261 task->send_data_sent = 0; 1262 1263 if (address_space_read(&address_space_memory, task->payload_gpa, 1264 MEMTXATTRS_UNSPECIFIED, task->send_data, 1265 task->send_data_size) != MEMTX_OK) { 1266 goto out_free; 1267 } 1268 1269 /* Mark the buffer in-flight. */ 1270 hdr.error_code = cpu_to_le64(TDX_VP_GET_QUOTE_IN_FLIGHT); 1271 if (address_space_write(&address_space_memory, buf_gpa, 1272 MEMTXATTRS_UNSPECIFIED, 1273 &hdr, TDX_GET_QUOTE_HDR_SIZE) != MEMTX_OK) { 1274 goto out_free; 1275 } 1276 1277 task->receive_buf = g_malloc0(task->payload_len); 1278 task->receive_buf_received = 0; 1279 task->opaque = tdx_guest; 1280 1281 object_ref(tdx_guest); 1282 tdx_generate_quote(task, tdx_guest->qg_sock_addr); 1283 run->tdx.get_quote.ret = TDG_VP_VMCALL_SUCCESS; 1284 return; 1285 1286 out_free: 1287 g_free(task->send_data); 1288 g_free(task); 1289 } 1290 1291 #define SUPPORTED_TDVMCALLINFO_1_R11 (TDG_VP_VMCALL_SUBFUNC_SET_EVENT_NOTIFY_INTERRUPT) 1292 #define SUPPORTED_TDVMCALLINFO_1_R12 (0) 1293 1294 void tdx_handle_get_tdvmcall_info(X86CPU *cpu, struct kvm_run *run) 1295 { 1296 if (run->tdx.get_tdvmcall_info.leaf != 1) { 1297 return; 1298 } 1299 1300 run->tdx.get_tdvmcall_info.r11 = (tdx_caps->user_tdvmcallinfo_1_r11 & 1301 SUPPORTED_TDVMCALLINFO_1_R11) | 1302 tdx_caps->kernel_tdvmcallinfo_1_r11; 1303 run->tdx.get_tdvmcall_info.r12 = (tdx_caps->user_tdvmcallinfo_1_r12 & 1304 SUPPORTED_TDVMCALLINFO_1_R12) | 1305 tdx_caps->kernel_tdvmcallinfo_1_r12; 1306 run->tdx.get_tdvmcall_info.r13 = 0; 1307 run->tdx.get_tdvmcall_info.r14 = 0; 1308 1309 run->tdx.get_tdvmcall_info.ret = TDG_VP_VMCALL_SUCCESS; 1310 } 1311 1312 void tdx_handle_setup_event_notify_interrupt(X86CPU *cpu, struct kvm_run *run) 1313 { 1314 uint64_t vector = run->tdx.setup_event_notify.vector; 1315 1316 if (vector >= 32 && vector < 256) { 1317 qemu_mutex_lock(&tdx_guest->lock); 1318 tdx_guest->event_notify_vector = vector; 1319 tdx_guest->event_notify_apicid = cpu->apic_id; 1320 qemu_mutex_unlock(&tdx_guest->lock); 1321 run->tdx.setup_event_notify.ret = TDG_VP_VMCALL_SUCCESS; 1322 } else { 1323 run->tdx.setup_event_notify.ret = TDG_VP_VMCALL_INVALID_OPERAND; 1324 } 1325 } 1326 1327 static void tdx_panicked_on_fatal_error(X86CPU *cpu, uint64_t error_code, 1328 char *message, bool has_gpa, 1329 uint64_t gpa) 1330 { 1331 GuestPanicInformation *panic_info; 1332 1333 panic_info = g_new0(GuestPanicInformation, 1); 1334 panic_info->type = GUEST_PANIC_INFORMATION_TYPE_TDX; 1335 panic_info->u.tdx.error_code = (uint32_t) error_code; 1336 panic_info->u.tdx.message = message; 1337 panic_info->u.tdx.gpa = gpa; 1338 panic_info->u.tdx.has_gpa = has_gpa; 1339 1340 qemu_system_guest_panicked(panic_info); 1341 } 1342 1343 /* 1344 * Only 8 registers can contain valid ASCII byte stream to form the fatal 1345 * message, and their sequence is: R14, R15, RBX, RDI, RSI, R8, R9, RDX 1346 */ 1347 #define TDX_FATAL_MESSAGE_MAX 64 1348 1349 #define TDX_REPORT_FATAL_ERROR_GPA_VALID BIT_ULL(63) 1350 1351 int tdx_handle_report_fatal_error(X86CPU *cpu, struct kvm_run *run) 1352 { 1353 uint64_t error_code = run->system_event.data[R_R12]; 1354 uint64_t reg_mask = run->system_event.data[R_ECX]; 1355 char *message = NULL; 1356 uint64_t *tmp; 1357 uint64_t gpa = -1ull; 1358 bool has_gpa = false; 1359 1360 if (error_code & 0xffff) { 1361 error_report("TDX: REPORT_FATAL_ERROR: invalid error code: 0x%"PRIx64, 1362 error_code); 1363 return -1; 1364 } 1365 1366 if (reg_mask) { 1367 message = g_malloc0(TDX_FATAL_MESSAGE_MAX + 1); 1368 tmp = (uint64_t *)message; 1369 1370 #define COPY_REG(REG) \ 1371 do { \ 1372 if (reg_mask & BIT_ULL(REG)) { \ 1373 *(tmp++) = run->system_event.data[REG]; \ 1374 } \ 1375 } while (0) 1376 1377 COPY_REG(R_R14); 1378 COPY_REG(R_R15); 1379 COPY_REG(R_EBX); 1380 COPY_REG(R_EDI); 1381 COPY_REG(R_ESI); 1382 COPY_REG(R_R8); 1383 COPY_REG(R_R9); 1384 COPY_REG(R_EDX); 1385 *((char *)tmp) = '\0'; 1386 } 1387 #undef COPY_REG 1388 1389 if (error_code & TDX_REPORT_FATAL_ERROR_GPA_VALID) { 1390 gpa = run->system_event.data[R_R13]; 1391 has_gpa = true; 1392 } 1393 1394 tdx_panicked_on_fatal_error(cpu, error_code, message, has_gpa, gpa); 1395 1396 return -1; 1397 } 1398 1399 static bool tdx_guest_get_sept_ve_disable(Object *obj, Error **errp) 1400 { 1401 TdxGuest *tdx = TDX_GUEST(obj); 1402 1403 return !!(tdx->attributes & TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE); 1404 } 1405 1406 static void tdx_guest_set_sept_ve_disable(Object *obj, bool value, Error **errp) 1407 { 1408 TdxGuest *tdx = TDX_GUEST(obj); 1409 1410 if (value) { 1411 tdx->attributes |= TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE; 1412 } else { 1413 tdx->attributes &= ~TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE; 1414 } 1415 } 1416 1417 static char *tdx_guest_get_mrconfigid(Object *obj, Error **errp) 1418 { 1419 TdxGuest *tdx = TDX_GUEST(obj); 1420 1421 return g_strdup(tdx->mrconfigid); 1422 } 1423 1424 static void tdx_guest_set_mrconfigid(Object *obj, const char *value, Error **errp) 1425 { 1426 TdxGuest *tdx = TDX_GUEST(obj); 1427 1428 g_free(tdx->mrconfigid); 1429 tdx->mrconfigid = g_strdup(value); 1430 } 1431 1432 static char *tdx_guest_get_mrowner(Object *obj, Error **errp) 1433 { 1434 TdxGuest *tdx = TDX_GUEST(obj); 1435 1436 return g_strdup(tdx->mrowner); 1437 } 1438 1439 static void tdx_guest_set_mrowner(Object *obj, const char *value, Error **errp) 1440 { 1441 TdxGuest *tdx = TDX_GUEST(obj); 1442 1443 g_free(tdx->mrowner); 1444 tdx->mrowner = g_strdup(value); 1445 } 1446 1447 static char *tdx_guest_get_mrownerconfig(Object *obj, Error **errp) 1448 { 1449 TdxGuest *tdx = TDX_GUEST(obj); 1450 1451 return g_strdup(tdx->mrownerconfig); 1452 } 1453 1454 static void tdx_guest_set_mrownerconfig(Object *obj, const char *value, Error **errp) 1455 { 1456 TdxGuest *tdx = TDX_GUEST(obj); 1457 1458 g_free(tdx->mrownerconfig); 1459 tdx->mrownerconfig = g_strdup(value); 1460 } 1461 1462 static void tdx_guest_get_qgs(Object *obj, Visitor *v, 1463 const char *name, void *opaque, 1464 Error **errp) 1465 { 1466 TdxGuest *tdx = TDX_GUEST(obj); 1467 1468 if (!tdx->qg_sock_addr) { 1469 error_setg(errp, "quote-generation-socket is not set"); 1470 return; 1471 } 1472 visit_type_SocketAddress(v, name, &tdx->qg_sock_addr, errp); 1473 } 1474 1475 static void tdx_guest_set_qgs(Object *obj, Visitor *v, 1476 const char *name, void *opaque, 1477 Error **errp) 1478 { 1479 TdxGuest *tdx = TDX_GUEST(obj); 1480 SocketAddress *sock = NULL; 1481 1482 if (!visit_type_SocketAddress(v, name, &sock, errp)) { 1483 return; 1484 } 1485 1486 if (tdx->qg_sock_addr) { 1487 qapi_free_SocketAddress(tdx->qg_sock_addr); 1488 } 1489 1490 tdx->qg_sock_addr = sock; 1491 } 1492 1493 /* tdx guest */ 1494 OBJECT_DEFINE_TYPE_WITH_INTERFACES(TdxGuest, 1495 tdx_guest, 1496 TDX_GUEST, 1497 X86_CONFIDENTIAL_GUEST, 1498 { TYPE_USER_CREATABLE }, 1499 { NULL }) 1500 1501 static void tdx_guest_init(Object *obj) 1502 { 1503 ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj); 1504 TdxGuest *tdx = TDX_GUEST(obj); 1505 1506 qemu_mutex_init(&tdx->lock); 1507 1508 cgs->require_guest_memfd = true; 1509 tdx->attributes = TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE; 1510 1511 object_property_add_uint64_ptr(obj, "attributes", &tdx->attributes, 1512 OBJ_PROP_FLAG_READWRITE); 1513 object_property_add_bool(obj, "sept-ve-disable", 1514 tdx_guest_get_sept_ve_disable, 1515 tdx_guest_set_sept_ve_disable); 1516 object_property_add_str(obj, "mrconfigid", 1517 tdx_guest_get_mrconfigid, 1518 tdx_guest_set_mrconfigid); 1519 object_property_add_str(obj, "mrowner", 1520 tdx_guest_get_mrowner, tdx_guest_set_mrowner); 1521 object_property_add_str(obj, "mrownerconfig", 1522 tdx_guest_get_mrownerconfig, 1523 tdx_guest_set_mrownerconfig); 1524 1525 object_property_add(obj, "quote-generation-socket", "SocketAddress", 1526 tdx_guest_get_qgs, 1527 tdx_guest_set_qgs, 1528 NULL, NULL); 1529 1530 tdx->event_notify_vector = -1; 1531 tdx->event_notify_apicid = -1; 1532 } 1533 1534 static void tdx_guest_finalize(Object *obj) 1535 { 1536 } 1537 1538 static void tdx_guest_class_init(ObjectClass *oc, const void *data) 1539 { 1540 ConfidentialGuestSupportClass *klass = CONFIDENTIAL_GUEST_SUPPORT_CLASS(oc); 1541 X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc); 1542 1543 klass->kvm_init = tdx_kvm_init; 1544 x86_klass->kvm_type = tdx_kvm_type; 1545 x86_klass->cpu_instance_init = tdx_cpu_instance_init; 1546 x86_klass->adjust_cpuid_features = tdx_adjust_cpuid_features; 1547 x86_klass->check_features = tdx_check_features; 1548 } 1549