xref: /openbmc/qemu/target/i386/kvm/tdx.c (revision e82989544e38062beeeaad88c175afbeed0400f8)
1 /*
2  * QEMU TDX support
3  *
4  * Copyright (c) 2025 Intel Corporation
5  *
6  * Author:
7  *      Xiaoyao Li <xiaoyao.li@intel.com>
8  *
9  * SPDX-License-Identifier: GPL-2.0-or-later
10  */
11 
12 #include "qemu/osdep.h"
13 #include "qemu/error-report.h"
14 #include "qemu/base64.h"
15 #include "qemu/mmap-alloc.h"
16 #include "qapi/error.h"
17 #include "qapi/qapi-visit-sockets.h"
18 #include "qom/object_interfaces.h"
19 #include "crypto/hash.h"
20 #include "system/kvm_int.h"
21 #include "system/runstate.h"
22 #include "system/system.h"
23 #include "system/ramblock.h"
24 #include "system/address-spaces.h"
25 
26 #include <linux/kvm_para.h>
27 
28 #include "cpu.h"
29 #include "cpu-internal.h"
30 #include "host-cpu.h"
31 #include "hw/i386/apic_internal.h"
32 #include "hw/i386/apic-msidef.h"
33 #include "hw/i386/e820_memory_layout.h"
34 #include "hw/i386/tdvf.h"
35 #include "hw/i386/x86.h"
36 #include "hw/i386/tdvf-hob.h"
37 #include "hw/pci/msi.h"
38 #include "kvm_i386.h"
39 #include "tdx.h"
40 #include "tdx-quote-generator.h"
41 
42 #include "standard-headers/asm-x86/kvm_para.h"
43 
44 #define TDX_MIN_TSC_FREQUENCY_KHZ   (100 * 1000)
45 #define TDX_MAX_TSC_FREQUENCY_KHZ   (10 * 1000 * 1000)
46 
47 #define TDX_TD_ATTRIBUTES_DEBUG             BIT_ULL(0)
48 #define TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE   BIT_ULL(28)
49 #define TDX_TD_ATTRIBUTES_PKS               BIT_ULL(30)
50 #define TDX_TD_ATTRIBUTES_PERFMON           BIT_ULL(63)
51 
52 #define TDX_SUPPORTED_TD_ATTRS  (TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE |\
53                                  TDX_TD_ATTRIBUTES_PKS | \
54                                  TDX_TD_ATTRIBUTES_PERFMON)
55 
56 #define TDX_SUPPORTED_KVM_FEATURES  ((1U << KVM_FEATURE_NOP_IO_DELAY) | \
57                                      (1U << KVM_FEATURE_PV_UNHALT) | \
58                                      (1U << KVM_FEATURE_PV_TLB_FLUSH) | \
59                                      (1U << KVM_FEATURE_PV_SEND_IPI) | \
60                                      (1U << KVM_FEATURE_POLL_CONTROL) | \
61                                      (1U << KVM_FEATURE_PV_SCHED_YIELD) | \
62                                      (1U << KVM_FEATURE_MSI_EXT_DEST_ID))
63 
64 static TdxGuest *tdx_guest;
65 
66 static struct kvm_tdx_capabilities *tdx_caps;
67 static struct kvm_cpuid2 *tdx_supported_cpuid;
68 
69 /* Valid after kvm_arch_init()->confidential_guest_kvm_init()->tdx_kvm_init() */
is_tdx_vm(void)70 bool is_tdx_vm(void)
71 {
72     return !!tdx_guest;
73 }
74 
75 enum tdx_ioctl_level {
76     TDX_VM_IOCTL,
77     TDX_VCPU_IOCTL,
78 };
79 
tdx_ioctl_internal(enum tdx_ioctl_level level,void * state,int cmd_id,__u32 flags,void * data,Error ** errp)80 static int tdx_ioctl_internal(enum tdx_ioctl_level level, void *state,
81                               int cmd_id, __u32 flags, void *data,
82                               Error **errp)
83 {
84     struct kvm_tdx_cmd tdx_cmd = {};
85     int r;
86 
87     const char *tdx_ioctl_name[] = {
88         [KVM_TDX_CAPABILITIES] = "KVM_TDX_CAPABILITIES",
89         [KVM_TDX_INIT_VM] = "KVM_TDX_INIT_VM",
90         [KVM_TDX_INIT_VCPU] = "KVM_TDX_INIT_VCPU",
91         [KVM_TDX_INIT_MEM_REGION] = "KVM_TDX_INIT_MEM_REGION",
92         [KVM_TDX_FINALIZE_VM] = "KVM_TDX_FINALIZE_VM",
93         [KVM_TDX_GET_CPUID] = "KVM_TDX_GET_CPUID",
94     };
95 
96     tdx_cmd.id = cmd_id;
97     tdx_cmd.flags = flags;
98     tdx_cmd.data = (__u64)(unsigned long)data;
99 
100     switch (level) {
101     case TDX_VM_IOCTL:
102         r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd);
103         break;
104     case TDX_VCPU_IOCTL:
105         r = kvm_vcpu_ioctl(state, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd);
106         break;
107     default:
108         error_setg(errp, "Invalid tdx_ioctl_level %d", level);
109         return -EINVAL;
110     }
111 
112     if (r < 0) {
113         error_setg_errno(errp, -r, "TDX ioctl %s failed, hw_errors: 0x%llx",
114                          tdx_ioctl_name[cmd_id], tdx_cmd.hw_error);
115     }
116     return r;
117 }
118 
tdx_vm_ioctl(int cmd_id,__u32 flags,void * data,Error ** errp)119 static inline int tdx_vm_ioctl(int cmd_id, __u32 flags, void *data,
120                                Error **errp)
121 {
122     return tdx_ioctl_internal(TDX_VM_IOCTL, NULL, cmd_id, flags, data, errp);
123 }
124 
tdx_vcpu_ioctl(CPUState * cpu,int cmd_id,__u32 flags,void * data,Error ** errp)125 static inline int tdx_vcpu_ioctl(CPUState *cpu, int cmd_id, __u32 flags,
126                                  void *data, Error **errp)
127 {
128     return  tdx_ioctl_internal(TDX_VCPU_IOCTL, cpu, cmd_id, flags, data, errp);
129 }
130 
get_tdx_capabilities(Error ** errp)131 static int get_tdx_capabilities(Error **errp)
132 {
133     struct kvm_tdx_capabilities *caps;
134     /* 1st generation of TDX reports 6 cpuid configs */
135     int nr_cpuid_configs = 6;
136     size_t size;
137     int r;
138 
139     do {
140         Error *local_err = NULL;
141         size = sizeof(struct kvm_tdx_capabilities) +
142                       nr_cpuid_configs * sizeof(struct kvm_cpuid_entry2);
143         caps = g_malloc0(size);
144         caps->cpuid.nent = nr_cpuid_configs;
145 
146         r = tdx_vm_ioctl(KVM_TDX_CAPABILITIES, 0, caps, &local_err);
147         if (r == -E2BIG) {
148             g_free(caps);
149             nr_cpuid_configs *= 2;
150             if (nr_cpuid_configs > KVM_MAX_CPUID_ENTRIES) {
151                 error_report("KVM TDX seems broken that number of CPUID entries"
152                              " in kvm_tdx_capabilities exceeds limit: %d",
153                              KVM_MAX_CPUID_ENTRIES);
154                 error_propagate(errp, local_err);
155                 return r;
156             }
157             error_free(local_err);
158         } else if (r < 0) {
159             g_free(caps);
160             error_propagate(errp, local_err);
161             return r;
162         }
163     } while (r == -E2BIG);
164 
165     tdx_caps = caps;
166 
167     return 0;
168 }
169 
tdx_set_tdvf_region(MemoryRegion * tdvf_mr)170 void tdx_set_tdvf_region(MemoryRegion *tdvf_mr)
171 {
172     assert(!tdx_guest->tdvf_mr);
173     tdx_guest->tdvf_mr = tdvf_mr;
174 }
175 
tdx_get_hob_entry(TdxGuest * tdx)176 static TdxFirmwareEntry *tdx_get_hob_entry(TdxGuest *tdx)
177 {
178     TdxFirmwareEntry *entry;
179 
180     for_each_tdx_fw_entry(&tdx->tdvf, entry) {
181         if (entry->type == TDVF_SECTION_TYPE_TD_HOB) {
182             return entry;
183         }
184     }
185     error_report("TDVF metadata doesn't specify TD_HOB location.");
186     exit(1);
187 }
188 
tdx_add_ram_entry(uint64_t address,uint64_t length,enum TdxRamType type)189 static void tdx_add_ram_entry(uint64_t address, uint64_t length,
190                               enum TdxRamType type)
191 {
192     uint32_t nr_entries = tdx_guest->nr_ram_entries;
193     tdx_guest->ram_entries = g_renew(TdxRamEntry, tdx_guest->ram_entries,
194                                      nr_entries + 1);
195 
196     tdx_guest->ram_entries[nr_entries].address = address;
197     tdx_guest->ram_entries[nr_entries].length = length;
198     tdx_guest->ram_entries[nr_entries].type = type;
199     tdx_guest->nr_ram_entries++;
200 }
201 
tdx_accept_ram_range(uint64_t address,uint64_t length)202 static int tdx_accept_ram_range(uint64_t address, uint64_t length)
203 {
204     uint64_t head_start, tail_start, head_length, tail_length;
205     uint64_t tmp_address, tmp_length;
206     TdxRamEntry *e;
207     int i = 0;
208 
209     do {
210         if (i == tdx_guest->nr_ram_entries) {
211             return -1;
212         }
213 
214         e = &tdx_guest->ram_entries[i++];
215     } while (address + length <= e->address || address >= e->address + e->length);
216 
217     /*
218      * The to-be-accepted ram range must be fully contained by one
219      * RAM entry.
220      */
221     if (e->address > address ||
222         e->address + e->length < address + length) {
223         return -1;
224     }
225 
226     if (e->type == TDX_RAM_ADDED) {
227         return 0;
228     }
229 
230     tmp_address = e->address;
231     tmp_length = e->length;
232 
233     e->address = address;
234     e->length = length;
235     e->type = TDX_RAM_ADDED;
236 
237     head_length = address - tmp_address;
238     if (head_length > 0) {
239         head_start = tmp_address;
240         tdx_add_ram_entry(head_start, head_length, TDX_RAM_UNACCEPTED);
241     }
242 
243     tail_start = address + length;
244     if (tail_start < tmp_address + tmp_length) {
245         tail_length = tmp_address + tmp_length - tail_start;
246         tdx_add_ram_entry(tail_start, tail_length, TDX_RAM_UNACCEPTED);
247     }
248 
249     return 0;
250 }
251 
tdx_ram_entry_compare(const void * lhs_,const void * rhs_)252 static int tdx_ram_entry_compare(const void *lhs_, const void* rhs_)
253 {
254     const TdxRamEntry *lhs = lhs_;
255     const TdxRamEntry *rhs = rhs_;
256 
257     if (lhs->address == rhs->address) {
258         return 0;
259     }
260     if (le64_to_cpu(lhs->address) > le64_to_cpu(rhs->address)) {
261         return 1;
262     }
263     return -1;
264 }
265 
tdx_init_ram_entries(void)266 static void tdx_init_ram_entries(void)
267 {
268     unsigned i, j, nr_e820_entries;
269 
270     nr_e820_entries = e820_get_table(NULL);
271     tdx_guest->ram_entries = g_new(TdxRamEntry, nr_e820_entries);
272 
273     for (i = 0, j = 0; i < nr_e820_entries; i++) {
274         uint64_t addr, len;
275 
276         if (e820_get_entry(i, E820_RAM, &addr, &len)) {
277             tdx_guest->ram_entries[j].address = addr;
278             tdx_guest->ram_entries[j].length = len;
279             tdx_guest->ram_entries[j].type = TDX_RAM_UNACCEPTED;
280             j++;
281         }
282     }
283     tdx_guest->nr_ram_entries = j;
284 }
285 
tdx_post_init_vcpus(void)286 static void tdx_post_init_vcpus(void)
287 {
288     TdxFirmwareEntry *hob;
289     CPUState *cpu;
290 
291     hob = tdx_get_hob_entry(tdx_guest);
292     CPU_FOREACH(cpu) {
293         tdx_vcpu_ioctl(cpu, KVM_TDX_INIT_VCPU, 0, (void *)(uintptr_t)hob->address,
294                        &error_fatal);
295     }
296 }
297 
tdx_finalize_vm(Notifier * notifier,void * unused)298 static void tdx_finalize_vm(Notifier *notifier, void *unused)
299 {
300     TdxFirmware *tdvf = &tdx_guest->tdvf;
301     TdxFirmwareEntry *entry;
302     RAMBlock *ram_block;
303     Error *local_err = NULL;
304     int r;
305 
306     tdx_init_ram_entries();
307 
308     for_each_tdx_fw_entry(tdvf, entry) {
309         switch (entry->type) {
310         case TDVF_SECTION_TYPE_BFV:
311         case TDVF_SECTION_TYPE_CFV:
312             entry->mem_ptr = tdvf->mem_ptr + entry->data_offset;
313             break;
314         case TDVF_SECTION_TYPE_TD_HOB:
315         case TDVF_SECTION_TYPE_TEMP_MEM:
316             entry->mem_ptr = qemu_ram_mmap(-1, entry->size,
317                                            qemu_real_host_page_size(), 0, 0);
318             if (entry->mem_ptr == MAP_FAILED) {
319                 error_report("Failed to mmap memory for TDVF section %d",
320                              entry->type);
321                 exit(1);
322             }
323             if (tdx_accept_ram_range(entry->address, entry->size)) {
324                 error_report("Failed to accept memory for TDVF section %d",
325                              entry->type);
326                 qemu_ram_munmap(-1, entry->mem_ptr, entry->size);
327                 exit(1);
328             }
329             break;
330         default:
331             error_report("Unsupported TDVF section %d", entry->type);
332             exit(1);
333         }
334     }
335 
336     qsort(tdx_guest->ram_entries, tdx_guest->nr_ram_entries,
337           sizeof(TdxRamEntry), &tdx_ram_entry_compare);
338 
339     tdvf_hob_create(tdx_guest, tdx_get_hob_entry(tdx_guest));
340 
341     tdx_post_init_vcpus();
342 
343     for_each_tdx_fw_entry(tdvf, entry) {
344         struct kvm_tdx_init_mem_region region;
345         uint32_t flags;
346 
347         region = (struct kvm_tdx_init_mem_region) {
348             .source_addr = (uintptr_t)entry->mem_ptr,
349             .gpa = entry->address,
350             .nr_pages = entry->size >> 12,
351         };
352 
353         flags = entry->attributes & TDVF_SECTION_ATTRIBUTES_MR_EXTEND ?
354                 KVM_TDX_MEASURE_MEMORY_REGION : 0;
355 
356         do {
357             error_free(local_err);
358             local_err = NULL;
359             r = tdx_vcpu_ioctl(first_cpu, KVM_TDX_INIT_MEM_REGION, flags,
360                                &region, &local_err);
361         } while (r == -EAGAIN || r == -EINTR);
362         if (r < 0) {
363             error_report_err(local_err);
364             exit(1);
365         }
366 
367         if (entry->type == TDVF_SECTION_TYPE_TD_HOB ||
368             entry->type == TDVF_SECTION_TYPE_TEMP_MEM) {
369             qemu_ram_munmap(-1, entry->mem_ptr, entry->size);
370             entry->mem_ptr = NULL;
371         }
372     }
373 
374     /*
375      * TDVF image has been copied into private region above via
376      * KVM_MEMORY_MAPPING. It becomes useless.
377      */
378     ram_block = tdx_guest->tdvf_mr->ram_block;
379     ram_block_discard_range(ram_block, 0, ram_block->max_length);
380 
381     tdx_vm_ioctl(KVM_TDX_FINALIZE_VM, 0, NULL, &error_fatal);
382     CONFIDENTIAL_GUEST_SUPPORT(tdx_guest)->ready = true;
383 }
384 
385 static Notifier tdx_machine_done_notify = {
386     .notify = tdx_finalize_vm,
387 };
388 
389 /*
390  * Some CPUID bits change from fixed1 to configurable bits when TDX module
391  * supports TDX_FEATURES0.VE_REDUCTION. e.g., MCA/MCE/MTRR/CORE_CAPABILITY.
392  *
393  * To make QEMU work with all the versions of TDX module, keep the fixed1 bits
394  * here if they are ever fixed1 bits in any of the version though not fixed1 in
395  * the latest version. Otherwise, with the older version of TDX module, QEMU may
396  * treat the fixed1 bit as unsupported.
397  *
398  * For newer TDX module, it does no harm to keep them in tdx_fixed1_bits even
399  * though they changed to configurable bits. Because tdx_fixed1_bits is used to
400  * setup the supported bits.
401  */
402 KvmCpuidInfo tdx_fixed1_bits = {
403     .cpuid.nent = 8,
404     .entries[0] = {
405         .function = 0x1,
406         .index = 0,
407         .ecx = CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_DTES64 |
408                CPUID_EXT_DSCPL | CPUID_EXT_SSSE3 | CPUID_EXT_CX16 |
409                CPUID_EXT_PDCM | CPUID_EXT_PCID | CPUID_EXT_SSE41 |
410                CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE |
411                CPUID_EXT_POPCNT | CPUID_EXT_AES | CPUID_EXT_XSAVE |
412                CPUID_EXT_RDRAND | CPUID_EXT_HYPERVISOR,
413         .edx = CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC |
414                CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC |
415                CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV |
416                CPUID_PAT | CPUID_CLFLUSH | CPUID_DTS | CPUID_MMX | CPUID_FXSR |
417                CPUID_SSE | CPUID_SSE2,
418     },
419     .entries[1] = {
420         .function = 0x6,
421         .index = 0,
422         .eax = CPUID_6_EAX_ARAT,
423     },
424     .entries[2] = {
425         .function = 0x7,
426         .index = 0,
427         .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX,
428         .ebx = CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_FDP_EXCPTN_ONLY |
429                CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_INVPCID |
430                CPUID_7_0_EBX_ZERO_FCS_FDS | CPUID_7_0_EBX_RDSEED |
431                CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT |
432                CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_SHA_NI,
433         .ecx = CPUID_7_0_ECX_BUS_LOCK_DETECT | CPUID_7_0_ECX_MOVDIRI |
434                CPUID_7_0_ECX_MOVDIR64B,
435         .edx = CPUID_7_0_EDX_MD_CLEAR | CPUID_7_0_EDX_SPEC_CTRL |
436                CPUID_7_0_EDX_STIBP | CPUID_7_0_EDX_FLUSH_L1D |
437                CPUID_7_0_EDX_ARCH_CAPABILITIES | CPUID_7_0_EDX_CORE_CAPABILITY |
438                CPUID_7_0_EDX_SPEC_CTRL_SSBD,
439     },
440     .entries[3] = {
441         .function = 0x7,
442         .index = 2,
443         .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX,
444         .edx = CPUID_7_2_EDX_PSFD | CPUID_7_2_EDX_IPRED_CTRL |
445                CPUID_7_2_EDX_RRSBA_CTRL | CPUID_7_2_EDX_BHI_CTRL,
446     },
447     .entries[4] = {
448         .function = 0xD,
449         .index = 0,
450         .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX,
451         .eax = XSTATE_FP_MASK | XSTATE_SSE_MASK,
452     },
453     .entries[5] = {
454         .function = 0xD,
455         .index = 1,
456         .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX,
457         .eax = CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC|
458                CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES,
459     },
460     .entries[6] = {
461         .function = 0x80000001,
462         .index = 0,
463         .ecx = CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH,
464         /*
465          * Strictly speaking, SYSCALL is not fixed1 bit since it depends on
466          * the CPU to be in 64-bit mode. But here fixed1 is used to serve the
467          * purpose of supported bits for TDX. In this sense, SYACALL is always
468          * supported.
469          */
470         .edx = CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB |
471                CPUID_EXT2_RDTSCP | CPUID_EXT2_LM,
472     },
473     .entries[7] = {
474         .function = 0x80000007,
475         .index = 0,
476         .edx = CPUID_APM_INVTSC,
477     },
478 };
479 
480 typedef struct TdxAttrsMap {
481     uint32_t attr_index;
482     uint32_t cpuid_leaf;
483     uint32_t cpuid_subleaf;
484     int cpuid_reg;
485     uint32_t feat_mask;
486 } TdxAttrsMap;
487 
488 static TdxAttrsMap tdx_attrs_maps[] = {
489     {.attr_index = 27,
490      .cpuid_leaf = 7,
491      .cpuid_subleaf = 1,
492      .cpuid_reg = R_EAX,
493      .feat_mask = CPUID_7_1_EAX_LASS,},
494 
495     {.attr_index = 30,
496      .cpuid_leaf = 7,
497      .cpuid_subleaf = 0,
498      .cpuid_reg = R_ECX,
499      .feat_mask = CPUID_7_0_ECX_PKS,},
500 
501     {.attr_index = 31,
502      .cpuid_leaf = 7,
503      .cpuid_subleaf = 0,
504      .cpuid_reg = R_ECX,
505      .feat_mask = CPUID_7_0_ECX_KeyLocker,},
506 };
507 
508 typedef struct TdxXFAMDep {
509     int xfam_bit;
510     FeatureMask feat_mask;
511 } TdxXFAMDep;
512 
513 /*
514  * Note, only the CPUID bits whose virtualization type are "XFAM & Native" are
515  * defiend here.
516  *
517  * For those whose virtualization type are "XFAM & Configured & Native", they
518  * are reported as configurable bits. And they are not supported if not in the
519  * configureable bits list from KVM even if the corresponding XFAM bit is
520  * supported.
521  */
522 TdxXFAMDep tdx_xfam_deps[] = {
523     { XSTATE_YMM_BIT,       { FEAT_1_ECX, CPUID_EXT_FMA }},
524     { XSTATE_YMM_BIT,       { FEAT_7_0_EBX, CPUID_7_0_EBX_AVX2 }},
525     { XSTATE_OPMASK_BIT,    { FEAT_7_0_ECX, CPUID_7_0_ECX_AVX512_VBMI}},
526     { XSTATE_OPMASK_BIT,    { FEAT_7_0_EDX, CPUID_7_0_EDX_AVX512_FP16}},
527     { XSTATE_PT_BIT,        { FEAT_7_0_EBX, CPUID_7_0_EBX_INTEL_PT}},
528     { XSTATE_PKRU_BIT,      { FEAT_7_0_ECX, CPUID_7_0_ECX_PKU}},
529     { XSTATE_XTILE_CFG_BIT, { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_BF16 }},
530     { XSTATE_XTILE_CFG_BIT, { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE }},
531     { XSTATE_XTILE_CFG_BIT, { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_INT8 }},
532 };
533 
find_in_supported_entry(uint32_t function,uint32_t index)534 static struct kvm_cpuid_entry2 *find_in_supported_entry(uint32_t function,
535                                                         uint32_t index)
536 {
537     struct kvm_cpuid_entry2 *e;
538 
539     e = cpuid_find_entry(tdx_supported_cpuid, function, index);
540     if (!e) {
541         if (tdx_supported_cpuid->nent >= KVM_MAX_CPUID_ENTRIES) {
542             error_report("tdx_supported_cpuid requries more space than %d entries",
543                           KVM_MAX_CPUID_ENTRIES);
544             exit(1);
545         }
546         e = &tdx_supported_cpuid->entries[tdx_supported_cpuid->nent++];
547         e->function = function;
548         e->index = index;
549     }
550 
551     return e;
552 }
553 
tdx_add_supported_cpuid_by_fixed1_bits(void)554 static void tdx_add_supported_cpuid_by_fixed1_bits(void)
555 {
556     struct kvm_cpuid_entry2 *e, *e1;
557     int i;
558 
559     for (i = 0; i < tdx_fixed1_bits.cpuid.nent; i++) {
560         e = &tdx_fixed1_bits.entries[i];
561 
562         e1 = find_in_supported_entry(e->function, e->index);
563         e1->eax |= e->eax;
564         e1->ebx |= e->ebx;
565         e1->ecx |= e->ecx;
566         e1->edx |= e->edx;
567     }
568 }
569 
tdx_add_supported_cpuid_by_attrs(void)570 static void tdx_add_supported_cpuid_by_attrs(void)
571 {
572     struct kvm_cpuid_entry2 *e;
573     TdxAttrsMap *map;
574     int i;
575 
576     for (i = 0; i < ARRAY_SIZE(tdx_attrs_maps); i++) {
577         map = &tdx_attrs_maps[i];
578         if (!((1ULL << map->attr_index) & tdx_caps->supported_attrs)) {
579             continue;
580         }
581 
582         e = find_in_supported_entry(map->cpuid_leaf, map->cpuid_subleaf);
583 
584         switch(map->cpuid_reg) {
585         case R_EAX:
586             e->eax |= map->feat_mask;
587             break;
588         case R_EBX:
589             e->ebx |= map->feat_mask;
590             break;
591         case R_ECX:
592             e->ecx |= map->feat_mask;
593             break;
594         case R_EDX:
595             e->edx |= map->feat_mask;
596             break;
597         }
598     }
599 }
600 
tdx_add_supported_cpuid_by_xfam(void)601 static void tdx_add_supported_cpuid_by_xfam(void)
602 {
603     struct kvm_cpuid_entry2 *e;
604     int i;
605 
606     const TdxXFAMDep *xfam_dep;
607     const FeatureWordInfo *f;
608     for (i = 0; i < ARRAY_SIZE(tdx_xfam_deps); i++) {
609         xfam_dep = &tdx_xfam_deps[i];
610         if (!((1ULL << xfam_dep->xfam_bit) & tdx_caps->supported_xfam)) {
611             continue;
612         }
613 
614         f = &feature_word_info[xfam_dep->feat_mask.index];
615         if (f->type != CPUID_FEATURE_WORD) {
616             continue;
617         }
618 
619         e = find_in_supported_entry(f->cpuid.eax, f->cpuid.ecx);
620         switch(f->cpuid.reg) {
621         case R_EAX:
622             e->eax |= xfam_dep->feat_mask.mask;
623             break;
624         case R_EBX:
625             e->ebx |= xfam_dep->feat_mask.mask;
626             break;
627         case R_ECX:
628             e->ecx |= xfam_dep->feat_mask.mask;
629             break;
630         case R_EDX:
631             e->edx |= xfam_dep->feat_mask.mask;
632             break;
633         }
634     }
635 
636     e = find_in_supported_entry(0xd, 0);
637     e->eax |= (tdx_caps->supported_xfam & CPUID_XSTATE_XCR0_MASK);
638     e->edx |= (tdx_caps->supported_xfam & CPUID_XSTATE_XCR0_MASK) >> 32;
639 
640     e = find_in_supported_entry(0xd, 1);
641     /*
642      * Mark XFD always support for TDX, it will be cleared finally in
643      * tdx_adjust_cpuid_features() if XFD is unavailable on the hardware
644      * because in this case the original data has it as 0.
645      */
646     e->eax |= CPUID_XSAVE_XFD;
647     e->ecx |= (tdx_caps->supported_xfam & CPUID_XSTATE_XSS_MASK);
648     e->edx |= (tdx_caps->supported_xfam & CPUID_XSTATE_XSS_MASK) >> 32;
649 }
650 
tdx_add_supported_kvm_features(void)651 static void tdx_add_supported_kvm_features(void)
652 {
653     struct kvm_cpuid_entry2 *e;
654 
655     e = find_in_supported_entry(0x40000001, 0);
656     e->eax = TDX_SUPPORTED_KVM_FEATURES;
657 }
658 
tdx_setup_supported_cpuid(void)659 static void tdx_setup_supported_cpuid(void)
660 {
661     if (tdx_supported_cpuid) {
662         return;
663     }
664 
665     tdx_supported_cpuid = g_malloc0(sizeof(*tdx_supported_cpuid) +
666                     KVM_MAX_CPUID_ENTRIES * sizeof(struct kvm_cpuid_entry2));
667 
668     memcpy(tdx_supported_cpuid->entries, tdx_caps->cpuid.entries,
669            tdx_caps->cpuid.nent * sizeof(struct kvm_cpuid_entry2));
670     tdx_supported_cpuid->nent = tdx_caps->cpuid.nent;
671 
672     tdx_add_supported_cpuid_by_fixed1_bits();
673     tdx_add_supported_cpuid_by_attrs();
674     tdx_add_supported_cpuid_by_xfam();
675 
676     tdx_add_supported_kvm_features();
677 }
678 
tdx_kvm_init(ConfidentialGuestSupport * cgs,Error ** errp)679 static int tdx_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
680 {
681     MachineState *ms = MACHINE(qdev_get_machine());
682     X86MachineState *x86ms = X86_MACHINE(ms);
683     TdxGuest *tdx = TDX_GUEST(cgs);
684     int r = 0;
685 
686     kvm_mark_guest_state_protected();
687 
688     if (x86ms->smm == ON_OFF_AUTO_AUTO) {
689         x86ms->smm = ON_OFF_AUTO_OFF;
690     } else if (x86ms->smm == ON_OFF_AUTO_ON) {
691         error_setg(errp, "TDX VM doesn't support SMM");
692         return -EINVAL;
693     }
694 
695     if (x86ms->pic == ON_OFF_AUTO_AUTO) {
696         x86ms->pic = ON_OFF_AUTO_OFF;
697     } else if (x86ms->pic == ON_OFF_AUTO_ON) {
698         error_setg(errp, "TDX VM doesn't support PIC");
699         return -EINVAL;
700     }
701 
702     if (kvm_state->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
703         kvm_state->kernel_irqchip_split = ON_OFF_AUTO_ON;
704     } else if (kvm_state->kernel_irqchip_split != ON_OFF_AUTO_ON) {
705         error_setg(errp, "TDX VM requires kernel_irqchip to be split");
706         return -EINVAL;
707     }
708 
709     if (!tdx_caps) {
710         r = get_tdx_capabilities(errp);
711         if (r) {
712             return r;
713         }
714     }
715 
716     tdx_setup_supported_cpuid();
717 
718     /* TDX relies on KVM_HC_MAP_GPA_RANGE to handle TDG.VP.VMCALL<MapGPA> */
719     if (!kvm_enable_hypercall(BIT_ULL(KVM_HC_MAP_GPA_RANGE))) {
720         return -EOPNOTSUPP;
721     }
722 
723     /*
724      * Set kvm_readonly_mem_allowed to false, because TDX only supports readonly
725      * memory for shared memory but not for private memory. Besides, whether a
726      * memslot is private or shared is not determined by QEMU.
727      *
728      * Thus, just mark readonly memory not supported for simplicity.
729      */
730     kvm_readonly_mem_allowed = false;
731 
732     qemu_add_machine_init_done_notifier(&tdx_machine_done_notify);
733 
734     tdx_guest = tdx;
735     return 0;
736 }
737 
tdx_kvm_type(X86ConfidentialGuest * cg)738 static int tdx_kvm_type(X86ConfidentialGuest *cg)
739 {
740     /* Do the object check */
741     TDX_GUEST(cg);
742 
743     return KVM_X86_TDX_VM;
744 }
745 
tdx_cpu_instance_init(X86ConfidentialGuest * cg,CPUState * cpu)746 static void tdx_cpu_instance_init(X86ConfidentialGuest *cg, CPUState *cpu)
747 {
748     X86CPUClass *xcc = X86_CPU_GET_CLASS(cpu);
749     X86CPU *x86cpu = X86_CPU(cpu);
750 
751     if (xcc->model) {
752         error_report("Named cpu model is not supported for TDX yet!");
753         exit(1);
754     }
755 
756     object_property_set_bool(OBJECT(cpu), "pmu", false, &error_abort);
757 
758     /* invtsc is fixed1 for TD guest */
759     object_property_set_bool(OBJECT(cpu), "invtsc", true, &error_abort);
760 
761     x86cpu->force_cpuid_0x1f = true;
762 }
763 
tdx_adjust_cpuid_features(X86ConfidentialGuest * cg,uint32_t feature,uint32_t index,int reg,uint32_t value)764 static uint32_t tdx_adjust_cpuid_features(X86ConfidentialGuest *cg,
765                                           uint32_t feature, uint32_t index,
766                                           int reg, uint32_t value)
767 {
768     struct kvm_cpuid_entry2 *e;
769 
770     e = cpuid_find_entry(&tdx_fixed1_bits.cpuid, feature, index);
771     if (e) {
772         value |= cpuid_entry_get_reg(e, reg);
773     }
774 
775     if (is_feature_word_cpuid(feature, index, reg)) {
776         e = cpuid_find_entry(tdx_supported_cpuid, feature, index);
777         if (e) {
778             value &= cpuid_entry_get_reg(e, reg);
779         }
780     }
781 
782     return value;
783 }
784 
tdx_fetch_cpuid(CPUState * cpu,int * ret)785 static struct kvm_cpuid2 *tdx_fetch_cpuid(CPUState *cpu, int *ret)
786 {
787     struct kvm_cpuid2 *fetch_cpuid;
788     int size = KVM_MAX_CPUID_ENTRIES;
789     Error *local_err = NULL;
790     int r;
791 
792     do {
793         error_free(local_err);
794         local_err = NULL;
795 
796         fetch_cpuid = g_malloc0(sizeof(*fetch_cpuid) +
797                                 sizeof(struct kvm_cpuid_entry2) * size);
798         fetch_cpuid->nent = size;
799         r = tdx_vcpu_ioctl(cpu, KVM_TDX_GET_CPUID, 0, fetch_cpuid, &local_err);
800         if (r == -E2BIG) {
801             g_free(fetch_cpuid);
802             size = fetch_cpuid->nent;
803         }
804     } while (r == -E2BIG);
805 
806     if (r < 0) {
807         error_report_err(local_err);
808         *ret = r;
809         return NULL;
810     }
811 
812     return fetch_cpuid;
813 }
814 
tdx_check_features(X86ConfidentialGuest * cg,CPUState * cs)815 static int tdx_check_features(X86ConfidentialGuest *cg, CPUState *cs)
816 {
817     uint64_t actual, requested, unavailable, forced_on;
818     g_autofree struct kvm_cpuid2 *fetch_cpuid;
819     const char *forced_on_prefix = NULL;
820     const char *unav_prefix = NULL;
821     struct kvm_cpuid_entry2 *entry;
822     X86CPU *cpu = X86_CPU(cs);
823     CPUX86State *env = &cpu->env;
824     FeatureWordInfo *wi;
825     FeatureWord w;
826     bool mismatch = false;
827     int r;
828 
829     fetch_cpuid = tdx_fetch_cpuid(cs, &r);
830     if (!fetch_cpuid) {
831         return r;
832     }
833 
834     if (cpu->check_cpuid || cpu->enforce_cpuid) {
835         unav_prefix = "TDX doesn't support requested feature";
836         forced_on_prefix = "TDX forcibly sets the feature";
837     }
838 
839     for (w = 0; w < FEATURE_WORDS; w++) {
840         wi = &feature_word_info[w];
841         actual = 0;
842 
843         switch (wi->type) {
844         case CPUID_FEATURE_WORD:
845             entry = cpuid_find_entry(fetch_cpuid, wi->cpuid.eax, wi->cpuid.ecx);
846             if (!entry) {
847                 /*
848                  * If KVM doesn't report it means it's totally configurable
849                  * by QEMU
850                  */
851                 continue;
852             }
853 
854             actual = cpuid_entry_get_reg(entry, wi->cpuid.reg);
855             break;
856         case MSR_FEATURE_WORD:
857             /*
858              * TODO:
859              * validate MSR features when KVM has interface report them.
860              */
861             continue;
862         }
863 
864         /* Fixup for special cases */
865         switch (w) {
866         case FEAT_8000_0001_EDX:
867             /*
868              * Intel enumerates SYSCALL bit as 1 only when processor in 64-bit
869              * mode and before vcpu running it's not in 64-bit mode.
870              */
871             actual |= CPUID_EXT2_SYSCALL;
872             break;
873         default:
874             break;
875         }
876 
877         requested = env->features[w];
878         unavailable = requested & ~actual;
879         mark_unavailable_features(cpu, w, unavailable, unav_prefix);
880         if (unavailable) {
881             mismatch = true;
882         }
883 
884         forced_on = actual & ~requested;
885         mark_forced_on_features(cpu, w, forced_on, forced_on_prefix);
886         if (forced_on) {
887             mismatch = true;
888         }
889     }
890 
891     if (cpu->enforce_cpuid && mismatch) {
892         return -EINVAL;
893     }
894 
895     if (cpu->phys_bits != host_cpu_phys_bits()) {
896         error_report("TDX requires guest CPU physical bits (%u) "
897                      "to match host CPU physical bits (%u)",
898                      cpu->phys_bits, host_cpu_phys_bits());
899         return -EINVAL;
900     }
901 
902     return 0;
903 }
904 
tdx_validate_attributes(TdxGuest * tdx,Error ** errp)905 static int tdx_validate_attributes(TdxGuest *tdx, Error **errp)
906 {
907     if ((tdx->attributes & ~tdx_caps->supported_attrs)) {
908         error_setg(errp, "Invalid attributes 0x%"PRIx64" for TDX VM "
909                    "(KVM supported: 0x%"PRIx64")", tdx->attributes,
910                    (uint64_t)tdx_caps->supported_attrs);
911         return -1;
912     }
913 
914     if (tdx->attributes & ~TDX_SUPPORTED_TD_ATTRS) {
915         error_setg(errp, "Some QEMU unsupported TD attribute bits being "
916                     "requested: 0x%"PRIx64" (QEMU supported: 0x%"PRIx64")",
917                     tdx->attributes, (uint64_t)TDX_SUPPORTED_TD_ATTRS);
918         return -1;
919     }
920 
921     return 0;
922 }
923 
setup_td_guest_attributes(X86CPU * x86cpu,Error ** errp)924 static int setup_td_guest_attributes(X86CPU *x86cpu, Error **errp)
925 {
926     CPUX86State *env = &x86cpu->env;
927 
928     tdx_guest->attributes |= (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS) ?
929                              TDX_TD_ATTRIBUTES_PKS : 0;
930     tdx_guest->attributes |= x86cpu->enable_pmu ? TDX_TD_ATTRIBUTES_PERFMON : 0;
931 
932     return tdx_validate_attributes(tdx_guest, errp);
933 }
934 
setup_td_xfam(X86CPU * x86cpu,Error ** errp)935 static int setup_td_xfam(X86CPU *x86cpu, Error **errp)
936 {
937     CPUX86State *env = &x86cpu->env;
938     uint64_t xfam;
939 
940     xfam = env->features[FEAT_XSAVE_XCR0_LO] |
941            env->features[FEAT_XSAVE_XCR0_HI] |
942            env->features[FEAT_XSAVE_XSS_LO] |
943            env->features[FEAT_XSAVE_XSS_HI];
944 
945     if (xfam & ~tdx_caps->supported_xfam) {
946         error_setg(errp, "Invalid XFAM 0x%"PRIx64" for TDX VM (supported: 0x%"PRIx64"))",
947                    xfam, (uint64_t)tdx_caps->supported_xfam);
948         return -1;
949     }
950 
951     tdx_guest->xfam = xfam;
952     return 0;
953 }
954 
tdx_filter_cpuid(struct kvm_cpuid2 * cpuids)955 static void tdx_filter_cpuid(struct kvm_cpuid2 *cpuids)
956 {
957     int i, dest_cnt = 0;
958     struct kvm_cpuid_entry2 *src, *dest, *conf;
959 
960     for (i = 0; i < cpuids->nent; i++) {
961         src = cpuids->entries + i;
962         conf = cpuid_find_entry(&tdx_caps->cpuid, src->function, src->index);
963         if (!conf) {
964             continue;
965         }
966         dest = cpuids->entries + dest_cnt;
967 
968         dest->function = src->function;
969         dest->index = src->index;
970         dest->flags = src->flags;
971         dest->eax = src->eax & conf->eax;
972         dest->ebx = src->ebx & conf->ebx;
973         dest->ecx = src->ecx & conf->ecx;
974         dest->edx = src->edx & conf->edx;
975 
976         dest_cnt++;
977     }
978     cpuids->nent = dest_cnt++;
979 }
980 
tdx_pre_create_vcpu(CPUState * cpu,Error ** errp)981 int tdx_pre_create_vcpu(CPUState *cpu, Error **errp)
982 {
983     X86CPU *x86cpu = X86_CPU(cpu);
984     CPUX86State *env = &x86cpu->env;
985     g_autofree struct kvm_tdx_init_vm *init_vm = NULL;
986     Error *local_err = NULL;
987     size_t data_len;
988     int retry = 10000;
989     int r = 0;
990 
991     QEMU_LOCK_GUARD(&tdx_guest->lock);
992     if (tdx_guest->initialized) {
993         return r;
994     }
995 
996     init_vm = g_malloc0(sizeof(struct kvm_tdx_init_vm) +
997                         sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
998 
999     if (!kvm_check_extension(kvm_state, KVM_CAP_X86_APIC_BUS_CYCLES_NS)) {
1000         error_setg(errp, "KVM doesn't support KVM_CAP_X86_APIC_BUS_CYCLES_NS");
1001         return -EOPNOTSUPP;
1002     }
1003 
1004     r = kvm_vm_enable_cap(kvm_state, KVM_CAP_X86_APIC_BUS_CYCLES_NS,
1005                           0, TDX_APIC_BUS_CYCLES_NS);
1006     if (r < 0) {
1007         error_setg_errno(errp, -r,
1008                          "Unable to set core crystal clock frequency to 25MHz");
1009         return r;
1010     }
1011 
1012     if (env->tsc_khz && (env->tsc_khz < TDX_MIN_TSC_FREQUENCY_KHZ ||
1013                          env->tsc_khz > TDX_MAX_TSC_FREQUENCY_KHZ)) {
1014         error_setg(errp, "Invalid TSC %"PRId64" KHz, must specify cpu_frequency "
1015                          "between [%d, %d] kHz", env->tsc_khz,
1016                          TDX_MIN_TSC_FREQUENCY_KHZ, TDX_MAX_TSC_FREQUENCY_KHZ);
1017        return -EINVAL;
1018     }
1019 
1020     if (env->tsc_khz % (25 * 1000)) {
1021         error_setg(errp, "Invalid TSC %"PRId64" KHz, it must be multiple of 25MHz",
1022                    env->tsc_khz);
1023         return -EINVAL;
1024     }
1025 
1026     /* it's safe even env->tsc_khz is 0. KVM uses host's tsc_khz in this case */
1027     r = kvm_vm_ioctl(kvm_state, KVM_SET_TSC_KHZ, env->tsc_khz);
1028     if (r < 0) {
1029         error_setg_errno(errp, -r, "Unable to set TSC frequency to %"PRId64" kHz",
1030                          env->tsc_khz);
1031         return r;
1032     }
1033 
1034     if (tdx_guest->mrconfigid) {
1035         g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrconfigid,
1036                               strlen(tdx_guest->mrconfigid), &data_len, errp);
1037         if (!data) {
1038             return -1;
1039         }
1040         if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) {
1041             error_setg(errp, "TDX 'mrconfigid' sha384 digest was %ld bytes, "
1042                              "expected %d bytes", data_len,
1043                              QCRYPTO_HASH_DIGEST_LEN_SHA384);
1044             return -1;
1045         }
1046         memcpy(init_vm->mrconfigid, data, data_len);
1047     }
1048 
1049     if (tdx_guest->mrowner) {
1050         g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrowner,
1051                               strlen(tdx_guest->mrowner), &data_len, errp);
1052         if (!data) {
1053             return -1;
1054         }
1055         if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) {
1056             error_setg(errp, "TDX 'mrowner' sha384 digest was %ld bytes, "
1057                              "expected %d bytes", data_len,
1058                              QCRYPTO_HASH_DIGEST_LEN_SHA384);
1059             return -1;
1060         }
1061         memcpy(init_vm->mrowner, data, data_len);
1062     }
1063 
1064     if (tdx_guest->mrownerconfig) {
1065         g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrownerconfig,
1066                             strlen(tdx_guest->mrownerconfig), &data_len, errp);
1067         if (!data) {
1068             return -1;
1069         }
1070         if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) {
1071             error_setg(errp, "TDX 'mrownerconfig' sha384 digest was %ld bytes, "
1072                              "expected %d bytes", data_len,
1073                              QCRYPTO_HASH_DIGEST_LEN_SHA384);
1074             return -1;
1075         }
1076         memcpy(init_vm->mrownerconfig, data, data_len);
1077     }
1078 
1079     r = setup_td_guest_attributes(x86cpu, errp);
1080     if (r) {
1081         return r;
1082     }
1083 
1084     r = setup_td_xfam(x86cpu, errp);
1085     if (r) {
1086         return r;
1087     }
1088 
1089     init_vm->cpuid.nent = kvm_x86_build_cpuid(env, init_vm->cpuid.entries, 0);
1090     tdx_filter_cpuid(&init_vm->cpuid);
1091 
1092     init_vm->attributes = tdx_guest->attributes;
1093     init_vm->xfam = tdx_guest->xfam;
1094 
1095     /*
1096      * KVM_TDX_INIT_VM gets -EAGAIN when KVM side SEAMCALL(TDH_MNG_CREATE)
1097      * gets TDX_RND_NO_ENTROPY due to Random number generation (e.g., RDRAND or
1098      * RDSEED) is busy.
1099      *
1100      * Retry for the case.
1101      */
1102     do {
1103         error_free(local_err);
1104         local_err = NULL;
1105         r = tdx_vm_ioctl(KVM_TDX_INIT_VM, 0, init_vm, &local_err);
1106     } while (r == -EAGAIN && --retry);
1107 
1108     if (r < 0) {
1109         if (!retry) {
1110             error_append_hint(&local_err, "Hardware RNG (Random Number "
1111             "Generator) is busy occupied by someone (via RDRAND/RDSEED) "
1112             "maliciously, which leads to KVM_TDX_INIT_VM keeping failure "
1113             "due to lack of entropy.\n");
1114         }
1115         error_propagate(errp, local_err);
1116         return r;
1117     }
1118 
1119     tdx_guest->initialized = true;
1120 
1121     return 0;
1122 }
1123 
tdx_parse_tdvf(void * flash_ptr,int size)1124 int tdx_parse_tdvf(void *flash_ptr, int size)
1125 {
1126     return tdvf_parse_metadata(&tdx_guest->tdvf, flash_ptr, size);
1127 }
1128 
tdx_inject_interrupt(TdxGuest * tdx)1129 static void tdx_inject_interrupt(TdxGuest *tdx)
1130 {
1131     int ret;
1132     uint32_t apicid, vector;
1133 
1134     qemu_mutex_lock(&tdx->lock);
1135     vector = tdx->event_notify_vector;
1136     apicid = tdx->event_notify_apicid;
1137     qemu_mutex_unlock(&tdx->lock);
1138     if (vector < 32 || vector > 255) {
1139         return;
1140     }
1141 
1142     MSIMessage msg = {
1143         .address = ((apicid & 0xff) << MSI_ADDR_DEST_ID_SHIFT) |
1144                    (((uint64_t)apicid & 0xffffff00) << 32),
1145         .data = vector | (APIC_DM_FIXED << MSI_DATA_DELIVERY_MODE_SHIFT),
1146     };
1147 
1148     ret = kvm_irqchip_send_msi(kvm_state, msg);
1149     if (ret < 0) {
1150         /* In this case, no better way to tell it to guest. Log it. */
1151         error_report("TDX: injection interrupt %d failed, interrupt lost (%s).",
1152                      vector, strerror(-ret));
1153     }
1154 }
1155 
tdx_get_quote_completion(TdxGenerateQuoteTask * task)1156 static void tdx_get_quote_completion(TdxGenerateQuoteTask *task)
1157 {
1158     TdxGuest *tdx = task->opaque;
1159     int ret;
1160 
1161     /* Maintain the number of in-flight requests. */
1162     qemu_mutex_lock(&tdx->lock);
1163     tdx->num--;
1164     qemu_mutex_unlock(&tdx->lock);
1165 
1166     if (task->status_code == TDX_VP_GET_QUOTE_SUCCESS) {
1167         ret = address_space_write(&address_space_memory, task->payload_gpa,
1168                                   MEMTXATTRS_UNSPECIFIED, task->receive_buf,
1169                                   task->receive_buf_received);
1170         if (ret != MEMTX_OK) {
1171             error_report("TDX: get-quote: failed to write quote data.");
1172         } else {
1173             task->hdr.out_len = cpu_to_le64(task->receive_buf_received);
1174         }
1175     }
1176     task->hdr.error_code = cpu_to_le64(task->status_code);
1177 
1178     /* Publish the response contents before marking this request completed. */
1179     smp_wmb();
1180     ret = address_space_write(&address_space_memory, task->buf_gpa,
1181                               MEMTXATTRS_UNSPECIFIED, &task->hdr,
1182                               TDX_GET_QUOTE_HDR_SIZE);
1183     if (ret != MEMTX_OK) {
1184         error_report("TDX: get-quote: failed to update GetQuote header.");
1185     }
1186 
1187     tdx_inject_interrupt(tdx);
1188 
1189     g_free(task->send_data);
1190     g_free(task->receive_buf);
1191     g_free(task);
1192     object_unref(tdx);
1193 }
1194 
tdx_handle_get_quote(X86CPU * cpu,struct kvm_run * run)1195 void tdx_handle_get_quote(X86CPU *cpu, struct kvm_run *run)
1196 {
1197     TdxGenerateQuoteTask *task;
1198     struct tdx_get_quote_header hdr;
1199     hwaddr buf_gpa = run->tdx.get_quote.gpa;
1200     uint64_t buf_len = run->tdx.get_quote.size;
1201 
1202     QEMU_BUILD_BUG_ON(sizeof(struct tdx_get_quote_header) != TDX_GET_QUOTE_HDR_SIZE);
1203 
1204     run->tdx.get_quote.ret = TDG_VP_VMCALL_INVALID_OPERAND;
1205 
1206     if (buf_len == 0) {
1207         return;
1208     }
1209 
1210     if (!QEMU_IS_ALIGNED(buf_gpa, 4096) || !QEMU_IS_ALIGNED(buf_len, 4096)) {
1211         run->tdx.get_quote.ret = TDG_VP_VMCALL_ALIGN_ERROR;
1212         return;
1213     }
1214 
1215     if (address_space_read(&address_space_memory, buf_gpa, MEMTXATTRS_UNSPECIFIED,
1216                            &hdr, TDX_GET_QUOTE_HDR_SIZE) != MEMTX_OK) {
1217         error_report("TDX: get-quote: failed to read GetQuote header.");
1218         return;
1219     }
1220 
1221     if (le64_to_cpu(hdr.structure_version) != TDX_GET_QUOTE_STRUCTURE_VERSION) {
1222         return;
1223     }
1224 
1225     /* Only safe-guard check to avoid too large buffer size. */
1226     if (buf_len > TDX_GET_QUOTE_MAX_BUF_LEN ||
1227         le32_to_cpu(hdr.in_len) > buf_len - TDX_GET_QUOTE_HDR_SIZE) {
1228         return;
1229     }
1230 
1231     if (!tdx_guest->qg_sock_addr) {
1232         hdr.error_code = cpu_to_le64(TDX_VP_GET_QUOTE_QGS_UNAVAILABLE);
1233         if (address_space_write(&address_space_memory, buf_gpa,
1234                                 MEMTXATTRS_UNSPECIFIED,
1235                                 &hdr, TDX_GET_QUOTE_HDR_SIZE) != MEMTX_OK) {
1236             error_report("TDX: failed to update GetQuote header.");
1237             return;
1238         }
1239         run->tdx.get_quote.ret = TDG_VP_VMCALL_SUCCESS;
1240         return;
1241     }
1242 
1243     qemu_mutex_lock(&tdx_guest->lock);
1244     if (tdx_guest->num >= TDX_MAX_GET_QUOTE_REQUEST) {
1245         qemu_mutex_unlock(&tdx_guest->lock);
1246         run->tdx.get_quote.ret = TDG_VP_VMCALL_RETRY;
1247         return;
1248     }
1249     tdx_guest->num++;
1250     qemu_mutex_unlock(&tdx_guest->lock);
1251 
1252     task = g_new(TdxGenerateQuoteTask, 1);
1253     task->buf_gpa = buf_gpa;
1254     task->payload_gpa = buf_gpa + TDX_GET_QUOTE_HDR_SIZE;
1255     task->payload_len = buf_len - TDX_GET_QUOTE_HDR_SIZE;
1256     task->hdr = hdr;
1257     task->completion = tdx_get_quote_completion;
1258 
1259     task->send_data_size = le32_to_cpu(hdr.in_len);
1260     task->send_data = g_malloc(task->send_data_size);
1261     task->send_data_sent = 0;
1262 
1263     if (address_space_read(&address_space_memory, task->payload_gpa,
1264                            MEMTXATTRS_UNSPECIFIED, task->send_data,
1265                            task->send_data_size) != MEMTX_OK) {
1266         goto out_free;
1267     }
1268 
1269     /* Mark the buffer in-flight. */
1270     hdr.error_code = cpu_to_le64(TDX_VP_GET_QUOTE_IN_FLIGHT);
1271     if (address_space_write(&address_space_memory, buf_gpa,
1272                             MEMTXATTRS_UNSPECIFIED,
1273                             &hdr, TDX_GET_QUOTE_HDR_SIZE) != MEMTX_OK) {
1274         goto out_free;
1275     }
1276 
1277     task->receive_buf = g_malloc0(task->payload_len);
1278     task->receive_buf_received = 0;
1279     task->opaque = tdx_guest;
1280 
1281     object_ref(tdx_guest);
1282     tdx_generate_quote(task, tdx_guest->qg_sock_addr);
1283     run->tdx.get_quote.ret = TDG_VP_VMCALL_SUCCESS;
1284     return;
1285 
1286 out_free:
1287     g_free(task->send_data);
1288     g_free(task);
1289 }
1290 
1291 #define SUPPORTED_TDVMCALLINFO_1_R11    (TDG_VP_VMCALL_SUBFUNC_SET_EVENT_NOTIFY_INTERRUPT)
1292 #define SUPPORTED_TDVMCALLINFO_1_R12    (0)
1293 
tdx_handle_get_tdvmcall_info(X86CPU * cpu,struct kvm_run * run)1294 void tdx_handle_get_tdvmcall_info(X86CPU *cpu, struct kvm_run *run)
1295 {
1296     if (run->tdx.get_tdvmcall_info.leaf != 1) {
1297         return;
1298     }
1299 
1300     run->tdx.get_tdvmcall_info.r11 = (tdx_caps->user_tdvmcallinfo_1_r11 &
1301                                       SUPPORTED_TDVMCALLINFO_1_R11) |
1302                                       tdx_caps->kernel_tdvmcallinfo_1_r11;
1303     run->tdx.get_tdvmcall_info.r12 = (tdx_caps->user_tdvmcallinfo_1_r12 &
1304                                       SUPPORTED_TDVMCALLINFO_1_R12) |
1305                                       tdx_caps->kernel_tdvmcallinfo_1_r12;
1306     run->tdx.get_tdvmcall_info.r13 = 0;
1307     run->tdx.get_tdvmcall_info.r14 = 0;
1308 
1309     run->tdx.get_tdvmcall_info.ret = TDG_VP_VMCALL_SUCCESS;
1310 }
1311 
tdx_handle_setup_event_notify_interrupt(X86CPU * cpu,struct kvm_run * run)1312 void tdx_handle_setup_event_notify_interrupt(X86CPU *cpu, struct kvm_run *run)
1313 {
1314     uint64_t vector = run->tdx.setup_event_notify.vector;
1315 
1316     if (vector >= 32 && vector < 256) {
1317         qemu_mutex_lock(&tdx_guest->lock);
1318         tdx_guest->event_notify_vector = vector;
1319         tdx_guest->event_notify_apicid = cpu->apic_id;
1320         qemu_mutex_unlock(&tdx_guest->lock);
1321         run->tdx.setup_event_notify.ret = TDG_VP_VMCALL_SUCCESS;
1322     } else {
1323         run->tdx.setup_event_notify.ret = TDG_VP_VMCALL_INVALID_OPERAND;
1324     }
1325 }
1326 
tdx_panicked_on_fatal_error(X86CPU * cpu,uint64_t error_code,char * message,bool has_gpa,uint64_t gpa)1327 static void tdx_panicked_on_fatal_error(X86CPU *cpu, uint64_t error_code,
1328                                         char *message, bool has_gpa,
1329                                         uint64_t gpa)
1330 {
1331     GuestPanicInformation *panic_info;
1332 
1333     panic_info = g_new0(GuestPanicInformation, 1);
1334     panic_info->type = GUEST_PANIC_INFORMATION_TYPE_TDX;
1335     panic_info->u.tdx.error_code = (uint32_t) error_code;
1336     panic_info->u.tdx.message = message;
1337     panic_info->u.tdx.gpa = gpa;
1338     panic_info->u.tdx.has_gpa = has_gpa;
1339 
1340     qemu_system_guest_panicked(panic_info);
1341 }
1342 
1343 /*
1344  * Only 8 registers can contain valid ASCII byte stream to form the fatal
1345  * message, and their sequence is: R14, R15, RBX, RDI, RSI, R8, R9, RDX
1346  */
1347 #define TDX_FATAL_MESSAGE_MAX        64
1348 
1349 #define TDX_REPORT_FATAL_ERROR_GPA_VALID    BIT_ULL(63)
1350 
tdx_handle_report_fatal_error(X86CPU * cpu,struct kvm_run * run)1351 int tdx_handle_report_fatal_error(X86CPU *cpu, struct kvm_run *run)
1352 {
1353     uint64_t error_code = run->system_event.data[R_R12];
1354     uint64_t reg_mask = run->system_event.data[R_ECX];
1355     char *message = NULL;
1356     uint64_t *tmp;
1357     uint64_t gpa = -1ull;
1358     bool has_gpa = false;
1359 
1360     if (error_code & 0xffff) {
1361         error_report("TDX: REPORT_FATAL_ERROR: invalid error code: 0x%"PRIx64,
1362                      error_code);
1363         return -1;
1364     }
1365 
1366     if (reg_mask) {
1367         message = g_malloc0(TDX_FATAL_MESSAGE_MAX + 1);
1368         tmp = (uint64_t *)message;
1369 
1370 #define COPY_REG(REG)                               \
1371     do {                                            \
1372         if (reg_mask & BIT_ULL(REG)) {              \
1373             *(tmp++) = run->system_event.data[REG]; \
1374         }                                           \
1375     } while (0)
1376 
1377         COPY_REG(R_R14);
1378         COPY_REG(R_R15);
1379         COPY_REG(R_EBX);
1380         COPY_REG(R_EDI);
1381         COPY_REG(R_ESI);
1382         COPY_REG(R_R8);
1383         COPY_REG(R_R9);
1384         COPY_REG(R_EDX);
1385         *((char *)tmp) = '\0';
1386     }
1387 #undef COPY_REG
1388 
1389     if (error_code & TDX_REPORT_FATAL_ERROR_GPA_VALID) {
1390         gpa = run->system_event.data[R_R13];
1391         has_gpa = true;
1392     }
1393 
1394     tdx_panicked_on_fatal_error(cpu, error_code, message, has_gpa, gpa);
1395 
1396     return -1;
1397 }
1398 
tdx_guest_get_sept_ve_disable(Object * obj,Error ** errp)1399 static bool tdx_guest_get_sept_ve_disable(Object *obj, Error **errp)
1400 {
1401     TdxGuest *tdx = TDX_GUEST(obj);
1402 
1403     return !!(tdx->attributes & TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE);
1404 }
1405 
tdx_guest_set_sept_ve_disable(Object * obj,bool value,Error ** errp)1406 static void tdx_guest_set_sept_ve_disable(Object *obj, bool value, Error **errp)
1407 {
1408     TdxGuest *tdx = TDX_GUEST(obj);
1409 
1410     if (value) {
1411         tdx->attributes |= TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE;
1412     } else {
1413         tdx->attributes &= ~TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE;
1414     }
1415 }
1416 
tdx_guest_get_mrconfigid(Object * obj,Error ** errp)1417 static char *tdx_guest_get_mrconfigid(Object *obj, Error **errp)
1418 {
1419     TdxGuest *tdx = TDX_GUEST(obj);
1420 
1421     return g_strdup(tdx->mrconfigid);
1422 }
1423 
tdx_guest_set_mrconfigid(Object * obj,const char * value,Error ** errp)1424 static void tdx_guest_set_mrconfigid(Object *obj, const char *value, Error **errp)
1425 {
1426     TdxGuest *tdx = TDX_GUEST(obj);
1427 
1428     g_free(tdx->mrconfigid);
1429     tdx->mrconfigid = g_strdup(value);
1430 }
1431 
tdx_guest_get_mrowner(Object * obj,Error ** errp)1432 static char *tdx_guest_get_mrowner(Object *obj, Error **errp)
1433 {
1434     TdxGuest *tdx = TDX_GUEST(obj);
1435 
1436     return g_strdup(tdx->mrowner);
1437 }
1438 
tdx_guest_set_mrowner(Object * obj,const char * value,Error ** errp)1439 static void tdx_guest_set_mrowner(Object *obj, const char *value, Error **errp)
1440 {
1441     TdxGuest *tdx = TDX_GUEST(obj);
1442 
1443     g_free(tdx->mrowner);
1444     tdx->mrowner = g_strdup(value);
1445 }
1446 
tdx_guest_get_mrownerconfig(Object * obj,Error ** errp)1447 static char *tdx_guest_get_mrownerconfig(Object *obj, Error **errp)
1448 {
1449     TdxGuest *tdx = TDX_GUEST(obj);
1450 
1451     return g_strdup(tdx->mrownerconfig);
1452 }
1453 
tdx_guest_set_mrownerconfig(Object * obj,const char * value,Error ** errp)1454 static void tdx_guest_set_mrownerconfig(Object *obj, const char *value, Error **errp)
1455 {
1456     TdxGuest *tdx = TDX_GUEST(obj);
1457 
1458     g_free(tdx->mrownerconfig);
1459     tdx->mrownerconfig = g_strdup(value);
1460 }
1461 
tdx_guest_get_qgs(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1462 static void tdx_guest_get_qgs(Object *obj, Visitor *v,
1463                               const char *name, void *opaque,
1464                               Error **errp)
1465 {
1466     TdxGuest *tdx = TDX_GUEST(obj);
1467 
1468     if (!tdx->qg_sock_addr) {
1469         error_setg(errp, "quote-generation-socket is not set");
1470         return;
1471     }
1472     visit_type_SocketAddress(v, name, &tdx->qg_sock_addr, errp);
1473 }
1474 
tdx_guest_set_qgs(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1475 static void tdx_guest_set_qgs(Object *obj, Visitor *v,
1476                               const char *name, void *opaque,
1477                               Error **errp)
1478 {
1479     TdxGuest *tdx = TDX_GUEST(obj);
1480     SocketAddress *sock = NULL;
1481 
1482     if (!visit_type_SocketAddress(v, name, &sock, errp)) {
1483         return;
1484     }
1485 
1486     if (tdx->qg_sock_addr) {
1487         qapi_free_SocketAddress(tdx->qg_sock_addr);
1488     }
1489 
1490     tdx->qg_sock_addr = sock;
1491 }
1492 
1493 /* tdx guest */
1494 OBJECT_DEFINE_TYPE_WITH_INTERFACES(TdxGuest,
1495                                    tdx_guest,
1496                                    TDX_GUEST,
1497                                    X86_CONFIDENTIAL_GUEST,
1498                                    { TYPE_USER_CREATABLE },
1499                                    { NULL })
1500 
tdx_guest_init(Object * obj)1501 static void tdx_guest_init(Object *obj)
1502 {
1503     ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj);
1504     TdxGuest *tdx = TDX_GUEST(obj);
1505 
1506     qemu_mutex_init(&tdx->lock);
1507 
1508     cgs->require_guest_memfd = true;
1509     tdx->attributes = TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE;
1510 
1511     object_property_add_uint64_ptr(obj, "attributes", &tdx->attributes,
1512                                    OBJ_PROP_FLAG_READWRITE);
1513     object_property_add_bool(obj, "sept-ve-disable",
1514                              tdx_guest_get_sept_ve_disable,
1515                              tdx_guest_set_sept_ve_disable);
1516     object_property_add_str(obj, "mrconfigid",
1517                             tdx_guest_get_mrconfigid,
1518                             tdx_guest_set_mrconfigid);
1519     object_property_add_str(obj, "mrowner",
1520                             tdx_guest_get_mrowner, tdx_guest_set_mrowner);
1521     object_property_add_str(obj, "mrownerconfig",
1522                             tdx_guest_get_mrownerconfig,
1523                             tdx_guest_set_mrownerconfig);
1524 
1525     object_property_add(obj, "quote-generation-socket", "SocketAddress",
1526                             tdx_guest_get_qgs,
1527                             tdx_guest_set_qgs,
1528                             NULL, NULL);
1529 
1530     tdx->event_notify_vector = -1;
1531     tdx->event_notify_apicid = -1;
1532 }
1533 
tdx_guest_finalize(Object * obj)1534 static void tdx_guest_finalize(Object *obj)
1535 {
1536 }
1537 
tdx_guest_class_init(ObjectClass * oc,const void * data)1538 static void tdx_guest_class_init(ObjectClass *oc, const void *data)
1539 {
1540     ConfidentialGuestSupportClass *klass = CONFIDENTIAL_GUEST_SUPPORT_CLASS(oc);
1541     X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc);
1542 
1543     klass->kvm_init = tdx_kvm_init;
1544     x86_klass->kvm_type = tdx_kvm_type;
1545     x86_klass->cpu_instance_init = tdx_cpu_instance_init;
1546     x86_klass->adjust_cpuid_features = tdx_adjust_cpuid_features;
1547     x86_klass->check_features = tdx_check_features;
1548 }
1549