xref: /openbmc/qemu/hw/ppc/spapr.c (revision e60467febb406b70b0a4e8cd05c4172a6c5d99ed)
1 /*
2  * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
3  *
4  * Copyright (c) 2004-2007 Fabrice Bellard
5  * Copyright (c) 2007 Jocelyn Mayer
6  * Copyright (c) 2010 David Gibson, IBM Corporation.
7  * Copyright (c) 2010-2024, IBM Corporation..
8  *
9  * SPDX-License-Identifier: GPL-2.0-or-later
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
27  * THE SOFTWARE.
28  */
29 
30 #include "qemu/osdep.h"
31 #include "qemu/datadir.h"
32 #include "qemu/memalign.h"
33 #include "qemu/guest-random.h"
34 #include "qapi/error.h"
35 #include "qapi/qapi-events-machine.h"
36 #include "qapi/qapi-events-qdev.h"
37 #include "qapi/visitor.h"
38 #include "system/system.h"
39 #include "system/hostmem.h"
40 #include "system/numa.h"
41 #include "system/tcg.h"
42 #include "system/qtest.h"
43 #include "system/reset.h"
44 #include "system/runstate.h"
45 #include "qemu/log.h"
46 #include "hw/fw-path-provider.h"
47 #include "elf.h"
48 #include "net/net.h"
49 #include "system/device_tree.h"
50 #include "system/cpus.h"
51 #include "system/hw_accel.h"
52 #include "kvm_ppc.h"
53 #include "migration/misc.h"
54 #include "migration/qemu-file-types.h"
55 #include "migration/global_state.h"
56 #include "migration/register.h"
57 #include "migration/blocker.h"
58 #include "mmu-hash64.h"
59 #include "mmu-book3s-v3.h"
60 #include "cpu-models.h"
61 #include "hw/core/cpu.h"
62 
63 #include "hw/ppc/ppc.h"
64 #include "hw/loader.h"
65 
66 #include "hw/ppc/fdt.h"
67 #include "hw/ppc/spapr.h"
68 #include "hw/ppc/spapr_nested.h"
69 #include "hw/ppc/spapr_vio.h"
70 #include "hw/ppc/vof.h"
71 #include "hw/qdev-properties.h"
72 #include "hw/pci-host/spapr.h"
73 #include "hw/pci/msi.h"
74 
75 #include "hw/pci/pci.h"
76 #include "hw/scsi/scsi.h"
77 #include "hw/virtio/virtio-scsi.h"
78 #include "hw/virtio/vhost-scsi-common.h"
79 
80 #include "system/ram_addr.h"
81 #include "system/confidential-guest-support.h"
82 #include "hw/usb.h"
83 #include "qemu/config-file.h"
84 #include "qemu/error-report.h"
85 #include "trace.h"
86 #include "hw/nmi.h"
87 #include "hw/intc/intc.h"
88 
89 #include "hw/ppc/spapr_cpu_core.h"
90 #include "hw/mem/memory-device.h"
91 #include "hw/ppc/spapr_tpm_proxy.h"
92 #include "hw/ppc/spapr_nvdimm.h"
93 #include "hw/ppc/spapr_numa.h"
94 
95 #include <libfdt.h>
96 
97 /* SLOF memory layout:
98  *
99  * SLOF raw image loaded at 0, copies its romfs right below the flat
100  * device-tree, then position SLOF itself 31M below that
101  *
102  * So we set FW_OVERHEAD to 40MB which should account for all of that
103  * and more
104  *
105  * We load our kernel at 4M, leaving space for SLOF initial image
106  */
107 #define FDT_MAX_ADDR            0x80000000 /* FDT must stay below that */
108 #define FW_MAX_SIZE             0x400000
109 #define FW_FILE_NAME            "slof.bin"
110 #define FW_FILE_NAME_VOF        "vof.bin"
111 #define FW_OVERHEAD             0x2800000
112 #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
113 
114 #define MIN_RMA_SLOF            (128 * MiB)
115 
116 #define PHANDLE_INTC            0x00001111
117 
118 /* These two functions implement the VCPU id numbering: one to compute them
119  * all and one to identify thread 0 of a VCORE. Any change to the first one
120  * is likely to have an impact on the second one, so let's keep them close.
121  */
spapr_vcpu_id(SpaprMachineState * spapr,int cpu_index)122 static int spapr_vcpu_id(SpaprMachineState *spapr, int cpu_index)
123 {
124     MachineState *ms = MACHINE(spapr);
125     unsigned int smp_threads = ms->smp.threads;
126 
127     assert(spapr->vsmt);
128     return
129         (cpu_index / smp_threads) * spapr->vsmt + cpu_index % smp_threads;
130 }
spapr_is_thread0_in_vcore(SpaprMachineState * spapr,PowerPCCPU * cpu)131 static bool spapr_is_thread0_in_vcore(SpaprMachineState *spapr,
132                                       PowerPCCPU *cpu)
133 {
134     assert(spapr->vsmt);
135     return spapr_get_vcpu_id(cpu) % spapr->vsmt == 0;
136 }
137 
spapr_max_server_number(SpaprMachineState * spapr)138 int spapr_max_server_number(SpaprMachineState *spapr)
139 {
140     MachineState *ms = MACHINE(spapr);
141 
142     assert(spapr->vsmt);
143     return DIV_ROUND_UP(ms->smp.max_cpus * spapr->vsmt, ms->smp.threads);
144 }
145 
spapr_fixup_cpu_smt_dt(void * fdt,int offset,PowerPCCPU * cpu,int smt_threads)146 static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
147                                   int smt_threads)
148 {
149     int i, ret = 0;
150     g_autofree uint32_t *servers_prop = g_new(uint32_t, smt_threads);
151     g_autofree uint32_t *gservers_prop = g_new(uint32_t, smt_threads * 2);
152     int index = spapr_get_vcpu_id(cpu);
153 
154     if (cpu->compat_pvr) {
155         ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->compat_pvr);
156         if (ret < 0) {
157             return ret;
158         }
159     }
160 
161     /* Build interrupt servers and gservers properties */
162     for (i = 0; i < smt_threads; i++) {
163         servers_prop[i] = cpu_to_be32(index + i);
164         /* Hack, direct the group queues back to cpu 0 */
165         gservers_prop[i*2] = cpu_to_be32(index + i);
166         gservers_prop[i*2 + 1] = 0;
167     }
168     ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
169                       servers_prop, sizeof(*servers_prop) * smt_threads);
170     if (ret < 0) {
171         return ret;
172     }
173     ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
174                       gservers_prop, sizeof(*gservers_prop) * smt_threads * 2);
175 
176     return ret;
177 }
178 
spapr_dt_pa_features(SpaprMachineState * spapr,PowerPCCPU * cpu,void * fdt,int offset)179 static void spapr_dt_pa_features(SpaprMachineState *spapr,
180                                  PowerPCCPU *cpu,
181                                  void *fdt, int offset)
182 {
183     /*
184      * SSO (SAO) ordering is supported on KVM and thread=single hosts,
185      * but not MTTCG, so disable it. To advertise it, a cap would have
186      * to be added, or support implemented for MTTCG.
187      *
188      * Copy/paste is not supported by TCG, so it is not advertised. KVM
189      * can execute them but it has no accelerator drivers which are usable,
190      * so there isn't much need for it anyway.
191      */
192 
193     /* These should be kept in sync with pnv */
194     uint8_t pa_features_206[] = { 6, 0,
195         0xf6, 0x1f, 0xc7, 0x00, 0x00, 0xc0 };
196     uint8_t pa_features_207[] = { 24, 0,
197         0xf6, 0x1f, 0xc7, 0xc0, 0x00, 0xf0,
198         0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
199         0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
200         0x80, 0x00, 0x80, 0x00, 0x00, 0x00 };
201     uint8_t pa_features_300[] = { 66, 0,
202         /* 0: MMU|FPU|SLB|RUN|DABR|NX, 1: fri[nzpm]|DABRX|SPRG3|SLB0|PP110 */
203         /* 2: VPM|DS205|PPR|DS202|DS206, 3: LSD|URG, 5: LE|CFAR|EB|LSQ */
204         0xf6, 0x1f, 0xc7, 0xc0, 0x00, 0xf0, /* 0 - 5 */
205         /* 6: DS207 */
206         0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 6 - 11 */
207         /* 16: Vector */
208         0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 12 - 17 */
209         /* 18: Vec. Scalar, 20: Vec. XOR */
210         0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 18 - 23 */
211         /* 24: Ext. Dec, 26: 64 bit ftrs, 28: PM ftrs */
212         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 24 - 29 */
213         /* 32: LE atomic, 34: EBB + ext EBB */
214         0x00, 0x00, 0x80, 0x00, 0xC0, 0x00, /* 30 - 35 */
215         /* 40: Radix MMU */
216         0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 36 - 41 */
217         /* 42: PM, 44: PC RA, 46: SC vec'd */
218         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 42 - 47 */
219         /* 48: SIMD, 50: QP BFP, 52: String */
220         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
221         /* 54: DecFP, 56: DecI, 58: SHA */
222         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
223         /* 60: NM atomic, 62: RNG */
224         0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
225     };
226     /* 3.1 removes SAO, HTM support */
227     uint8_t pa_features_31[] = { 74, 0,
228         /* 0: MMU|FPU|SLB|RUN|DABR|NX, 1: fri[nzpm]|DABRX|SPRG3|SLB0|PP110 */
229         /* 2: VPM|DS205|PPR|DS202|DS206, 3: LSD|URG, 5: LE|CFAR|EB|LSQ */
230         0xf6, 0x1f, 0xc7, 0xc0, 0x00, 0xf0, /* 0 - 5 */
231         /* 6: DS207 */
232         0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 6 - 11 */
233         /* 16: Vector */
234         0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 12 - 17 */
235         /* 18: Vec. Scalar, 20: Vec. XOR */
236         0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 18 - 23 */
237         /* 24: Ext. Dec, 26: 64 bit ftrs, 28: PM ftrs */
238         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 24 - 29 */
239         /* 32: LE atomic, 34: EBB + ext EBB */
240         0x00, 0x00, 0x80, 0x00, 0xC0, 0x00, /* 30 - 35 */
241         /* 40: Radix MMU */
242         0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 36 - 41 */
243         /* 42: PM, 44: PC RA, 46: SC vec'd */
244         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 42 - 47 */
245         /* 48: SIMD, 50: QP BFP, 52: String */
246         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
247         /* 54: DecFP, 56: DecI, 58: SHA */
248         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
249         /* 60: NM atomic, 62: RNG, 64: DAWR1 (ISA 3.1) */
250         0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
251         /* 68: DEXCR[SBHE|IBRTPDUS|SRAPD|NPHIE|PHIE] */
252         0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 66 - 71 */
253         /* 72: [P]HASHST/[P]HASHCHK */
254         0x80, 0x00,                         /* 72 - 73 */
255     };
256     uint8_t *pa_features = NULL;
257     size_t pa_size;
258 
259     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_06, 0, cpu->compat_pvr)) {
260         pa_features = pa_features_206;
261         pa_size = sizeof(pa_features_206);
262     }
263     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_07, 0, cpu->compat_pvr)) {
264         pa_features = pa_features_207;
265         pa_size = sizeof(pa_features_207);
266     }
267     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0, cpu->compat_pvr)) {
268         pa_features = pa_features_300;
269         pa_size = sizeof(pa_features_300);
270     }
271     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_10, 0, cpu->compat_pvr)) {
272         pa_features = pa_features_31;
273         pa_size = sizeof(pa_features_31);
274     }
275     if (!pa_features) {
276         return;
277     }
278 
279     if (ppc_hash64_has(cpu, PPC_HASH64_CI_LARGEPAGE)) {
280         /*
281          * Note: we keep CI large pages off by default because a 64K capable
282          * guest provisioned with large pages might otherwise try to map a qemu
283          * framebuffer (or other kind of memory mapped PCI BAR) using 64K pages
284          * even if that qemu runs on a 4k host.
285          * We dd this bit back here if we are confident this is not an issue
286          */
287         pa_features[3] |= 0x20;
288     }
289     if ((spapr_get_cap(spapr, SPAPR_CAP_HTM) != 0) && pa_size > 24) {
290         pa_features[24] |= 0x80;    /* Transactional memory support */
291     }
292     if (spapr->cas_pre_isa3_guest && pa_size > 40) {
293         /* Workaround for broken kernels that attempt (guest) radix
294          * mode when they can't handle it, if they see the radix bit set
295          * in pa-features. So hide it from them. */
296         pa_features[40 + 2] &= ~0x80; /* Radix MMU */
297     }
298     if (spapr_get_cap(spapr, SPAPR_CAP_DAWR1)) {
299         g_assert(pa_size > 66);
300         pa_features[66] |= 0x80;
301     }
302 
303     _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
304 }
305 
spapr_dt_pi_features(SpaprMachineState * spapr,PowerPCCPU * cpu,void * fdt,int offset)306 static void spapr_dt_pi_features(SpaprMachineState *spapr,
307                                  PowerPCCPU *cpu,
308                                  void *fdt, int offset)
309 {
310     uint8_t pi_features[] = { 1, 0,
311         0x00 };
312 
313     if (kvm_enabled() && ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00,
314                                           0, cpu->compat_pvr)) {
315         /*
316          * POWER9 and later CPUs with KVM run in LPAR-per-thread mode where
317          * all threads are essentially independent CPUs, and msgsndp does not
318          * work (because it is physically-addressed) and therefore is
319          * emulated by KVM, so disable it here to ensure XIVE will be used.
320          * This is both KVM and CPU implementation-specific behaviour so a KVM
321          * cap would be cleanest, but for now this works. If KVM ever permits
322          * native msgsndp execution by guests, a cap could be added at that
323          * time.
324          */
325         pi_features[2] |= 0x08; /* 4: No msgsndp */
326     }
327 
328     _FDT((fdt_setprop(fdt, offset, "ibm,pi-features", pi_features,
329                       sizeof(pi_features))));
330 }
331 
spapr_node0_size(MachineState * machine)332 static hwaddr spapr_node0_size(MachineState *machine)
333 {
334     if (machine->numa_state->num_nodes) {
335         int i;
336         for (i = 0; i < machine->numa_state->num_nodes; ++i) {
337             if (machine->numa_state->nodes[i].node_mem) {
338                 return MIN(pow2floor(machine->numa_state->nodes[i].node_mem),
339                            machine->ram_size);
340             }
341         }
342     }
343     return machine->ram_size;
344 }
345 
add_str(GString * s,const gchar * s1)346 static void add_str(GString *s, const gchar *s1)
347 {
348     g_string_append_len(s, s1, strlen(s1) + 1);
349 }
350 
spapr_dt_memory_node(SpaprMachineState * spapr,void * fdt,int nodeid,hwaddr start,hwaddr size)351 static int spapr_dt_memory_node(SpaprMachineState *spapr, void *fdt, int nodeid,
352                                 hwaddr start, hwaddr size)
353 {
354     char mem_name[32];
355     uint64_t mem_reg_property[2];
356     int off;
357 
358     mem_reg_property[0] = cpu_to_be64(start);
359     mem_reg_property[1] = cpu_to_be64(size);
360 
361     sprintf(mem_name, "memory@%" HWADDR_PRIx, start);
362     off = fdt_add_subnode(fdt, 0, mem_name);
363     _FDT(off);
364     _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
365     _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
366                       sizeof(mem_reg_property))));
367     spapr_numa_write_associativity_dt(spapr, fdt, off, nodeid);
368     return off;
369 }
370 
spapr_pc_dimm_node(MemoryDeviceInfoList * list,ram_addr_t addr)371 static uint32_t spapr_pc_dimm_node(MemoryDeviceInfoList *list, ram_addr_t addr)
372 {
373     MemoryDeviceInfoList *info;
374 
375     for (info = list; info; info = info->next) {
376         MemoryDeviceInfo *value = info->value;
377 
378         if (value && value->type == MEMORY_DEVICE_INFO_KIND_DIMM) {
379             PCDIMMDeviceInfo *pcdimm_info = value->u.dimm.data;
380 
381             if (addr >= pcdimm_info->addr &&
382                 addr < (pcdimm_info->addr + pcdimm_info->size)) {
383                 return pcdimm_info->node;
384             }
385         }
386     }
387 
388     return -1;
389 }
390 
391 struct sPAPRDrconfCellV2 {
392      uint32_t seq_lmbs;
393      uint64_t base_addr;
394      uint32_t drc_index;
395      uint32_t aa_index;
396      uint32_t flags;
397 } QEMU_PACKED;
398 
399 typedef struct DrconfCellQueue {
400     struct sPAPRDrconfCellV2 cell;
401     QSIMPLEQ_ENTRY(DrconfCellQueue) entry;
402 } DrconfCellQueue;
403 
404 static DrconfCellQueue *
spapr_get_drconf_cell(uint32_t seq_lmbs,uint64_t base_addr,uint32_t drc_index,uint32_t aa_index,uint32_t flags)405 spapr_get_drconf_cell(uint32_t seq_lmbs, uint64_t base_addr,
406                       uint32_t drc_index, uint32_t aa_index,
407                       uint32_t flags)
408 {
409     DrconfCellQueue *elem;
410 
411     elem = g_malloc0(sizeof(*elem));
412     elem->cell.seq_lmbs = cpu_to_be32(seq_lmbs);
413     elem->cell.base_addr = cpu_to_be64(base_addr);
414     elem->cell.drc_index = cpu_to_be32(drc_index);
415     elem->cell.aa_index = cpu_to_be32(aa_index);
416     elem->cell.flags = cpu_to_be32(flags);
417 
418     return elem;
419 }
420 
spapr_dt_dynamic_memory_v2(SpaprMachineState * spapr,void * fdt,int offset,MemoryDeviceInfoList * dimms)421 static int spapr_dt_dynamic_memory_v2(SpaprMachineState *spapr, void *fdt,
422                                       int offset, MemoryDeviceInfoList *dimms)
423 {
424     MachineState *machine = MACHINE(spapr);
425     uint8_t *int_buf, *cur_index;
426     int ret;
427     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
428     uint64_t addr, cur_addr, size;
429     uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
430     uint64_t mem_end = machine->device_memory->base +
431                        memory_region_size(&machine->device_memory->mr);
432     uint32_t node, buf_len, nr_entries = 0;
433     SpaprDrc *drc;
434     DrconfCellQueue *elem, *next;
435     MemoryDeviceInfoList *info;
436     QSIMPLEQ_HEAD(, DrconfCellQueue) drconf_queue
437         = QSIMPLEQ_HEAD_INITIALIZER(drconf_queue);
438 
439     /* Entry to cover RAM and the gap area */
440     elem = spapr_get_drconf_cell(nr_boot_lmbs, 0, 0, -1,
441                                  SPAPR_LMB_FLAGS_RESERVED |
442                                  SPAPR_LMB_FLAGS_DRC_INVALID);
443     QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
444     nr_entries++;
445 
446     cur_addr = machine->device_memory->base;
447     for (info = dimms; info; info = info->next) {
448         PCDIMMDeviceInfo *di = info->value->u.dimm.data;
449 
450         addr = di->addr;
451         size = di->size;
452         node = di->node;
453 
454         /*
455          * The NVDIMM area is hotpluggable after the NVDIMM is unplugged. The
456          * area is marked hotpluggable in the next iteration for the bigger
457          * chunk including the NVDIMM occupied area.
458          */
459         if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM)
460             continue;
461 
462         /* Entry for hot-pluggable area */
463         if (cur_addr < addr) {
464             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
465             g_assert(drc);
466             elem = spapr_get_drconf_cell((addr - cur_addr) / lmb_size,
467                                          cur_addr, spapr_drc_index(drc), -1, 0);
468             QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
469             nr_entries++;
470         }
471 
472         /* Entry for DIMM */
473         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
474         g_assert(drc);
475         elem = spapr_get_drconf_cell(size / lmb_size, addr,
476                                      spapr_drc_index(drc), node,
477                                      (SPAPR_LMB_FLAGS_ASSIGNED |
478                                       SPAPR_LMB_FLAGS_HOTREMOVABLE));
479         QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
480         nr_entries++;
481         cur_addr = addr + size;
482     }
483 
484     /* Entry for remaining hotpluggable area */
485     if (cur_addr < mem_end) {
486         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
487         g_assert(drc);
488         elem = spapr_get_drconf_cell((mem_end - cur_addr) / lmb_size,
489                                      cur_addr, spapr_drc_index(drc), -1, 0);
490         QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
491         nr_entries++;
492     }
493 
494     buf_len = nr_entries * sizeof(struct sPAPRDrconfCellV2) + sizeof(uint32_t);
495     int_buf = cur_index = g_malloc0(buf_len);
496     *(uint32_t *)int_buf = cpu_to_be32(nr_entries);
497     cur_index += sizeof(nr_entries);
498 
499     QSIMPLEQ_FOREACH_SAFE(elem, &drconf_queue, entry, next) {
500         memcpy(cur_index, &elem->cell, sizeof(elem->cell));
501         cur_index += sizeof(elem->cell);
502         QSIMPLEQ_REMOVE(&drconf_queue, elem, DrconfCellQueue, entry);
503         g_free(elem);
504     }
505 
506     ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory-v2", int_buf, buf_len);
507     g_free(int_buf);
508     if (ret < 0) {
509         return -1;
510     }
511     return 0;
512 }
513 
spapr_dt_dynamic_memory(SpaprMachineState * spapr,void * fdt,int offset,MemoryDeviceInfoList * dimms)514 static int spapr_dt_dynamic_memory(SpaprMachineState *spapr, void *fdt,
515                                    int offset, MemoryDeviceInfoList *dimms)
516 {
517     MachineState *machine = MACHINE(spapr);
518     int i, ret;
519     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
520     uint32_t device_lmb_start = machine->device_memory->base / lmb_size;
521     uint32_t nr_lmbs = (machine->device_memory->base +
522                        memory_region_size(&machine->device_memory->mr)) /
523                        lmb_size;
524     uint32_t *int_buf, *cur_index, buf_len;
525 
526     /*
527      * Allocate enough buffer size to fit in ibm,dynamic-memory
528      */
529     buf_len = (nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1) * sizeof(uint32_t);
530     cur_index = int_buf = g_malloc0(buf_len);
531     int_buf[0] = cpu_to_be32(nr_lmbs);
532     cur_index++;
533     for (i = 0; i < nr_lmbs; i++) {
534         uint64_t addr = i * lmb_size;
535         uint32_t *dynamic_memory = cur_index;
536 
537         if (i >= device_lmb_start) {
538             SpaprDrc *drc;
539 
540             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, i);
541             g_assert(drc);
542 
543             dynamic_memory[0] = cpu_to_be32(addr >> 32);
544             dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
545             dynamic_memory[2] = cpu_to_be32(spapr_drc_index(drc));
546             dynamic_memory[3] = cpu_to_be32(0); /* reserved */
547             dynamic_memory[4] = cpu_to_be32(spapr_pc_dimm_node(dimms, addr));
548             if (memory_region_present(get_system_memory(), addr)) {
549                 dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_ASSIGNED);
550             } else {
551                 dynamic_memory[5] = cpu_to_be32(0);
552             }
553         } else {
554             /*
555              * LMB information for RMA, boot time RAM and gap b/n RAM and
556              * device memory region -- all these are marked as reserved
557              * and as having no valid DRC.
558              */
559             dynamic_memory[0] = cpu_to_be32(addr >> 32);
560             dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
561             dynamic_memory[2] = cpu_to_be32(0);
562             dynamic_memory[3] = cpu_to_be32(0); /* reserved */
563             dynamic_memory[4] = cpu_to_be32(-1);
564             dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_RESERVED |
565                                             SPAPR_LMB_FLAGS_DRC_INVALID);
566         }
567 
568         cur_index += SPAPR_DR_LMB_LIST_ENTRY_SIZE;
569     }
570     ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len);
571     g_free(int_buf);
572     if (ret < 0) {
573         return -1;
574     }
575     return 0;
576 }
577 
578 /*
579  * Adds ibm,dynamic-reconfiguration-memory node.
580  * Refer to docs/specs/ppc-spapr-hotplug.rst for the documentation
581  * of this device tree node.
582  */
spapr_dt_dynamic_reconfiguration_memory(SpaprMachineState * spapr,void * fdt)583 static int spapr_dt_dynamic_reconfiguration_memory(SpaprMachineState *spapr,
584                                                    void *fdt)
585 {
586     MachineState *machine = MACHINE(spapr);
587     int ret, offset;
588     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
589     uint32_t prop_lmb_size[] = {cpu_to_be32(lmb_size >> 32),
590                                 cpu_to_be32(lmb_size & 0xffffffff)};
591     MemoryDeviceInfoList *dimms = NULL;
592 
593     /* Don't create the node if there is no device memory. */
594     if (!machine->device_memory) {
595         return 0;
596     }
597 
598     offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
599 
600     ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
601                     sizeof(prop_lmb_size));
602     if (ret < 0) {
603         return ret;
604     }
605 
606     ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
607     if (ret < 0) {
608         return ret;
609     }
610 
611     ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
612     if (ret < 0) {
613         return ret;
614     }
615 
616     /* ibm,dynamic-memory or ibm,dynamic-memory-v2 */
617     dimms = qmp_memory_device_list();
618     if (spapr_ovec_test(spapr->ov5_cas, OV5_DRMEM_V2)) {
619         ret = spapr_dt_dynamic_memory_v2(spapr, fdt, offset, dimms);
620     } else {
621         ret = spapr_dt_dynamic_memory(spapr, fdt, offset, dimms);
622     }
623     qapi_free_MemoryDeviceInfoList(dimms);
624 
625     if (ret < 0) {
626         return ret;
627     }
628 
629     ret = spapr_numa_write_assoc_lookup_arrays(spapr, fdt, offset);
630 
631     return ret;
632 }
633 
spapr_dt_memory(SpaprMachineState * spapr,void * fdt)634 static int spapr_dt_memory(SpaprMachineState *spapr, void *fdt)
635 {
636     MachineState *machine = MACHINE(spapr);
637     hwaddr mem_start, node_size;
638     int i, nb_nodes = machine->numa_state->num_nodes;
639     NodeInfo *nodes = machine->numa_state->nodes;
640 
641     for (i = 0, mem_start = 0; i < nb_nodes; ++i) {
642         if (!nodes[i].node_mem) {
643             continue;
644         }
645         if (mem_start >= machine->ram_size) {
646             node_size = 0;
647         } else {
648             node_size = nodes[i].node_mem;
649             if (node_size > machine->ram_size - mem_start) {
650                 node_size = machine->ram_size - mem_start;
651             }
652         }
653         if (!mem_start) {
654             /* spapr_machine_init() checks for rma_size <= node0_size
655              * already */
656             spapr_dt_memory_node(spapr, fdt, i, 0, spapr->rma_size);
657             mem_start += spapr->rma_size;
658             node_size -= spapr->rma_size;
659         }
660         for ( ; node_size; ) {
661             hwaddr sizetmp = pow2floor(node_size);
662 
663             /* mem_start != 0 here */
664             if (ctzl(mem_start) < ctzl(sizetmp)) {
665                 sizetmp = 1ULL << ctzl(mem_start);
666             }
667 
668             spapr_dt_memory_node(spapr, fdt, i, mem_start, sizetmp);
669             node_size -= sizetmp;
670             mem_start += sizetmp;
671         }
672     }
673 
674     /* Generate ibm,dynamic-reconfiguration-memory node if required */
675     if (spapr_ovec_test(spapr->ov5_cas, OV5_DRCONF_MEMORY)) {
676         int ret;
677 
678         ret = spapr_dt_dynamic_reconfiguration_memory(spapr, fdt);
679         if (ret) {
680             return ret;
681         }
682     }
683 
684     return 0;
685 }
686 
spapr_dt_cpu(CPUState * cs,void * fdt,int offset,SpaprMachineState * spapr)687 static void spapr_dt_cpu(CPUState *cs, void *fdt, int offset,
688                          SpaprMachineState *spapr)
689 {
690     MachineState *ms = MACHINE(spapr);
691     PowerPCCPU *cpu = POWERPC_CPU(cs);
692     CPUPPCState *env = &cpu->env;
693     PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
694     int index = spapr_get_vcpu_id(cpu);
695     uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
696                        0xffffffff, 0xffffffff};
697     uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq()
698         : SPAPR_TIMEBASE_FREQ;
699     uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
700     uint32_t page_sizes_prop[64];
701     size_t page_sizes_prop_size;
702     unsigned int smp_threads = ms->smp.threads;
703     uint32_t vcpus_per_socket = smp_threads * ms->smp.cores;
704     uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
705     int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu));
706     SpaprDrc *drc;
707     int drc_index;
708     uint32_t radix_AP_encodings[PPC_PAGE_SIZES_MAX_SZ];
709     int i;
710 
711     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, env->core_index);
712     if (drc) {
713         drc_index = spapr_drc_index(drc);
714         _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index)));
715     }
716 
717     _FDT((fdt_setprop_cell(fdt, offset, "reg", index)));
718     _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu")));
719 
720     _FDT((fdt_setprop_cell(fdt, offset, "cpu-version", env->spr[SPR_PVR])));
721     _FDT((fdt_setprop_cell(fdt, offset, "d-cache-block-size",
722                            env->dcache_line_size)));
723     _FDT((fdt_setprop_cell(fdt, offset, "d-cache-line-size",
724                            env->dcache_line_size)));
725     _FDT((fdt_setprop_cell(fdt, offset, "i-cache-block-size",
726                            env->icache_line_size)));
727     _FDT((fdt_setprop_cell(fdt, offset, "i-cache-line-size",
728                            env->icache_line_size)));
729 
730     if (pcc->l1_dcache_size) {
731         _FDT((fdt_setprop_cell(fdt, offset, "d-cache-size",
732                                pcc->l1_dcache_size)));
733     } else {
734         warn_report("Unknown L1 dcache size for cpu");
735     }
736     if (pcc->l1_icache_size) {
737         _FDT((fdt_setprop_cell(fdt, offset, "i-cache-size",
738                                pcc->l1_icache_size)));
739     } else {
740         warn_report("Unknown L1 icache size for cpu");
741     }
742 
743     _FDT((fdt_setprop_cell(fdt, offset, "timebase-frequency", tbfreq)));
744     _FDT((fdt_setprop_cell(fdt, offset, "clock-frequency", cpufreq)));
745     _FDT((fdt_setprop_cell(fdt, offset, "slb-size", cpu->hash64_opts->slb_size)));
746     _FDT((fdt_setprop_cell(fdt, offset, "ibm,slb-size", cpu->hash64_opts->slb_size)));
747     _FDT((fdt_setprop_string(fdt, offset, "status", "okay")));
748     _FDT((fdt_setprop(fdt, offset, "64-bit", NULL, 0)));
749 
750     if (ppc_has_spr(cpu, SPR_PURR)) {
751         _FDT((fdt_setprop_cell(fdt, offset, "ibm,purr", 1)));
752     }
753     if (ppc_has_spr(cpu, SPR_PURR)) {
754         _FDT((fdt_setprop_cell(fdt, offset, "ibm,spurr", 1)));
755     }
756 
757     if (ppc_hash64_has(cpu, PPC_HASH64_1TSEG)) {
758         _FDT((fdt_setprop(fdt, offset, "ibm,processor-segment-sizes",
759                           segs, sizeof(segs))));
760     }
761 
762     /* Advertise VSX (vector extensions) if available
763      *   1               == VMX / Altivec available
764      *   2               == VSX available
765      *
766      * Only CPUs for which we create core types in spapr_cpu_core.c
767      * are possible, and all of those have VMX */
768     if (env->insns_flags & PPC_ALTIVEC) {
769         if (spapr_get_cap(spapr, SPAPR_CAP_VSX) != 0) {
770             _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 2)));
771         } else {
772             _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 1)));
773         }
774     }
775 
776     /* Advertise DFP (Decimal Floating Point) if available
777      *   0 / no property == no DFP
778      *   1               == DFP available */
779     if (spapr_get_cap(spapr, SPAPR_CAP_DFP) != 0) {
780         _FDT((fdt_setprop_cell(fdt, offset, "ibm,dfp", 1)));
781     }
782 
783     page_sizes_prop_size = ppc_create_page_sizes_prop(cpu, page_sizes_prop,
784                                                       sizeof(page_sizes_prop));
785     if (page_sizes_prop_size) {
786         _FDT((fdt_setprop(fdt, offset, "ibm,segment-page-sizes",
787                           page_sizes_prop, page_sizes_prop_size)));
788     }
789 
790     spapr_dt_pa_features(spapr, cpu, fdt, offset);
791 
792     spapr_dt_pi_features(spapr, cpu, fdt, offset);
793 
794     _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id",
795                            cs->cpu_index / vcpus_per_socket)));
796 
797     _FDT((fdt_setprop(fdt, offset, "ibm,pft-size",
798                       pft_size_prop, sizeof(pft_size_prop))));
799 
800     if (ms->numa_state->num_nodes > 1) {
801         _FDT(spapr_numa_fixup_cpu_dt(spapr, fdt, offset, cpu));
802     }
803 
804     _FDT(spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt));
805 
806     if (pcc->radix_page_info) {
807         for (i = 0; i < pcc->radix_page_info->count; i++) {
808             radix_AP_encodings[i] =
809                 cpu_to_be32(pcc->radix_page_info->entries[i]);
810         }
811         _FDT((fdt_setprop(fdt, offset, "ibm,processor-radix-AP-encodings",
812                           radix_AP_encodings,
813                           pcc->radix_page_info->count *
814                           sizeof(radix_AP_encodings[0]))));
815     }
816 
817     /*
818      * We set this property to let the guest know that it can use the large
819      * decrementer and its width in bits.
820      */
821     if (spapr_get_cap(spapr, SPAPR_CAP_LARGE_DECREMENTER) != SPAPR_CAP_OFF)
822         _FDT((fdt_setprop_u32(fdt, offset, "ibm,dec-bits",
823                               pcc->lrg_decr_bits)));
824 }
825 
spapr_dt_one_cpu(void * fdt,SpaprMachineState * spapr,CPUState * cs,int cpus_offset)826 static void spapr_dt_one_cpu(void *fdt, SpaprMachineState *spapr, CPUState *cs,
827                              int cpus_offset)
828 {
829     PowerPCCPU *cpu = POWERPC_CPU(cs);
830     int index = spapr_get_vcpu_id(cpu);
831     DeviceClass *dc = DEVICE_GET_CLASS(cs);
832     g_autofree char *nodename = NULL;
833     int offset;
834 
835     if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
836         return;
837     }
838 
839     nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
840     offset = fdt_add_subnode(fdt, cpus_offset, nodename);
841     _FDT(offset);
842     spapr_dt_cpu(cs, fdt, offset, spapr);
843 }
844 
845 
spapr_dt_cpus(void * fdt,SpaprMachineState * spapr)846 static void spapr_dt_cpus(void *fdt, SpaprMachineState *spapr)
847 {
848     CPUState **rev;
849     CPUState *cs;
850     int n_cpus;
851     int cpus_offset;
852     int i;
853 
854     cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
855     _FDT(cpus_offset);
856     _FDT((fdt_setprop_cell(fdt, cpus_offset, "#address-cells", 0x1)));
857     _FDT((fdt_setprop_cell(fdt, cpus_offset, "#size-cells", 0x0)));
858 
859     /*
860      * We walk the CPUs in reverse order to ensure that CPU DT nodes
861      * created by fdt_add_subnode() end up in the right order in FDT
862      * for the guest kernel the enumerate the CPUs correctly.
863      *
864      * The CPU list cannot be traversed in reverse order, so we need
865      * to do extra work.
866      */
867     n_cpus = 0;
868     rev = NULL;
869     CPU_FOREACH(cs) {
870         rev = g_renew(CPUState *, rev, n_cpus + 1);
871         rev[n_cpus++] = cs;
872     }
873 
874     for (i = n_cpus - 1; i >= 0; i--) {
875         spapr_dt_one_cpu(fdt, spapr, rev[i], cpus_offset);
876     }
877 
878     g_free(rev);
879 }
880 
spapr_dt_rng(void * fdt)881 static int spapr_dt_rng(void *fdt)
882 {
883     int node;
884     int ret;
885 
886     node = qemu_fdt_add_subnode(fdt, "/ibm,platform-facilities");
887     if (node <= 0) {
888         return -1;
889     }
890     ret = fdt_setprop_string(fdt, node, "device_type",
891                              "ibm,platform-facilities");
892     ret |= fdt_setprop_cell(fdt, node, "#address-cells", 0x1);
893     ret |= fdt_setprop_cell(fdt, node, "#size-cells", 0x0);
894 
895     node = fdt_add_subnode(fdt, node, "ibm,random-v1");
896     if (node <= 0) {
897         return -1;
898     }
899     ret |= fdt_setprop_string(fdt, node, "compatible", "ibm,random");
900 
901     return ret ? -1 : 0;
902 }
903 
spapr_dt_rtas(SpaprMachineState * spapr,void * fdt)904 static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
905 {
906     MachineState *ms = MACHINE(spapr);
907     int rtas;
908     GString *hypertas = g_string_sized_new(256);
909     GString *qemu_hypertas = g_string_sized_new(256);
910     uint64_t max_device_addr = 0;
911     uint32_t lrdr_capacity[] = {
912         0,
913         0,
914         cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE >> 32),
915         cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE & 0xffffffff),
916         cpu_to_be32(ms->smp.max_cpus / ms->smp.threads),
917     };
918 
919     /* Do we have device memory? */
920     if (MACHINE(spapr)->device_memory) {
921         max_device_addr = MACHINE(spapr)->device_memory->base +
922             memory_region_size(&MACHINE(spapr)->device_memory->mr);
923     } else if (ms->ram_size == ms->maxram_size) {
924         max_device_addr = ms->ram_size;
925     }
926 
927     lrdr_capacity[0] = cpu_to_be32(max_device_addr >> 32);
928     lrdr_capacity[1] = cpu_to_be32(max_device_addr & 0xffffffff);
929 
930     _FDT(rtas = fdt_add_subnode(fdt, 0, "rtas"));
931 
932     /* hypertas */
933     add_str(hypertas, "hcall-pft");
934     add_str(hypertas, "hcall-term");
935     add_str(hypertas, "hcall-dabr");
936     add_str(hypertas, "hcall-interrupt");
937     add_str(hypertas, "hcall-tce");
938     add_str(hypertas, "hcall-vio");
939     add_str(hypertas, "hcall-splpar");
940     add_str(hypertas, "hcall-join");
941     add_str(hypertas, "hcall-bulk");
942     add_str(hypertas, "hcall-set-mode");
943     add_str(hypertas, "hcall-sprg0");
944     add_str(hypertas, "hcall-copy");
945     add_str(hypertas, "hcall-debug");
946     add_str(hypertas, "hcall-vphn");
947     if (spapr_get_cap(spapr, SPAPR_CAP_RPT_INVALIDATE) == SPAPR_CAP_ON) {
948         add_str(hypertas, "hcall-rpt-invalidate");
949     }
950 
951     add_str(qemu_hypertas, "hcall-memop1");
952 
953     if (!kvm_enabled() || kvmppc_spapr_use_multitce()) {
954         add_str(hypertas, "hcall-multi-tce");
955     }
956 
957     if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
958         add_str(hypertas, "hcall-hpt-resize");
959     }
960 
961     add_str(hypertas, "hcall-watchdog");
962 
963     _FDT(fdt_setprop(fdt, rtas, "ibm,hypertas-functions",
964                      hypertas->str, hypertas->len));
965     g_string_free(hypertas, TRUE);
966     _FDT(fdt_setprop(fdt, rtas, "qemu,hypertas-functions",
967                      qemu_hypertas->str, qemu_hypertas->len));
968     g_string_free(qemu_hypertas, TRUE);
969 
970     spapr_numa_write_rtas_dt(spapr, fdt, rtas);
971 
972     /*
973      * FWNMI reserves RTAS_ERROR_LOG_MAX for the machine check error log,
974      * and 16 bytes per CPU for system reset error log plus an extra 8 bytes.
975      *
976      * The system reset requirements are driven by existing Linux and PowerVM
977      * implementation which (contrary to PAPR) saves r3 in the error log
978      * structure like machine check, so Linux expects to find the saved r3
979      * value at the address in r3 upon FWNMI-enabled sreset interrupt (and
980      * does not look at the error value).
981      *
982      * System reset interrupts are not subject to interlock like machine
983      * check, so this memory area could be corrupted if the sreset is
984      * interrupted by a machine check (or vice versa) if it was shared. To
985      * prevent this, system reset uses per-CPU areas for the sreset save
986      * area. A system reset that interrupts a system reset handler could
987      * still overwrite this area, but Linux doesn't try to recover in that
988      * case anyway.
989      *
990      * The extra 8 bytes is required because Linux's FWNMI error log check
991      * is off-by-one.
992      *
993      * RTAS_MIN_SIZE is required for the RTAS blob itself.
994      */
995     _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_MIN_SIZE +
996                           RTAS_ERROR_LOG_MAX +
997                           ms->smp.max_cpus * sizeof(uint64_t) * 2 +
998                           sizeof(uint64_t)));
999     _FDT(fdt_setprop_cell(fdt, rtas, "rtas-error-log-max",
1000                           RTAS_ERROR_LOG_MAX));
1001     _FDT(fdt_setprop_cell(fdt, rtas, "rtas-event-scan-rate",
1002                           RTAS_EVENT_SCAN_RATE));
1003 
1004     g_assert(msi_nonbroken);
1005     _FDT(fdt_setprop(fdt, rtas, "ibm,change-msix-capable", NULL, 0));
1006 
1007     /*
1008      * According to PAPR, rtas ibm,os-term does not guarantee a return
1009      * back to the guest cpu.
1010      *
1011      * While an additional ibm,extended-os-term property indicates
1012      * that rtas call return will always occur. Set this property.
1013      */
1014     _FDT(fdt_setprop(fdt, rtas, "ibm,extended-os-term", NULL, 0));
1015 
1016     _FDT(fdt_setprop(fdt, rtas, "ibm,lrdr-capacity",
1017                      lrdr_capacity, sizeof(lrdr_capacity)));
1018 
1019     spapr_dt_rtas_tokens(fdt, rtas);
1020 }
1021 
1022 /*
1023  * Prepare ibm,arch-vec-5-platform-support, which indicates the MMU
1024  * and the XIVE features that the guest may request and thus the valid
1025  * values for bytes 23..26 of option vector 5:
1026  */
spapr_dt_ov5_platform_support(SpaprMachineState * spapr,void * fdt,int chosen)1027 static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
1028                                           int chosen)
1029 {
1030     PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
1031 
1032     char val[2 * 4] = {
1033         23, 0x00, /* XICS / XIVE mode */
1034         24, 0x00, /* Hash/Radix, filled in below. */
1035         25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
1036         26, 0x40, /* Radix options: GTSE == yes. */
1037     };
1038 
1039     if (spapr->irq->xics && spapr->irq->xive) {
1040         val[1] = SPAPR_OV5_XIVE_BOTH;
1041     } else if (spapr->irq->xive) {
1042         val[1] = SPAPR_OV5_XIVE_EXPLOIT;
1043     } else {
1044         assert(spapr->irq->xics);
1045         val[1] = SPAPR_OV5_XIVE_LEGACY;
1046     }
1047 
1048     if (!ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_00, 0,
1049                           first_ppc_cpu->compat_pvr)) {
1050         /*
1051          * If we're in a pre POWER9 compat mode then the guest should
1052          * do hash and use the legacy interrupt mode
1053          */
1054         val[1] = SPAPR_OV5_XIVE_LEGACY; /* XICS */
1055         val[3] = 0x00; /* Hash */
1056         spapr_check_mmu_mode(false);
1057     } else if (kvm_enabled()) {
1058         if (kvmppc_has_cap_mmu_radix() && kvmppc_has_cap_mmu_hash_v3()) {
1059             val[3] = 0x80; /* OV5_MMU_BOTH */
1060         } else if (kvmppc_has_cap_mmu_radix()) {
1061             val[3] = 0x40; /* OV5_MMU_RADIX_300 */
1062         } else {
1063             val[3] = 0x00; /* Hash */
1064         }
1065     } else {
1066         /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
1067         val[3] = 0xC0;
1068     }
1069     _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
1070                      val, sizeof(val)));
1071 }
1072 
spapr_dt_chosen(SpaprMachineState * spapr,void * fdt,bool reset)1073 static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset)
1074 {
1075     MachineState *machine = MACHINE(spapr);
1076     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
1077     int chosen;
1078 
1079     _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen"));
1080 
1081     if (reset) {
1082         const char *boot_device = spapr->boot_device;
1083         g_autofree char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
1084         size_t cb = 0;
1085         g_autofree char *bootlist = get_boot_devices_list(&cb);
1086 
1087         if (machine->kernel_cmdline && machine->kernel_cmdline[0]) {
1088             _FDT(fdt_setprop_string(fdt, chosen, "bootargs",
1089                                     machine->kernel_cmdline));
1090         }
1091 
1092         if (spapr->initrd_size) {
1093             _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-start",
1094                                   spapr->initrd_base));
1095             _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-end",
1096                                   spapr->initrd_base + spapr->initrd_size));
1097         }
1098 
1099         if (spapr->kernel_size) {
1100             uint64_t kprop[2] = { cpu_to_be64(spapr->kernel_addr),
1101                                   cpu_to_be64(spapr->kernel_size) };
1102 
1103             _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel",
1104                          &kprop, sizeof(kprop)));
1105             if (spapr->kernel_le) {
1106                 _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel-le", NULL, 0));
1107             }
1108         }
1109         if (machine->boot_config.has_menu && machine->boot_config.menu) {
1110             _FDT((fdt_setprop_cell(fdt, chosen, "qemu,boot-menu", true)));
1111         }
1112         _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-width", graphic_width));
1113         _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-height", graphic_height));
1114         _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-depth", graphic_depth));
1115 
1116         if (cb && bootlist) {
1117             int i;
1118 
1119             for (i = 0; i < cb; i++) {
1120                 if (bootlist[i] == '\n') {
1121                     bootlist[i] = ' ';
1122                 }
1123             }
1124             _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-list", bootlist));
1125         }
1126 
1127         if (boot_device && strlen(boot_device)) {
1128             _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-device", boot_device));
1129         }
1130 
1131         if (spapr->want_stdout_path && stdout_path) {
1132             /*
1133              * "linux,stdout-path" and "stdout" properties are
1134              * deprecated by linux kernel. New platforms should only
1135              * use the "stdout-path" property. Set the new property
1136              * and continue using older property to remain compatible
1137              * with the existing firmware.
1138              */
1139             _FDT(fdt_setprop_string(fdt, chosen, "linux,stdout-path", stdout_path));
1140             _FDT(fdt_setprop_string(fdt, chosen, "stdout-path", stdout_path));
1141         }
1142 
1143         /*
1144          * We can deal with BAR reallocation just fine, advertise it
1145          * to the guest
1146          */
1147         if (smc->linux_pci_probe) {
1148             _FDT(fdt_setprop_cell(fdt, chosen, "linux,pci-probe-only", 0));
1149         }
1150 
1151         spapr_dt_ov5_platform_support(spapr, fdt, chosen);
1152     }
1153 
1154     _FDT(fdt_setprop(fdt, chosen, "rng-seed", spapr->fdt_rng_seed, 32));
1155 
1156     _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5"));
1157 }
1158 
spapr_dt_hypervisor(SpaprMachineState * spapr,void * fdt)1159 static void spapr_dt_hypervisor(SpaprMachineState *spapr, void *fdt)
1160 {
1161     /* The /hypervisor node isn't in PAPR - this is a hack to allow PR
1162      * KVM to work under pHyp with some guest co-operation */
1163     int hypervisor;
1164     uint8_t hypercall[16];
1165 
1166     _FDT(hypervisor = fdt_add_subnode(fdt, 0, "hypervisor"));
1167     /* indicate KVM hypercall interface */
1168     _FDT(fdt_setprop_string(fdt, hypervisor, "compatible", "linux,kvm"));
1169     if (kvmppc_has_cap_fixup_hcalls()) {
1170         /*
1171          * Older KVM versions with older guest kernels were broken
1172          * with the magic page, don't allow the guest to map it.
1173          */
1174         if (!kvmppc_get_hypercall(cpu_env(first_cpu), hypercall,
1175                                   sizeof(hypercall))) {
1176             _FDT(fdt_setprop(fdt, hypervisor, "hcall-instructions",
1177                              hypercall, sizeof(hypercall)));
1178         }
1179     }
1180 }
1181 
spapr_build_fdt(SpaprMachineState * spapr,bool reset,size_t space)1182 void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, size_t space)
1183 {
1184     MachineState *machine = MACHINE(spapr);
1185     MachineClass *mc = MACHINE_GET_CLASS(machine);
1186     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
1187     uint32_t root_drc_type_mask = 0;
1188     int ret;
1189     void *fdt;
1190     SpaprPhbState *phb;
1191     char *buf;
1192 
1193     fdt = g_malloc0(space);
1194     _FDT((fdt_create_empty_tree(fdt, space)));
1195 
1196     /* Root node */
1197     _FDT(fdt_setprop_string(fdt, 0, "device_type", "chrp"));
1198     _FDT(fdt_setprop_string(fdt, 0, "model", "IBM pSeries (emulated by qemu)"));
1199     _FDT(fdt_setprop_string(fdt, 0, "compatible", "qemu,pseries"));
1200 
1201     /* Guest UUID & Name*/
1202     buf = qemu_uuid_unparse_strdup(&qemu_uuid);
1203     _FDT(fdt_setprop_string(fdt, 0, "vm,uuid", buf));
1204     if (qemu_uuid_set) {
1205         _FDT(fdt_setprop_string(fdt, 0, "system-id", buf));
1206     }
1207     g_free(buf);
1208 
1209     if (qemu_get_vm_name()) {
1210         _FDT(fdt_setprop_string(fdt, 0, "ibm,partition-name",
1211                                 qemu_get_vm_name()));
1212     }
1213 
1214     /* Host Model & Serial Number */
1215     if (spapr->host_model) {
1216         _FDT(fdt_setprop_string(fdt, 0, "host-model", spapr->host_model));
1217     } else if (smc->broken_host_serial_model && kvmppc_get_host_model(&buf)) {
1218         _FDT(fdt_setprop_string(fdt, 0, "host-model", buf));
1219         g_free(buf);
1220     }
1221 
1222     if (spapr->host_serial) {
1223         _FDT(fdt_setprop_string(fdt, 0, "host-serial", spapr->host_serial));
1224     } else if (smc->broken_host_serial_model && kvmppc_get_host_serial(&buf)) {
1225         _FDT(fdt_setprop_string(fdt, 0, "host-serial", buf));
1226         g_free(buf);
1227     }
1228 
1229     _FDT(fdt_setprop_cell(fdt, 0, "#address-cells", 2));
1230     _FDT(fdt_setprop_cell(fdt, 0, "#size-cells", 2));
1231 
1232     /* /interrupt controller */
1233     spapr_irq_dt(spapr, spapr_max_server_number(spapr), fdt, PHANDLE_INTC);
1234 
1235     ret = spapr_dt_memory(spapr, fdt);
1236     if (ret < 0) {
1237         error_report("couldn't setup memory nodes in fdt");
1238         exit(1);
1239     }
1240 
1241     /* /vdevice */
1242     spapr_dt_vdevice(spapr->vio_bus, fdt);
1243 
1244     if (object_resolve_path_type("", TYPE_SPAPR_RNG, NULL)) {
1245         ret = spapr_dt_rng(fdt);
1246         if (ret < 0) {
1247             error_report("could not set up rng device in the fdt");
1248             exit(1);
1249         }
1250     }
1251 
1252     QLIST_FOREACH(phb, &spapr->phbs, list) {
1253         ret = spapr_dt_phb(spapr, phb, PHANDLE_INTC, fdt, NULL);
1254         if (ret < 0) {
1255             error_report("couldn't setup PCI devices in fdt");
1256             exit(1);
1257         }
1258     }
1259 
1260     spapr_dt_cpus(fdt, spapr);
1261 
1262     /* ibm,drc-indexes and friends */
1263     root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_LMB;
1264     if (smc->dr_phb_enabled) {
1265         root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_PHB;
1266     }
1267     if (mc->nvdimm_supported) {
1268         root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_PMEM;
1269     }
1270     if (root_drc_type_mask) {
1271         _FDT(spapr_dt_drc(fdt, 0, NULL, root_drc_type_mask));
1272     }
1273 
1274     if (mc->has_hotpluggable_cpus) {
1275         int offset = fdt_path_offset(fdt, "/cpus");
1276         ret = spapr_dt_drc(fdt, offset, NULL, SPAPR_DR_CONNECTOR_TYPE_CPU);
1277         if (ret < 0) {
1278             error_report("Couldn't set up CPU DR device tree properties");
1279             exit(1);
1280         }
1281     }
1282 
1283     /* /event-sources */
1284     spapr_dt_events(spapr, fdt);
1285 
1286     /* /rtas */
1287     spapr_dt_rtas(spapr, fdt);
1288 
1289     /* /chosen */
1290     spapr_dt_chosen(spapr, fdt, reset);
1291 
1292     /* /hypervisor */
1293     if (kvm_enabled()) {
1294         spapr_dt_hypervisor(spapr, fdt);
1295     }
1296 
1297     /* Build memory reserve map */
1298     if (reset) {
1299         if (spapr->kernel_size) {
1300             _FDT((fdt_add_mem_rsv(fdt, spapr->kernel_addr,
1301                                   spapr->kernel_size)));
1302         }
1303         if (spapr->initrd_size) {
1304             _FDT((fdt_add_mem_rsv(fdt, spapr->initrd_base,
1305                                   spapr->initrd_size)));
1306         }
1307     }
1308 
1309     /* NVDIMM devices */
1310     if (mc->nvdimm_supported) {
1311         spapr_dt_persistent_memory(spapr, fdt);
1312     }
1313 
1314     return fdt;
1315 }
1316 
translate_kernel_address(void * opaque,uint64_t addr)1317 static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
1318 {
1319     SpaprMachineState *spapr = opaque;
1320 
1321     return (addr & 0x0fffffff) + spapr->kernel_addr;
1322 }
1323 
emulate_spapr_hypercall(PPCVirtualHypervisor * vhyp,PowerPCCPU * cpu)1324 static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
1325                                     PowerPCCPU *cpu)
1326 {
1327     CPUPPCState *env = &cpu->env;
1328 
1329     /* The TCG path should also be holding the BQL at this point */
1330     g_assert(bql_locked());
1331 
1332     g_assert(!vhyp_cpu_in_nested(cpu));
1333 
1334     if (FIELD_EX64(env->msr, MSR, PR)) {
1335         hcall_dprintf("Hypercall made with MSR[PR]=1\n");
1336         env->gpr[3] = H_PRIVILEGE;
1337     } else {
1338         env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
1339     }
1340 }
1341 
1342 struct LPCRSyncState {
1343     target_ulong value;
1344     target_ulong mask;
1345 };
1346 
do_lpcr_sync(CPUState * cs,run_on_cpu_data arg)1347 static void do_lpcr_sync(CPUState *cs, run_on_cpu_data arg)
1348 {
1349     struct LPCRSyncState *s = arg.host_ptr;
1350     PowerPCCPU *cpu = POWERPC_CPU(cs);
1351     CPUPPCState *env = &cpu->env;
1352     target_ulong lpcr;
1353 
1354     cpu_synchronize_state(cs);
1355     lpcr = env->spr[SPR_LPCR];
1356     lpcr &= ~s->mask;
1357     lpcr |= s->value;
1358     ppc_store_lpcr(cpu, lpcr);
1359 }
1360 
spapr_set_all_lpcrs(target_ulong value,target_ulong mask)1361 void spapr_set_all_lpcrs(target_ulong value, target_ulong mask)
1362 {
1363     CPUState *cs;
1364     struct LPCRSyncState s = {
1365         .value = value,
1366         .mask = mask
1367     };
1368     CPU_FOREACH(cs) {
1369         run_on_cpu(cs, do_lpcr_sync, RUN_ON_CPU_HOST_PTR(&s));
1370     }
1371 }
1372 
1373 /* May be used when the machine is not running */
spapr_init_all_lpcrs(target_ulong value,target_ulong mask)1374 void spapr_init_all_lpcrs(target_ulong value, target_ulong mask)
1375 {
1376     CPUState *cs;
1377     CPU_FOREACH(cs) {
1378         PowerPCCPU *cpu = POWERPC_CPU(cs);
1379         CPUPPCState *env = &cpu->env;
1380         target_ulong lpcr;
1381 
1382         lpcr = env->spr[SPR_LPCR];
1383         lpcr &= ~(LPCR_HR | LPCR_UPRT);
1384         ppc_store_lpcr(cpu, lpcr);
1385     }
1386 }
1387 
spapr_get_pate(PPCVirtualHypervisor * vhyp,PowerPCCPU * cpu,target_ulong lpid,ppc_v3_pate_t * entry)1388 static bool spapr_get_pate(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu,
1389                            target_ulong lpid, ppc_v3_pate_t *entry)
1390 {
1391     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1392     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
1393 
1394     if (!spapr_cpu->in_nested) {
1395         assert(lpid == 0);
1396 
1397         /* Copy PATE1:GR into PATE0:HR */
1398         entry->dw0 = spapr->patb_entry & PATE0_HR;
1399         entry->dw1 = spapr->patb_entry;
1400         return true;
1401     } else {
1402         if (spapr_nested_api(spapr) == NESTED_API_KVM_HV) {
1403             return spapr_get_pate_nested_hv(spapr, cpu, lpid, entry);
1404         } else if (spapr_nested_api(spapr) == NESTED_API_PAPR) {
1405             return spapr_get_pate_nested_papr(spapr, cpu, lpid, entry);
1406         } else {
1407             g_assert_not_reached();
1408         }
1409     }
1410 }
1411 
hpte_get_ptr(SpaprMachineState * s,unsigned index)1412 static uint64_t *hpte_get_ptr(SpaprMachineState *s, unsigned index)
1413 {
1414     uint64_t *table = s->htab;
1415 
1416     return &table[2 * index];
1417 }
1418 
hpte_is_valid(SpaprMachineState * s,unsigned index)1419 static bool hpte_is_valid(SpaprMachineState *s, unsigned index)
1420 {
1421     return ldq_be_p(hpte_get_ptr(s, index)) & HPTE64_V_VALID;
1422 }
1423 
hpte_is_dirty(SpaprMachineState * s,unsigned index)1424 static bool hpte_is_dirty(SpaprMachineState *s, unsigned index)
1425 {
1426     return ldq_be_p(hpte_get_ptr(s, index)) & HPTE64_V_HPTE_DIRTY;
1427 }
1428 
hpte_set_clean(SpaprMachineState * s,unsigned index)1429 static void hpte_set_clean(SpaprMachineState *s, unsigned index)
1430 {
1431     stq_be_p(hpte_get_ptr(s, index),
1432              ldq_be_p(hpte_get_ptr(s, index)) & ~HPTE64_V_HPTE_DIRTY);
1433 }
1434 
hpte_set_dirty(SpaprMachineState * s,unsigned index)1435 static void hpte_set_dirty(SpaprMachineState *s, unsigned index)
1436 {
1437     stq_be_p(hpte_get_ptr(s, index),
1438              ldq_be_p(hpte_get_ptr(s, index)) | HPTE64_V_HPTE_DIRTY);
1439 }
1440 
1441 /*
1442  * Get the fd to access the kernel htab, re-opening it if necessary
1443  */
get_htab_fd(SpaprMachineState * spapr)1444 static int get_htab_fd(SpaprMachineState *spapr)
1445 {
1446     Error *local_err = NULL;
1447 
1448     if (spapr->htab_fd >= 0) {
1449         return spapr->htab_fd;
1450     }
1451 
1452     spapr->htab_fd = kvmppc_get_htab_fd(false, 0, &local_err);
1453     if (spapr->htab_fd < 0) {
1454         error_report_err(local_err);
1455     }
1456 
1457     return spapr->htab_fd;
1458 }
1459 
close_htab_fd(SpaprMachineState * spapr)1460 void close_htab_fd(SpaprMachineState *spapr)
1461 {
1462     if (spapr->htab_fd >= 0) {
1463         close(spapr->htab_fd);
1464     }
1465     spapr->htab_fd = -1;
1466 }
1467 
spapr_hpt_mask(PPCVirtualHypervisor * vhyp)1468 static hwaddr spapr_hpt_mask(PPCVirtualHypervisor *vhyp)
1469 {
1470     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1471 
1472     return HTAB_SIZE(spapr) / HASH_PTEG_SIZE_64 - 1;
1473 }
1474 
spapr_encode_hpt_for_kvm_pr(PPCVirtualHypervisor * vhyp)1475 static target_ulong spapr_encode_hpt_for_kvm_pr(PPCVirtualHypervisor *vhyp)
1476 {
1477     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1478 
1479     assert(kvm_enabled());
1480 
1481     if (!spapr->htab) {
1482         return 0;
1483     }
1484 
1485     return (target_ulong)(uintptr_t)spapr->htab | (spapr->htab_shift - 18);
1486 }
1487 
spapr_map_hptes(PPCVirtualHypervisor * vhyp,hwaddr ptex,int n)1488 static const ppc_hash_pte64_t *spapr_map_hptes(PPCVirtualHypervisor *vhyp,
1489                                                 hwaddr ptex, int n)
1490 {
1491     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1492     hwaddr pte_offset = ptex * HASH_PTE_SIZE_64;
1493 
1494     if (!spapr->htab) {
1495         /*
1496          * HTAB is controlled by KVM. Fetch into temporary buffer
1497          */
1498         ppc_hash_pte64_t *hptes = g_malloc(n * HASH_PTE_SIZE_64);
1499         kvmppc_read_hptes(hptes, ptex, n);
1500         return hptes;
1501     }
1502 
1503     /*
1504      * HTAB is controlled by QEMU. Just point to the internally
1505      * accessible PTEG.
1506      */
1507     return (const ppc_hash_pte64_t *)(spapr->htab + pte_offset);
1508 }
1509 
spapr_unmap_hptes(PPCVirtualHypervisor * vhyp,const ppc_hash_pte64_t * hptes,hwaddr ptex,int n)1510 static void spapr_unmap_hptes(PPCVirtualHypervisor *vhyp,
1511                               const ppc_hash_pte64_t *hptes,
1512                               hwaddr ptex, int n)
1513 {
1514     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1515 
1516     if (!spapr->htab) {
1517         g_free((void *)hptes);
1518     }
1519 
1520     /* Nothing to do for qemu managed HPT */
1521 }
1522 
spapr_store_hpte(PowerPCCPU * cpu,hwaddr ptex,uint64_t pte0,uint64_t pte1)1523 void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
1524                       uint64_t pte0, uint64_t pte1)
1525 {
1526     SpaprMachineState *spapr = SPAPR_MACHINE(cpu->vhyp);
1527     hwaddr offset = ptex * HASH_PTE_SIZE_64;
1528 
1529     if (!spapr->htab) {
1530         kvmppc_write_hpte(ptex, pte0, pte1);
1531     } else {
1532         if (pte0 & HPTE64_V_VALID) {
1533             stq_p(spapr->htab + offset + HPTE64_DW1, pte1);
1534             /*
1535              * When setting valid, we write PTE1 first. This ensures
1536              * proper synchronization with the reading code in
1537              * ppc_hash64_pteg_search()
1538              */
1539             smp_wmb();
1540             stq_p(spapr->htab + offset, pte0);
1541         } else {
1542             stq_p(spapr->htab + offset, pte0);
1543             /*
1544              * When clearing it we set PTE0 first. This ensures proper
1545              * synchronization with the reading code in
1546              * ppc_hash64_pteg_search()
1547              */
1548             smp_wmb();
1549             stq_p(spapr->htab + offset + HPTE64_DW1, pte1);
1550         }
1551     }
1552 }
1553 
spapr_hpte_set_c(PPCVirtualHypervisor * vhyp,hwaddr ptex,uint64_t pte1)1554 static void spapr_hpte_set_c(PPCVirtualHypervisor *vhyp, hwaddr ptex,
1555                              uint64_t pte1)
1556 {
1557     hwaddr offset = ptex * HASH_PTE_SIZE_64 + HPTE64_DW1_C;
1558     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1559 
1560     if (!spapr->htab) {
1561         /* There should always be a hash table when this is called */
1562         error_report("spapr_hpte_set_c called with no hash table !");
1563         return;
1564     }
1565 
1566     /* The HW performs a non-atomic byte update */
1567     stb_p(spapr->htab + offset, (pte1 & 0xff) | 0x80);
1568 }
1569 
spapr_hpte_set_r(PPCVirtualHypervisor * vhyp,hwaddr ptex,uint64_t pte1)1570 static void spapr_hpte_set_r(PPCVirtualHypervisor *vhyp, hwaddr ptex,
1571                              uint64_t pte1)
1572 {
1573     hwaddr offset = ptex * HASH_PTE_SIZE_64 + HPTE64_DW1_R;
1574     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1575 
1576     if (!spapr->htab) {
1577         /* There should always be a hash table when this is called */
1578         error_report("spapr_hpte_set_r called with no hash table !");
1579         return;
1580     }
1581 
1582     /* The HW performs a non-atomic byte update */
1583     stb_p(spapr->htab + offset, ((pte1 >> 8) & 0xff) | 0x01);
1584 }
1585 
spapr_hpt_shift_for_ramsize(uint64_t ramsize)1586 int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
1587 {
1588     int shift;
1589 
1590     /* We aim for a hash table of size 1/128 the size of RAM (rounded
1591      * up).  The PAPR recommendation is actually 1/64 of RAM size, but
1592      * that's much more than is needed for Linux guests */
1593     shift = ctz64(pow2ceil(ramsize)) - 7;
1594     shift = MAX(shift, 18); /* Minimum architected size */
1595     shift = MIN(shift, 46); /* Maximum architected size */
1596     return shift;
1597 }
1598 
spapr_free_hpt(SpaprMachineState * spapr)1599 void spapr_free_hpt(SpaprMachineState *spapr)
1600 {
1601     qemu_vfree(spapr->htab);
1602     spapr->htab = NULL;
1603     spapr->htab_shift = 0;
1604     close_htab_fd(spapr);
1605 }
1606 
spapr_reallocate_hpt(SpaprMachineState * spapr,int shift,Error ** errp)1607 int spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, Error **errp)
1608 {
1609     ERRP_GUARD();
1610     long rc;
1611 
1612     /* Clean up any HPT info from a previous boot */
1613     spapr_free_hpt(spapr);
1614 
1615     rc = kvmppc_reset_htab(shift);
1616 
1617     if (rc == -EOPNOTSUPP) {
1618         error_setg(errp, "HPT not supported in nested guests");
1619         return -EOPNOTSUPP;
1620     }
1621 
1622     if (rc < 0) {
1623         /* kernel-side HPT needed, but couldn't allocate one */
1624         error_setg_errno(errp, errno, "Failed to allocate KVM HPT of order %d",
1625                          shift);
1626         error_append_hint(errp, "Try smaller maxmem?\n");
1627         return -errno;
1628     } else if (rc > 0) {
1629         /* kernel-side HPT allocated */
1630         if (rc != shift) {
1631             error_setg(errp,
1632                        "Requested order %d HPT, but kernel allocated order %ld",
1633                        shift, rc);
1634             error_append_hint(errp, "Try smaller maxmem?\n");
1635             return -ENOSPC;
1636         }
1637 
1638         spapr->htab_shift = shift;
1639         spapr->htab = NULL;
1640     } else {
1641         /* kernel-side HPT not needed, allocate in userspace instead */
1642         size_t size = 1ULL << shift;
1643         int i;
1644 
1645         spapr->htab = qemu_memalign(size, size);
1646         memset(spapr->htab, 0, size);
1647         spapr->htab_shift = shift;
1648 
1649         for (i = 0; i < size / HASH_PTE_SIZE_64; i++) {
1650             hpte_set_dirty(spapr, i);
1651         }
1652     }
1653     /* We're setting up a hash table, so that means we're not radix */
1654     spapr->patb_entry = 0;
1655     spapr_init_all_lpcrs(0, LPCR_HR | LPCR_UPRT);
1656     return 0;
1657 }
1658 
spapr_setup_hpt(SpaprMachineState * spapr)1659 void spapr_setup_hpt(SpaprMachineState *spapr)
1660 {
1661     int hpt_shift;
1662 
1663     if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) {
1664         hpt_shift = spapr_hpt_shift_for_ramsize(MACHINE(spapr)->maxram_size);
1665     } else {
1666         uint64_t current_ram_size;
1667 
1668         current_ram_size = MACHINE(spapr)->ram_size + get_plugged_memory_size();
1669         hpt_shift = spapr_hpt_shift_for_ramsize(current_ram_size);
1670     }
1671     spapr_reallocate_hpt(spapr, hpt_shift, &error_fatal);
1672 
1673     if (kvm_enabled()) {
1674         hwaddr vrma_limit = kvmppc_vrma_limit(spapr->htab_shift);
1675 
1676         /* Check our RMA fits in the possible VRMA */
1677         if (vrma_limit < spapr->rma_size) {
1678             error_report("Unable to create %" HWADDR_PRIu
1679                          "MiB RMA (VRMA only allows %" HWADDR_PRIu "MiB",
1680                          spapr->rma_size / MiB, vrma_limit / MiB);
1681             exit(EXIT_FAILURE);
1682         }
1683     }
1684 }
1685 
spapr_check_mmu_mode(bool guest_radix)1686 void spapr_check_mmu_mode(bool guest_radix)
1687 {
1688     if (guest_radix) {
1689         if (kvm_enabled() && !kvmppc_has_cap_mmu_radix()) {
1690             error_report("Guest requested unavailable MMU mode (radix).");
1691             exit(EXIT_FAILURE);
1692         }
1693     } else {
1694         if (kvm_enabled() && kvmppc_has_cap_mmu_radix()
1695             && !kvmppc_has_cap_mmu_hash_v3()) {
1696             error_report("Guest requested unavailable MMU mode (hash).");
1697             exit(EXIT_FAILURE);
1698         }
1699     }
1700 }
1701 
spapr_machine_reset(MachineState * machine,ResetType type)1702 static void spapr_machine_reset(MachineState *machine, ResetType type)
1703 {
1704     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
1705     PowerPCCPU *first_ppc_cpu;
1706     hwaddr fdt_addr;
1707     void *fdt;
1708     int rc;
1709 
1710     if (type != RESET_TYPE_SNAPSHOT_LOAD) {
1711         /*
1712          * Record-replay snapshot load must not consume random, this was
1713          * already replayed from initial machine reset.
1714          */
1715         qemu_guest_getrandom_nofail(spapr->fdt_rng_seed, 32);
1716     }
1717 
1718     if (machine->cgs) {
1719         confidential_guest_kvm_reset(machine->cgs, &error_fatal);
1720     }
1721     spapr_caps_apply(spapr);
1722     spapr_nested_reset(spapr);
1723 
1724     first_ppc_cpu = POWERPC_CPU(first_cpu);
1725     if (kvm_enabled() && kvmppc_has_cap_mmu_radix() &&
1726         ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
1727                               spapr->max_compat_pvr)) {
1728         /*
1729          * If using KVM with radix mode available, VCPUs can be started
1730          * without a HPT because KVM will start them in radix mode.
1731          * Set the GR bit in PATE so that we know there is no HPT.
1732          */
1733         spapr->patb_entry = PATE1_GR;
1734         spapr_set_all_lpcrs(LPCR_HR | LPCR_UPRT, LPCR_HR | LPCR_UPRT);
1735     } else {
1736         spapr_setup_hpt(spapr);
1737     }
1738 
1739     qemu_devices_reset(type);
1740 
1741     spapr_ovec_cleanup(spapr->ov5_cas);
1742     spapr->ov5_cas = spapr_ovec_new();
1743 
1744     ppc_init_compat_all(spapr->max_compat_pvr, &error_fatal);
1745 
1746     /*
1747      * This is fixing some of the default configuration of the XIVE
1748      * devices. To be called after the reset of the machine devices.
1749      */
1750     spapr_irq_reset(spapr, &error_fatal);
1751 
1752     /*
1753      * There is no CAS under qtest. Simulate one to please the code that
1754      * depends on spapr->ov5_cas. This is especially needed to test device
1755      * unplug, so we do that before resetting the DRCs.
1756      */
1757     if (qtest_enabled()) {
1758         spapr_ovec_cleanup(spapr->ov5_cas);
1759         spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
1760     }
1761 
1762     spapr_nvdimm_finish_flushes();
1763 
1764     /* DRC reset may cause a device to be unplugged. This will cause troubles
1765      * if this device is used by another device (eg, a running vhost backend
1766      * will crash QEMU if the DIMM holding the vring goes away). To avoid such
1767      * situations, we reset DRCs after all devices have been reset.
1768      */
1769     spapr_drc_reset_all(spapr);
1770 
1771     spapr_clear_pending_events(spapr);
1772 
1773     /*
1774      * We place the device tree just below either the top of the RMA,
1775      * or just below 2GB, whichever is lower, so that it can be
1776      * processed with 32-bit real mode code if necessary
1777      */
1778     fdt_addr = MIN(spapr->rma_size, FDT_MAX_ADDR) - FDT_MAX_SIZE;
1779 
1780     fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE);
1781     if (spapr->vof) {
1782         spapr_vof_reset(spapr, fdt, &error_fatal);
1783         /*
1784          * Do not pack the FDT as the client may change properties.
1785          * VOF client does not expect the FDT so we do not load it to the VM.
1786          */
1787     } else {
1788         rc = fdt_pack(fdt);
1789         /* Should only fail if we've built a corrupted tree */
1790         assert(rc == 0);
1791 
1792         spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
1793                                   0, fdt_addr, 0);
1794         cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
1795     }
1796 
1797     g_free(spapr->fdt_blob);
1798     spapr->fdt_size = fdt_totalsize(fdt);
1799     spapr->fdt_initial_size = spapr->fdt_size;
1800     spapr->fdt_blob = fdt;
1801 
1802     /* Set machine->fdt for 'dumpdtb' QMP/HMP command */
1803     machine->fdt = fdt;
1804 
1805     /* Set up the entry state */
1806     first_ppc_cpu->env.gpr[5] = 0;
1807 
1808     spapr->fwnmi_system_reset_addr = -1;
1809     spapr->fwnmi_machine_check_addr = -1;
1810     spapr->fwnmi_machine_check_interlock = -1;
1811 
1812     /* Signal all vCPUs waiting on this condition */
1813     qemu_cond_broadcast(&spapr->fwnmi_machine_check_interlock_cond);
1814 
1815     migrate_del_blocker(&spapr->fwnmi_migration_blocker);
1816 }
1817 
spapr_create_nvram(SpaprMachineState * spapr)1818 static void spapr_create_nvram(SpaprMachineState *spapr)
1819 {
1820     DeviceState *dev = qdev_new("spapr-nvram");
1821     DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
1822 
1823     if (dinfo) {
1824         qdev_prop_set_drive_err(dev, "drive", blk_by_legacy_dinfo(dinfo),
1825                                 &error_fatal);
1826     }
1827 
1828     qdev_realize_and_unref(dev, &spapr->vio_bus->bus, &error_fatal);
1829 
1830     spapr->nvram = (struct SpaprNvram *)dev;
1831 }
1832 
spapr_rtc_create(SpaprMachineState * spapr)1833 static void spapr_rtc_create(SpaprMachineState *spapr)
1834 {
1835     object_initialize_child_with_props(OBJECT(spapr), "rtc", &spapr->rtc,
1836                                        sizeof(spapr->rtc), TYPE_SPAPR_RTC,
1837                                        &error_fatal, NULL);
1838     qdev_realize(DEVICE(&spapr->rtc), NULL, &error_fatal);
1839     object_property_add_alias(OBJECT(spapr), "rtc-time", OBJECT(&spapr->rtc),
1840                               "date");
1841 }
1842 
1843 /* Returns whether we want to use VGA or not */
spapr_vga_init(PCIBus * pci_bus,Error ** errp)1844 static bool spapr_vga_init(PCIBus *pci_bus, Error **errp)
1845 {
1846     vga_interface_created = true;
1847     switch (vga_interface_type) {
1848     case VGA_NONE:
1849         return false;
1850     case VGA_DEVICE:
1851         return true;
1852     case VGA_STD:
1853     case VGA_VIRTIO:
1854     case VGA_CIRRUS:
1855         return pci_vga_init(pci_bus) != NULL;
1856     default:
1857         error_setg(errp,
1858                    "Unsupported VGA mode, only -vga std or -vga virtio is supported");
1859         return false;
1860     }
1861 }
1862 
spapr_pre_load(void * opaque)1863 static int spapr_pre_load(void *opaque)
1864 {
1865     int rc;
1866 
1867     rc = spapr_caps_pre_load(opaque);
1868     if (rc) {
1869         return rc;
1870     }
1871 
1872     return 0;
1873 }
1874 
spapr_post_load(void * opaque,int version_id)1875 static int spapr_post_load(void *opaque, int version_id)
1876 {
1877     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1878     int err = 0;
1879 
1880     err = spapr_caps_post_migration(spapr);
1881     if (err) {
1882         return err;
1883     }
1884 
1885     /*
1886      * In earlier versions, there was no separate qdev for the PAPR
1887      * RTC, so the RTC offset was stored directly in sPAPREnvironment.
1888      * So when migrating from those versions, poke the incoming offset
1889      * value into the RTC device
1890      */
1891     if (version_id < 3) {
1892         err = spapr_rtc_import_offset(&spapr->rtc, spapr->rtc_offset);
1893         if (err) {
1894             return err;
1895         }
1896     }
1897 
1898     if (kvm_enabled() && spapr->patb_entry) {
1899         PowerPCCPU *cpu = POWERPC_CPU(first_cpu);
1900         bool radix = !!(spapr->patb_entry & PATE1_GR);
1901         bool gtse = !!(cpu->env.spr[SPR_LPCR] & LPCR_GTSE);
1902 
1903         /*
1904          * Update LPCR:HR and UPRT as they may not be set properly in
1905          * the stream
1906          */
1907         spapr_set_all_lpcrs(radix ? (LPCR_HR | LPCR_UPRT) : 0,
1908                             LPCR_HR | LPCR_UPRT);
1909 
1910         err = kvmppc_configure_v3_mmu(cpu, radix, gtse, spapr->patb_entry);
1911         if (err) {
1912             error_report("Process table config unsupported by the host");
1913             return -EINVAL;
1914         }
1915     }
1916 
1917     err = spapr_irq_post_load(spapr, version_id);
1918     if (err) {
1919         return err;
1920     }
1921 
1922     return err;
1923 }
1924 
spapr_pre_save(void * opaque)1925 static int spapr_pre_save(void *opaque)
1926 {
1927     int rc;
1928 
1929     rc = spapr_caps_pre_save(opaque);
1930     if (rc) {
1931         return rc;
1932     }
1933 
1934     return 0;
1935 }
1936 
version_before_3(void * opaque,int version_id)1937 static bool version_before_3(void *opaque, int version_id)
1938 {
1939     return version_id < 3;
1940 }
1941 
spapr_pending_events_needed(void * opaque)1942 static bool spapr_pending_events_needed(void *opaque)
1943 {
1944     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1945     return !QTAILQ_EMPTY(&spapr->pending_events);
1946 }
1947 
1948 static const VMStateDescription vmstate_spapr_event_entry = {
1949     .name = "spapr_event_log_entry",
1950     .version_id = 1,
1951     .minimum_version_id = 1,
1952     .fields = (const VMStateField[]) {
1953         VMSTATE_UINT32(summary, SpaprEventLogEntry),
1954         VMSTATE_UINT32(extended_length, SpaprEventLogEntry),
1955         VMSTATE_VBUFFER_ALLOC_UINT32(extended_log, SpaprEventLogEntry, 0,
1956                                      NULL, extended_length),
1957         VMSTATE_END_OF_LIST()
1958     },
1959 };
1960 
1961 static const VMStateDescription vmstate_spapr_pending_events = {
1962     .name = "spapr_pending_events",
1963     .version_id = 1,
1964     .minimum_version_id = 1,
1965     .needed = spapr_pending_events_needed,
1966     .fields = (const VMStateField[]) {
1967         VMSTATE_QTAILQ_V(pending_events, SpaprMachineState, 1,
1968                          vmstate_spapr_event_entry, SpaprEventLogEntry, next),
1969         VMSTATE_END_OF_LIST()
1970     },
1971 };
1972 
spapr_ov5_cas_needed(void * opaque)1973 static bool spapr_ov5_cas_needed(void *opaque)
1974 {
1975     SpaprMachineState *spapr = opaque;
1976     SpaprOptionVector *ov5_mask = spapr_ovec_new();
1977     bool cas_needed;
1978 
1979     /* Prior to the introduction of SpaprOptionVector, we had two option
1980      * vectors we dealt with: OV5_FORM1_AFFINITY, and OV5_DRCONF_MEMORY.
1981      * Both of these options encode machine topology into the device-tree
1982      * in such a way that the now-booted OS should still be able to interact
1983      * appropriately with QEMU regardless of what options were actually
1984      * negotiatied on the source side.
1985      *
1986      * As such, we can avoid migrating the CAS-negotiated options if these
1987      * are the only options available on the current machine/platform.
1988      * Since these are the only options available for pseries-2.7 and
1989      * earlier, this allows us to maintain old->new/new->old migration
1990      * compatibility.
1991      *
1992      * For QEMU 2.8+, there are additional CAS-negotiatable options available
1993      * via default pseries-2.8 machines and explicit command-line parameters.
1994      * Some of these options, like OV5_HP_EVT, *do* require QEMU to be aware
1995      * of the actual CAS-negotiated values to continue working properly. For
1996      * example, availability of memory unplug depends on knowing whether
1997      * OV5_HP_EVT was negotiated via CAS.
1998      *
1999      * Thus, for any cases where the set of available CAS-negotiatable
2000      * options extends beyond OV5_FORM1_AFFINITY and OV5_DRCONF_MEMORY, we
2001      * include the CAS-negotiated options in the migration stream, unless
2002      * if they affect boot time behaviour only.
2003      */
2004     spapr_ovec_set(ov5_mask, OV5_FORM1_AFFINITY);
2005     spapr_ovec_set(ov5_mask, OV5_DRCONF_MEMORY);
2006     spapr_ovec_set(ov5_mask, OV5_DRMEM_V2);
2007 
2008     /* We need extra information if we have any bits outside the mask
2009      * defined above */
2010     cas_needed = !spapr_ovec_subset(spapr->ov5, ov5_mask);
2011 
2012     spapr_ovec_cleanup(ov5_mask);
2013 
2014     return cas_needed;
2015 }
2016 
2017 static const VMStateDescription vmstate_spapr_ov5_cas = {
2018     .name = "spapr_option_vector_ov5_cas",
2019     .version_id = 1,
2020     .minimum_version_id = 1,
2021     .needed = spapr_ov5_cas_needed,
2022     .fields = (const VMStateField[]) {
2023         VMSTATE_STRUCT_POINTER_V(ov5_cas, SpaprMachineState, 1,
2024                                  vmstate_spapr_ovec, SpaprOptionVector),
2025         VMSTATE_END_OF_LIST()
2026     },
2027 };
2028 
spapr_patb_entry_needed(void * opaque)2029 static bool spapr_patb_entry_needed(void *opaque)
2030 {
2031     SpaprMachineState *spapr = opaque;
2032 
2033     return !!spapr->patb_entry;
2034 }
2035 
2036 static const VMStateDescription vmstate_spapr_patb_entry = {
2037     .name = "spapr_patb_entry",
2038     .version_id = 1,
2039     .minimum_version_id = 1,
2040     .needed = spapr_patb_entry_needed,
2041     .fields = (const VMStateField[]) {
2042         VMSTATE_UINT64(patb_entry, SpaprMachineState),
2043         VMSTATE_END_OF_LIST()
2044     },
2045 };
2046 
spapr_irq_map_needed(void * opaque)2047 static bool spapr_irq_map_needed(void *opaque)
2048 {
2049     SpaprMachineState *spapr = opaque;
2050 
2051     return spapr->irq_map && !bitmap_empty(spapr->irq_map, spapr->irq_map_nr);
2052 }
2053 
2054 static const VMStateDescription vmstate_spapr_irq_map = {
2055     .name = "spapr_irq_map",
2056     .version_id = 1,
2057     .minimum_version_id = 1,
2058     .needed = spapr_irq_map_needed,
2059     .fields = (const VMStateField[]) {
2060         VMSTATE_BITMAP(irq_map, SpaprMachineState, 0, irq_map_nr),
2061         VMSTATE_END_OF_LIST()
2062     },
2063 };
2064 
spapr_dtb_needed(void * opaque)2065 static bool spapr_dtb_needed(void *opaque)
2066 {
2067     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(opaque);
2068 
2069     return smc->update_dt_enabled;
2070 }
2071 
spapr_dtb_pre_load(void * opaque)2072 static int spapr_dtb_pre_load(void *opaque)
2073 {
2074     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
2075 
2076     g_free(spapr->fdt_blob);
2077     spapr->fdt_blob = NULL;
2078     spapr->fdt_size = 0;
2079 
2080     return 0;
2081 }
2082 
2083 static const VMStateDescription vmstate_spapr_dtb = {
2084     .name = "spapr_dtb",
2085     .version_id = 1,
2086     .minimum_version_id = 1,
2087     .needed = spapr_dtb_needed,
2088     .pre_load = spapr_dtb_pre_load,
2089     .fields = (const VMStateField[]) {
2090         VMSTATE_UINT32(fdt_initial_size, SpaprMachineState),
2091         VMSTATE_UINT32(fdt_size, SpaprMachineState),
2092         VMSTATE_VBUFFER_ALLOC_UINT32(fdt_blob, SpaprMachineState, 0, NULL,
2093                                      fdt_size),
2094         VMSTATE_END_OF_LIST()
2095     },
2096 };
2097 
spapr_fwnmi_needed(void * opaque)2098 static bool spapr_fwnmi_needed(void *opaque)
2099 {
2100     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
2101 
2102     return spapr->fwnmi_machine_check_addr != -1;
2103 }
2104 
spapr_fwnmi_pre_save(void * opaque)2105 static int spapr_fwnmi_pre_save(void *opaque)
2106 {
2107     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
2108 
2109     /*
2110      * Check if machine check handling is in progress and print a
2111      * warning message.
2112      */
2113     if (spapr->fwnmi_machine_check_interlock != -1) {
2114         warn_report("A machine check is being handled during migration. The"
2115                 "handler may run and log hardware error on the destination");
2116     }
2117 
2118     return 0;
2119 }
2120 
2121 static const VMStateDescription vmstate_spapr_fwnmi = {
2122     .name = "spapr_fwnmi",
2123     .version_id = 1,
2124     .minimum_version_id = 1,
2125     .needed = spapr_fwnmi_needed,
2126     .pre_save = spapr_fwnmi_pre_save,
2127     .fields = (const VMStateField[]) {
2128         VMSTATE_UINT64(fwnmi_system_reset_addr, SpaprMachineState),
2129         VMSTATE_UINT64(fwnmi_machine_check_addr, SpaprMachineState),
2130         VMSTATE_INT32(fwnmi_machine_check_interlock, SpaprMachineState),
2131         VMSTATE_END_OF_LIST()
2132     },
2133 };
2134 
2135 static const VMStateDescription vmstate_spapr = {
2136     .name = "spapr",
2137     .version_id = 3,
2138     .minimum_version_id = 1,
2139     .pre_load = spapr_pre_load,
2140     .post_load = spapr_post_load,
2141     .pre_save = spapr_pre_save,
2142     .fields = (const VMStateField[]) {
2143         /* used to be @next_irq */
2144         VMSTATE_UNUSED_BUFFER(version_before_3, 0, 4),
2145 
2146         /* RTC offset */
2147         VMSTATE_UINT64_TEST(rtc_offset, SpaprMachineState, version_before_3),
2148 
2149         VMSTATE_PPC_TIMEBASE_V(tb, SpaprMachineState, 2),
2150         VMSTATE_END_OF_LIST()
2151     },
2152     .subsections = (const VMStateDescription * const []) {
2153         &vmstate_spapr_ov5_cas,
2154         &vmstate_spapr_patb_entry,
2155         &vmstate_spapr_pending_events,
2156         &vmstate_spapr_cap_htm,
2157         &vmstate_spapr_cap_vsx,
2158         &vmstate_spapr_cap_dfp,
2159         &vmstate_spapr_cap_cfpc,
2160         &vmstate_spapr_cap_sbbc,
2161         &vmstate_spapr_cap_ibs,
2162         &vmstate_spapr_cap_hpt_maxpagesize,
2163         &vmstate_spapr_irq_map,
2164         &vmstate_spapr_cap_nested_kvm_hv,
2165         &vmstate_spapr_dtb,
2166         &vmstate_spapr_cap_large_decr,
2167         &vmstate_spapr_cap_ccf_assist,
2168         &vmstate_spapr_cap_fwnmi,
2169         &vmstate_spapr_fwnmi,
2170         &vmstate_spapr_cap_rpt_invalidate,
2171         &vmstate_spapr_cap_ail_mode_3,
2172         &vmstate_spapr_cap_nested_papr,
2173         &vmstate_spapr_cap_dawr1,
2174         NULL
2175     }
2176 };
2177 
htab_save_setup(QEMUFile * f,void * opaque,Error ** errp)2178 static int htab_save_setup(QEMUFile *f, void *opaque, Error **errp)
2179 {
2180     SpaprMachineState *spapr = opaque;
2181 
2182     /* "Iteration" header */
2183     if (!spapr->htab_shift) {
2184         qemu_put_be32(f, -1);
2185     } else {
2186         qemu_put_be32(f, spapr->htab_shift);
2187     }
2188 
2189     if (spapr->htab) {
2190         spapr->htab_save_index = 0;
2191         spapr->htab_first_pass = true;
2192     } else {
2193         if (spapr->htab_shift) {
2194             assert(kvm_enabled());
2195         }
2196     }
2197 
2198 
2199     return 0;
2200 }
2201 
htab_save_chunk(QEMUFile * f,SpaprMachineState * spapr,int chunkstart,int n_valid,int n_invalid)2202 static void htab_save_chunk(QEMUFile *f, SpaprMachineState *spapr,
2203                             int chunkstart, int n_valid, int n_invalid)
2204 {
2205     qemu_put_be32(f, chunkstart);
2206     qemu_put_be16(f, n_valid);
2207     qemu_put_be16(f, n_invalid);
2208     qemu_put_buffer(f, (void *)hpte_get_ptr(spapr, chunkstart),
2209                     HASH_PTE_SIZE_64 * n_valid);
2210 }
2211 
htab_save_end_marker(QEMUFile * f)2212 static void htab_save_end_marker(QEMUFile *f)
2213 {
2214     qemu_put_be32(f, 0);
2215     qemu_put_be16(f, 0);
2216     qemu_put_be16(f, 0);
2217 }
2218 
htab_save_first_pass(QEMUFile * f,SpaprMachineState * spapr,int64_t max_ns)2219 static void htab_save_first_pass(QEMUFile *f, SpaprMachineState *spapr,
2220                                  int64_t max_ns)
2221 {
2222     bool has_timeout = max_ns != -1;
2223     int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
2224     int index = spapr->htab_save_index;
2225     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2226 
2227     assert(spapr->htab_first_pass);
2228 
2229     do {
2230         int chunkstart;
2231 
2232         /* Consume invalid HPTEs */
2233         while ((index < htabslots)
2234                && !hpte_is_valid(spapr, index)) {
2235             hpte_set_clean(spapr, index);
2236             index++;
2237         }
2238 
2239         /* Consume valid HPTEs */
2240         chunkstart = index;
2241         while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
2242                && hpte_is_valid(spapr, index)) {
2243             hpte_set_clean(spapr, index);
2244             index++;
2245         }
2246 
2247         if (index > chunkstart) {
2248             int n_valid = index - chunkstart;
2249 
2250             htab_save_chunk(f, spapr, chunkstart, n_valid, 0);
2251 
2252             if (has_timeout &&
2253                 (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
2254                 break;
2255             }
2256         }
2257     } while ((index < htabslots) && !migration_rate_exceeded(f));
2258 
2259     if (index >= htabslots) {
2260         assert(index == htabslots);
2261         index = 0;
2262         spapr->htab_first_pass = false;
2263     }
2264     spapr->htab_save_index = index;
2265 }
2266 
htab_save_later_pass(QEMUFile * f,SpaprMachineState * spapr,int64_t max_ns)2267 static int htab_save_later_pass(QEMUFile *f, SpaprMachineState *spapr,
2268                                 int64_t max_ns)
2269 {
2270     bool final = max_ns < 0;
2271     int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
2272     int examined = 0, sent = 0;
2273     int index = spapr->htab_save_index;
2274     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2275 
2276     assert(!spapr->htab_first_pass);
2277 
2278     do {
2279         int chunkstart, invalidstart;
2280 
2281         /* Consume non-dirty HPTEs */
2282         while ((index < htabslots)
2283                && !hpte_is_dirty(spapr, index)) {
2284             index++;
2285             examined++;
2286         }
2287 
2288         chunkstart = index;
2289         /* Consume valid dirty HPTEs */
2290         while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
2291                && hpte_is_dirty(spapr, index)
2292                && hpte_is_valid(spapr, index)) {
2293             hpte_set_clean(spapr, index);
2294             index++;
2295             examined++;
2296         }
2297 
2298         invalidstart = index;
2299         /* Consume invalid dirty HPTEs */
2300         while ((index < htabslots) && (index - invalidstart < USHRT_MAX)
2301                && hpte_is_dirty(spapr, index)
2302                && !hpte_is_valid(spapr, index)) {
2303             hpte_set_clean(spapr, index);
2304             index++;
2305             examined++;
2306         }
2307 
2308         if (index > chunkstart) {
2309             int n_valid = invalidstart - chunkstart;
2310             int n_invalid = index - invalidstart;
2311 
2312             htab_save_chunk(f, spapr, chunkstart, n_valid, n_invalid);
2313             sent += index - chunkstart;
2314 
2315             if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
2316                 break;
2317             }
2318         }
2319 
2320         if (examined >= htabslots) {
2321             break;
2322         }
2323 
2324         if (index >= htabslots) {
2325             assert(index == htabslots);
2326             index = 0;
2327         }
2328     } while ((examined < htabslots) && (!migration_rate_exceeded(f) || final));
2329 
2330     if (index >= htabslots) {
2331         assert(index == htabslots);
2332         index = 0;
2333     }
2334 
2335     spapr->htab_save_index = index;
2336 
2337     return (examined >= htabslots) && (sent == 0) ? 1 : 0;
2338 }
2339 
2340 #define MAX_ITERATION_NS    5000000 /* 5 ms */
2341 #define MAX_KVM_BUF_SIZE    2048
2342 
htab_save_iterate(QEMUFile * f,void * opaque)2343 static int htab_save_iterate(QEMUFile *f, void *opaque)
2344 {
2345     SpaprMachineState *spapr = opaque;
2346     int fd;
2347     int rc = 0;
2348 
2349     /* Iteration header */
2350     if (!spapr->htab_shift) {
2351         qemu_put_be32(f, -1);
2352         return 1;
2353     } else {
2354         qemu_put_be32(f, 0);
2355     }
2356 
2357     if (!spapr->htab) {
2358         assert(kvm_enabled());
2359 
2360         fd = get_htab_fd(spapr);
2361         if (fd < 0) {
2362             return fd;
2363         }
2364 
2365         rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
2366         if (rc < 0) {
2367             return rc;
2368         }
2369     } else  if (spapr->htab_first_pass) {
2370         htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
2371     } else {
2372         rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
2373     }
2374 
2375     htab_save_end_marker(f);
2376 
2377     return rc;
2378 }
2379 
htab_save_complete(QEMUFile * f,void * opaque)2380 static int htab_save_complete(QEMUFile *f, void *opaque)
2381 {
2382     SpaprMachineState *spapr = opaque;
2383     int fd;
2384 
2385     /* Iteration header */
2386     if (!spapr->htab_shift) {
2387         qemu_put_be32(f, -1);
2388         return 0;
2389     } else {
2390         qemu_put_be32(f, 0);
2391     }
2392 
2393     if (!spapr->htab) {
2394         int rc;
2395 
2396         assert(kvm_enabled());
2397 
2398         fd = get_htab_fd(spapr);
2399         if (fd < 0) {
2400             return fd;
2401         }
2402 
2403         rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, -1);
2404         if (rc < 0) {
2405             return rc;
2406         }
2407     } else {
2408         if (spapr->htab_first_pass) {
2409             htab_save_first_pass(f, spapr, -1);
2410         }
2411         htab_save_later_pass(f, spapr, -1);
2412     }
2413 
2414     /* End marker */
2415     htab_save_end_marker(f);
2416 
2417     return 0;
2418 }
2419 
htab_load(QEMUFile * f,void * opaque,int version_id)2420 static int htab_load(QEMUFile *f, void *opaque, int version_id)
2421 {
2422     SpaprMachineState *spapr = opaque;
2423     uint32_t section_hdr;
2424     int fd = -1;
2425     Error *local_err = NULL;
2426 
2427     if (version_id < 1 || version_id > 1) {
2428         error_report("htab_load() bad version");
2429         return -EINVAL;
2430     }
2431 
2432     section_hdr = qemu_get_be32(f);
2433 
2434     if (section_hdr == -1) {
2435         spapr_free_hpt(spapr);
2436         return 0;
2437     }
2438 
2439     if (section_hdr) {
2440         int ret;
2441 
2442         /* First section gives the htab size */
2443         ret = spapr_reallocate_hpt(spapr, section_hdr, &local_err);
2444         if (ret < 0) {
2445             error_report_err(local_err);
2446             return ret;
2447         }
2448         return 0;
2449     }
2450 
2451     if (!spapr->htab) {
2452         assert(kvm_enabled());
2453 
2454         fd = kvmppc_get_htab_fd(true, 0, &local_err);
2455         if (fd < 0) {
2456             error_report_err(local_err);
2457             return fd;
2458         }
2459     }
2460 
2461     while (true) {
2462         uint32_t index;
2463         uint16_t n_valid, n_invalid;
2464 
2465         index = qemu_get_be32(f);
2466         n_valid = qemu_get_be16(f);
2467         n_invalid = qemu_get_be16(f);
2468 
2469         if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
2470             /* End of Stream */
2471             break;
2472         }
2473 
2474         if ((index + n_valid + n_invalid) >
2475             (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
2476             /* Bad index in stream */
2477             error_report(
2478                 "htab_load() bad index %d (%hd+%hd entries) in htab stream (htab_shift=%d)",
2479                 index, n_valid, n_invalid, spapr->htab_shift);
2480             return -EINVAL;
2481         }
2482 
2483         if (spapr->htab) {
2484             if (n_valid) {
2485                 qemu_get_buffer(f, (void *)hpte_get_ptr(spapr, index),
2486                                 HASH_PTE_SIZE_64 * n_valid);
2487             }
2488             if (n_invalid) {
2489                 memset(hpte_get_ptr(spapr, index + n_valid), 0,
2490                        HASH_PTE_SIZE_64 * n_invalid);
2491             }
2492         } else {
2493             int rc;
2494 
2495             assert(fd >= 0);
2496 
2497             rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid,
2498                                         &local_err);
2499             if (rc < 0) {
2500                 error_report_err(local_err);
2501                 return rc;
2502             }
2503         }
2504     }
2505 
2506     if (!spapr->htab) {
2507         assert(fd >= 0);
2508         close(fd);
2509     }
2510 
2511     return 0;
2512 }
2513 
htab_save_cleanup(void * opaque)2514 static void htab_save_cleanup(void *opaque)
2515 {
2516     SpaprMachineState *spapr = opaque;
2517 
2518     close_htab_fd(spapr);
2519 }
2520 
2521 static SaveVMHandlers savevm_htab_handlers = {
2522     .save_setup = htab_save_setup,
2523     .save_live_iterate = htab_save_iterate,
2524     .save_complete = htab_save_complete,
2525     .save_cleanup = htab_save_cleanup,
2526     .load_state = htab_load,
2527 };
2528 
spapr_boot_set(void * opaque,const char * boot_device,Error ** errp)2529 static void spapr_boot_set(void *opaque, const char *boot_device,
2530                            Error **errp)
2531 {
2532     SpaprMachineState *spapr = SPAPR_MACHINE(opaque);
2533 
2534     g_free(spapr->boot_device);
2535     spapr->boot_device = g_strdup(boot_device);
2536 }
2537 
spapr_create_lmb_dr_connectors(SpaprMachineState * spapr)2538 static void spapr_create_lmb_dr_connectors(SpaprMachineState *spapr)
2539 {
2540     MachineState *machine = MACHINE(spapr);
2541     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
2542     uint32_t nr_lmbs = (machine->maxram_size - machine->ram_size)/lmb_size;
2543     int i;
2544 
2545     g_assert(!nr_lmbs || machine->device_memory);
2546     for (i = 0; i < nr_lmbs; i++) {
2547         uint64_t addr;
2548 
2549         addr = i * lmb_size + machine->device_memory->base;
2550         spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_LMB,
2551                                addr / lmb_size);
2552     }
2553 }
2554 
2555 /*
2556  * If RAM size, maxmem size and individual node mem sizes aren't aligned
2557  * to SPAPR_MEMORY_BLOCK_SIZE(256MB), then refuse to start the guest
2558  * since we can't support such unaligned sizes with DRCONF_MEMORY.
2559  */
spapr_validate_node_memory(MachineState * machine,Error ** errp)2560 static void spapr_validate_node_memory(MachineState *machine, Error **errp)
2561 {
2562     int i;
2563 
2564     if (machine->ram_size % SPAPR_MEMORY_BLOCK_SIZE) {
2565         error_setg(errp, "Memory size 0x" RAM_ADDR_FMT
2566                    " is not aligned to %" PRIu64 " MiB",
2567                    machine->ram_size,
2568                    SPAPR_MEMORY_BLOCK_SIZE / MiB);
2569         return;
2570     }
2571 
2572     if (machine->maxram_size % SPAPR_MEMORY_BLOCK_SIZE) {
2573         error_setg(errp, "Maximum memory size 0x" RAM_ADDR_FMT
2574                    " is not aligned to %" PRIu64 " MiB",
2575                    machine->ram_size,
2576                    SPAPR_MEMORY_BLOCK_SIZE / MiB);
2577         return;
2578     }
2579 
2580     for (i = 0; i < machine->numa_state->num_nodes; i++) {
2581         if (machine->numa_state->nodes[i].node_mem % SPAPR_MEMORY_BLOCK_SIZE) {
2582             error_setg(errp,
2583                        "Node %d memory size 0x%" PRIx64
2584                        " is not aligned to %" PRIu64 " MiB",
2585                        i, machine->numa_state->nodes[i].node_mem,
2586                        SPAPR_MEMORY_BLOCK_SIZE / MiB);
2587             return;
2588         }
2589     }
2590 }
2591 
2592 /* find cpu slot in machine->possible_cpus by core_id */
spapr_find_cpu_slot(MachineState * ms,uint32_t id,int * idx)2593 static CPUArchId *spapr_find_cpu_slot(MachineState *ms, uint32_t id, int *idx)
2594 {
2595     int index = id / ms->smp.threads;
2596 
2597     if (index >= ms->possible_cpus->len) {
2598         return NULL;
2599     }
2600     if (idx) {
2601         *idx = index;
2602     }
2603     return &ms->possible_cpus->cpus[index];
2604 }
2605 
spapr_set_vsmt_mode(SpaprMachineState * spapr,Error ** errp)2606 static void spapr_set_vsmt_mode(SpaprMachineState *spapr, Error **errp)
2607 {
2608     MachineState *ms = MACHINE(spapr);
2609     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
2610     Error *local_err = NULL;
2611     bool vsmt_user = !!spapr->vsmt;
2612     int kvm_smt = kvmppc_smt_threads();
2613     int ret;
2614     unsigned int smp_threads = ms->smp.threads;
2615 
2616     if (tcg_enabled()) {
2617         if (smp_threads > 1 &&
2618             !ppc_type_check_compat(ms->cpu_type, CPU_POWERPC_LOGICAL_2_07, 0,
2619                                    spapr->max_compat_pvr)) {
2620             error_setg(errp, "TCG only supports SMT on POWER8 or newer CPUs");
2621             return;
2622         }
2623 
2624         if (smp_threads > 8) {
2625             error_setg(errp, "TCG cannot support more than 8 threads/core "
2626                        "on a pseries machine");
2627             return;
2628         }
2629     }
2630     if (!is_power_of_2(smp_threads)) {
2631         error_setg(errp, "Cannot support %d threads/core on a pseries "
2632                    "machine because it must be a power of 2", smp_threads);
2633         return;
2634     }
2635 
2636     /* Determine the VSMT mode to use: */
2637     if (vsmt_user) {
2638         if (spapr->vsmt < smp_threads) {
2639             error_setg(errp, "Cannot support VSMT mode %d"
2640                        " because it must be >= threads/core (%d)",
2641                        spapr->vsmt, smp_threads);
2642             return;
2643         }
2644         /* In this case, spapr->vsmt has been set by the command line */
2645     } else if (!smc->smp_threads_vsmt) {
2646         /*
2647          * Default VSMT value is tricky, because we need it to be as
2648          * consistent as possible (for migration), but this requires
2649          * changing it for at least some existing cases.  We pick 8 as
2650          * the value that we'd get with KVM on POWER8, the
2651          * overwhelmingly common case in production systems.
2652          */
2653         spapr->vsmt = MAX(8, smp_threads);
2654     } else {
2655         spapr->vsmt = smp_threads;
2656     }
2657 
2658     /* KVM: If necessary, set the SMT mode: */
2659     if (kvm_enabled() && (spapr->vsmt != kvm_smt)) {
2660         ret = kvmppc_set_smt_threads(spapr->vsmt);
2661         if (ret) {
2662             /* Looks like KVM isn't able to change VSMT mode */
2663             error_setg(&local_err,
2664                        "Failed to set KVM's VSMT mode to %d (errno %d)",
2665                        spapr->vsmt, ret);
2666             /* We can live with that if the default one is big enough
2667              * for the number of threads, and a submultiple of the one
2668              * we want.  In this case we'll waste some vcpu ids, but
2669              * behaviour will be correct */
2670             if ((kvm_smt >= smp_threads) && ((spapr->vsmt % kvm_smt) == 0)) {
2671                 warn_report_err(local_err);
2672             } else {
2673                 if (!vsmt_user) {
2674                     error_append_hint(&local_err,
2675                                       "On PPC, a VM with %d threads/core"
2676                                       " on a host with %d threads/core"
2677                                       " requires the use of VSMT mode %d.\n",
2678                                       smp_threads, kvm_smt, spapr->vsmt);
2679                 }
2680                 kvmppc_error_append_smt_possible_hint(&local_err);
2681                 error_propagate(errp, local_err);
2682             }
2683         }
2684     }
2685     /* else TCG: nothing to do currently */
2686 }
2687 
spapr_init_cpus(SpaprMachineState * spapr)2688 static void spapr_init_cpus(SpaprMachineState *spapr)
2689 {
2690     MachineState *machine = MACHINE(spapr);
2691     MachineClass *mc = MACHINE_GET_CLASS(machine);
2692     const char *type = spapr_get_cpu_core_type(machine->cpu_type);
2693     const CPUArchIdList *possible_cpus;
2694     unsigned int smp_cpus = machine->smp.cpus;
2695     unsigned int smp_threads = machine->smp.threads;
2696     unsigned int max_cpus = machine->smp.max_cpus;
2697     int boot_cores_nr = smp_cpus / smp_threads;
2698     int i;
2699 
2700     possible_cpus = mc->possible_cpu_arch_ids(machine);
2701     if (mc->has_hotpluggable_cpus) {
2702         if (smp_cpus % smp_threads) {
2703             error_report("smp_cpus (%u) must be multiple of threads (%u)",
2704                          smp_cpus, smp_threads);
2705             exit(1);
2706         }
2707         if (max_cpus % smp_threads) {
2708             error_report("max_cpus (%u) must be multiple of threads (%u)",
2709                          max_cpus, smp_threads);
2710             exit(1);
2711         }
2712     } else {
2713         if (max_cpus != smp_cpus) {
2714             error_report("This machine version does not support CPU hotplug");
2715             exit(1);
2716         }
2717         boot_cores_nr = possible_cpus->len;
2718     }
2719 
2720     for (i = 0; i < possible_cpus->len; i++) {
2721         int core_id = i * smp_threads;
2722 
2723         if (mc->has_hotpluggable_cpus) {
2724             spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_CPU,
2725                                    spapr_vcpu_id(spapr, core_id));
2726         }
2727 
2728         if (i < boot_cores_nr) {
2729             Object *core  = object_new(type);
2730             int nr_threads = smp_threads;
2731 
2732             /* Handle the partially filled core for older machine types */
2733             if ((i + 1) * smp_threads >= smp_cpus) {
2734                 nr_threads = smp_cpus - i * smp_threads;
2735             }
2736 
2737             object_property_set_int(core, "nr-threads", nr_threads,
2738                                     &error_fatal);
2739             object_property_set_int(core, CPU_CORE_PROP_CORE_ID, core_id,
2740                                     &error_fatal);
2741             qdev_realize(DEVICE(core), NULL, &error_fatal);
2742 
2743             object_unref(core);
2744         }
2745     }
2746 }
2747 
spapr_create_default_phb(void)2748 static PCIHostState *spapr_create_default_phb(void)
2749 {
2750     DeviceState *dev;
2751 
2752     dev = qdev_new(TYPE_SPAPR_PCI_HOST_BRIDGE);
2753     qdev_prop_set_uint32(dev, "index", 0);
2754     sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
2755 
2756     return PCI_HOST_BRIDGE(dev);
2757 }
2758 
spapr_rma_size(SpaprMachineState * spapr,Error ** errp)2759 static hwaddr spapr_rma_size(SpaprMachineState *spapr, Error **errp)
2760 {
2761     MachineState *machine = MACHINE(spapr);
2762     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
2763     hwaddr rma_size = machine->ram_size;
2764     hwaddr node0_size = spapr_node0_size(machine);
2765 
2766     /* RMA has to fit in the first NUMA node */
2767     rma_size = MIN(rma_size, node0_size);
2768 
2769     /*
2770      * VRMA access is via a special 1TiB SLB mapping, so the RMA can
2771      * never exceed that
2772      */
2773     rma_size = MIN(rma_size, 1 * TiB);
2774 
2775     /*
2776      * Clamp the RMA size based on machine type.  This is for
2777      * migration compatibility with older qemu versions, which limited
2778      * the RMA size for complicated and mostly bad reasons.
2779      */
2780     if (smc->rma_limit) {
2781         rma_size = MIN(rma_size, smc->rma_limit);
2782     }
2783 
2784     if (rma_size < MIN_RMA_SLOF) {
2785         error_setg(errp,
2786                    "pSeries SLOF firmware requires >= %" HWADDR_PRIx
2787                    "ldMiB guest RMA (Real Mode Area memory)",
2788                    MIN_RMA_SLOF / MiB);
2789         return 0;
2790     }
2791 
2792     return rma_size;
2793 }
2794 
spapr_create_nvdimm_dr_connectors(SpaprMachineState * spapr)2795 static void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr)
2796 {
2797     MachineState *machine = MACHINE(spapr);
2798     int i;
2799 
2800     for (i = 0; i < machine->ram_slots; i++) {
2801         spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM, i);
2802     }
2803 }
2804 
2805 /* pSeries LPAR / sPAPR hardware init */
spapr_machine_init(MachineState * machine)2806 static void spapr_machine_init(MachineState *machine)
2807 {
2808     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
2809     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
2810     MachineClass *mc = MACHINE_GET_CLASS(machine);
2811     const char *bios_default = spapr->vof ? FW_FILE_NAME_VOF : FW_FILE_NAME;
2812     const char *bios_name = machine->firmware ?: bios_default;
2813     g_autofree char *filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
2814     const char *kernel_filename = machine->kernel_filename;
2815     const char *initrd_filename = machine->initrd_filename;
2816     PCIHostState *phb;
2817     bool has_vga;
2818     int i;
2819     MemoryRegion *sysmem = get_system_memory();
2820     long load_limit, fw_size;
2821     Error *resize_hpt_err = NULL;
2822     NICInfo *nd;
2823 
2824     if (!filename) {
2825         error_report("Could not find LPAR firmware '%s'", bios_name);
2826         exit(1);
2827     }
2828     fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
2829     if (fw_size <= 0) {
2830         error_report("Could not load LPAR firmware '%s'", filename);
2831         exit(1);
2832     }
2833 
2834     /*
2835      * if Secure VM (PEF) support is configured, then initialize it
2836      */
2837     if (machine->cgs) {
2838         confidential_guest_kvm_init(machine->cgs, &error_fatal);
2839     }
2840 
2841     msi_nonbroken = true;
2842 
2843     QLIST_INIT(&spapr->phbs);
2844     QTAILQ_INIT(&spapr->pending_dimm_unplugs);
2845 
2846     /* Determine capabilities to run with */
2847     spapr_caps_init(spapr);
2848 
2849     kvmppc_check_papr_resize_hpt(&resize_hpt_err);
2850     if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DEFAULT) {
2851         /*
2852          * If the user explicitly requested a mode we should either
2853          * supply it, or fail completely (which we do below).  But if
2854          * it's not set explicitly, we reset our mode to something
2855          * that works
2856          */
2857         if (resize_hpt_err) {
2858             spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
2859             error_free(resize_hpt_err);
2860             resize_hpt_err = NULL;
2861         } else {
2862             spapr->resize_hpt = smc->resize_hpt_default;
2863         }
2864     }
2865 
2866     assert(spapr->resize_hpt != SPAPR_RESIZE_HPT_DEFAULT);
2867 
2868     if ((spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) && resize_hpt_err) {
2869         /*
2870          * User requested HPT resize, but this host can't supply it.  Bail out
2871          */
2872         error_report_err(resize_hpt_err);
2873         exit(1);
2874     }
2875     error_free(resize_hpt_err);
2876 
2877     spapr->rma_size = spapr_rma_size(spapr, &error_fatal);
2878 
2879     /* Setup a load limit for the ramdisk leaving room for SLOF and FDT */
2880     load_limit = MIN(spapr->rma_size, FDT_MAX_ADDR) - FW_OVERHEAD;
2881 
2882     /*
2883      * VSMT must be set in order to be able to compute VCPU ids, ie to
2884      * call spapr_max_server_number() or spapr_vcpu_id().
2885      */
2886     spapr_set_vsmt_mode(spapr, &error_fatal);
2887 
2888     /* Set up Interrupt Controller before we create the VCPUs */
2889     spapr_irq_init(spapr, &error_fatal);
2890 
2891     /* Set up containers for ibm,client-architecture-support negotiated options
2892      */
2893     spapr->ov5 = spapr_ovec_new();
2894     spapr->ov5_cas = spapr_ovec_new();
2895 
2896     spapr_ovec_set(spapr->ov5, OV5_DRCONF_MEMORY);
2897     spapr_validate_node_memory(machine, &error_fatal);
2898 
2899     spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
2900 
2901     /* Do not advertise FORM2 NUMA support for pseries-6.1 and older */
2902     if (!smc->pre_6_2_numa_affinity) {
2903         spapr_ovec_set(spapr->ov5, OV5_FORM2_AFFINITY);
2904     }
2905 
2906     /* advertise support for dedicated HP event source to guests */
2907     if (spapr->use_hotplug_event_source) {
2908         spapr_ovec_set(spapr->ov5, OV5_HP_EVT);
2909     }
2910 
2911     /* advertise support for HPT resizing */
2912     if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
2913         spapr_ovec_set(spapr->ov5, OV5_HPT_RESIZE);
2914     }
2915 
2916     /* advertise support for ibm,dyamic-memory-v2 */
2917     spapr_ovec_set(spapr->ov5, OV5_DRMEM_V2);
2918 
2919     /* advertise XIVE on POWER9 machines */
2920     if (spapr->irq->xive) {
2921         spapr_ovec_set(spapr->ov5, OV5_XIVE_EXPLOIT);
2922     }
2923 
2924     qemu_guest_getrandom_nofail(&spapr->hashpkey_val,
2925                                 sizeof(spapr->hashpkey_val));
2926 
2927     /* init CPUs */
2928     spapr_init_cpus(spapr);
2929 
2930     /* Init numa_assoc_array */
2931     spapr_numa_associativity_init(spapr, machine);
2932 
2933     if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
2934         ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
2935                               spapr->max_compat_pvr)) {
2936         spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
2937         /* KVM and TCG always allow GTSE with radix... */
2938         spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
2939     }
2940     /* ... but not with hash (currently). */
2941 
2942     if (kvm_enabled()) {
2943         /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
2944         kvmppc_enable_logical_ci_hcalls();
2945         kvmppc_enable_set_mode_hcall();
2946 
2947         /* H_CLEAR_MOD/_REF are mandatory in PAPR, but off by default */
2948         kvmppc_enable_clear_ref_mod_hcalls();
2949 
2950         /* Enable H_PAGE_INIT */
2951         kvmppc_enable_h_page_init();
2952     }
2953 
2954     /* map RAM */
2955     memory_region_add_subregion(sysmem, 0, machine->ram);
2956 
2957     /* initialize hotplug memory address space */
2958     if (machine->ram_size < machine->maxram_size) {
2959         ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size;
2960         hwaddr device_mem_base;
2961 
2962         /*
2963          * Limit the number of hotpluggable memory slots to half the number
2964          * slots that KVM supports, leaving the other half for PCI and other
2965          * devices. However ensure that number of slots doesn't drop below 32.
2966          */
2967         int max_memslots = kvm_enabled() ? kvm_get_max_memslots() / 2 :
2968                            SPAPR_MAX_RAM_SLOTS;
2969 
2970         if (max_memslots < SPAPR_MAX_RAM_SLOTS) {
2971             max_memslots = SPAPR_MAX_RAM_SLOTS;
2972         }
2973         if (machine->ram_slots > max_memslots) {
2974             error_report("Specified number of memory slots %"
2975                          PRIu64" exceeds max supported %d",
2976                          machine->ram_slots, max_memslots);
2977             exit(1);
2978         }
2979 
2980         device_mem_base = ROUND_UP(machine->ram_size, SPAPR_DEVICE_MEM_ALIGN);
2981         machine_memory_devices_init(machine, device_mem_base, device_mem_size);
2982     }
2983 
2984     spapr_create_lmb_dr_connectors(spapr);
2985 
2986     if (mc->nvdimm_supported) {
2987         spapr_create_nvdimm_dr_connectors(spapr);
2988     }
2989 
2990     /* Set up RTAS event infrastructure */
2991     spapr_events_init(spapr);
2992 
2993     /* Set up the RTC RTAS interfaces */
2994     spapr_rtc_create(spapr);
2995 
2996     /* Set up VIO bus */
2997     spapr->vio_bus = spapr_vio_bus_init();
2998 
2999     for (i = 0; serial_hd(i); i++) {
3000         spapr_vty_create(spapr->vio_bus, serial_hd(i));
3001     }
3002 
3003     /* We always have at least the nvram device on VIO */
3004     spapr_create_nvram(spapr);
3005 
3006     /*
3007      * Setup hotplug / dynamic-reconfiguration connectors. top-level
3008      * connectors (described in root DT node's "ibm,drc-types" property)
3009      * are pre-initialized here. additional child connectors (such as
3010      * connectors for a PHBs PCI slots) are added as needed during their
3011      * parent's realization.
3012      */
3013     if (smc->dr_phb_enabled) {
3014         for (i = 0; i < SPAPR_MAX_PHBS; i++) {
3015             spapr_dr_connector_new(OBJECT(machine), TYPE_SPAPR_DRC_PHB, i);
3016         }
3017     }
3018 
3019     /* Set up PCI */
3020     spapr_pci_rtas_init();
3021 
3022     phb = spapr_create_default_phb();
3023 
3024     while ((nd = qemu_find_nic_info("spapr-vlan", true, "ibmveth"))) {
3025         spapr_vlan_create(spapr->vio_bus, nd);
3026     }
3027 
3028     pci_init_nic_devices(phb->bus, NULL);
3029 
3030     for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
3031         spapr_vscsi_create(spapr->vio_bus);
3032     }
3033 
3034     /* Graphics */
3035     has_vga = spapr_vga_init(phb->bus, &error_fatal);
3036     if (has_vga) {
3037         spapr->want_stdout_path = !machine->enable_graphics;
3038         machine->usb |= defaults_enabled() && !machine->usb_disabled;
3039     } else {
3040         spapr->want_stdout_path = true;
3041     }
3042 
3043     if (machine->usb) {
3044         pci_create_simple(phb->bus, -1, "nec-usb-xhci");
3045 
3046         if (has_vga) {
3047             USBBus *usb_bus;
3048 
3049             usb_bus = USB_BUS(object_resolve_type_unambiguous(TYPE_USB_BUS,
3050                                                               &error_abort));
3051             usb_create_simple(usb_bus, "usb-kbd");
3052             usb_create_simple(usb_bus, "usb-mouse");
3053         }
3054     }
3055 
3056     if (kernel_filename) {
3057         uint64_t loaded_addr = 0;
3058 
3059         spapr->kernel_size = load_elf(kernel_filename, NULL,
3060                                       translate_kernel_address, spapr,
3061                                       NULL, &loaded_addr, NULL, NULL,
3062                                       ELFDATA2MSB, PPC_ELF_MACHINE, 0, 0);
3063         if (spapr->kernel_size == ELF_LOAD_WRONG_ENDIAN) {
3064             spapr->kernel_size = load_elf(kernel_filename, NULL,
3065                                           translate_kernel_address, spapr,
3066                                           NULL, &loaded_addr, NULL, NULL,
3067                                           ELFDATA2LSB, PPC_ELF_MACHINE, 0, 0);
3068             spapr->kernel_le = spapr->kernel_size > 0;
3069         }
3070         if (spapr->kernel_size < 0) {
3071             error_report("error loading %s: %s", kernel_filename,
3072                          load_elf_strerror(spapr->kernel_size));
3073             exit(1);
3074         }
3075 
3076         if (spapr->kernel_addr != loaded_addr) {
3077             warn_report("spapr: kernel_addr changed from 0x%"PRIx64
3078                         " to 0x%"PRIx64,
3079                         spapr->kernel_addr, loaded_addr);
3080             spapr->kernel_addr = loaded_addr;
3081         }
3082 
3083         /* load initrd */
3084         if (initrd_filename) {
3085             /* Try to locate the initrd in the gap between the kernel
3086              * and the firmware. Add a bit of space just in case
3087              */
3088             spapr->initrd_base = (spapr->kernel_addr + spapr->kernel_size
3089                                   + 0x1ffff) & ~0xffff;
3090             spapr->initrd_size = load_image_targphys(initrd_filename,
3091                                                      spapr->initrd_base,
3092                                                      load_limit
3093                                                      - spapr->initrd_base);
3094             if (spapr->initrd_size < 0) {
3095                 error_report("could not load initial ram disk '%s'",
3096                              initrd_filename);
3097                 exit(1);
3098             }
3099         }
3100     }
3101 
3102     /* FIXME: Should register things through the MachineState's qdev
3103      * interface, this is a legacy from the sPAPREnvironment structure
3104      * which predated MachineState but had a similar function */
3105     vmstate_register(NULL, 0, &vmstate_spapr, spapr);
3106     register_savevm_live("spapr/htab", VMSTATE_INSTANCE_ID_ANY, 1,
3107                          &savevm_htab_handlers, spapr);
3108 
3109     qbus_set_hotplug_handler(sysbus_get_default(), OBJECT(machine));
3110 
3111     qemu_register_boot_set(spapr_boot_set, spapr);
3112 
3113     /*
3114      * Nothing needs to be done to resume a suspended guest because
3115      * suspending does not change the machine state, so no need for
3116      * a ->wakeup method.
3117      */
3118     qemu_register_wakeup_support();
3119 
3120     if (kvm_enabled()) {
3121         /* to stop and start vmclock */
3122         qemu_add_vm_change_state_handler(cpu_ppc_clock_vm_state_change,
3123                                          &spapr->tb);
3124 
3125         kvmppc_spapr_enable_inkernel_multitce();
3126     }
3127 
3128     qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
3129     if (spapr->vof) {
3130         spapr->vof->fw_size = fw_size; /* for claim() on itself */
3131         spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, spapr_h_vof_client);
3132     }
3133 
3134     spapr_watchdog_init(spapr);
3135 }
3136 
3137 #define DEFAULT_KVM_TYPE "auto"
spapr_kvm_type(MachineState * machine,const char * vm_type)3138 static int spapr_kvm_type(MachineState *machine, const char *vm_type)
3139 {
3140     /*
3141      * The use of g_ascii_strcasecmp() for 'hv' and 'pr' is to
3142      * accommodate the 'HV' and 'PV' formats that exists in the
3143      * wild. The 'auto' mode is being introduced already as
3144      * lower-case, thus we don't need to bother checking for
3145      * "AUTO".
3146      */
3147     if (!vm_type || !strcmp(vm_type, DEFAULT_KVM_TYPE)) {
3148         return 0;
3149     }
3150 
3151     if (!g_ascii_strcasecmp(vm_type, "hv")) {
3152         return 1;
3153     }
3154 
3155     if (!g_ascii_strcasecmp(vm_type, "pr")) {
3156         return 2;
3157     }
3158 
3159     error_report("Unknown kvm-type specified '%s'", vm_type);
3160     return -1;
3161 }
3162 
3163 /*
3164  * Implementation of an interface to adjust firmware path
3165  * for the bootindex property handling.
3166  */
spapr_get_fw_dev_path(FWPathProvider * p,BusState * bus,DeviceState * dev)3167 static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
3168                                    DeviceState *dev)
3169 {
3170 #define CAST(type, obj, name) \
3171     ((type *)object_dynamic_cast(OBJECT(obj), (name)))
3172     SCSIDevice *d = CAST(SCSIDevice,  dev, TYPE_SCSI_DEVICE);
3173     SpaprPhbState *phb = CAST(SpaprPhbState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);
3174     VHostSCSICommon *vsc = CAST(VHostSCSICommon, dev, TYPE_VHOST_SCSI_COMMON);
3175     PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
3176 
3177     if (d && bus) {
3178         void *spapr = CAST(void, bus->parent, "spapr-vscsi");
3179         VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI);
3180         USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE);
3181 
3182         if (spapr) {
3183             /*
3184              * Replace "channel@0/disk@0,0" with "disk@8000000000000000":
3185              * In the top 16 bits of the 64-bit LUN, we use SRP luns of the form
3186              * 0x8000 | (target << 8) | (bus << 5) | lun
3187              * (see the "Logical unit addressing format" table in SAM5)
3188              */
3189             unsigned id = 0x8000 | (d->id << 8) | (d->channel << 5) | d->lun;
3190             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3191                                    (uint64_t)id << 48);
3192         } else if (virtio) {
3193             /*
3194              * We use SRP luns of the form 01000000 | (target << 8) | lun
3195              * in the top 32 bits of the 64-bit LUN
3196              * Note: the quote above is from SLOF and it is wrong,
3197              * the actual binding is:
3198              * swap 0100 or 10 << or 20 << ( target lun-id -- srplun )
3199              */
3200             unsigned id = 0x1000000 | (d->id << 16) | d->lun;
3201             if (d->lun >= 256) {
3202                 /* Use the LUN "flat space addressing method" */
3203                 id |= 0x4000;
3204             }
3205             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3206                                    (uint64_t)id << 32);
3207         } else if (usb) {
3208             /*
3209              * We use SRP luns of the form 01000000 | (usb-port << 16) | lun
3210              * in the top 32 bits of the 64-bit LUN
3211              */
3212             unsigned usb_port = atoi(usb->port->path);
3213             unsigned id = 0x1000000 | (usb_port << 16) | d->lun;
3214             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3215                                    (uint64_t)id << 32);
3216         }
3217     }
3218 
3219     /*
3220      * SLOF probes the USB devices, and if it recognizes that the device is a
3221      * storage device, it changes its name to "storage" instead of "usb-host",
3222      * and additionally adds a child node for the SCSI LUN, so the correct
3223      * boot path in SLOF is something like .../storage@1/disk@xxx" instead.
3224      */
3225     if (strcmp("usb-host", qdev_fw_name(dev)) == 0) {
3226         USBDevice *usbdev = CAST(USBDevice, dev, TYPE_USB_DEVICE);
3227         if (usb_device_is_scsi_storage(usbdev)) {
3228             return g_strdup_printf("storage@%s/disk", usbdev->port->path);
3229         }
3230     }
3231 
3232     if (phb) {
3233         /* Replace "pci" with "pci@800000020000000" */
3234         return g_strdup_printf("pci@%"PRIX64, phb->buid);
3235     }
3236 
3237     if (vsc) {
3238         /* Same logic as virtio above */
3239         unsigned id = 0x1000000 | (vsc->target << 16) | vsc->lun;
3240         return g_strdup_printf("disk@%"PRIX64, (uint64_t)id << 32);
3241     }
3242 
3243     if (g_str_equal("pci-bridge", qdev_fw_name(dev))) {
3244         /* SLOF uses "pci" instead of "pci-bridge" for PCI bridges */
3245         PCIDevice *pdev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
3246         return g_strdup_printf("pci@%x", PCI_SLOT(pdev->devfn));
3247     }
3248 
3249     if (pcidev) {
3250         return spapr_pci_fw_dev_name(pcidev);
3251     }
3252 
3253     return NULL;
3254 }
3255 
spapr_get_kvm_type(Object * obj,Error ** errp)3256 static char *spapr_get_kvm_type(Object *obj, Error **errp)
3257 {
3258     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3259 
3260     return g_strdup(spapr->kvm_type);
3261 }
3262 
spapr_set_kvm_type(Object * obj,const char * value,Error ** errp)3263 static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
3264 {
3265     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3266 
3267     g_free(spapr->kvm_type);
3268     spapr->kvm_type = g_strdup(value);
3269 }
3270 
spapr_get_modern_hotplug_events(Object * obj,Error ** errp)3271 static bool spapr_get_modern_hotplug_events(Object *obj, Error **errp)
3272 {
3273     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3274 
3275     return spapr->use_hotplug_event_source;
3276 }
3277 
spapr_set_modern_hotplug_events(Object * obj,bool value,Error ** errp)3278 static void spapr_set_modern_hotplug_events(Object *obj, bool value,
3279                                             Error **errp)
3280 {
3281     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3282 
3283     spapr->use_hotplug_event_source = value;
3284 }
3285 
spapr_get_msix_emulation(Object * obj,Error ** errp)3286 static bool spapr_get_msix_emulation(Object *obj, Error **errp)
3287 {
3288     return true;
3289 }
3290 
spapr_get_resize_hpt(Object * obj,Error ** errp)3291 static char *spapr_get_resize_hpt(Object *obj, Error **errp)
3292 {
3293     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3294 
3295     switch (spapr->resize_hpt) {
3296     case SPAPR_RESIZE_HPT_DEFAULT:
3297         return g_strdup("default");
3298     case SPAPR_RESIZE_HPT_DISABLED:
3299         return g_strdup("disabled");
3300     case SPAPR_RESIZE_HPT_ENABLED:
3301         return g_strdup("enabled");
3302     case SPAPR_RESIZE_HPT_REQUIRED:
3303         return g_strdup("required");
3304     }
3305     g_assert_not_reached();
3306 }
3307 
spapr_set_resize_hpt(Object * obj,const char * value,Error ** errp)3308 static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp)
3309 {
3310     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3311 
3312     if (strcmp(value, "default") == 0) {
3313         spapr->resize_hpt = SPAPR_RESIZE_HPT_DEFAULT;
3314     } else if (strcmp(value, "disabled") == 0) {
3315         spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
3316     } else if (strcmp(value, "enabled") == 0) {
3317         spapr->resize_hpt = SPAPR_RESIZE_HPT_ENABLED;
3318     } else if (strcmp(value, "required") == 0) {
3319         spapr->resize_hpt = SPAPR_RESIZE_HPT_REQUIRED;
3320     } else {
3321         error_setg(errp, "Bad value for \"resize-hpt\" property");
3322     }
3323 }
3324 
spapr_get_vof(Object * obj,Error ** errp)3325 static bool spapr_get_vof(Object *obj, Error **errp)
3326 {
3327     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3328 
3329     return spapr->vof != NULL;
3330 }
3331 
spapr_set_vof(Object * obj,bool value,Error ** errp)3332 static void spapr_set_vof(Object *obj, bool value, Error **errp)
3333 {
3334     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3335 
3336     if (spapr->vof) {
3337         vof_cleanup(spapr->vof);
3338         g_free(spapr->vof);
3339         spapr->vof = NULL;
3340     }
3341     if (!value) {
3342         return;
3343     }
3344     spapr->vof = g_malloc0(sizeof(*spapr->vof));
3345 }
3346 
spapr_get_ic_mode(Object * obj,Error ** errp)3347 static char *spapr_get_ic_mode(Object *obj, Error **errp)
3348 {
3349     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3350 
3351     if (spapr->irq == &spapr_irq_xics_legacy) {
3352         return g_strdup("legacy");
3353     } else if (spapr->irq == &spapr_irq_xics) {
3354         return g_strdup("xics");
3355     } else if (spapr->irq == &spapr_irq_xive) {
3356         return g_strdup("xive");
3357     } else if (spapr->irq == &spapr_irq_dual) {
3358         return g_strdup("dual");
3359     }
3360     g_assert_not_reached();
3361 }
3362 
spapr_set_ic_mode(Object * obj,const char * value,Error ** errp)3363 static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
3364 {
3365     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3366 
3367     if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
3368         error_setg(errp, "This machine only uses the legacy XICS backend, don't pass ic-mode");
3369         return;
3370     }
3371 
3372     /* The legacy IRQ backend can not be set */
3373     if (strcmp(value, "xics") == 0) {
3374         spapr->irq = &spapr_irq_xics;
3375     } else if (strcmp(value, "xive") == 0) {
3376         spapr->irq = &spapr_irq_xive;
3377     } else if (strcmp(value, "dual") == 0) {
3378         spapr->irq = &spapr_irq_dual;
3379     } else {
3380         error_setg(errp, "Bad value for \"ic-mode\" property");
3381     }
3382 }
3383 
spapr_get_host_model(Object * obj,Error ** errp)3384 static char *spapr_get_host_model(Object *obj, Error **errp)
3385 {
3386     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3387 
3388     return g_strdup(spapr->host_model);
3389 }
3390 
spapr_set_host_model(Object * obj,const char * value,Error ** errp)3391 static void spapr_set_host_model(Object *obj, const char *value, Error **errp)
3392 {
3393     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3394 
3395     g_free(spapr->host_model);
3396     spapr->host_model = g_strdup(value);
3397 }
3398 
spapr_get_host_serial(Object * obj,Error ** errp)3399 static char *spapr_get_host_serial(Object *obj, Error **errp)
3400 {
3401     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3402 
3403     return g_strdup(spapr->host_serial);
3404 }
3405 
spapr_set_host_serial(Object * obj,const char * value,Error ** errp)3406 static void spapr_set_host_serial(Object *obj, const char *value, Error **errp)
3407 {
3408     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3409 
3410     g_free(spapr->host_serial);
3411     spapr->host_serial = g_strdup(value);
3412 }
3413 
spapr_instance_init(Object * obj)3414 static void spapr_instance_init(Object *obj)
3415 {
3416     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3417     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
3418     MachineState *ms = MACHINE(spapr);
3419     MachineClass *mc = MACHINE_GET_CLASS(ms);
3420 
3421     /*
3422      * NVDIMM support went live in 5.1 without considering that, in
3423      * other archs, the user needs to enable NVDIMM support with the
3424      * 'nvdimm' machine option and the default behavior is NVDIMM
3425      * support disabled. It is too late to roll back to the standard
3426      * behavior without breaking 5.1 guests.
3427      */
3428     if (mc->nvdimm_supported) {
3429         ms->nvdimms_state->is_enabled = true;
3430     }
3431 
3432     spapr->htab_fd = -1;
3433     spapr->use_hotplug_event_source = true;
3434     spapr->kvm_type = g_strdup(DEFAULT_KVM_TYPE);
3435     object_property_add_str(obj, "kvm-type",
3436                             spapr_get_kvm_type, spapr_set_kvm_type);
3437     object_property_set_description(obj, "kvm-type",
3438                                     "Specifies the KVM virtualization mode (auto,"
3439                                     " hv, pr). Defaults to 'auto'. This mode will use"
3440                                     " any available KVM module loaded in the host,"
3441                                     " where kvm_hv takes precedence if both kvm_hv and"
3442                                     " kvm_pr are loaded.");
3443     object_property_add_bool(obj, "modern-hotplug-events",
3444                             spapr_get_modern_hotplug_events,
3445                             spapr_set_modern_hotplug_events);
3446     object_property_set_description(obj, "modern-hotplug-events",
3447                                     "Use dedicated hotplug event mechanism in"
3448                                     " place of standard EPOW events when possible"
3449                                     " (required for memory hot-unplug support)");
3450     ppc_compat_add_property(obj, "max-cpu-compat", &spapr->max_compat_pvr,
3451                             "Maximum permitted CPU compatibility mode");
3452 
3453     object_property_add_str(obj, "resize-hpt",
3454                             spapr_get_resize_hpt, spapr_set_resize_hpt);
3455     object_property_set_description(obj, "resize-hpt",
3456                                     "Resizing of the Hash Page Table (enabled, disabled, required)");
3457     object_property_add_uint32_ptr(obj, "vsmt",
3458                                    &spapr->vsmt, OBJ_PROP_FLAG_READWRITE);
3459     object_property_set_description(obj, "vsmt",
3460                                     "Virtual SMT: KVM behaves as if this were"
3461                                     " the host's SMT mode");
3462 
3463     object_property_add_bool(obj, "vfio-no-msix-emulation",
3464                              spapr_get_msix_emulation, NULL);
3465 
3466     object_property_add_uint64_ptr(obj, "kernel-addr",
3467                                    &spapr->kernel_addr, OBJ_PROP_FLAG_READWRITE);
3468     object_property_set_description(obj, "kernel-addr",
3469                                     stringify(KERNEL_LOAD_ADDR)
3470                                     " for -kernel is the default");
3471     spapr->kernel_addr = KERNEL_LOAD_ADDR;
3472 
3473     object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof);
3474     object_property_set_description(obj, "x-vof",
3475                                     "Enable Virtual Open Firmware (experimental)");
3476 
3477     /* The machine class defines the default interrupt controller mode */
3478     spapr->irq = smc->irq;
3479     object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
3480                             spapr_set_ic_mode);
3481     object_property_set_description(obj, "ic-mode",
3482                  "Specifies the interrupt controller mode (xics, xive, dual)");
3483 
3484     object_property_add_str(obj, "host-model",
3485         spapr_get_host_model, spapr_set_host_model);
3486     object_property_set_description(obj, "host-model",
3487         "Host model to advertise in guest device tree");
3488     object_property_add_str(obj, "host-serial",
3489         spapr_get_host_serial, spapr_set_host_serial);
3490     object_property_set_description(obj, "host-serial",
3491         "Host serial number to advertise in guest device tree");
3492 }
3493 
spapr_machine_finalizefn(Object * obj)3494 static void spapr_machine_finalizefn(Object *obj)
3495 {
3496     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3497 
3498     g_free(spapr->kvm_type);
3499 }
3500 
spapr_do_system_reset_on_cpu(CPUState * cs,run_on_cpu_data arg)3501 void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg)
3502 {
3503     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
3504     CPUPPCState *env = cpu_env(cs);
3505 
3506     cpu_synchronize_state(cs);
3507     /* If FWNMI is inactive, addr will be -1, which will deliver to 0x100 */
3508     if (spapr->fwnmi_system_reset_addr != -1) {
3509         uint64_t rtas_addr, addr;
3510 
3511         /* get rtas addr from fdt */
3512         rtas_addr = spapr_get_rtas_addr();
3513         if (!rtas_addr) {
3514             qemu_system_guest_panicked(NULL);
3515             return;
3516         }
3517 
3518         addr = rtas_addr + RTAS_ERROR_LOG_MAX + cs->cpu_index * sizeof(uint64_t)*2;
3519         stq_be_phys(&address_space_memory, addr, env->gpr[3]);
3520         stq_be_phys(&address_space_memory, addr + sizeof(uint64_t), 0);
3521         env->gpr[3] = addr;
3522     }
3523     ppc_cpu_do_system_reset(cs);
3524     if (spapr->fwnmi_system_reset_addr != -1) {
3525         env->nip = spapr->fwnmi_system_reset_addr;
3526     }
3527 }
3528 
spapr_nmi(NMIState * n,int cpu_index,Error ** errp)3529 static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
3530 {
3531     CPUState *cs;
3532 
3533     CPU_FOREACH(cs) {
3534         async_run_on_cpu(cs, spapr_do_system_reset_on_cpu, RUN_ON_CPU_NULL);
3535     }
3536 }
3537 
spapr_lmb_dt_populate(SpaprDrc * drc,SpaprMachineState * spapr,void * fdt,int * fdt_start_offset,Error ** errp)3538 int spapr_lmb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3539                           void *fdt, int *fdt_start_offset, Error **errp)
3540 {
3541     uint64_t addr;
3542     uint32_t node;
3543 
3544     addr = spapr_drc_index(drc) * SPAPR_MEMORY_BLOCK_SIZE;
3545     node = object_property_get_uint(OBJECT(drc->dev), PC_DIMM_NODE_PROP,
3546                                     &error_abort);
3547     *fdt_start_offset = spapr_dt_memory_node(spapr, fdt, node, addr,
3548                                              SPAPR_MEMORY_BLOCK_SIZE);
3549     return 0;
3550 }
3551 
spapr_add_lmbs(DeviceState * dev,uint64_t addr_start,uint64_t size,bool dedicated_hp_event_source)3552 static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
3553                            bool dedicated_hp_event_source)
3554 {
3555     SpaprDrc *drc;
3556     uint32_t nr_lmbs = size/SPAPR_MEMORY_BLOCK_SIZE;
3557     int i;
3558     uint64_t addr = addr_start;
3559     bool hotplugged = spapr_drc_hotplugged(dev);
3560 
3561     for (i = 0; i < nr_lmbs; i++) {
3562         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3563                               addr / SPAPR_MEMORY_BLOCK_SIZE);
3564         g_assert(drc);
3565 
3566         /*
3567          * memory_device_get_free_addr() provided a range of free addresses
3568          * that doesn't overlap with any existing mapping at pre-plug. The
3569          * corresponding LMB DRCs are thus assumed to be all attachable.
3570          */
3571         spapr_drc_attach(drc, dev);
3572         if (!hotplugged) {
3573             spapr_drc_reset(drc);
3574         }
3575         addr += SPAPR_MEMORY_BLOCK_SIZE;
3576     }
3577     /* send hotplug notification to the
3578      * guest only in case of hotplugged memory
3579      */
3580     if (hotplugged) {
3581         if (dedicated_hp_event_source) {
3582             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3583                                   addr_start / SPAPR_MEMORY_BLOCK_SIZE);
3584             g_assert(drc);
3585             spapr_hotplug_req_add_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
3586                                                    nr_lmbs,
3587                                                    spapr_drc_index(drc));
3588         } else {
3589             spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB,
3590                                            nr_lmbs);
3591         }
3592     }
3593 }
3594 
spapr_memory_plug(HotplugHandler * hotplug_dev,DeviceState * dev)3595 static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
3596 {
3597     SpaprMachineState *ms = SPAPR_MACHINE(hotplug_dev);
3598     PCDIMMDevice *dimm = PC_DIMM(dev);
3599     uint64_t size, addr;
3600     int64_t slot;
3601     bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
3602 
3603     size = memory_device_get_region_size(MEMORY_DEVICE(dev), &error_abort);
3604 
3605     pc_dimm_plug(dimm, MACHINE(ms));
3606 
3607     if (!is_nvdimm) {
3608         addr = object_property_get_uint(OBJECT(dimm),
3609                                         PC_DIMM_ADDR_PROP, &error_abort);
3610         spapr_add_lmbs(dev, addr, size,
3611                        spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT));
3612     } else {
3613         slot = object_property_get_int(OBJECT(dimm),
3614                                        PC_DIMM_SLOT_PROP, &error_abort);
3615         /* We should have valid slot number at this point */
3616         g_assert(slot >= 0);
3617         spapr_add_nvdimm(dev, slot);
3618     }
3619 }
3620 
spapr_memory_pre_plug(HotplugHandler * hotplug_dev,DeviceState * dev,Error ** errp)3621 static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3622                                   Error **errp)
3623 {
3624     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3625     bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
3626     PCDIMMDevice *dimm = PC_DIMM(dev);
3627     Error *local_err = NULL;
3628     uint64_t size;
3629     Object *memdev;
3630     hwaddr pagesize;
3631 
3632     size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &local_err);
3633     if (local_err) {
3634         error_propagate(errp, local_err);
3635         return;
3636     }
3637 
3638     if (is_nvdimm) {
3639         if (!spapr_nvdimm_validate(hotplug_dev, NVDIMM(dev), size, errp)) {
3640             return;
3641         }
3642     } else if (size % SPAPR_MEMORY_BLOCK_SIZE) {
3643         error_setg(errp, "Hotplugged memory size must be a multiple of "
3644                    "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
3645         return;
3646     }
3647 
3648     memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
3649                                       &error_abort);
3650     pagesize = host_memory_backend_pagesize(MEMORY_BACKEND(memdev));
3651     if (!spapr_check_pagesize(spapr, pagesize, errp)) {
3652         return;
3653     }
3654 
3655     pc_dimm_pre_plug(dimm, MACHINE(hotplug_dev), errp);
3656 }
3657 
3658 struct SpaprDimmState {
3659     PCDIMMDevice *dimm;
3660     uint32_t nr_lmbs;
3661     QTAILQ_ENTRY(SpaprDimmState) next;
3662 };
3663 
spapr_pending_dimm_unplugs_find(SpaprMachineState * s,PCDIMMDevice * dimm)3664 static SpaprDimmState *spapr_pending_dimm_unplugs_find(SpaprMachineState *s,
3665                                                        PCDIMMDevice *dimm)
3666 {
3667     SpaprDimmState *dimm_state = NULL;
3668 
3669     QTAILQ_FOREACH(dimm_state, &s->pending_dimm_unplugs, next) {
3670         if (dimm_state->dimm == dimm) {
3671             break;
3672         }
3673     }
3674     return dimm_state;
3675 }
3676 
spapr_pending_dimm_unplugs_add(SpaprMachineState * spapr,uint32_t nr_lmbs,PCDIMMDevice * dimm)3677 static SpaprDimmState *spapr_pending_dimm_unplugs_add(SpaprMachineState *spapr,
3678                                                       uint32_t nr_lmbs,
3679                                                       PCDIMMDevice *dimm)
3680 {
3681     SpaprDimmState *ds = NULL;
3682 
3683     /*
3684      * If this request is for a DIMM whose removal had failed earlier
3685      * (due to guest's refusal to remove the LMBs), we would have this
3686      * dimm already in the pending_dimm_unplugs list. In that
3687      * case don't add again.
3688      */
3689     ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
3690     if (!ds) {
3691         ds = g_new0(SpaprDimmState, 1);
3692         ds->nr_lmbs = nr_lmbs;
3693         ds->dimm = dimm;
3694         QTAILQ_INSERT_HEAD(&spapr->pending_dimm_unplugs, ds, next);
3695     }
3696     return ds;
3697 }
3698 
spapr_pending_dimm_unplugs_remove(SpaprMachineState * spapr,SpaprDimmState * dimm_state)3699 static void spapr_pending_dimm_unplugs_remove(SpaprMachineState *spapr,
3700                                               SpaprDimmState *dimm_state)
3701 {
3702     QTAILQ_REMOVE(&spapr->pending_dimm_unplugs, dimm_state, next);
3703     g_free(dimm_state);
3704 }
3705 
spapr_recover_pending_dimm_state(SpaprMachineState * ms,PCDIMMDevice * dimm)3706 static SpaprDimmState *spapr_recover_pending_dimm_state(SpaprMachineState *ms,
3707                                                         PCDIMMDevice *dimm)
3708 {
3709     SpaprDrc *drc;
3710     uint64_t size = memory_device_get_region_size(MEMORY_DEVICE(dimm),
3711                                                   &error_abort);
3712     uint32_t nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3713     uint32_t avail_lmbs = 0;
3714     uint64_t addr_start, addr;
3715     int i;
3716 
3717     addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3718                                           &error_abort);
3719 
3720     addr = addr_start;
3721     for (i = 0; i < nr_lmbs; i++) {
3722         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3723                               addr / SPAPR_MEMORY_BLOCK_SIZE);
3724         g_assert(drc);
3725         if (drc->dev) {
3726             avail_lmbs++;
3727         }
3728         addr += SPAPR_MEMORY_BLOCK_SIZE;
3729     }
3730 
3731     return spapr_pending_dimm_unplugs_add(ms, avail_lmbs, dimm);
3732 }
3733 
spapr_memory_unplug_rollback(SpaprMachineState * spapr,DeviceState * dev)3734 void spapr_memory_unplug_rollback(SpaprMachineState *spapr, DeviceState *dev)
3735 {
3736     SpaprDimmState *ds;
3737     PCDIMMDevice *dimm;
3738     SpaprDrc *drc;
3739     uint32_t nr_lmbs;
3740     uint64_t size, addr_start, addr;
3741     int i;
3742 
3743     if (!dev) {
3744         return;
3745     }
3746 
3747     dimm = PC_DIMM(dev);
3748     ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
3749 
3750     /*
3751      * 'ds == NULL' would mean that the DIMM doesn't have a pending
3752      * unplug state, but one of its DRC is marked as unplug_requested.
3753      * This is bad and weird enough to g_assert() out.
3754      */
3755     g_assert(ds);
3756 
3757     spapr_pending_dimm_unplugs_remove(spapr, ds);
3758 
3759     size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
3760     nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3761 
3762     addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3763                                           &error_abort);
3764 
3765     addr = addr_start;
3766     for (i = 0; i < nr_lmbs; i++) {
3767         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3768                               addr / SPAPR_MEMORY_BLOCK_SIZE);
3769         g_assert(drc);
3770 
3771         drc->unplug_requested = false;
3772         addr += SPAPR_MEMORY_BLOCK_SIZE;
3773     }
3774 
3775     /*
3776      * Tell QAPI that something happened and the memory
3777      * hotunplug wasn't successful.
3778      */
3779     qapi_event_send_device_unplug_guest_error(dev->id,
3780                                               dev->canonical_path);
3781 }
3782 
3783 /* Callback to be called during DRC release. */
spapr_lmb_release(DeviceState * dev)3784 void spapr_lmb_release(DeviceState *dev)
3785 {
3786     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3787     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_ctrl);
3788     SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
3789 
3790     /* This information will get lost if a migration occurs
3791      * during the unplug process. In this case recover it. */
3792     if (ds == NULL) {
3793         ds = spapr_recover_pending_dimm_state(spapr, PC_DIMM(dev));
3794         g_assert(ds);
3795         /* The DRC being examined by the caller at least must be counted */
3796         g_assert(ds->nr_lmbs);
3797     }
3798 
3799     if (--ds->nr_lmbs) {
3800         return;
3801     }
3802 
3803     /*
3804      * Now that all the LMBs have been removed by the guest, call the
3805      * unplug handler chain. This can never fail.
3806      */
3807     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3808     object_unparent(OBJECT(dev));
3809 }
3810 
spapr_memory_unplug(HotplugHandler * hotplug_dev,DeviceState * dev)3811 static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
3812 {
3813     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3814     SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
3815 
3816     /* We really shouldn't get this far without anything to unplug */
3817     g_assert(ds);
3818 
3819     pc_dimm_unplug(PC_DIMM(dev), MACHINE(hotplug_dev));
3820     qdev_unrealize(dev);
3821     spapr_pending_dimm_unplugs_remove(spapr, ds);
3822 }
3823 
spapr_memory_unplug_request(HotplugHandler * hotplug_dev,DeviceState * dev,Error ** errp)3824 static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev,
3825                                         DeviceState *dev, Error **errp)
3826 {
3827     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3828     PCDIMMDevice *dimm = PC_DIMM(dev);
3829     uint32_t nr_lmbs;
3830     uint64_t size, addr_start, addr;
3831     int i;
3832     SpaprDrc *drc;
3833 
3834     if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) {
3835         error_setg(errp, "nvdimm device hot unplug is not supported yet.");
3836         return;
3837     }
3838 
3839     size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
3840     nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3841 
3842     addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3843                                           &error_abort);
3844 
3845     /*
3846      * An existing pending dimm state for this DIMM means that there is an
3847      * unplug operation in progress, waiting for the spapr_lmb_release
3848      * callback to complete the job (BQL can't cover that far). In this case,
3849      * bail out to avoid detaching DRCs that were already released.
3850      */
3851     if (spapr_pending_dimm_unplugs_find(spapr, dimm)) {
3852         error_setg(errp, "Memory unplug already in progress for device %s",
3853                    dev->id);
3854         return;
3855     }
3856 
3857     spapr_pending_dimm_unplugs_add(spapr, nr_lmbs, dimm);
3858 
3859     addr = addr_start;
3860     for (i = 0; i < nr_lmbs; i++) {
3861         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3862                               addr / SPAPR_MEMORY_BLOCK_SIZE);
3863         g_assert(drc);
3864 
3865         spapr_drc_unplug_request(drc);
3866         addr += SPAPR_MEMORY_BLOCK_SIZE;
3867     }
3868 
3869     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3870                           addr_start / SPAPR_MEMORY_BLOCK_SIZE);
3871     spapr_hotplug_req_remove_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
3872                                               nr_lmbs, spapr_drc_index(drc));
3873 }
3874 
3875 /* Callback to be called during DRC release. */
spapr_core_release(DeviceState * dev)3876 void spapr_core_release(DeviceState *dev)
3877 {
3878     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3879 
3880     /* Call the unplug handler chain. This can never fail. */
3881     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3882     object_unparent(OBJECT(dev));
3883 }
3884 
spapr_core_unplug(HotplugHandler * hotplug_dev,DeviceState * dev)3885 static void spapr_core_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
3886 {
3887     MachineState *ms = MACHINE(hotplug_dev);
3888     CPUCore *cc = CPU_CORE(dev);
3889     CPUArchId *core_slot = spapr_find_cpu_slot(ms, cc->core_id, NULL);
3890 
3891     assert(core_slot);
3892     core_slot->cpu = NULL;
3893     qdev_unrealize(dev);
3894 }
3895 
3896 static
spapr_core_unplug_request(HotplugHandler * hotplug_dev,DeviceState * dev,Error ** errp)3897 void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev,
3898                                Error **errp)
3899 {
3900     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3901     int index;
3902     SpaprDrc *drc;
3903     CPUCore *cc = CPU_CORE(dev);
3904 
3905     if (!spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index)) {
3906         error_setg(errp, "Unable to find CPU core with core-id: %d",
3907                    cc->core_id);
3908         return;
3909     }
3910     if (index == 0) {
3911         error_setg(errp, "Boot CPU core may not be unplugged");
3912         return;
3913     }
3914 
3915     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
3916                           spapr_vcpu_id(spapr, cc->core_id));
3917     g_assert(drc);
3918 
3919     if (!spapr_drc_unplug_requested(drc)) {
3920         spapr_drc_unplug_request(drc);
3921     }
3922 
3923     /*
3924      * spapr_hotplug_req_remove_by_index is left unguarded, out of the
3925      * "!spapr_drc_unplug_requested" check, to allow for multiple IRQ
3926      * pulses removing the same CPU. Otherwise, in an failed hotunplug
3927      * attempt (e.g. the kernel will refuse to remove the last online
3928      * CPU), we will never attempt it again because unplug_requested
3929      * will still be 'true' in that case.
3930      */
3931     spapr_hotplug_req_remove_by_index(drc);
3932 }
3933 
spapr_core_dt_populate(SpaprDrc * drc,SpaprMachineState * spapr,void * fdt,int * fdt_start_offset,Error ** errp)3934 int spapr_core_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3935                            void *fdt, int *fdt_start_offset, Error **errp)
3936 {
3937     SpaprCpuCore *core = SPAPR_CPU_CORE(drc->dev);
3938     CPUState *cs = CPU(core->threads[0]);
3939     PowerPCCPU *cpu = POWERPC_CPU(cs);
3940     DeviceClass *dc = DEVICE_GET_CLASS(cs);
3941     int id = spapr_get_vcpu_id(cpu);
3942     g_autofree char *nodename = NULL;
3943     int offset;
3944 
3945     nodename = g_strdup_printf("%s@%x", dc->fw_name, id);
3946     offset = fdt_add_subnode(fdt, 0, nodename);
3947 
3948     spapr_dt_cpu(cs, fdt, offset, spapr);
3949 
3950     /*
3951      * spapr_dt_cpu() does not fill the 'name' property in the
3952      * CPU node. The function is called during boot process, before
3953      * and after CAS, and overwriting the 'name' property written
3954      * by SLOF is not allowed.
3955      *
3956      * Write it manually after spapr_dt_cpu(). This makes the hotplug
3957      * CPUs more compatible with the coldplugged ones, which have
3958      * the 'name' property. Linux Kernel also relies on this
3959      * property to identify CPU nodes.
3960      */
3961     _FDT((fdt_setprop_string(fdt, offset, "name", nodename)));
3962 
3963     *fdt_start_offset = offset;
3964     return 0;
3965 }
3966 
spapr_core_plug(HotplugHandler * hotplug_dev,DeviceState * dev)3967 static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
3968 {
3969     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3970     MachineClass *mc = MACHINE_GET_CLASS(spapr);
3971     SpaprCpuCore *core = SPAPR_CPU_CORE(OBJECT(dev));
3972     CPUCore *cc = CPU_CORE(dev);
3973     SpaprDrc *drc;
3974     CPUArchId *core_slot;
3975     int index;
3976     bool hotplugged = spapr_drc_hotplugged(dev);
3977     int i;
3978 
3979     core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
3980     g_assert(core_slot); /* Already checked in spapr_core_pre_plug() */
3981 
3982     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
3983                           spapr_vcpu_id(spapr, cc->core_id));
3984 
3985     g_assert(drc || !mc->has_hotpluggable_cpus);
3986 
3987     if (drc) {
3988         /*
3989          * spapr_core_pre_plug() already buys us this is a brand new
3990          * core being plugged into a free slot. Nothing should already
3991          * be attached to the corresponding DRC.
3992          */
3993         spapr_drc_attach(drc, dev);
3994 
3995         if (hotplugged) {
3996             /*
3997              * Send hotplug notification interrupt to the guest only
3998              * in case of hotplugged CPUs.
3999              */
4000             spapr_hotplug_req_add_by_index(drc);
4001         } else {
4002             spapr_drc_reset(drc);
4003         }
4004     }
4005 
4006     core_slot->cpu = CPU(dev);
4007 
4008     /*
4009      * Set compatibility mode to match the boot CPU, which was either set
4010      * by the machine reset code or by CAS. This really shouldn't fail at
4011      * this point.
4012      */
4013     if (hotplugged) {
4014         for (i = 0; i < cc->nr_threads; i++) {
4015             ppc_set_compat(core->threads[i], POWERPC_CPU(first_cpu)->compat_pvr,
4016                            &error_abort);
4017         }
4018     }
4019 
4020 }
4021 
spapr_core_pre_plug(HotplugHandler * hotplug_dev,DeviceState * dev,Error ** errp)4022 static void spapr_core_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
4023                                 Error **errp)
4024 {
4025     MachineState *machine = MACHINE(OBJECT(hotplug_dev));
4026     MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
4027     CPUCore *cc = CPU_CORE(dev);
4028     const char *base_core_type = spapr_get_cpu_core_type(machine->cpu_type);
4029     const char *type = object_get_typename(OBJECT(dev));
4030     CPUArchId *core_slot;
4031     int index;
4032     unsigned int smp_threads = machine->smp.threads;
4033 
4034     if (dev->hotplugged && !mc->has_hotpluggable_cpus) {
4035         error_setg(errp, "CPU hotplug not supported for this machine");
4036         return;
4037     }
4038 
4039     if (strcmp(base_core_type, type)) {
4040         error_setg(errp, "CPU core type should be %s", base_core_type);
4041         return;
4042     }
4043 
4044     if (cc->core_id % smp_threads) {
4045         error_setg(errp, "invalid core id %d", cc->core_id);
4046         return;
4047     }
4048 
4049     /*
4050      * In general we should have homogeneous threads-per-core, but old
4051      * (pre hotplug support) machine types allow the last core to have
4052      * reduced threads as a compatibility hack for when we allowed
4053      * total vcpus not a multiple of threads-per-core.
4054      */
4055     if (mc->has_hotpluggable_cpus && (cc->nr_threads != smp_threads)) {
4056         error_setg(errp, "invalid nr-threads %d, must be %d", cc->nr_threads,
4057                    smp_threads);
4058         return;
4059     }
4060 
4061     core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
4062     if (!core_slot) {
4063         error_setg(errp, "core id %d out of range", cc->core_id);
4064         return;
4065     }
4066 
4067     if (core_slot->cpu) {
4068         error_setg(errp, "core %d already populated", cc->core_id);
4069         return;
4070     }
4071 
4072     numa_cpu_pre_plug(core_slot, dev, errp);
4073 }
4074 
spapr_phb_dt_populate(SpaprDrc * drc,SpaprMachineState * spapr,void * fdt,int * fdt_start_offset,Error ** errp)4075 int spapr_phb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
4076                           void *fdt, int *fdt_start_offset, Error **errp)
4077 {
4078     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(drc->dev);
4079     int intc_phandle;
4080 
4081     intc_phandle = spapr_irq_get_phandle(spapr, spapr->fdt_blob, errp);
4082     if (intc_phandle <= 0) {
4083         return -1;
4084     }
4085 
4086     if (spapr_dt_phb(spapr, sphb, intc_phandle, fdt, fdt_start_offset)) {
4087         error_setg(errp, "unable to create FDT node for PHB %d", sphb->index);
4088         return -1;
4089     }
4090 
4091     /* generally SLOF creates these, for hotplug it's up to QEMU */
4092     _FDT(fdt_setprop_string(fdt, *fdt_start_offset, "name", "pci"));
4093 
4094     return 0;
4095 }
4096 
spapr_phb_pre_plug(HotplugHandler * hotplug_dev,DeviceState * dev,Error ** errp)4097 static bool spapr_phb_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
4098                                Error **errp)
4099 {
4100     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4101     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4102     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
4103     const unsigned windows_supported = spapr_phb_windows_supported(sphb);
4104     SpaprDrc *drc;
4105 
4106     if (dev->hotplugged && !smc->dr_phb_enabled) {
4107         error_setg(errp, "PHB hotplug not supported for this machine");
4108         return false;
4109     }
4110 
4111     if (sphb->index == (uint32_t)-1) {
4112         error_setg(errp, "\"index\" for PAPR PHB is mandatory");
4113         return false;
4114     }
4115 
4116     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4117     if (drc && drc->dev) {
4118         error_setg(errp, "PHB %d already attached", sphb->index);
4119         return false;
4120     }
4121 
4122     /*
4123      * This will check that sphb->index doesn't exceed the maximum number of
4124      * PHBs for the current machine type.
4125      */
4126     return
4127         smc->phb_placement(spapr, sphb->index,
4128                            &sphb->buid, &sphb->io_win_addr,
4129                            &sphb->mem_win_addr, &sphb->mem64_win_addr,
4130                            windows_supported, sphb->dma_liobn,
4131                            errp);
4132 }
4133 
spapr_phb_plug(HotplugHandler * hotplug_dev,DeviceState * dev)4134 static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
4135 {
4136     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4137     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
4138     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4139     SpaprDrc *drc;
4140     bool hotplugged = spapr_drc_hotplugged(dev);
4141 
4142     if (!smc->dr_phb_enabled) {
4143         return;
4144     }
4145 
4146     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4147     /* hotplug hooks should check it's enabled before getting this far */
4148     assert(drc);
4149 
4150     /* spapr_phb_pre_plug() already checked the DRC is attachable */
4151     spapr_drc_attach(drc, dev);
4152 
4153     if (hotplugged) {
4154         spapr_hotplug_req_add_by_index(drc);
4155     } else {
4156         spapr_drc_reset(drc);
4157     }
4158 }
4159 
spapr_phb_release(DeviceState * dev)4160 void spapr_phb_release(DeviceState *dev)
4161 {
4162     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
4163 
4164     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
4165     object_unparent(OBJECT(dev));
4166 }
4167 
spapr_phb_unplug(HotplugHandler * hotplug_dev,DeviceState * dev)4168 static void spapr_phb_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
4169 {
4170     qdev_unrealize(dev);
4171 }
4172 
spapr_phb_unplug_request(HotplugHandler * hotplug_dev,DeviceState * dev,Error ** errp)4173 static void spapr_phb_unplug_request(HotplugHandler *hotplug_dev,
4174                                      DeviceState *dev, Error **errp)
4175 {
4176     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4177     SpaprDrc *drc;
4178 
4179     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4180     assert(drc);
4181 
4182     if (!spapr_drc_unplug_requested(drc)) {
4183         spapr_drc_unplug_request(drc);
4184         spapr_hotplug_req_remove_by_index(drc);
4185     } else {
4186         error_setg(errp,
4187                    "PCI Host Bridge unplug already in progress for device %s",
4188                    dev->id);
4189     }
4190 }
4191 
4192 static
spapr_tpm_proxy_pre_plug(HotplugHandler * hotplug_dev,DeviceState * dev,Error ** errp)4193 bool spapr_tpm_proxy_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
4194                               Error **errp)
4195 {
4196     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4197 
4198     if (spapr->tpm_proxy != NULL) {
4199         error_setg(errp, "Only one TPM proxy can be specified for this machine");
4200         return false;
4201     }
4202 
4203     return true;
4204 }
4205 
spapr_tpm_proxy_plug(HotplugHandler * hotplug_dev,DeviceState * dev)4206 static void spapr_tpm_proxy_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
4207 {
4208     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4209     SpaprTpmProxy *tpm_proxy = SPAPR_TPM_PROXY(dev);
4210 
4211     /* Already checked in spapr_tpm_proxy_pre_plug() */
4212     g_assert(spapr->tpm_proxy == NULL);
4213 
4214     spapr->tpm_proxy = tpm_proxy;
4215 }
4216 
spapr_tpm_proxy_unplug(HotplugHandler * hotplug_dev,DeviceState * dev)4217 static void spapr_tpm_proxy_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
4218 {
4219     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4220 
4221     qdev_unrealize(dev);
4222     object_unparent(OBJECT(dev));
4223     spapr->tpm_proxy = NULL;
4224 }
4225 
spapr_machine_device_plug(HotplugHandler * hotplug_dev,DeviceState * dev,Error ** errp)4226 static void spapr_machine_device_plug(HotplugHandler *hotplug_dev,
4227                                       DeviceState *dev, Error **errp)
4228 {
4229     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4230         spapr_memory_plug(hotplug_dev, dev);
4231     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4232         spapr_core_plug(hotplug_dev, dev);
4233     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4234         spapr_phb_plug(hotplug_dev, dev);
4235     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4236         spapr_tpm_proxy_plug(hotplug_dev, dev);
4237     }
4238 }
4239 
spapr_machine_device_unplug(HotplugHandler * hotplug_dev,DeviceState * dev,Error ** errp)4240 static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
4241                                         DeviceState *dev, Error **errp)
4242 {
4243     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4244         spapr_memory_unplug(hotplug_dev, dev);
4245     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4246         spapr_core_unplug(hotplug_dev, dev);
4247     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4248         spapr_phb_unplug(hotplug_dev, dev);
4249     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4250         spapr_tpm_proxy_unplug(hotplug_dev, dev);
4251     }
4252 }
4253 
spapr_memory_hot_unplug_supported(SpaprMachineState * spapr)4254 bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr)
4255 {
4256     return spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT) ||
4257         /*
4258          * CAS will process all pending unplug requests.
4259          *
4260          * HACK: a guest could theoretically have cleared all bits in OV5,
4261          * but none of the guests we care for do.
4262          */
4263         spapr_ovec_empty(spapr->ov5_cas);
4264 }
4265 
spapr_machine_device_unplug_request(HotplugHandler * hotplug_dev,DeviceState * dev,Error ** errp)4266 static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev,
4267                                                 DeviceState *dev, Error **errp)
4268 {
4269     SpaprMachineState *sms = SPAPR_MACHINE(OBJECT(hotplug_dev));
4270     MachineClass *mc = MACHINE_GET_CLASS(sms);
4271     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4272 
4273     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4274         if (spapr_memory_hot_unplug_supported(sms)) {
4275             spapr_memory_unplug_request(hotplug_dev, dev, errp);
4276         } else {
4277             error_setg(errp, "Memory hot unplug not supported for this guest");
4278         }
4279     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4280         if (!mc->has_hotpluggable_cpus) {
4281             error_setg(errp, "CPU hot unplug not supported on this machine");
4282             return;
4283         }
4284         spapr_core_unplug_request(hotplug_dev, dev, errp);
4285     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4286         if (!smc->dr_phb_enabled) {
4287             error_setg(errp, "PHB hot unplug not supported on this machine");
4288             return;
4289         }
4290         spapr_phb_unplug_request(hotplug_dev, dev, errp);
4291     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4292         spapr_tpm_proxy_unplug(hotplug_dev, dev);
4293     }
4294 }
4295 
spapr_machine_device_pre_plug(HotplugHandler * hotplug_dev,DeviceState * dev,Error ** errp)4296 static void spapr_machine_device_pre_plug(HotplugHandler *hotplug_dev,
4297                                           DeviceState *dev, Error **errp)
4298 {
4299     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4300         spapr_memory_pre_plug(hotplug_dev, dev, errp);
4301     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4302         spapr_core_pre_plug(hotplug_dev, dev, errp);
4303     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4304         spapr_phb_pre_plug(hotplug_dev, dev, errp);
4305     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4306         spapr_tpm_proxy_pre_plug(hotplug_dev, dev, errp);
4307     }
4308 }
4309 
spapr_get_hotplug_handler(MachineState * machine,DeviceState * dev)4310 static HotplugHandler *spapr_get_hotplug_handler(MachineState *machine,
4311                                                  DeviceState *dev)
4312 {
4313     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
4314         object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE) ||
4315         object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE) ||
4316         object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4317         return HOTPLUG_HANDLER(machine);
4318     }
4319     if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
4320         PCIDevice *pcidev = PCI_DEVICE(dev);
4321         PCIBus *root = pci_device_root_bus(pcidev);
4322         SpaprPhbState *phb =
4323             (SpaprPhbState *)object_dynamic_cast(OBJECT(BUS(root)->parent),
4324                                                  TYPE_SPAPR_PCI_HOST_BRIDGE);
4325 
4326         if (phb) {
4327             return HOTPLUG_HANDLER(phb);
4328         }
4329     }
4330     return NULL;
4331 }
4332 
4333 static CpuInstanceProperties
spapr_cpu_index_to_props(MachineState * machine,unsigned cpu_index)4334 spapr_cpu_index_to_props(MachineState *machine, unsigned cpu_index)
4335 {
4336     CPUArchId *core_slot;
4337     MachineClass *mc = MACHINE_GET_CLASS(machine);
4338 
4339     /* make sure possible_cpu are initialized */
4340     mc->possible_cpu_arch_ids(machine);
4341     /* get CPU core slot containing thread that matches cpu_index */
4342     core_slot = spapr_find_cpu_slot(machine, cpu_index, NULL);
4343     assert(core_slot);
4344     return core_slot->props;
4345 }
4346 
spapr_get_default_cpu_node_id(const MachineState * ms,int idx)4347 static int64_t spapr_get_default_cpu_node_id(const MachineState *ms, int idx)
4348 {
4349     return idx / ms->smp.cores % ms->numa_state->num_nodes;
4350 }
4351 
spapr_possible_cpu_arch_ids(MachineState * machine)4352 static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine)
4353 {
4354     int i;
4355     unsigned int smp_threads = machine->smp.threads;
4356     unsigned int smp_cpus = machine->smp.cpus;
4357     const char *core_type;
4358     int spapr_max_cores = machine->smp.max_cpus / smp_threads;
4359     MachineClass *mc = MACHINE_GET_CLASS(machine);
4360 
4361     if (!mc->has_hotpluggable_cpus) {
4362         spapr_max_cores = QEMU_ALIGN_UP(smp_cpus, smp_threads) / smp_threads;
4363     }
4364     if (machine->possible_cpus) {
4365         assert(machine->possible_cpus->len == spapr_max_cores);
4366         return machine->possible_cpus;
4367     }
4368 
4369     core_type = spapr_get_cpu_core_type(machine->cpu_type);
4370     if (!core_type) {
4371         error_report("Unable to find sPAPR CPU Core definition");
4372         exit(1);
4373     }
4374 
4375     machine->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
4376                              sizeof(CPUArchId) * spapr_max_cores);
4377     machine->possible_cpus->len = spapr_max_cores;
4378     for (i = 0; i < machine->possible_cpus->len; i++) {
4379         int core_id = i * smp_threads;
4380 
4381         machine->possible_cpus->cpus[i].type = core_type;
4382         machine->possible_cpus->cpus[i].vcpus_count = smp_threads;
4383         machine->possible_cpus->cpus[i].arch_id = core_id;
4384         machine->possible_cpus->cpus[i].props.has_core_id = true;
4385         machine->possible_cpus->cpus[i].props.core_id = core_id;
4386     }
4387     return machine->possible_cpus;
4388 }
4389 
spapr_phb_placement(SpaprMachineState * spapr,uint32_t index,uint64_t * buid,hwaddr * pio,hwaddr * mmio32,hwaddr * mmio64,unsigned n_dma,uint32_t * liobns,Error ** errp)4390 static bool spapr_phb_placement(SpaprMachineState *spapr, uint32_t index,
4391                                 uint64_t *buid, hwaddr *pio,
4392                                 hwaddr *mmio32, hwaddr *mmio64,
4393                                 unsigned n_dma, uint32_t *liobns, Error **errp)
4394 {
4395     /*
4396      * New-style PHB window placement.
4397      *
4398      * Goals: Gives large (1TiB), naturally aligned 64-bit MMIO window
4399      * for each PHB, in addition to 2GiB 32-bit MMIO and 64kiB PIO
4400      * windows.
4401      *
4402      * Some guest kernels can't work with MMIO windows above 1<<46
4403      * (64TiB), so we place up to 31 PHBs in the area 32TiB..64TiB
4404      *
4405      * 32TiB..(33TiB+1984kiB) contains the 64kiB PIO windows for each
4406      * PHB stacked together.  (32TiB+2GiB)..(32TiB+64GiB) contains the
4407      * 2GiB 32-bit MMIO windows for each PHB.  Then 33..64TiB has the
4408      * 1TiB 64-bit MMIO windows for each PHB.
4409      */
4410     const uint64_t base_buid = 0x800000020000000ULL;
4411     int i;
4412 
4413     /* Sanity check natural alignments */
4414     QEMU_BUILD_BUG_ON((SPAPR_PCI_BASE % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
4415     QEMU_BUILD_BUG_ON((SPAPR_PCI_LIMIT % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
4416     QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM64_WIN_SIZE % SPAPR_PCI_MEM32_WIN_SIZE) != 0);
4417     QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM32_WIN_SIZE % SPAPR_PCI_IO_WIN_SIZE) != 0);
4418     /* Sanity check bounds */
4419     QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_IO_WIN_SIZE) >
4420                       SPAPR_PCI_MEM32_WIN_SIZE);
4421     QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_MEM32_WIN_SIZE) >
4422                       SPAPR_PCI_MEM64_WIN_SIZE);
4423 
4424     if (index >= SPAPR_MAX_PHBS) {
4425         error_setg(errp, "\"index\" for PAPR PHB is too large (max %llu)",
4426                    SPAPR_MAX_PHBS - 1);
4427         return false;
4428     }
4429 
4430     *buid = base_buid + index;
4431     for (i = 0; i < n_dma; ++i) {
4432         liobns[i] = SPAPR_PCI_LIOBN(index, i);
4433     }
4434 
4435     *pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE;
4436     *mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE;
4437     *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE;
4438     return true;
4439 }
4440 
spapr_ics_get(XICSFabric * dev,int irq)4441 static ICSState *spapr_ics_get(XICSFabric *dev, int irq)
4442 {
4443     SpaprMachineState *spapr = SPAPR_MACHINE(dev);
4444 
4445     return ics_valid_irq(spapr->ics, irq) ? spapr->ics : NULL;
4446 }
4447 
spapr_ics_resend(XICSFabric * dev)4448 static void spapr_ics_resend(XICSFabric *dev)
4449 {
4450     SpaprMachineState *spapr = SPAPR_MACHINE(dev);
4451 
4452     ics_resend(spapr->ics);
4453 }
4454 
spapr_icp_get(XICSFabric * xi,int vcpu_id)4455 static ICPState *spapr_icp_get(XICSFabric *xi, int vcpu_id)
4456 {
4457     PowerPCCPU *cpu = spapr_find_cpu(vcpu_id);
4458 
4459     return cpu ? spapr_cpu_state(cpu)->icp : NULL;
4460 }
4461 
spapr_pic_print_info(InterruptStatsProvider * obj,GString * buf)4462 static void spapr_pic_print_info(InterruptStatsProvider *obj, GString *buf)
4463 {
4464     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
4465 
4466     spapr_irq_print_info(spapr, buf);
4467     g_string_append_printf(buf, "irqchip: %s\n",
4468                            kvm_irqchip_in_kernel() ? "in-kernel" : "emulated");
4469 }
4470 
4471 /*
4472  * This is a XIVE only operation
4473  */
spapr_match_nvt(XiveFabric * xfb,uint8_t format,uint8_t nvt_blk,uint32_t nvt_idx,bool crowd,bool cam_ignore,uint8_t priority,uint32_t logic_serv,XiveTCTXMatch * match)4474 static bool spapr_match_nvt(XiveFabric *xfb, uint8_t format,
4475                             uint8_t nvt_blk, uint32_t nvt_idx,
4476                             bool crowd, bool cam_ignore, uint8_t priority,
4477                             uint32_t logic_serv, XiveTCTXMatch *match)
4478 {
4479     SpaprMachineState *spapr = SPAPR_MACHINE(xfb);
4480     XivePresenter *xptr = XIVE_PRESENTER(spapr->active_intc);
4481     XivePresenterClass *xpc = XIVE_PRESENTER_GET_CLASS(xptr);
4482 
4483     /*
4484      * When we implement the save and restore of the thread interrupt
4485      * contexts in the enter/exit CPU handlers of the machine and the
4486      * escalations in QEMU, we should be able to handle non dispatched
4487      * vCPUs.
4488      *
4489      * Until this is done, the sPAPR machine should find at least one
4490      * matching context always.
4491      */
4492     if (!xpc->match_nvt(xptr, format, nvt_blk, nvt_idx, crowd, cam_ignore,
4493                            priority, logic_serv, match)) {
4494         qemu_log_mask(LOG_GUEST_ERROR, "XIVE: NVT %x/%x is not dispatched\n",
4495                       nvt_blk, nvt_idx);
4496         return false;
4497     }
4498 
4499     return true;
4500 }
4501 
spapr_get_vcpu_id(PowerPCCPU * cpu)4502 int spapr_get_vcpu_id(PowerPCCPU *cpu)
4503 {
4504     return cpu->vcpu_id;
4505 }
4506 
spapr_set_vcpu_id(PowerPCCPU * cpu,int cpu_index,Error ** errp)4507 bool spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp)
4508 {
4509     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
4510     MachineState *ms = MACHINE(spapr);
4511     int vcpu_id;
4512 
4513     vcpu_id = spapr_vcpu_id(spapr, cpu_index);
4514 
4515     if (kvm_enabled() && !kvm_vcpu_id_is_valid(vcpu_id)) {
4516         error_setg(errp, "Can't create CPU with id %d in KVM", vcpu_id);
4517         error_append_hint(errp, "Adjust the number of cpus to %d "
4518                           "or try to raise the number of threads per core\n",
4519                           vcpu_id * ms->smp.threads / spapr->vsmt);
4520         return false;
4521     }
4522 
4523     cpu->vcpu_id = vcpu_id;
4524     return true;
4525 }
4526 
spapr_find_cpu(int vcpu_id)4527 PowerPCCPU *spapr_find_cpu(int vcpu_id)
4528 {
4529     CPUState *cs;
4530 
4531     CPU_FOREACH(cs) {
4532         PowerPCCPU *cpu = POWERPC_CPU(cs);
4533 
4534         if (spapr_get_vcpu_id(cpu) == vcpu_id) {
4535             return cpu;
4536         }
4537     }
4538 
4539     return NULL;
4540 }
4541 
spapr_cpu_in_nested(PowerPCCPU * cpu)4542 static bool spapr_cpu_in_nested(PowerPCCPU *cpu)
4543 {
4544     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
4545 
4546     return spapr_cpu->in_nested;
4547 }
4548 
spapr_cpu_exec_enter(PPCVirtualHypervisor * vhyp,PowerPCCPU * cpu)4549 static void spapr_cpu_exec_enter(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
4550 {
4551     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
4552 
4553     /* These are only called by TCG, KVM maintains dispatch state */
4554 
4555     spapr_cpu->prod = false;
4556     if (spapr_cpu->vpa_addr) {
4557         CPUState *cs = CPU(cpu);
4558         uint32_t dispatch;
4559 
4560         dispatch = ldl_be_phys(cs->as,
4561                                spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER);
4562         dispatch++;
4563         if ((dispatch & 1) != 0) {
4564             qemu_log_mask(LOG_GUEST_ERROR,
4565                           "VPA: incorrect dispatch counter value for "
4566                           "dispatched partition %u, correcting.\n", dispatch);
4567             dispatch++;
4568         }
4569         stl_be_phys(cs->as,
4570                     spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER, dispatch);
4571     }
4572 }
4573 
spapr_cpu_exec_exit(PPCVirtualHypervisor * vhyp,PowerPCCPU * cpu)4574 static void spapr_cpu_exec_exit(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
4575 {
4576     SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
4577 
4578     if (spapr_cpu->vpa_addr) {
4579         CPUState *cs = CPU(cpu);
4580         uint32_t dispatch;
4581 
4582         dispatch = ldl_be_phys(cs->as,
4583                                spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER);
4584         dispatch++;
4585         if ((dispatch & 1) != 1) {
4586             qemu_log_mask(LOG_GUEST_ERROR,
4587                           "VPA: incorrect dispatch counter value for "
4588                           "preempted partition %u, correcting.\n", dispatch);
4589             dispatch++;
4590         }
4591         stl_be_phys(cs->as,
4592                     spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER, dispatch);
4593     }
4594 }
4595 
spapr_machine_class_init(ObjectClass * oc,const void * data)4596 static void spapr_machine_class_init(ObjectClass *oc, const void *data)
4597 {
4598     MachineClass *mc = MACHINE_CLASS(oc);
4599     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(oc);
4600     FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
4601     NMIClass *nc = NMI_CLASS(oc);
4602     HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
4603     PPCVirtualHypervisorClass *vhc = PPC_VIRTUAL_HYPERVISOR_CLASS(oc);
4604     XICSFabricClass *xic = XICS_FABRIC_CLASS(oc);
4605     InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc);
4606     XiveFabricClass *xfc = XIVE_FABRIC_CLASS(oc);
4607     VofMachineIfClass *vmc = VOF_MACHINE_CLASS(oc);
4608 
4609     mc->desc = "pSeries Logical Partition (PAPR compliant)";
4610     mc->ignore_boot_device_suffixes = true;
4611 
4612     /*
4613      * We set up the default / latest behaviour here.  The class_init
4614      * functions for the specific versioned machine types can override
4615      * these details for backwards compatibility
4616      */
4617     mc->init = spapr_machine_init;
4618     mc->reset = spapr_machine_reset;
4619     mc->block_default_type = IF_SCSI;
4620 
4621     /*
4622      * While KVM determines max cpus in kvm_init() using kvm_max_vcpus(),
4623      * In TCG the limit is restricted by the range of CPU IPIs available.
4624      */
4625     mc->max_cpus = SPAPR_IRQ_NR_IPIS;
4626 
4627     mc->no_parallel = 1;
4628     mc->default_boot_order = "";
4629     mc->default_ram_size = 512 * MiB;
4630     mc->default_ram_id = "ppc_spapr.ram";
4631     mc->default_display = "std";
4632     mc->kvm_type = spapr_kvm_type;
4633     machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SPAPR_PCI_HOST_BRIDGE);
4634     mc->pci_allow_0_address = true;
4635     assert(!mc->get_hotplug_handler);
4636     mc->get_hotplug_handler = spapr_get_hotplug_handler;
4637     hc->pre_plug = spapr_machine_device_pre_plug;
4638     hc->plug = spapr_machine_device_plug;
4639     mc->cpu_index_to_instance_props = spapr_cpu_index_to_props;
4640     mc->get_default_cpu_node_id = spapr_get_default_cpu_node_id;
4641     mc->possible_cpu_arch_ids = spapr_possible_cpu_arch_ids;
4642     hc->unplug_request = spapr_machine_device_unplug_request;
4643     hc->unplug = spapr_machine_device_unplug;
4644 
4645     smc->update_dt_enabled = true;
4646     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power10_v2.0");
4647     mc->has_hotpluggable_cpus = true;
4648     mc->nvdimm_supported = true;
4649     smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED;
4650     fwc->get_dev_path = spapr_get_fw_dev_path;
4651     nc->nmi_monitor_handler = spapr_nmi;
4652     smc->phb_placement = spapr_phb_placement;
4653     vhc->cpu_in_nested = spapr_cpu_in_nested;
4654     vhc->deliver_hv_excp = spapr_exit_nested;
4655     vhc->hypercall = emulate_spapr_hypercall;
4656     vhc->hpt_mask = spapr_hpt_mask;
4657     vhc->map_hptes = spapr_map_hptes;
4658     vhc->unmap_hptes = spapr_unmap_hptes;
4659     vhc->hpte_set_c = spapr_hpte_set_c;
4660     vhc->hpte_set_r = spapr_hpte_set_r;
4661     vhc->get_pate = spapr_get_pate;
4662     vhc->encode_hpt_for_kvm_pr = spapr_encode_hpt_for_kvm_pr;
4663     vhc->cpu_exec_enter = spapr_cpu_exec_enter;
4664     vhc->cpu_exec_exit = spapr_cpu_exec_exit;
4665     xic->ics_get = spapr_ics_get;
4666     xic->ics_resend = spapr_ics_resend;
4667     xic->icp_get = spapr_icp_get;
4668     ispc->print_info = spapr_pic_print_info;
4669     /* Force NUMA node memory size to be a multiple of
4670      * SPAPR_MEMORY_BLOCK_SIZE (256M) since that's the granularity
4671      * in which LMBs are represented and hot-added
4672      */
4673     mc->numa_mem_align_shift = 28;
4674     mc->auto_enable_numa = true;
4675 
4676     smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_OFF;
4677     smc->default_caps.caps[SPAPR_CAP_VSX] = SPAPR_CAP_ON;
4678     smc->default_caps.caps[SPAPR_CAP_DFP] = SPAPR_CAP_ON;
4679     smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND;
4680     smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND;
4681     smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_WORKAROUND;
4682     smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 16; /* 64kiB */
4683     smc->default_caps.caps[SPAPR_CAP_NESTED_KVM_HV] = SPAPR_CAP_OFF;
4684     smc->default_caps.caps[SPAPR_CAP_NESTED_PAPR] = SPAPR_CAP_OFF;
4685     smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON;
4686     smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
4687     smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
4688     smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
4689     smc->default_caps.caps[SPAPR_CAP_DAWR1] = SPAPR_CAP_ON;
4690 
4691     /*
4692      * This cap specifies whether the AIL 3 mode for
4693      * H_SET_RESOURCE is supported. The default is modified
4694      * by default_caps_with_cpu().
4695      */
4696     smc->default_caps.caps[SPAPR_CAP_AIL_MODE_3] = SPAPR_CAP_ON;
4697     spapr_caps_add_properties(smc);
4698     smc->irq = &spapr_irq_dual;
4699     smc->dr_phb_enabled = true;
4700     smc->linux_pci_probe = true;
4701     smc->smp_threads_vsmt = true;
4702     smc->nr_xirqs = SPAPR_NR_XIRQS;
4703     xfc->match_nvt = spapr_match_nvt;
4704     vmc->client_architecture_support = spapr_vof_client_architecture_support;
4705     vmc->quiesce = spapr_vof_quiesce;
4706     vmc->setprop = spapr_vof_setprop;
4707 }
4708 
4709 static const TypeInfo spapr_machine_info = {
4710     .name          = TYPE_SPAPR_MACHINE,
4711     .parent        = TYPE_MACHINE,
4712     .abstract      = true,
4713     .instance_size = sizeof(SpaprMachineState),
4714     .instance_init = spapr_instance_init,
4715     .instance_finalize = spapr_machine_finalizefn,
4716     .class_size    = sizeof(SpaprMachineClass),
4717     .class_init    = spapr_machine_class_init,
4718     .interfaces = (const InterfaceInfo[]) {
4719         { TYPE_FW_PATH_PROVIDER },
4720         { TYPE_NMI },
4721         { TYPE_HOTPLUG_HANDLER },
4722         { TYPE_PPC_VIRTUAL_HYPERVISOR },
4723         { TYPE_XICS_FABRIC },
4724         { TYPE_INTERRUPT_STATS_PROVIDER },
4725         { TYPE_XIVE_FABRIC },
4726         { TYPE_VOF_MACHINE_IF },
4727         { }
4728     },
4729 };
4730 
spapr_machine_latest_class_options(MachineClass * mc)4731 static void spapr_machine_latest_class_options(MachineClass *mc)
4732 {
4733     mc->alias = "pseries";
4734     mc->is_default = true;
4735 }
4736 
4737 #define DEFINE_SPAPR_MACHINE_IMPL(latest, ...)                       \
4738     static void MACHINE_VER_SYM(class_init, spapr, __VA_ARGS__)(     \
4739         ObjectClass *oc,                                             \
4740         const void *data)                                            \
4741     {                                                                \
4742         MachineClass *mc = MACHINE_CLASS(oc);                        \
4743         MACHINE_VER_SYM(class_options, spapr, __VA_ARGS__)(mc);      \
4744         MACHINE_VER_DEPRECATION(__VA_ARGS__);                        \
4745         if (latest) {                                                \
4746             spapr_machine_latest_class_options(mc);                  \
4747         }                                                            \
4748     }                                                                \
4749     static const TypeInfo MACHINE_VER_SYM(info, spapr, __VA_ARGS__) = \
4750     {                                                                \
4751         .name = MACHINE_VER_TYPE_NAME("pseries", __VA_ARGS__),       \
4752         .parent = TYPE_SPAPR_MACHINE,                                \
4753         .class_init = MACHINE_VER_SYM(class_init, spapr, __VA_ARGS__), \
4754     };                                                               \
4755     static void MACHINE_VER_SYM(register, spapr, __VA_ARGS__)(void)  \
4756     {                                                                \
4757         MACHINE_VER_DELETION(__VA_ARGS__);                           \
4758         type_register_static(&MACHINE_VER_SYM(info, spapr, __VA_ARGS__));   \
4759     }                                                                \
4760     type_init(MACHINE_VER_SYM(register, spapr, __VA_ARGS__))
4761 
4762 #define DEFINE_SPAPR_MACHINE_AS_LATEST(major, minor) \
4763     DEFINE_SPAPR_MACHINE_IMPL(true, major, minor)
4764 #define DEFINE_SPAPR_MACHINE(major, minor) \
4765     DEFINE_SPAPR_MACHINE_IMPL(false, major, minor)
4766 
4767 /*
4768  * pseries-10.1
4769  */
spapr_machine_10_1_class_options(MachineClass * mc)4770 static void spapr_machine_10_1_class_options(MachineClass *mc)
4771 {
4772     /* Defaults for the latest behaviour inherited from the base class */
4773 }
4774 
4775 DEFINE_SPAPR_MACHINE_AS_LATEST(10, 1);
4776 
4777 /*
4778  * pseries-10.0
4779  */
spapr_machine_10_0_class_options(MachineClass * mc)4780 static void spapr_machine_10_0_class_options(MachineClass *mc)
4781 {
4782     spapr_machine_10_1_class_options(mc);
4783     compat_props_add(mc->compat_props, hw_compat_10_0, hw_compat_10_0_len);
4784 }
4785 
4786 DEFINE_SPAPR_MACHINE(10, 0);
4787 
4788 /*
4789  * pseries-9.2
4790  */
spapr_machine_9_2_class_options(MachineClass * mc)4791 static void spapr_machine_9_2_class_options(MachineClass *mc)
4792 {
4793     spapr_machine_10_0_class_options(mc);
4794     compat_props_add(mc->compat_props, hw_compat_9_2, hw_compat_9_2_len);
4795 }
4796 
4797 DEFINE_SPAPR_MACHINE(9, 2);
4798 
4799 /*
4800  * pseries-9.1
4801  */
spapr_machine_9_1_class_options(MachineClass * mc)4802 static void spapr_machine_9_1_class_options(MachineClass *mc)
4803 {
4804     spapr_machine_9_2_class_options(mc);
4805     compat_props_add(mc->compat_props, hw_compat_9_1, hw_compat_9_1_len);
4806 }
4807 
4808 DEFINE_SPAPR_MACHINE(9, 1);
4809 
4810 /*
4811  * pseries-9.0
4812  */
spapr_machine_9_0_class_options(MachineClass * mc)4813 static void spapr_machine_9_0_class_options(MachineClass *mc)
4814 {
4815     spapr_machine_9_1_class_options(mc);
4816     compat_props_add(mc->compat_props, hw_compat_9_0, hw_compat_9_0_len);
4817 }
4818 
4819 DEFINE_SPAPR_MACHINE(9, 0);
4820 
4821 /*
4822  * pseries-8.2
4823  */
spapr_machine_8_2_class_options(MachineClass * mc)4824 static void spapr_machine_8_2_class_options(MachineClass *mc)
4825 {
4826     spapr_machine_9_0_class_options(mc);
4827     compat_props_add(mc->compat_props, hw_compat_8_2, hw_compat_8_2_len);
4828     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power9_v2.2");
4829 }
4830 
4831 DEFINE_SPAPR_MACHINE(8, 2);
4832 
4833 /*
4834  * pseries-8.1
4835  */
spapr_machine_8_1_class_options(MachineClass * mc)4836 static void spapr_machine_8_1_class_options(MachineClass *mc)
4837 {
4838     spapr_machine_8_2_class_options(mc);
4839     compat_props_add(mc->compat_props, hw_compat_8_1, hw_compat_8_1_len);
4840 }
4841 
4842 DEFINE_SPAPR_MACHINE(8, 1);
4843 
4844 /*
4845  * pseries-8.0
4846  */
spapr_machine_8_0_class_options(MachineClass * mc)4847 static void spapr_machine_8_0_class_options(MachineClass *mc)
4848 {
4849     spapr_machine_8_1_class_options(mc);
4850     compat_props_add(mc->compat_props, hw_compat_8_0, hw_compat_8_0_len);
4851 }
4852 
4853 DEFINE_SPAPR_MACHINE(8, 0);
4854 
4855 /*
4856  * pseries-7.2
4857  */
spapr_machine_7_2_class_options(MachineClass * mc)4858 static void spapr_machine_7_2_class_options(MachineClass *mc)
4859 {
4860     spapr_machine_8_0_class_options(mc);
4861     compat_props_add(mc->compat_props, hw_compat_7_2, hw_compat_7_2_len);
4862 }
4863 
4864 DEFINE_SPAPR_MACHINE(7, 2);
4865 
4866 /*
4867  * pseries-7.1
4868  */
spapr_machine_7_1_class_options(MachineClass * mc)4869 static void spapr_machine_7_1_class_options(MachineClass *mc)
4870 {
4871     spapr_machine_7_2_class_options(mc);
4872     compat_props_add(mc->compat_props, hw_compat_7_1, hw_compat_7_1_len);
4873 }
4874 
4875 DEFINE_SPAPR_MACHINE(7, 1);
4876 
4877 /*
4878  * pseries-7.0
4879  */
spapr_machine_7_0_class_options(MachineClass * mc)4880 static void spapr_machine_7_0_class_options(MachineClass *mc)
4881 {
4882     spapr_machine_7_1_class_options(mc);
4883     compat_props_add(mc->compat_props, hw_compat_7_0, hw_compat_7_0_len);
4884 }
4885 
4886 DEFINE_SPAPR_MACHINE(7, 0);
4887 
4888 /*
4889  * pseries-6.2
4890  */
spapr_machine_6_2_class_options(MachineClass * mc)4891 static void spapr_machine_6_2_class_options(MachineClass *mc)
4892 {
4893     spapr_machine_7_0_class_options(mc);
4894     compat_props_add(mc->compat_props, hw_compat_6_2, hw_compat_6_2_len);
4895 }
4896 
4897 DEFINE_SPAPR_MACHINE(6, 2);
4898 
4899 /*
4900  * pseries-6.1
4901  */
spapr_machine_6_1_class_options(MachineClass * mc)4902 static void spapr_machine_6_1_class_options(MachineClass *mc)
4903 {
4904     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4905 
4906     spapr_machine_6_2_class_options(mc);
4907     compat_props_add(mc->compat_props, hw_compat_6_1, hw_compat_6_1_len);
4908     smc->pre_6_2_numa_affinity = true;
4909     mc->smp_props.prefer_sockets = true;
4910 }
4911 
4912 DEFINE_SPAPR_MACHINE(6, 1);
4913 
4914 /*
4915  * pseries-6.0
4916  */
spapr_machine_6_0_class_options(MachineClass * mc)4917 static void spapr_machine_6_0_class_options(MachineClass *mc)
4918 {
4919     spapr_machine_6_1_class_options(mc);
4920     compat_props_add(mc->compat_props, hw_compat_6_0, hw_compat_6_0_len);
4921 }
4922 
4923 DEFINE_SPAPR_MACHINE(6, 0);
4924 
4925 /*
4926  * pseries-5.2
4927  */
spapr_machine_5_2_class_options(MachineClass * mc)4928 static void spapr_machine_5_2_class_options(MachineClass *mc)
4929 {
4930     spapr_machine_6_0_class_options(mc);
4931     compat_props_add(mc->compat_props, hw_compat_5_2, hw_compat_5_2_len);
4932 }
4933 
4934 DEFINE_SPAPR_MACHINE(5, 2);
4935 
4936 /*
4937  * pseries-5.1
4938  */
spapr_machine_5_1_class_options(MachineClass * mc)4939 static void spapr_machine_5_1_class_options(MachineClass *mc)
4940 {
4941     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4942 
4943     spapr_machine_5_2_class_options(mc);
4944     compat_props_add(mc->compat_props, hw_compat_5_1, hw_compat_5_1_len);
4945     smc->pre_5_2_numa_associativity = true;
4946 }
4947 
4948 DEFINE_SPAPR_MACHINE(5, 1);
4949 
4950 /*
4951  * pseries-5.0
4952  */
spapr_machine_5_0_class_options(MachineClass * mc)4953 static void spapr_machine_5_0_class_options(MachineClass *mc)
4954 {
4955     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4956     static GlobalProperty compat[] = {
4957         { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-5.1-associativity", "on" },
4958     };
4959 
4960     spapr_machine_5_1_class_options(mc);
4961     compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len);
4962     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4963     mc->numa_mem_supported = true;
4964     smc->pre_5_1_assoc_refpoints = true;
4965 }
4966 
4967 DEFINE_SPAPR_MACHINE(5, 0);
4968 
4969 /*
4970  * pseries-4.2
4971  */
spapr_machine_4_2_class_options(MachineClass * mc)4972 static void spapr_machine_4_2_class_options(MachineClass *mc)
4973 {
4974     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4975 
4976     spapr_machine_5_0_class_options(mc);
4977     compat_props_add(mc->compat_props, hw_compat_4_2, hw_compat_4_2_len);
4978     smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF;
4979     smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_OFF;
4980     smc->rma_limit = 16 * GiB;
4981     mc->nvdimm_supported = false;
4982 }
4983 
4984 DEFINE_SPAPR_MACHINE(4, 2);
4985 
4986 /*
4987  * pseries-4.1
4988  */
spapr_machine_4_1_class_options(MachineClass * mc)4989 static void spapr_machine_4_1_class_options(MachineClass *mc)
4990 {
4991     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4992     static GlobalProperty compat[] = {
4993         /* Only allow 4kiB and 64kiB IOMMU pagesizes */
4994         { TYPE_SPAPR_PCI_HOST_BRIDGE, "pgsz", "0x11000" },
4995     };
4996 
4997     spapr_machine_4_2_class_options(mc);
4998     smc->linux_pci_probe = false;
4999     smc->smp_threads_vsmt = false;
5000     compat_props_add(mc->compat_props, hw_compat_4_1, hw_compat_4_1_len);
5001     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5002 }
5003 
5004 DEFINE_SPAPR_MACHINE(4, 1);
5005 
5006 /*
5007  * pseries-4.0
5008  */
phb_placement_4_0(SpaprMachineState * spapr,uint32_t index,uint64_t * buid,hwaddr * pio,hwaddr * mmio32,hwaddr * mmio64,unsigned n_dma,uint32_t * liobns,Error ** errp)5009 static bool phb_placement_4_0(SpaprMachineState *spapr, uint32_t index,
5010                               uint64_t *buid, hwaddr *pio,
5011                               hwaddr *mmio32, hwaddr *mmio64,
5012                               unsigned n_dma, uint32_t *liobns, Error **errp)
5013 {
5014     if (!spapr_phb_placement(spapr, index, buid, pio, mmio32, mmio64, n_dma,
5015                              liobns, errp)) {
5016         return false;
5017     }
5018     return true;
5019 }
spapr_machine_4_0_class_options(MachineClass * mc)5020 static void spapr_machine_4_0_class_options(MachineClass *mc)
5021 {
5022     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5023 
5024     spapr_machine_4_1_class_options(mc);
5025     compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len);
5026     smc->phb_placement = phb_placement_4_0;
5027     smc->irq = &spapr_irq_xics;
5028     smc->pre_4_1_migration = true;
5029 }
5030 
5031 DEFINE_SPAPR_MACHINE(4, 0);
5032 
5033 /*
5034  * pseries-3.1
5035  */
spapr_machine_3_1_class_options(MachineClass * mc)5036 static void spapr_machine_3_1_class_options(MachineClass *mc)
5037 {
5038     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5039 
5040     spapr_machine_4_0_class_options(mc);
5041     compat_props_add(mc->compat_props, hw_compat_3_1, hw_compat_3_1_len);
5042 
5043     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
5044     smc->update_dt_enabled = false;
5045     smc->dr_phb_enabled = false;
5046     smc->broken_host_serial_model = true;
5047     smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_BROKEN;
5048     smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN;
5049     smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN;
5050     smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_OFF;
5051 }
5052 
5053 DEFINE_SPAPR_MACHINE(3, 1);
5054 
5055 /*
5056  * pseries-3.0
5057  */
5058 
spapr_machine_3_0_class_options(MachineClass * mc)5059 static void spapr_machine_3_0_class_options(MachineClass *mc)
5060 {
5061     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5062 
5063     spapr_machine_3_1_class_options(mc);
5064     compat_props_add(mc->compat_props, hw_compat_3_0, hw_compat_3_0_len);
5065 
5066     smc->legacy_irq_allocation = true;
5067     smc->nr_xirqs = 0x400;
5068     smc->irq = &spapr_irq_xics_legacy;
5069 }
5070 
5071 DEFINE_SPAPR_MACHINE(3, 0);
5072 
spapr_machine_register_types(void)5073 static void spapr_machine_register_types(void)
5074 {
5075     type_register_static(&spapr_machine_info);
5076 }
5077 
5078 type_init(spapr_machine_register_types)
5079