xref: /openbmc/qemu/hw/ppc/spapr.c (revision d5938f29fea29581725426f203a74da746ca03e7)
1 /*
2  * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
3  *
4  * Copyright (c) 2004-2007 Fabrice Bellard
5  * Copyright (c) 2007 Jocelyn Mayer
6  * Copyright (c) 2010 David Gibson, IBM Corporation.
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  */
26 
27 #include "qemu/osdep.h"
28 #include "qemu-common.h"
29 #include "qapi/error.h"
30 #include "qapi/visitor.h"
31 #include "sysemu/sysemu.h"
32 #include "sysemu/hostmem.h"
33 #include "sysemu/numa.h"
34 #include "sysemu/qtest.h"
35 #include "sysemu/reset.h"
36 #include "qemu/log.h"
37 #include "hw/fw-path-provider.h"
38 #include "elf.h"
39 #include "net/net.h"
40 #include "sysemu/device_tree.h"
41 #include "sysemu/cpus.h"
42 #include "sysemu/hw_accel.h"
43 #include "kvm_ppc.h"
44 #include "migration/misc.h"
45 #include "migration/qemu-file-types.h"
46 #include "migration/global_state.h"
47 #include "migration/register.h"
48 #include "mmu-hash64.h"
49 #include "mmu-book3s-v3.h"
50 #include "cpu-models.h"
51 #include "qom/cpu.h"
52 
53 #include "hw/boards.h"
54 #include "hw/ppc/ppc.h"
55 #include "hw/loader.h"
56 
57 #include "hw/ppc/fdt.h"
58 #include "hw/ppc/spapr.h"
59 #include "hw/ppc/spapr_vio.h"
60 #include "hw/qdev-properties.h"
61 #include "hw/pci-host/spapr.h"
62 #include "hw/pci/msi.h"
63 
64 #include "hw/pci/pci.h"
65 #include "hw/scsi/scsi.h"
66 #include "hw/virtio/virtio-scsi.h"
67 #include "hw/virtio/vhost-scsi-common.h"
68 
69 #include "exec/address-spaces.h"
70 #include "exec/ram_addr.h"
71 #include "hw/usb.h"
72 #include "qemu/config-file.h"
73 #include "qemu/error-report.h"
74 #include "trace.h"
75 #include "hw/nmi.h"
76 #include "hw/intc/intc.h"
77 
78 #include "qemu/cutils.h"
79 #include "hw/ppc/spapr_cpu_core.h"
80 #include "hw/mem/memory-device.h"
81 
82 #include <libfdt.h>
83 
84 /* SLOF memory layout:
85  *
86  * SLOF raw image loaded at 0, copies its romfs right below the flat
87  * device-tree, then position SLOF itself 31M below that
88  *
89  * So we set FW_OVERHEAD to 40MB which should account for all of that
90  * and more
91  *
92  * We load our kernel at 4M, leaving space for SLOF initial image
93  */
94 #define FDT_MAX_SIZE            0x100000
95 #define RTAS_MAX_SIZE           0x10000
96 #define RTAS_MAX_ADDR           0x80000000 /* RTAS must stay below that */
97 #define FW_MAX_SIZE             0x400000
98 #define FW_FILE_NAME            "slof.bin"
99 #define FW_OVERHEAD             0x2800000
100 #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
101 
102 #define MIN_RMA_SLOF            128UL
103 
104 #define PHANDLE_INTC            0x00001111
105 
106 /* These two functions implement the VCPU id numbering: one to compute them
107  * all and one to identify thread 0 of a VCORE. Any change to the first one
108  * is likely to have an impact on the second one, so let's keep them close.
109  */
110 static int spapr_vcpu_id(SpaprMachineState *spapr, int cpu_index)
111 {
112     MachineState *ms = MACHINE(spapr);
113     unsigned int smp_threads = ms->smp.threads;
114 
115     assert(spapr->vsmt);
116     return
117         (cpu_index / smp_threads) * spapr->vsmt + cpu_index % smp_threads;
118 }
119 static bool spapr_is_thread0_in_vcore(SpaprMachineState *spapr,
120                                       PowerPCCPU *cpu)
121 {
122     assert(spapr->vsmt);
123     return spapr_get_vcpu_id(cpu) % spapr->vsmt == 0;
124 }
125 
126 static bool pre_2_10_vmstate_dummy_icp_needed(void *opaque)
127 {
128     /* Dummy entries correspond to unused ICPState objects in older QEMUs,
129      * and newer QEMUs don't even have them. In both cases, we don't want
130      * to send anything on the wire.
131      */
132     return false;
133 }
134 
135 static const VMStateDescription pre_2_10_vmstate_dummy_icp = {
136     .name = "icp/server",
137     .version_id = 1,
138     .minimum_version_id = 1,
139     .needed = pre_2_10_vmstate_dummy_icp_needed,
140     .fields = (VMStateField[]) {
141         VMSTATE_UNUSED(4), /* uint32_t xirr */
142         VMSTATE_UNUSED(1), /* uint8_t pending_priority */
143         VMSTATE_UNUSED(1), /* uint8_t mfrr */
144         VMSTATE_END_OF_LIST()
145     },
146 };
147 
148 static void pre_2_10_vmstate_register_dummy_icp(int i)
149 {
150     vmstate_register(NULL, i, &pre_2_10_vmstate_dummy_icp,
151                      (void *)(uintptr_t) i);
152 }
153 
154 static void pre_2_10_vmstate_unregister_dummy_icp(int i)
155 {
156     vmstate_unregister(NULL, &pre_2_10_vmstate_dummy_icp,
157                        (void *)(uintptr_t) i);
158 }
159 
160 int spapr_max_server_number(SpaprMachineState *spapr)
161 {
162     MachineState *ms = MACHINE(spapr);
163 
164     assert(spapr->vsmt);
165     return DIV_ROUND_UP(ms->smp.max_cpus * spapr->vsmt, ms->smp.threads);
166 }
167 
168 static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
169                                   int smt_threads)
170 {
171     int i, ret = 0;
172     uint32_t servers_prop[smt_threads];
173     uint32_t gservers_prop[smt_threads * 2];
174     int index = spapr_get_vcpu_id(cpu);
175 
176     if (cpu->compat_pvr) {
177         ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->compat_pvr);
178         if (ret < 0) {
179             return ret;
180         }
181     }
182 
183     /* Build interrupt servers and gservers properties */
184     for (i = 0; i < smt_threads; i++) {
185         servers_prop[i] = cpu_to_be32(index + i);
186         /* Hack, direct the group queues back to cpu 0 */
187         gservers_prop[i*2] = cpu_to_be32(index + i);
188         gservers_prop[i*2 + 1] = 0;
189     }
190     ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
191                       servers_prop, sizeof(servers_prop));
192     if (ret < 0) {
193         return ret;
194     }
195     ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
196                       gservers_prop, sizeof(gservers_prop));
197 
198     return ret;
199 }
200 
201 static int spapr_fixup_cpu_numa_dt(void *fdt, int offset, PowerPCCPU *cpu)
202 {
203     int index = spapr_get_vcpu_id(cpu);
204     uint32_t associativity[] = {cpu_to_be32(0x5),
205                                 cpu_to_be32(0x0),
206                                 cpu_to_be32(0x0),
207                                 cpu_to_be32(0x0),
208                                 cpu_to_be32(cpu->node_id),
209                                 cpu_to_be32(index)};
210 
211     /* Advertise NUMA via ibm,associativity */
212     return fdt_setprop(fdt, offset, "ibm,associativity", associativity,
213                           sizeof(associativity));
214 }
215 
216 /* Populate the "ibm,pa-features" property */
217 static void spapr_populate_pa_features(SpaprMachineState *spapr,
218                                        PowerPCCPU *cpu,
219                                        void *fdt, int offset,
220                                        bool legacy_guest)
221 {
222     uint8_t pa_features_206[] = { 6, 0,
223         0xf6, 0x1f, 0xc7, 0x00, 0x80, 0xc0 };
224     uint8_t pa_features_207[] = { 24, 0,
225         0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0,
226         0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
227         0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
228         0x80, 0x00, 0x80, 0x00, 0x00, 0x00 };
229     uint8_t pa_features_300[] = { 66, 0,
230         /* 0: MMU|FPU|SLB|RUN|DABR|NX, 1: fri[nzpm]|DABRX|SPRG3|SLB0|PP110 */
231         /* 2: VPM|DS205|PPR|DS202|DS206, 3: LSD|URG, SSO, 5: LE|CFAR|EB|LSQ */
232         0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0, /* 0 - 5 */
233         /* 6: DS207 */
234         0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 6 - 11 */
235         /* 16: Vector */
236         0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 12 - 17 */
237         /* 18: Vec. Scalar, 20: Vec. XOR, 22: HTM */
238         0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 18 - 23 */
239         /* 24: Ext. Dec, 26: 64 bit ftrs, 28: PM ftrs */
240         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 24 - 29 */
241         /* 30: MMR, 32: LE atomic, 34: EBB + ext EBB */
242         0x80, 0x00, 0x80, 0x00, 0xC0, 0x00, /* 30 - 35 */
243         /* 36: SPR SO, 38: Copy/Paste, 40: Radix MMU */
244         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 36 - 41 */
245         /* 42: PM, 44: PC RA, 46: SC vec'd */
246         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 42 - 47 */
247         /* 48: SIMD, 50: QP BFP, 52: String */
248         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
249         /* 54: DecFP, 56: DecI, 58: SHA */
250         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
251         /* 60: NM atomic, 62: RNG */
252         0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
253     };
254     uint8_t *pa_features = NULL;
255     size_t pa_size;
256 
257     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_06, 0, cpu->compat_pvr)) {
258         pa_features = pa_features_206;
259         pa_size = sizeof(pa_features_206);
260     }
261     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_07, 0, cpu->compat_pvr)) {
262         pa_features = pa_features_207;
263         pa_size = sizeof(pa_features_207);
264     }
265     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0, cpu->compat_pvr)) {
266         pa_features = pa_features_300;
267         pa_size = sizeof(pa_features_300);
268     }
269     if (!pa_features) {
270         return;
271     }
272 
273     if (ppc_hash64_has(cpu, PPC_HASH64_CI_LARGEPAGE)) {
274         /*
275          * Note: we keep CI large pages off by default because a 64K capable
276          * guest provisioned with large pages might otherwise try to map a qemu
277          * framebuffer (or other kind of memory mapped PCI BAR) using 64K pages
278          * even if that qemu runs on a 4k host.
279          * We dd this bit back here if we are confident this is not an issue
280          */
281         pa_features[3] |= 0x20;
282     }
283     if ((spapr_get_cap(spapr, SPAPR_CAP_HTM) != 0) && pa_size > 24) {
284         pa_features[24] |= 0x80;    /* Transactional memory support */
285     }
286     if (legacy_guest && pa_size > 40) {
287         /* Workaround for broken kernels that attempt (guest) radix
288          * mode when they can't handle it, if they see the radix bit set
289          * in pa-features. So hide it from them. */
290         pa_features[40 + 2] &= ~0x80; /* Radix MMU */
291     }
292 
293     _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
294 }
295 
296 static int spapr_fixup_cpu_dt(void *fdt, SpaprMachineState *spapr)
297 {
298     MachineState *ms = MACHINE(spapr);
299     int ret = 0, offset, cpus_offset;
300     CPUState *cs;
301     char cpu_model[32];
302     uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
303 
304     CPU_FOREACH(cs) {
305         PowerPCCPU *cpu = POWERPC_CPU(cs);
306         DeviceClass *dc = DEVICE_GET_CLASS(cs);
307         int index = spapr_get_vcpu_id(cpu);
308         int compat_smt = MIN(ms->smp.threads, ppc_compat_max_vthreads(cpu));
309 
310         if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
311             continue;
312         }
313 
314         snprintf(cpu_model, 32, "%s@%x", dc->fw_name, index);
315 
316         cpus_offset = fdt_path_offset(fdt, "/cpus");
317         if (cpus_offset < 0) {
318             cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
319             if (cpus_offset < 0) {
320                 return cpus_offset;
321             }
322         }
323         offset = fdt_subnode_offset(fdt, cpus_offset, cpu_model);
324         if (offset < 0) {
325             offset = fdt_add_subnode(fdt, cpus_offset, cpu_model);
326             if (offset < 0) {
327                 return offset;
328             }
329         }
330 
331         ret = fdt_setprop(fdt, offset, "ibm,pft-size",
332                           pft_size_prop, sizeof(pft_size_prop));
333         if (ret < 0) {
334             return ret;
335         }
336 
337         if (nb_numa_nodes > 1) {
338             ret = spapr_fixup_cpu_numa_dt(fdt, offset, cpu);
339             if (ret < 0) {
340                 return ret;
341             }
342         }
343 
344         ret = spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt);
345         if (ret < 0) {
346             return ret;
347         }
348 
349         spapr_populate_pa_features(spapr, cpu, fdt, offset,
350                                    spapr->cas_legacy_guest_workaround);
351     }
352     return ret;
353 }
354 
355 static hwaddr spapr_node0_size(MachineState *machine)
356 {
357     if (nb_numa_nodes) {
358         int i;
359         for (i = 0; i < nb_numa_nodes; ++i) {
360             if (numa_info[i].node_mem) {
361                 return MIN(pow2floor(numa_info[i].node_mem),
362                            machine->ram_size);
363             }
364         }
365     }
366     return machine->ram_size;
367 }
368 
369 static void add_str(GString *s, const gchar *s1)
370 {
371     g_string_append_len(s, s1, strlen(s1) + 1);
372 }
373 
374 static int spapr_populate_memory_node(void *fdt, int nodeid, hwaddr start,
375                                        hwaddr size)
376 {
377     uint32_t associativity[] = {
378         cpu_to_be32(0x4), /* length */
379         cpu_to_be32(0x0), cpu_to_be32(0x0),
380         cpu_to_be32(0x0), cpu_to_be32(nodeid)
381     };
382     char mem_name[32];
383     uint64_t mem_reg_property[2];
384     int off;
385 
386     mem_reg_property[0] = cpu_to_be64(start);
387     mem_reg_property[1] = cpu_to_be64(size);
388 
389     sprintf(mem_name, "memory@" TARGET_FMT_lx, start);
390     off = fdt_add_subnode(fdt, 0, mem_name);
391     _FDT(off);
392     _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
393     _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
394                       sizeof(mem_reg_property))));
395     _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
396                       sizeof(associativity))));
397     return off;
398 }
399 
400 static int spapr_populate_memory(SpaprMachineState *spapr, void *fdt)
401 {
402     MachineState *machine = MACHINE(spapr);
403     hwaddr mem_start, node_size;
404     int i, nb_nodes = nb_numa_nodes;
405     NodeInfo *nodes = numa_info;
406     NodeInfo ramnode;
407 
408     /* No NUMA nodes, assume there is just one node with whole RAM */
409     if (!nb_numa_nodes) {
410         nb_nodes = 1;
411         ramnode.node_mem = machine->ram_size;
412         nodes = &ramnode;
413     }
414 
415     for (i = 0, mem_start = 0; i < nb_nodes; ++i) {
416         if (!nodes[i].node_mem) {
417             continue;
418         }
419         if (mem_start >= machine->ram_size) {
420             node_size = 0;
421         } else {
422             node_size = nodes[i].node_mem;
423             if (node_size > machine->ram_size - mem_start) {
424                 node_size = machine->ram_size - mem_start;
425             }
426         }
427         if (!mem_start) {
428             /* spapr_machine_init() checks for rma_size <= node0_size
429              * already */
430             spapr_populate_memory_node(fdt, i, 0, spapr->rma_size);
431             mem_start += spapr->rma_size;
432             node_size -= spapr->rma_size;
433         }
434         for ( ; node_size; ) {
435             hwaddr sizetmp = pow2floor(node_size);
436 
437             /* mem_start != 0 here */
438             if (ctzl(mem_start) < ctzl(sizetmp)) {
439                 sizetmp = 1ULL << ctzl(mem_start);
440             }
441 
442             spapr_populate_memory_node(fdt, i, mem_start, sizetmp);
443             node_size -= sizetmp;
444             mem_start += sizetmp;
445         }
446     }
447 
448     return 0;
449 }
450 
451 static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset,
452                                   SpaprMachineState *spapr)
453 {
454     MachineState *ms = MACHINE(spapr);
455     PowerPCCPU *cpu = POWERPC_CPU(cs);
456     CPUPPCState *env = &cpu->env;
457     PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
458     int index = spapr_get_vcpu_id(cpu);
459     uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
460                        0xffffffff, 0xffffffff};
461     uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq()
462         : SPAPR_TIMEBASE_FREQ;
463     uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
464     uint32_t page_sizes_prop[64];
465     size_t page_sizes_prop_size;
466     unsigned int smp_threads = ms->smp.threads;
467     uint32_t vcpus_per_socket = smp_threads * ms->smp.cores;
468     uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
469     int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu));
470     SpaprDrc *drc;
471     int drc_index;
472     uint32_t radix_AP_encodings[PPC_PAGE_SIZES_MAX_SZ];
473     int i;
474 
475     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, index);
476     if (drc) {
477         drc_index = spapr_drc_index(drc);
478         _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index)));
479     }
480 
481     _FDT((fdt_setprop_cell(fdt, offset, "reg", index)));
482     _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu")));
483 
484     _FDT((fdt_setprop_cell(fdt, offset, "cpu-version", env->spr[SPR_PVR])));
485     _FDT((fdt_setprop_cell(fdt, offset, "d-cache-block-size",
486                            env->dcache_line_size)));
487     _FDT((fdt_setprop_cell(fdt, offset, "d-cache-line-size",
488                            env->dcache_line_size)));
489     _FDT((fdt_setprop_cell(fdt, offset, "i-cache-block-size",
490                            env->icache_line_size)));
491     _FDT((fdt_setprop_cell(fdt, offset, "i-cache-line-size",
492                            env->icache_line_size)));
493 
494     if (pcc->l1_dcache_size) {
495         _FDT((fdt_setprop_cell(fdt, offset, "d-cache-size",
496                                pcc->l1_dcache_size)));
497     } else {
498         warn_report("Unknown L1 dcache size for cpu");
499     }
500     if (pcc->l1_icache_size) {
501         _FDT((fdt_setprop_cell(fdt, offset, "i-cache-size",
502                                pcc->l1_icache_size)));
503     } else {
504         warn_report("Unknown L1 icache size for cpu");
505     }
506 
507     _FDT((fdt_setprop_cell(fdt, offset, "timebase-frequency", tbfreq)));
508     _FDT((fdt_setprop_cell(fdt, offset, "clock-frequency", cpufreq)));
509     _FDT((fdt_setprop_cell(fdt, offset, "slb-size", cpu->hash64_opts->slb_size)));
510     _FDT((fdt_setprop_cell(fdt, offset, "ibm,slb-size", cpu->hash64_opts->slb_size)));
511     _FDT((fdt_setprop_string(fdt, offset, "status", "okay")));
512     _FDT((fdt_setprop(fdt, offset, "64-bit", NULL, 0)));
513 
514     if (env->spr_cb[SPR_PURR].oea_read) {
515         _FDT((fdt_setprop_cell(fdt, offset, "ibm,purr", 1)));
516     }
517     if (env->spr_cb[SPR_SPURR].oea_read) {
518         _FDT((fdt_setprop_cell(fdt, offset, "ibm,spurr", 1)));
519     }
520 
521     if (ppc_hash64_has(cpu, PPC_HASH64_1TSEG)) {
522         _FDT((fdt_setprop(fdt, offset, "ibm,processor-segment-sizes",
523                           segs, sizeof(segs))));
524     }
525 
526     /* Advertise VSX (vector extensions) if available
527      *   1               == VMX / Altivec available
528      *   2               == VSX available
529      *
530      * Only CPUs for which we create core types in spapr_cpu_core.c
531      * are possible, and all of those have VMX */
532     if (spapr_get_cap(spapr, SPAPR_CAP_VSX) != 0) {
533         _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 2)));
534     } else {
535         _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 1)));
536     }
537 
538     /* Advertise DFP (Decimal Floating Point) if available
539      *   0 / no property == no DFP
540      *   1               == DFP available */
541     if (spapr_get_cap(spapr, SPAPR_CAP_DFP) != 0) {
542         _FDT((fdt_setprop_cell(fdt, offset, "ibm,dfp", 1)));
543     }
544 
545     page_sizes_prop_size = ppc_create_page_sizes_prop(cpu, page_sizes_prop,
546                                                       sizeof(page_sizes_prop));
547     if (page_sizes_prop_size) {
548         _FDT((fdt_setprop(fdt, offset, "ibm,segment-page-sizes",
549                           page_sizes_prop, page_sizes_prop_size)));
550     }
551 
552     spapr_populate_pa_features(spapr, cpu, fdt, offset, false);
553 
554     _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id",
555                            cs->cpu_index / vcpus_per_socket)));
556 
557     _FDT((fdt_setprop(fdt, offset, "ibm,pft-size",
558                       pft_size_prop, sizeof(pft_size_prop))));
559 
560     if (nb_numa_nodes > 1) {
561         _FDT(spapr_fixup_cpu_numa_dt(fdt, offset, cpu));
562     }
563 
564     _FDT(spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt));
565 
566     if (pcc->radix_page_info) {
567         for (i = 0; i < pcc->radix_page_info->count; i++) {
568             radix_AP_encodings[i] =
569                 cpu_to_be32(pcc->radix_page_info->entries[i]);
570         }
571         _FDT((fdt_setprop(fdt, offset, "ibm,processor-radix-AP-encodings",
572                           radix_AP_encodings,
573                           pcc->radix_page_info->count *
574                           sizeof(radix_AP_encodings[0]))));
575     }
576 
577     /*
578      * We set this property to let the guest know that it can use the large
579      * decrementer and its width in bits.
580      */
581     if (spapr_get_cap(spapr, SPAPR_CAP_LARGE_DECREMENTER) != SPAPR_CAP_OFF)
582         _FDT((fdt_setprop_u32(fdt, offset, "ibm,dec-bits",
583                               pcc->lrg_decr_bits)));
584 }
585 
586 static void spapr_populate_cpus_dt_node(void *fdt, SpaprMachineState *spapr)
587 {
588     CPUState **rev;
589     CPUState *cs;
590     int n_cpus;
591     int cpus_offset;
592     char *nodename;
593     int i;
594 
595     cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
596     _FDT(cpus_offset);
597     _FDT((fdt_setprop_cell(fdt, cpus_offset, "#address-cells", 0x1)));
598     _FDT((fdt_setprop_cell(fdt, cpus_offset, "#size-cells", 0x0)));
599 
600     /*
601      * We walk the CPUs in reverse order to ensure that CPU DT nodes
602      * created by fdt_add_subnode() end up in the right order in FDT
603      * for the guest kernel the enumerate the CPUs correctly.
604      *
605      * The CPU list cannot be traversed in reverse order, so we need
606      * to do extra work.
607      */
608     n_cpus = 0;
609     rev = NULL;
610     CPU_FOREACH(cs) {
611         rev = g_renew(CPUState *, rev, n_cpus + 1);
612         rev[n_cpus++] = cs;
613     }
614 
615     for (i = n_cpus - 1; i >= 0; i--) {
616         CPUState *cs = rev[i];
617         PowerPCCPU *cpu = POWERPC_CPU(cs);
618         int index = spapr_get_vcpu_id(cpu);
619         DeviceClass *dc = DEVICE_GET_CLASS(cs);
620         int offset;
621 
622         if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
623             continue;
624         }
625 
626         nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
627         offset = fdt_add_subnode(fdt, cpus_offset, nodename);
628         g_free(nodename);
629         _FDT(offset);
630         spapr_populate_cpu_dt(cs, fdt, offset, spapr);
631     }
632 
633     g_free(rev);
634 }
635 
636 static int spapr_rng_populate_dt(void *fdt)
637 {
638     int node;
639     int ret;
640 
641     node = qemu_fdt_add_subnode(fdt, "/ibm,platform-facilities");
642     if (node <= 0) {
643         return -1;
644     }
645     ret = fdt_setprop_string(fdt, node, "device_type",
646                              "ibm,platform-facilities");
647     ret |= fdt_setprop_cell(fdt, node, "#address-cells", 0x1);
648     ret |= fdt_setprop_cell(fdt, node, "#size-cells", 0x0);
649 
650     node = fdt_add_subnode(fdt, node, "ibm,random-v1");
651     if (node <= 0) {
652         return -1;
653     }
654     ret |= fdt_setprop_string(fdt, node, "compatible", "ibm,random");
655 
656     return ret ? -1 : 0;
657 }
658 
659 static uint32_t spapr_pc_dimm_node(MemoryDeviceInfoList *list, ram_addr_t addr)
660 {
661     MemoryDeviceInfoList *info;
662 
663     for (info = list; info; info = info->next) {
664         MemoryDeviceInfo *value = info->value;
665 
666         if (value && value->type == MEMORY_DEVICE_INFO_KIND_DIMM) {
667             PCDIMMDeviceInfo *pcdimm_info = value->u.dimm.data;
668 
669             if (addr >= pcdimm_info->addr &&
670                 addr < (pcdimm_info->addr + pcdimm_info->size)) {
671                 return pcdimm_info->node;
672             }
673         }
674     }
675 
676     return -1;
677 }
678 
679 struct sPAPRDrconfCellV2 {
680      uint32_t seq_lmbs;
681      uint64_t base_addr;
682      uint32_t drc_index;
683      uint32_t aa_index;
684      uint32_t flags;
685 } QEMU_PACKED;
686 
687 typedef struct DrconfCellQueue {
688     struct sPAPRDrconfCellV2 cell;
689     QSIMPLEQ_ENTRY(DrconfCellQueue) entry;
690 } DrconfCellQueue;
691 
692 static DrconfCellQueue *
693 spapr_get_drconf_cell(uint32_t seq_lmbs, uint64_t base_addr,
694                       uint32_t drc_index, uint32_t aa_index,
695                       uint32_t flags)
696 {
697     DrconfCellQueue *elem;
698 
699     elem = g_malloc0(sizeof(*elem));
700     elem->cell.seq_lmbs = cpu_to_be32(seq_lmbs);
701     elem->cell.base_addr = cpu_to_be64(base_addr);
702     elem->cell.drc_index = cpu_to_be32(drc_index);
703     elem->cell.aa_index = cpu_to_be32(aa_index);
704     elem->cell.flags = cpu_to_be32(flags);
705 
706     return elem;
707 }
708 
709 /* ibm,dynamic-memory-v2 */
710 static int spapr_populate_drmem_v2(SpaprMachineState *spapr, void *fdt,
711                                    int offset, MemoryDeviceInfoList *dimms)
712 {
713     MachineState *machine = MACHINE(spapr);
714     uint8_t *int_buf, *cur_index;
715     int ret;
716     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
717     uint64_t addr, cur_addr, size;
718     uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
719     uint64_t mem_end = machine->device_memory->base +
720                        memory_region_size(&machine->device_memory->mr);
721     uint32_t node, buf_len, nr_entries = 0;
722     SpaprDrc *drc;
723     DrconfCellQueue *elem, *next;
724     MemoryDeviceInfoList *info;
725     QSIMPLEQ_HEAD(, DrconfCellQueue) drconf_queue
726         = QSIMPLEQ_HEAD_INITIALIZER(drconf_queue);
727 
728     /* Entry to cover RAM and the gap area */
729     elem = spapr_get_drconf_cell(nr_boot_lmbs, 0, 0, -1,
730                                  SPAPR_LMB_FLAGS_RESERVED |
731                                  SPAPR_LMB_FLAGS_DRC_INVALID);
732     QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
733     nr_entries++;
734 
735     cur_addr = machine->device_memory->base;
736     for (info = dimms; info; info = info->next) {
737         PCDIMMDeviceInfo *di = info->value->u.dimm.data;
738 
739         addr = di->addr;
740         size = di->size;
741         node = di->node;
742 
743         /* Entry for hot-pluggable area */
744         if (cur_addr < addr) {
745             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
746             g_assert(drc);
747             elem = spapr_get_drconf_cell((addr - cur_addr) / lmb_size,
748                                          cur_addr, spapr_drc_index(drc), -1, 0);
749             QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
750             nr_entries++;
751         }
752 
753         /* Entry for DIMM */
754         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
755         g_assert(drc);
756         elem = spapr_get_drconf_cell(size / lmb_size, addr,
757                                      spapr_drc_index(drc), node,
758                                      SPAPR_LMB_FLAGS_ASSIGNED);
759         QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
760         nr_entries++;
761         cur_addr = addr + size;
762     }
763 
764     /* Entry for remaining hotpluggable area */
765     if (cur_addr < mem_end) {
766         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
767         g_assert(drc);
768         elem = spapr_get_drconf_cell((mem_end - cur_addr) / lmb_size,
769                                      cur_addr, spapr_drc_index(drc), -1, 0);
770         QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
771         nr_entries++;
772     }
773 
774     buf_len = nr_entries * sizeof(struct sPAPRDrconfCellV2) + sizeof(uint32_t);
775     int_buf = cur_index = g_malloc0(buf_len);
776     *(uint32_t *)int_buf = cpu_to_be32(nr_entries);
777     cur_index += sizeof(nr_entries);
778 
779     QSIMPLEQ_FOREACH_SAFE(elem, &drconf_queue, entry, next) {
780         memcpy(cur_index, &elem->cell, sizeof(elem->cell));
781         cur_index += sizeof(elem->cell);
782         QSIMPLEQ_REMOVE(&drconf_queue, elem, DrconfCellQueue, entry);
783         g_free(elem);
784     }
785 
786     ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory-v2", int_buf, buf_len);
787     g_free(int_buf);
788     if (ret < 0) {
789         return -1;
790     }
791     return 0;
792 }
793 
794 /* ibm,dynamic-memory */
795 static int spapr_populate_drmem_v1(SpaprMachineState *spapr, void *fdt,
796                                    int offset, MemoryDeviceInfoList *dimms)
797 {
798     MachineState *machine = MACHINE(spapr);
799     int i, ret;
800     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
801     uint32_t device_lmb_start = machine->device_memory->base / lmb_size;
802     uint32_t nr_lmbs = (machine->device_memory->base +
803                        memory_region_size(&machine->device_memory->mr)) /
804                        lmb_size;
805     uint32_t *int_buf, *cur_index, buf_len;
806 
807     /*
808      * Allocate enough buffer size to fit in ibm,dynamic-memory
809      */
810     buf_len = (nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1) * sizeof(uint32_t);
811     cur_index = int_buf = g_malloc0(buf_len);
812     int_buf[0] = cpu_to_be32(nr_lmbs);
813     cur_index++;
814     for (i = 0; i < nr_lmbs; i++) {
815         uint64_t addr = i * lmb_size;
816         uint32_t *dynamic_memory = cur_index;
817 
818         if (i >= device_lmb_start) {
819             SpaprDrc *drc;
820 
821             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, i);
822             g_assert(drc);
823 
824             dynamic_memory[0] = cpu_to_be32(addr >> 32);
825             dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
826             dynamic_memory[2] = cpu_to_be32(spapr_drc_index(drc));
827             dynamic_memory[3] = cpu_to_be32(0); /* reserved */
828             dynamic_memory[4] = cpu_to_be32(spapr_pc_dimm_node(dimms, addr));
829             if (memory_region_present(get_system_memory(), addr)) {
830                 dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_ASSIGNED);
831             } else {
832                 dynamic_memory[5] = cpu_to_be32(0);
833             }
834         } else {
835             /*
836              * LMB information for RMA, boot time RAM and gap b/n RAM and
837              * device memory region -- all these are marked as reserved
838              * and as having no valid DRC.
839              */
840             dynamic_memory[0] = cpu_to_be32(addr >> 32);
841             dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
842             dynamic_memory[2] = cpu_to_be32(0);
843             dynamic_memory[3] = cpu_to_be32(0); /* reserved */
844             dynamic_memory[4] = cpu_to_be32(-1);
845             dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_RESERVED |
846                                             SPAPR_LMB_FLAGS_DRC_INVALID);
847         }
848 
849         cur_index += SPAPR_DR_LMB_LIST_ENTRY_SIZE;
850     }
851     ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len);
852     g_free(int_buf);
853     if (ret < 0) {
854         return -1;
855     }
856     return 0;
857 }
858 
859 /*
860  * Adds ibm,dynamic-reconfiguration-memory node.
861  * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
862  * of this device tree node.
863  */
864 static int spapr_populate_drconf_memory(SpaprMachineState *spapr, void *fdt)
865 {
866     MachineState *machine = MACHINE(spapr);
867     int ret, i, offset;
868     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
869     uint32_t prop_lmb_size[] = {0, cpu_to_be32(lmb_size)};
870     uint32_t *int_buf, *cur_index, buf_len;
871     int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1;
872     MemoryDeviceInfoList *dimms = NULL;
873 
874     /*
875      * Don't create the node if there is no device memory
876      */
877     if (machine->ram_size == machine->maxram_size) {
878         return 0;
879     }
880 
881     offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
882 
883     ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
884                     sizeof(prop_lmb_size));
885     if (ret < 0) {
886         return ret;
887     }
888 
889     ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
890     if (ret < 0) {
891         return ret;
892     }
893 
894     ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
895     if (ret < 0) {
896         return ret;
897     }
898 
899     /* ibm,dynamic-memory or ibm,dynamic-memory-v2 */
900     dimms = qmp_memory_device_list();
901     if (spapr_ovec_test(spapr->ov5_cas, OV5_DRMEM_V2)) {
902         ret = spapr_populate_drmem_v2(spapr, fdt, offset, dimms);
903     } else {
904         ret = spapr_populate_drmem_v1(spapr, fdt, offset, dimms);
905     }
906     qapi_free_MemoryDeviceInfoList(dimms);
907 
908     if (ret < 0) {
909         return ret;
910     }
911 
912     /* ibm,associativity-lookup-arrays */
913     buf_len = (nr_nodes * 4 + 2) * sizeof(uint32_t);
914     cur_index = int_buf = g_malloc0(buf_len);
915     int_buf[0] = cpu_to_be32(nr_nodes);
916     int_buf[1] = cpu_to_be32(4); /* Number of entries per associativity list */
917     cur_index += 2;
918     for (i = 0; i < nr_nodes; i++) {
919         uint32_t associativity[] = {
920             cpu_to_be32(0x0),
921             cpu_to_be32(0x0),
922             cpu_to_be32(0x0),
923             cpu_to_be32(i)
924         };
925         memcpy(cur_index, associativity, sizeof(associativity));
926         cur_index += 4;
927     }
928     ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", int_buf,
929             (cur_index - int_buf) * sizeof(uint32_t));
930     g_free(int_buf);
931 
932     return ret;
933 }
934 
935 static int spapr_dt_cas_updates(SpaprMachineState *spapr, void *fdt,
936                                 SpaprOptionVector *ov5_updates)
937 {
938     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
939     int ret = 0, offset;
940 
941     /* Generate ibm,dynamic-reconfiguration-memory node if required */
942     if (spapr_ovec_test(ov5_updates, OV5_DRCONF_MEMORY)) {
943         g_assert(smc->dr_lmb_enabled);
944         ret = spapr_populate_drconf_memory(spapr, fdt);
945         if (ret) {
946             goto out;
947         }
948     }
949 
950     offset = fdt_path_offset(fdt, "/chosen");
951     if (offset < 0) {
952         offset = fdt_add_subnode(fdt, 0, "chosen");
953         if (offset < 0) {
954             return offset;
955         }
956     }
957     ret = spapr_ovec_populate_dt(fdt, offset, spapr->ov5_cas,
958                                  "ibm,architecture-vec-5");
959 
960 out:
961     return ret;
962 }
963 
964 static bool spapr_hotplugged_dev_before_cas(void)
965 {
966     Object *drc_container, *obj;
967     ObjectProperty *prop;
968     ObjectPropertyIterator iter;
969 
970     drc_container = container_get(object_get_root(), "/dr-connector");
971     object_property_iter_init(&iter, drc_container);
972     while ((prop = object_property_iter_next(&iter))) {
973         if (!strstart(prop->type, "link<", NULL)) {
974             continue;
975         }
976         obj = object_property_get_link(drc_container, prop->name, NULL);
977         if (spapr_drc_needed(obj)) {
978             return true;
979         }
980     }
981     return false;
982 }
983 
984 int spapr_h_cas_compose_response(SpaprMachineState *spapr,
985                                  target_ulong addr, target_ulong size,
986                                  SpaprOptionVector *ov5_updates)
987 {
988     void *fdt, *fdt_skel;
989     SpaprDeviceTreeUpdateHeader hdr = { .version_id = 1 };
990 
991     if (spapr_hotplugged_dev_before_cas()) {
992         return 1;
993     }
994 
995     if (size < sizeof(hdr) || size > FW_MAX_SIZE) {
996         error_report("SLOF provided an unexpected CAS buffer size "
997                      TARGET_FMT_lu " (min: %zu, max: %u)",
998                      size, sizeof(hdr), FW_MAX_SIZE);
999         exit(EXIT_FAILURE);
1000     }
1001 
1002     size -= sizeof(hdr);
1003 
1004     /* Create skeleton */
1005     fdt_skel = g_malloc0(size);
1006     _FDT((fdt_create(fdt_skel, size)));
1007     _FDT((fdt_finish_reservemap(fdt_skel)));
1008     _FDT((fdt_begin_node(fdt_skel, "")));
1009     _FDT((fdt_end_node(fdt_skel)));
1010     _FDT((fdt_finish(fdt_skel)));
1011     fdt = g_malloc0(size);
1012     _FDT((fdt_open_into(fdt_skel, fdt, size)));
1013     g_free(fdt_skel);
1014 
1015     /* Fixup cpu nodes */
1016     _FDT((spapr_fixup_cpu_dt(fdt, spapr)));
1017 
1018     if (spapr_dt_cas_updates(spapr, fdt, ov5_updates)) {
1019         return -1;
1020     }
1021 
1022     /* Pack resulting tree */
1023     _FDT((fdt_pack(fdt)));
1024 
1025     if (fdt_totalsize(fdt) + sizeof(hdr) > size) {
1026         trace_spapr_cas_failed(size);
1027         return -1;
1028     }
1029 
1030     cpu_physical_memory_write(addr, &hdr, sizeof(hdr));
1031     cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt));
1032     trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr));
1033     g_free(fdt);
1034 
1035     return 0;
1036 }
1037 
1038 static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
1039 {
1040     MachineState *ms = MACHINE(spapr);
1041     int rtas;
1042     GString *hypertas = g_string_sized_new(256);
1043     GString *qemu_hypertas = g_string_sized_new(256);
1044     uint32_t refpoints[] = { cpu_to_be32(0x4), cpu_to_be32(0x4) };
1045     uint64_t max_device_addr = MACHINE(spapr)->device_memory->base +
1046         memory_region_size(&MACHINE(spapr)->device_memory->mr);
1047     uint32_t lrdr_capacity[] = {
1048         cpu_to_be32(max_device_addr >> 32),
1049         cpu_to_be32(max_device_addr & 0xffffffff),
1050         0, cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE),
1051         cpu_to_be32(ms->smp.max_cpus / ms->smp.threads),
1052     };
1053     uint32_t maxdomain = cpu_to_be32(spapr->gpu_numa_id > 1 ? 1 : 0);
1054     uint32_t maxdomains[] = {
1055         cpu_to_be32(4),
1056         maxdomain,
1057         maxdomain,
1058         maxdomain,
1059         cpu_to_be32(spapr->gpu_numa_id),
1060     };
1061 
1062     _FDT(rtas = fdt_add_subnode(fdt, 0, "rtas"));
1063 
1064     /* hypertas */
1065     add_str(hypertas, "hcall-pft");
1066     add_str(hypertas, "hcall-term");
1067     add_str(hypertas, "hcall-dabr");
1068     add_str(hypertas, "hcall-interrupt");
1069     add_str(hypertas, "hcall-tce");
1070     add_str(hypertas, "hcall-vio");
1071     add_str(hypertas, "hcall-splpar");
1072     add_str(hypertas, "hcall-bulk");
1073     add_str(hypertas, "hcall-set-mode");
1074     add_str(hypertas, "hcall-sprg0");
1075     add_str(hypertas, "hcall-copy");
1076     add_str(hypertas, "hcall-debug");
1077     add_str(hypertas, "hcall-vphn");
1078     add_str(qemu_hypertas, "hcall-memop1");
1079 
1080     if (!kvm_enabled() || kvmppc_spapr_use_multitce()) {
1081         add_str(hypertas, "hcall-multi-tce");
1082     }
1083 
1084     if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
1085         add_str(hypertas, "hcall-hpt-resize");
1086     }
1087 
1088     _FDT(fdt_setprop(fdt, rtas, "ibm,hypertas-functions",
1089                      hypertas->str, hypertas->len));
1090     g_string_free(hypertas, TRUE);
1091     _FDT(fdt_setprop(fdt, rtas, "qemu,hypertas-functions",
1092                      qemu_hypertas->str, qemu_hypertas->len));
1093     g_string_free(qemu_hypertas, TRUE);
1094 
1095     _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points",
1096                      refpoints, sizeof(refpoints)));
1097 
1098     _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains",
1099                      maxdomains, sizeof(maxdomains)));
1100 
1101     _FDT(fdt_setprop_cell(fdt, rtas, "rtas-error-log-max",
1102                           RTAS_ERROR_LOG_MAX));
1103     _FDT(fdt_setprop_cell(fdt, rtas, "rtas-event-scan-rate",
1104                           RTAS_EVENT_SCAN_RATE));
1105 
1106     g_assert(msi_nonbroken);
1107     _FDT(fdt_setprop(fdt, rtas, "ibm,change-msix-capable", NULL, 0));
1108 
1109     /*
1110      * According to PAPR, rtas ibm,os-term does not guarantee a return
1111      * back to the guest cpu.
1112      *
1113      * While an additional ibm,extended-os-term property indicates
1114      * that rtas call return will always occur. Set this property.
1115      */
1116     _FDT(fdt_setprop(fdt, rtas, "ibm,extended-os-term", NULL, 0));
1117 
1118     _FDT(fdt_setprop(fdt, rtas, "ibm,lrdr-capacity",
1119                      lrdr_capacity, sizeof(lrdr_capacity)));
1120 
1121     spapr_dt_rtas_tokens(fdt, rtas);
1122 }
1123 
1124 /*
1125  * Prepare ibm,arch-vec-5-platform-support, which indicates the MMU
1126  * and the XIVE features that the guest may request and thus the valid
1127  * values for bytes 23..26 of option vector 5:
1128  */
1129 static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
1130                                           int chosen)
1131 {
1132     PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
1133 
1134     char val[2 * 4] = {
1135         23, spapr->irq->ov5, /* Xive mode. */
1136         24, 0x00, /* Hash/Radix, filled in below. */
1137         25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
1138         26, 0x40, /* Radix options: GTSE == yes. */
1139     };
1140 
1141     if (!ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_00, 0,
1142                           first_ppc_cpu->compat_pvr)) {
1143         /*
1144          * If we're in a pre POWER9 compat mode then the guest should
1145          * do hash and use the legacy interrupt mode
1146          */
1147         val[1] = 0x00; /* XICS */
1148         val[3] = 0x00; /* Hash */
1149     } else if (kvm_enabled()) {
1150         if (kvmppc_has_cap_mmu_radix() && kvmppc_has_cap_mmu_hash_v3()) {
1151             val[3] = 0x80; /* OV5_MMU_BOTH */
1152         } else if (kvmppc_has_cap_mmu_radix()) {
1153             val[3] = 0x40; /* OV5_MMU_RADIX_300 */
1154         } else {
1155             val[3] = 0x00; /* Hash */
1156         }
1157     } else {
1158         /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
1159         val[3] = 0xC0;
1160     }
1161     _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
1162                      val, sizeof(val)));
1163 }
1164 
1165 static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt)
1166 {
1167     MachineState *machine = MACHINE(spapr);
1168     int chosen;
1169     const char *boot_device = machine->boot_order;
1170     char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
1171     size_t cb = 0;
1172     char *bootlist = get_boot_devices_list(&cb);
1173 
1174     _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen"));
1175 
1176     _FDT(fdt_setprop_string(fdt, chosen, "bootargs", machine->kernel_cmdline));
1177     _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-start",
1178                           spapr->initrd_base));
1179     _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-end",
1180                           spapr->initrd_base + spapr->initrd_size));
1181 
1182     if (spapr->kernel_size) {
1183         uint64_t kprop[2] = { cpu_to_be64(KERNEL_LOAD_ADDR),
1184                               cpu_to_be64(spapr->kernel_size) };
1185 
1186         _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel",
1187                          &kprop, sizeof(kprop)));
1188         if (spapr->kernel_le) {
1189             _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel-le", NULL, 0));
1190         }
1191     }
1192     if (boot_menu) {
1193         _FDT((fdt_setprop_cell(fdt, chosen, "qemu,boot-menu", boot_menu)));
1194     }
1195     _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-width", graphic_width));
1196     _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-height", graphic_height));
1197     _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-depth", graphic_depth));
1198 
1199     if (cb && bootlist) {
1200         int i;
1201 
1202         for (i = 0; i < cb; i++) {
1203             if (bootlist[i] == '\n') {
1204                 bootlist[i] = ' ';
1205             }
1206         }
1207         _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-list", bootlist));
1208     }
1209 
1210     if (boot_device && strlen(boot_device)) {
1211         _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-device", boot_device));
1212     }
1213 
1214     if (!spapr->has_graphics && stdout_path) {
1215         /*
1216          * "linux,stdout-path" and "stdout" properties are deprecated by linux
1217          * kernel. New platforms should only use the "stdout-path" property. Set
1218          * the new property and continue using older property to remain
1219          * compatible with the existing firmware.
1220          */
1221         _FDT(fdt_setprop_string(fdt, chosen, "linux,stdout-path", stdout_path));
1222         _FDT(fdt_setprop_string(fdt, chosen, "stdout-path", stdout_path));
1223     }
1224 
1225     spapr_dt_ov5_platform_support(spapr, fdt, chosen);
1226 
1227     g_free(stdout_path);
1228     g_free(bootlist);
1229 }
1230 
1231 static void spapr_dt_hypervisor(SpaprMachineState *spapr, void *fdt)
1232 {
1233     /* The /hypervisor node isn't in PAPR - this is a hack to allow PR
1234      * KVM to work under pHyp with some guest co-operation */
1235     int hypervisor;
1236     uint8_t hypercall[16];
1237 
1238     _FDT(hypervisor = fdt_add_subnode(fdt, 0, "hypervisor"));
1239     /* indicate KVM hypercall interface */
1240     _FDT(fdt_setprop_string(fdt, hypervisor, "compatible", "linux,kvm"));
1241     if (kvmppc_has_cap_fixup_hcalls()) {
1242         /*
1243          * Older KVM versions with older guest kernels were broken
1244          * with the magic page, don't allow the guest to map it.
1245          */
1246         if (!kvmppc_get_hypercall(first_cpu->env_ptr, hypercall,
1247                                   sizeof(hypercall))) {
1248             _FDT(fdt_setprop(fdt, hypervisor, "hcall-instructions",
1249                              hypercall, sizeof(hypercall)));
1250         }
1251     }
1252 }
1253 
1254 static void *spapr_build_fdt(SpaprMachineState *spapr)
1255 {
1256     MachineState *machine = MACHINE(spapr);
1257     MachineClass *mc = MACHINE_GET_CLASS(machine);
1258     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
1259     int ret;
1260     void *fdt;
1261     SpaprPhbState *phb;
1262     char *buf;
1263 
1264     fdt = g_malloc0(FDT_MAX_SIZE);
1265     _FDT((fdt_create_empty_tree(fdt, FDT_MAX_SIZE)));
1266 
1267     /* Root node */
1268     _FDT(fdt_setprop_string(fdt, 0, "device_type", "chrp"));
1269     _FDT(fdt_setprop_string(fdt, 0, "model", "IBM pSeries (emulated by qemu)"));
1270     _FDT(fdt_setprop_string(fdt, 0, "compatible", "qemu,pseries"));
1271 
1272     /* Guest UUID & Name*/
1273     buf = qemu_uuid_unparse_strdup(&qemu_uuid);
1274     _FDT(fdt_setprop_string(fdt, 0, "vm,uuid", buf));
1275     if (qemu_uuid_set) {
1276         _FDT(fdt_setprop_string(fdt, 0, "system-id", buf));
1277     }
1278     g_free(buf);
1279 
1280     if (qemu_get_vm_name()) {
1281         _FDT(fdt_setprop_string(fdt, 0, "ibm,partition-name",
1282                                 qemu_get_vm_name()));
1283     }
1284 
1285     /* Host Model & Serial Number */
1286     if (spapr->host_model) {
1287         _FDT(fdt_setprop_string(fdt, 0, "host-model", spapr->host_model));
1288     } else if (smc->broken_host_serial_model && kvmppc_get_host_model(&buf)) {
1289         _FDT(fdt_setprop_string(fdt, 0, "host-model", buf));
1290         g_free(buf);
1291     }
1292 
1293     if (spapr->host_serial) {
1294         _FDT(fdt_setprop_string(fdt, 0, "host-serial", spapr->host_serial));
1295     } else if (smc->broken_host_serial_model && kvmppc_get_host_serial(&buf)) {
1296         _FDT(fdt_setprop_string(fdt, 0, "host-serial", buf));
1297         g_free(buf);
1298     }
1299 
1300     _FDT(fdt_setprop_cell(fdt, 0, "#address-cells", 2));
1301     _FDT(fdt_setprop_cell(fdt, 0, "#size-cells", 2));
1302 
1303     /* /interrupt controller */
1304     spapr->irq->dt_populate(spapr, spapr_max_server_number(spapr), fdt,
1305                           PHANDLE_INTC);
1306 
1307     ret = spapr_populate_memory(spapr, fdt);
1308     if (ret < 0) {
1309         error_report("couldn't setup memory nodes in fdt");
1310         exit(1);
1311     }
1312 
1313     /* /vdevice */
1314     spapr_dt_vdevice(spapr->vio_bus, fdt);
1315 
1316     if (object_resolve_path_type("", TYPE_SPAPR_RNG, NULL)) {
1317         ret = spapr_rng_populate_dt(fdt);
1318         if (ret < 0) {
1319             error_report("could not set up rng device in the fdt");
1320             exit(1);
1321         }
1322     }
1323 
1324     QLIST_FOREACH(phb, &spapr->phbs, list) {
1325         ret = spapr_dt_phb(phb, PHANDLE_INTC, fdt, spapr->irq->nr_msis, NULL);
1326         if (ret < 0) {
1327             error_report("couldn't setup PCI devices in fdt");
1328             exit(1);
1329         }
1330     }
1331 
1332     /* cpus */
1333     spapr_populate_cpus_dt_node(fdt, spapr);
1334 
1335     if (smc->dr_lmb_enabled) {
1336         _FDT(spapr_dt_drc(fdt, 0, NULL, SPAPR_DR_CONNECTOR_TYPE_LMB));
1337     }
1338 
1339     if (mc->has_hotpluggable_cpus) {
1340         int offset = fdt_path_offset(fdt, "/cpus");
1341         ret = spapr_dt_drc(fdt, offset, NULL, SPAPR_DR_CONNECTOR_TYPE_CPU);
1342         if (ret < 0) {
1343             error_report("Couldn't set up CPU DR device tree properties");
1344             exit(1);
1345         }
1346     }
1347 
1348     /* /event-sources */
1349     spapr_dt_events(spapr, fdt);
1350 
1351     /* /rtas */
1352     spapr_dt_rtas(spapr, fdt);
1353 
1354     /* /chosen */
1355     spapr_dt_chosen(spapr, fdt);
1356 
1357     /* /hypervisor */
1358     if (kvm_enabled()) {
1359         spapr_dt_hypervisor(spapr, fdt);
1360     }
1361 
1362     /* Build memory reserve map */
1363     if (spapr->kernel_size) {
1364         _FDT((fdt_add_mem_rsv(fdt, KERNEL_LOAD_ADDR, spapr->kernel_size)));
1365     }
1366     if (spapr->initrd_size) {
1367         _FDT((fdt_add_mem_rsv(fdt, spapr->initrd_base, spapr->initrd_size)));
1368     }
1369 
1370     /* ibm,client-architecture-support updates */
1371     ret = spapr_dt_cas_updates(spapr, fdt, spapr->ov5_cas);
1372     if (ret < 0) {
1373         error_report("couldn't setup CAS properties fdt");
1374         exit(1);
1375     }
1376 
1377     if (smc->dr_phb_enabled) {
1378         ret = spapr_dt_drc(fdt, 0, NULL, SPAPR_DR_CONNECTOR_TYPE_PHB);
1379         if (ret < 0) {
1380             error_report("Couldn't set up PHB DR device tree properties");
1381             exit(1);
1382         }
1383     }
1384 
1385     return fdt;
1386 }
1387 
1388 static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
1389 {
1390     return (addr & 0x0fffffff) + KERNEL_LOAD_ADDR;
1391 }
1392 
1393 static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
1394                                     PowerPCCPU *cpu)
1395 {
1396     CPUPPCState *env = &cpu->env;
1397 
1398     /* The TCG path should also be holding the BQL at this point */
1399     g_assert(qemu_mutex_iothread_locked());
1400 
1401     if (msr_pr) {
1402         hcall_dprintf("Hypercall made with MSR[PR]=1\n");
1403         env->gpr[3] = H_PRIVILEGE;
1404     } else {
1405         env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
1406     }
1407 }
1408 
1409 struct LPCRSyncState {
1410     target_ulong value;
1411     target_ulong mask;
1412 };
1413 
1414 static void do_lpcr_sync(CPUState *cs, run_on_cpu_data arg)
1415 {
1416     struct LPCRSyncState *s = arg.host_ptr;
1417     PowerPCCPU *cpu = POWERPC_CPU(cs);
1418     CPUPPCState *env = &cpu->env;
1419     target_ulong lpcr;
1420 
1421     cpu_synchronize_state(cs);
1422     lpcr = env->spr[SPR_LPCR];
1423     lpcr &= ~s->mask;
1424     lpcr |= s->value;
1425     ppc_store_lpcr(cpu, lpcr);
1426 }
1427 
1428 void spapr_set_all_lpcrs(target_ulong value, target_ulong mask)
1429 {
1430     CPUState *cs;
1431     struct LPCRSyncState s = {
1432         .value = value,
1433         .mask = mask
1434     };
1435     CPU_FOREACH(cs) {
1436         run_on_cpu(cs, do_lpcr_sync, RUN_ON_CPU_HOST_PTR(&s));
1437     }
1438 }
1439 
1440 static void spapr_get_pate(PPCVirtualHypervisor *vhyp, ppc_v3_pate_t *entry)
1441 {
1442     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1443 
1444     /* Copy PATE1:GR into PATE0:HR */
1445     entry->dw0 = spapr->patb_entry & PATE0_HR;
1446     entry->dw1 = spapr->patb_entry;
1447 }
1448 
1449 #define HPTE(_table, _i)   (void *)(((uint64_t *)(_table)) + ((_i) * 2))
1450 #define HPTE_VALID(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
1451 #define HPTE_DIRTY(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY)
1452 #define CLEAN_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY))
1453 #define DIRTY_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) |= tswap64(HPTE64_V_HPTE_DIRTY))
1454 
1455 /*
1456  * Get the fd to access the kernel htab, re-opening it if necessary
1457  */
1458 static int get_htab_fd(SpaprMachineState *spapr)
1459 {
1460     Error *local_err = NULL;
1461 
1462     if (spapr->htab_fd >= 0) {
1463         return spapr->htab_fd;
1464     }
1465 
1466     spapr->htab_fd = kvmppc_get_htab_fd(false, 0, &local_err);
1467     if (spapr->htab_fd < 0) {
1468         error_report_err(local_err);
1469     }
1470 
1471     return spapr->htab_fd;
1472 }
1473 
1474 void close_htab_fd(SpaprMachineState *spapr)
1475 {
1476     if (spapr->htab_fd >= 0) {
1477         close(spapr->htab_fd);
1478     }
1479     spapr->htab_fd = -1;
1480 }
1481 
1482 static hwaddr spapr_hpt_mask(PPCVirtualHypervisor *vhyp)
1483 {
1484     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1485 
1486     return HTAB_SIZE(spapr) / HASH_PTEG_SIZE_64 - 1;
1487 }
1488 
1489 static target_ulong spapr_encode_hpt_for_kvm_pr(PPCVirtualHypervisor *vhyp)
1490 {
1491     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1492 
1493     assert(kvm_enabled());
1494 
1495     if (!spapr->htab) {
1496         return 0;
1497     }
1498 
1499     return (target_ulong)(uintptr_t)spapr->htab | (spapr->htab_shift - 18);
1500 }
1501 
1502 static const ppc_hash_pte64_t *spapr_map_hptes(PPCVirtualHypervisor *vhyp,
1503                                                 hwaddr ptex, int n)
1504 {
1505     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1506     hwaddr pte_offset = ptex * HASH_PTE_SIZE_64;
1507 
1508     if (!spapr->htab) {
1509         /*
1510          * HTAB is controlled by KVM. Fetch into temporary buffer
1511          */
1512         ppc_hash_pte64_t *hptes = g_malloc(n * HASH_PTE_SIZE_64);
1513         kvmppc_read_hptes(hptes, ptex, n);
1514         return hptes;
1515     }
1516 
1517     /*
1518      * HTAB is controlled by QEMU. Just point to the internally
1519      * accessible PTEG.
1520      */
1521     return (const ppc_hash_pte64_t *)(spapr->htab + pte_offset);
1522 }
1523 
1524 static void spapr_unmap_hptes(PPCVirtualHypervisor *vhyp,
1525                               const ppc_hash_pte64_t *hptes,
1526                               hwaddr ptex, int n)
1527 {
1528     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1529 
1530     if (!spapr->htab) {
1531         g_free((void *)hptes);
1532     }
1533 
1534     /* Nothing to do for qemu managed HPT */
1535 }
1536 
1537 void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
1538                       uint64_t pte0, uint64_t pte1)
1539 {
1540     SpaprMachineState *spapr = SPAPR_MACHINE(cpu->vhyp);
1541     hwaddr offset = ptex * HASH_PTE_SIZE_64;
1542 
1543     if (!spapr->htab) {
1544         kvmppc_write_hpte(ptex, pte0, pte1);
1545     } else {
1546         if (pte0 & HPTE64_V_VALID) {
1547             stq_p(spapr->htab + offset + HASH_PTE_SIZE_64 / 2, pte1);
1548             /*
1549              * When setting valid, we write PTE1 first. This ensures
1550              * proper synchronization with the reading code in
1551              * ppc_hash64_pteg_search()
1552              */
1553             smp_wmb();
1554             stq_p(spapr->htab + offset, pte0);
1555         } else {
1556             stq_p(spapr->htab + offset, pte0);
1557             /*
1558              * When clearing it we set PTE0 first. This ensures proper
1559              * synchronization with the reading code in
1560              * ppc_hash64_pteg_search()
1561              */
1562             smp_wmb();
1563             stq_p(spapr->htab + offset + HASH_PTE_SIZE_64 / 2, pte1);
1564         }
1565     }
1566 }
1567 
1568 static void spapr_hpte_set_c(PPCVirtualHypervisor *vhyp, hwaddr ptex,
1569                              uint64_t pte1)
1570 {
1571     hwaddr offset = ptex * HASH_PTE_SIZE_64 + 15;
1572     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1573 
1574     if (!spapr->htab) {
1575         /* There should always be a hash table when this is called */
1576         error_report("spapr_hpte_set_c called with no hash table !");
1577         return;
1578     }
1579 
1580     /* The HW performs a non-atomic byte update */
1581     stb_p(spapr->htab + offset, (pte1 & 0xff) | 0x80);
1582 }
1583 
1584 static void spapr_hpte_set_r(PPCVirtualHypervisor *vhyp, hwaddr ptex,
1585                              uint64_t pte1)
1586 {
1587     hwaddr offset = ptex * HASH_PTE_SIZE_64 + 14;
1588     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1589 
1590     if (!spapr->htab) {
1591         /* There should always be a hash table when this is called */
1592         error_report("spapr_hpte_set_r called with no hash table !");
1593         return;
1594     }
1595 
1596     /* The HW performs a non-atomic byte update */
1597     stb_p(spapr->htab + offset, ((pte1 >> 8) & 0xff) | 0x01);
1598 }
1599 
1600 int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
1601 {
1602     int shift;
1603 
1604     /* We aim for a hash table of size 1/128 the size of RAM (rounded
1605      * up).  The PAPR recommendation is actually 1/64 of RAM size, but
1606      * that's much more than is needed for Linux guests */
1607     shift = ctz64(pow2ceil(ramsize)) - 7;
1608     shift = MAX(shift, 18); /* Minimum architected size */
1609     shift = MIN(shift, 46); /* Maximum architected size */
1610     return shift;
1611 }
1612 
1613 void spapr_free_hpt(SpaprMachineState *spapr)
1614 {
1615     g_free(spapr->htab);
1616     spapr->htab = NULL;
1617     spapr->htab_shift = 0;
1618     close_htab_fd(spapr);
1619 }
1620 
1621 void spapr_reallocate_hpt(SpaprMachineState *spapr, int shift,
1622                           Error **errp)
1623 {
1624     long rc;
1625 
1626     /* Clean up any HPT info from a previous boot */
1627     spapr_free_hpt(spapr);
1628 
1629     rc = kvmppc_reset_htab(shift);
1630     if (rc < 0) {
1631         /* kernel-side HPT needed, but couldn't allocate one */
1632         error_setg_errno(errp, errno,
1633                          "Failed to allocate KVM HPT of order %d (try smaller maxmem?)",
1634                          shift);
1635         /* This is almost certainly fatal, but if the caller really
1636          * wants to carry on with shift == 0, it's welcome to try */
1637     } else if (rc > 0) {
1638         /* kernel-side HPT allocated */
1639         if (rc != shift) {
1640             error_setg(errp,
1641                        "Requested order %d HPT, but kernel allocated order %ld (try smaller maxmem?)",
1642                        shift, rc);
1643         }
1644 
1645         spapr->htab_shift = shift;
1646         spapr->htab = NULL;
1647     } else {
1648         /* kernel-side HPT not needed, allocate in userspace instead */
1649         size_t size = 1ULL << shift;
1650         int i;
1651 
1652         spapr->htab = qemu_memalign(size, size);
1653         if (!spapr->htab) {
1654             error_setg_errno(errp, errno,
1655                              "Could not allocate HPT of order %d", shift);
1656             return;
1657         }
1658 
1659         memset(spapr->htab, 0, size);
1660         spapr->htab_shift = shift;
1661 
1662         for (i = 0; i < size / HASH_PTE_SIZE_64; i++) {
1663             DIRTY_HPTE(HPTE(spapr->htab, i));
1664         }
1665     }
1666     /* We're setting up a hash table, so that means we're not radix */
1667     spapr->patb_entry = 0;
1668     spapr_set_all_lpcrs(0, LPCR_HR | LPCR_UPRT);
1669 }
1670 
1671 void spapr_setup_hpt_and_vrma(SpaprMachineState *spapr)
1672 {
1673     int hpt_shift;
1674 
1675     if ((spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED)
1676         || (spapr->cas_reboot
1677             && !spapr_ovec_test(spapr->ov5_cas, OV5_HPT_RESIZE))) {
1678         hpt_shift = spapr_hpt_shift_for_ramsize(MACHINE(spapr)->maxram_size);
1679     } else {
1680         uint64_t current_ram_size;
1681 
1682         current_ram_size = MACHINE(spapr)->ram_size + get_plugged_memory_size();
1683         hpt_shift = spapr_hpt_shift_for_ramsize(current_ram_size);
1684     }
1685     spapr_reallocate_hpt(spapr, hpt_shift, &error_fatal);
1686 
1687     if (spapr->vrma_adjust) {
1688         spapr->rma_size = kvmppc_rma_size(spapr_node0_size(MACHINE(spapr)),
1689                                           spapr->htab_shift);
1690     }
1691 }
1692 
1693 static int spapr_reset_drcs(Object *child, void *opaque)
1694 {
1695     SpaprDrc *drc =
1696         (SpaprDrc *) object_dynamic_cast(child,
1697                                                  TYPE_SPAPR_DR_CONNECTOR);
1698 
1699     if (drc) {
1700         spapr_drc_reset(drc);
1701     }
1702 
1703     return 0;
1704 }
1705 
1706 static void spapr_machine_reset(MachineState *machine)
1707 {
1708     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
1709     PowerPCCPU *first_ppc_cpu;
1710     uint32_t rtas_limit;
1711     hwaddr rtas_addr, fdt_addr;
1712     void *fdt;
1713     int rc;
1714 
1715     spapr_caps_apply(spapr);
1716 
1717     first_ppc_cpu = POWERPC_CPU(first_cpu);
1718     if (kvm_enabled() && kvmppc_has_cap_mmu_radix() &&
1719         ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
1720                               spapr->max_compat_pvr)) {
1721         /*
1722          * If using KVM with radix mode available, VCPUs can be started
1723          * without a HPT because KVM will start them in radix mode.
1724          * Set the GR bit in PATE so that we know there is no HPT.
1725          */
1726         spapr->patb_entry = PATE1_GR;
1727         spapr_set_all_lpcrs(LPCR_HR | LPCR_UPRT, LPCR_HR | LPCR_UPRT);
1728     } else {
1729         spapr_setup_hpt_and_vrma(spapr);
1730     }
1731 
1732     /*
1733      * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node.
1734      * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is
1735      * called from vPHB reset handler so we initialize the counter here.
1736      * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM
1737      * must be equally distant from any other node.
1738      * The final value of spapr->gpu_numa_id is going to be written to
1739      * max-associativity-domains in spapr_build_fdt().
1740      */
1741     spapr->gpu_numa_id = MAX(1, nb_numa_nodes);
1742     qemu_devices_reset();
1743 
1744     /*
1745      * If this reset wasn't generated by CAS, we should reset our
1746      * negotiated options and start from scratch
1747      */
1748     if (!spapr->cas_reboot) {
1749         spapr_ovec_cleanup(spapr->ov5_cas);
1750         spapr->ov5_cas = spapr_ovec_new();
1751 
1752         ppc_set_compat(first_ppc_cpu, spapr->max_compat_pvr, &error_fatal);
1753     }
1754 
1755     if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
1756         spapr_irq_msi_reset(spapr);
1757     }
1758 
1759     /*
1760      * This is fixing some of the default configuration of the XIVE
1761      * devices. To be called after the reset of the machine devices.
1762      */
1763     spapr_irq_reset(spapr, &error_fatal);
1764 
1765     /*
1766      * There is no CAS under qtest. Simulate one to please the code that
1767      * depends on spapr->ov5_cas. This is especially needed to test device
1768      * unplug, so we do that before resetting the DRCs.
1769      */
1770     if (qtest_enabled()) {
1771         spapr_ovec_cleanup(spapr->ov5_cas);
1772         spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
1773     }
1774 
1775     /* DRC reset may cause a device to be unplugged. This will cause troubles
1776      * if this device is used by another device (eg, a running vhost backend
1777      * will crash QEMU if the DIMM holding the vring goes away). To avoid such
1778      * situations, we reset DRCs after all devices have been reset.
1779      */
1780     object_child_foreach_recursive(object_get_root(), spapr_reset_drcs, NULL);
1781 
1782     spapr_clear_pending_events(spapr);
1783 
1784     /*
1785      * We place the device tree and RTAS just below either the top of the RMA,
1786      * or just below 2GB, whichever is lower, so that it can be
1787      * processed with 32-bit real mode code if necessary
1788      */
1789     rtas_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR);
1790     rtas_addr = rtas_limit - RTAS_MAX_SIZE;
1791     fdt_addr = rtas_addr - FDT_MAX_SIZE;
1792 
1793     fdt = spapr_build_fdt(spapr);
1794 
1795     spapr_load_rtas(spapr, fdt, rtas_addr);
1796 
1797     rc = fdt_pack(fdt);
1798 
1799     /* Should only fail if we've built a corrupted tree */
1800     assert(rc == 0);
1801 
1802     if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
1803         error_report("FDT too big ! 0x%x bytes (max is 0x%x)",
1804                      fdt_totalsize(fdt), FDT_MAX_SIZE);
1805         exit(1);
1806     }
1807 
1808     /* Load the fdt */
1809     qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
1810     cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
1811     g_free(spapr->fdt_blob);
1812     spapr->fdt_size = fdt_totalsize(fdt);
1813     spapr->fdt_initial_size = spapr->fdt_size;
1814     spapr->fdt_blob = fdt;
1815 
1816     /* Set up the entry state */
1817     spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, fdt_addr);
1818     first_ppc_cpu->env.gpr[5] = 0;
1819 
1820     spapr->cas_reboot = false;
1821 }
1822 
1823 static void spapr_create_nvram(SpaprMachineState *spapr)
1824 {
1825     DeviceState *dev = qdev_create(&spapr->vio_bus->bus, "spapr-nvram");
1826     DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
1827 
1828     if (dinfo) {
1829         qdev_prop_set_drive(dev, "drive", blk_by_legacy_dinfo(dinfo),
1830                             &error_fatal);
1831     }
1832 
1833     qdev_init_nofail(dev);
1834 
1835     spapr->nvram = (struct SpaprNvram *)dev;
1836 }
1837 
1838 static void spapr_rtc_create(SpaprMachineState *spapr)
1839 {
1840     object_initialize_child(OBJECT(spapr), "rtc",
1841                             &spapr->rtc, sizeof(spapr->rtc), TYPE_SPAPR_RTC,
1842                             &error_fatal, NULL);
1843     object_property_set_bool(OBJECT(&spapr->rtc), true, "realized",
1844                               &error_fatal);
1845     object_property_add_alias(OBJECT(spapr), "rtc-time", OBJECT(&spapr->rtc),
1846                               "date", &error_fatal);
1847 }
1848 
1849 /* Returns whether we want to use VGA or not */
1850 static bool spapr_vga_init(PCIBus *pci_bus, Error **errp)
1851 {
1852     switch (vga_interface_type) {
1853     case VGA_NONE:
1854         return false;
1855     case VGA_DEVICE:
1856         return true;
1857     case VGA_STD:
1858     case VGA_VIRTIO:
1859     case VGA_CIRRUS:
1860         return pci_vga_init(pci_bus) != NULL;
1861     default:
1862         error_setg(errp,
1863                    "Unsupported VGA mode, only -vga std or -vga virtio is supported");
1864         return false;
1865     }
1866 }
1867 
1868 static int spapr_pre_load(void *opaque)
1869 {
1870     int rc;
1871 
1872     rc = spapr_caps_pre_load(opaque);
1873     if (rc) {
1874         return rc;
1875     }
1876 
1877     return 0;
1878 }
1879 
1880 static int spapr_post_load(void *opaque, int version_id)
1881 {
1882     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1883     int err = 0;
1884 
1885     err = spapr_caps_post_migration(spapr);
1886     if (err) {
1887         return err;
1888     }
1889 
1890     /*
1891      * In earlier versions, there was no separate qdev for the PAPR
1892      * RTC, so the RTC offset was stored directly in sPAPREnvironment.
1893      * So when migrating from those versions, poke the incoming offset
1894      * value into the RTC device
1895      */
1896     if (version_id < 3) {
1897         err = spapr_rtc_import_offset(&spapr->rtc, spapr->rtc_offset);
1898         if (err) {
1899             return err;
1900         }
1901     }
1902 
1903     if (kvm_enabled() && spapr->patb_entry) {
1904         PowerPCCPU *cpu = POWERPC_CPU(first_cpu);
1905         bool radix = !!(spapr->patb_entry & PATE1_GR);
1906         bool gtse = !!(cpu->env.spr[SPR_LPCR] & LPCR_GTSE);
1907 
1908         /*
1909          * Update LPCR:HR and UPRT as they may not be set properly in
1910          * the stream
1911          */
1912         spapr_set_all_lpcrs(radix ? (LPCR_HR | LPCR_UPRT) : 0,
1913                             LPCR_HR | LPCR_UPRT);
1914 
1915         err = kvmppc_configure_v3_mmu(cpu, radix, gtse, spapr->patb_entry);
1916         if (err) {
1917             error_report("Process table config unsupported by the host");
1918             return -EINVAL;
1919         }
1920     }
1921 
1922     err = spapr_irq_post_load(spapr, version_id);
1923     if (err) {
1924         return err;
1925     }
1926 
1927     return err;
1928 }
1929 
1930 static int spapr_pre_save(void *opaque)
1931 {
1932     int rc;
1933 
1934     rc = spapr_caps_pre_save(opaque);
1935     if (rc) {
1936         return rc;
1937     }
1938 
1939     return 0;
1940 }
1941 
1942 static bool version_before_3(void *opaque, int version_id)
1943 {
1944     return version_id < 3;
1945 }
1946 
1947 static bool spapr_pending_events_needed(void *opaque)
1948 {
1949     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1950     return !QTAILQ_EMPTY(&spapr->pending_events);
1951 }
1952 
1953 static const VMStateDescription vmstate_spapr_event_entry = {
1954     .name = "spapr_event_log_entry",
1955     .version_id = 1,
1956     .minimum_version_id = 1,
1957     .fields = (VMStateField[]) {
1958         VMSTATE_UINT32(summary, SpaprEventLogEntry),
1959         VMSTATE_UINT32(extended_length, SpaprEventLogEntry),
1960         VMSTATE_VBUFFER_ALLOC_UINT32(extended_log, SpaprEventLogEntry, 0,
1961                                      NULL, extended_length),
1962         VMSTATE_END_OF_LIST()
1963     },
1964 };
1965 
1966 static const VMStateDescription vmstate_spapr_pending_events = {
1967     .name = "spapr_pending_events",
1968     .version_id = 1,
1969     .minimum_version_id = 1,
1970     .needed = spapr_pending_events_needed,
1971     .fields = (VMStateField[]) {
1972         VMSTATE_QTAILQ_V(pending_events, SpaprMachineState, 1,
1973                          vmstate_spapr_event_entry, SpaprEventLogEntry, next),
1974         VMSTATE_END_OF_LIST()
1975     },
1976 };
1977 
1978 static bool spapr_ov5_cas_needed(void *opaque)
1979 {
1980     SpaprMachineState *spapr = opaque;
1981     SpaprOptionVector *ov5_mask = spapr_ovec_new();
1982     SpaprOptionVector *ov5_legacy = spapr_ovec_new();
1983     SpaprOptionVector *ov5_removed = spapr_ovec_new();
1984     bool cas_needed;
1985 
1986     /* Prior to the introduction of SpaprOptionVector, we had two option
1987      * vectors we dealt with: OV5_FORM1_AFFINITY, and OV5_DRCONF_MEMORY.
1988      * Both of these options encode machine topology into the device-tree
1989      * in such a way that the now-booted OS should still be able to interact
1990      * appropriately with QEMU regardless of what options were actually
1991      * negotiatied on the source side.
1992      *
1993      * As such, we can avoid migrating the CAS-negotiated options if these
1994      * are the only options available on the current machine/platform.
1995      * Since these are the only options available for pseries-2.7 and
1996      * earlier, this allows us to maintain old->new/new->old migration
1997      * compatibility.
1998      *
1999      * For QEMU 2.8+, there are additional CAS-negotiatable options available
2000      * via default pseries-2.8 machines and explicit command-line parameters.
2001      * Some of these options, like OV5_HP_EVT, *do* require QEMU to be aware
2002      * of the actual CAS-negotiated values to continue working properly. For
2003      * example, availability of memory unplug depends on knowing whether
2004      * OV5_HP_EVT was negotiated via CAS.
2005      *
2006      * Thus, for any cases where the set of available CAS-negotiatable
2007      * options extends beyond OV5_FORM1_AFFINITY and OV5_DRCONF_MEMORY, we
2008      * include the CAS-negotiated options in the migration stream, unless
2009      * if they affect boot time behaviour only.
2010      */
2011     spapr_ovec_set(ov5_mask, OV5_FORM1_AFFINITY);
2012     spapr_ovec_set(ov5_mask, OV5_DRCONF_MEMORY);
2013     spapr_ovec_set(ov5_mask, OV5_DRMEM_V2);
2014 
2015     /* spapr_ovec_diff returns true if bits were removed. we avoid using
2016      * the mask itself since in the future it's possible "legacy" bits may be
2017      * removed via machine options, which could generate a false positive
2018      * that breaks migration.
2019      */
2020     spapr_ovec_intersect(ov5_legacy, spapr->ov5, ov5_mask);
2021     cas_needed = spapr_ovec_diff(ov5_removed, spapr->ov5, ov5_legacy);
2022 
2023     spapr_ovec_cleanup(ov5_mask);
2024     spapr_ovec_cleanup(ov5_legacy);
2025     spapr_ovec_cleanup(ov5_removed);
2026 
2027     return cas_needed;
2028 }
2029 
2030 static const VMStateDescription vmstate_spapr_ov5_cas = {
2031     .name = "spapr_option_vector_ov5_cas",
2032     .version_id = 1,
2033     .minimum_version_id = 1,
2034     .needed = spapr_ov5_cas_needed,
2035     .fields = (VMStateField[]) {
2036         VMSTATE_STRUCT_POINTER_V(ov5_cas, SpaprMachineState, 1,
2037                                  vmstate_spapr_ovec, SpaprOptionVector),
2038         VMSTATE_END_OF_LIST()
2039     },
2040 };
2041 
2042 static bool spapr_patb_entry_needed(void *opaque)
2043 {
2044     SpaprMachineState *spapr = opaque;
2045 
2046     return !!spapr->patb_entry;
2047 }
2048 
2049 static const VMStateDescription vmstate_spapr_patb_entry = {
2050     .name = "spapr_patb_entry",
2051     .version_id = 1,
2052     .minimum_version_id = 1,
2053     .needed = spapr_patb_entry_needed,
2054     .fields = (VMStateField[]) {
2055         VMSTATE_UINT64(patb_entry, SpaprMachineState),
2056         VMSTATE_END_OF_LIST()
2057     },
2058 };
2059 
2060 static bool spapr_irq_map_needed(void *opaque)
2061 {
2062     SpaprMachineState *spapr = opaque;
2063 
2064     return spapr->irq_map && !bitmap_empty(spapr->irq_map, spapr->irq_map_nr);
2065 }
2066 
2067 static const VMStateDescription vmstate_spapr_irq_map = {
2068     .name = "spapr_irq_map",
2069     .version_id = 1,
2070     .minimum_version_id = 1,
2071     .needed = spapr_irq_map_needed,
2072     .fields = (VMStateField[]) {
2073         VMSTATE_BITMAP(irq_map, SpaprMachineState, 0, irq_map_nr),
2074         VMSTATE_END_OF_LIST()
2075     },
2076 };
2077 
2078 static bool spapr_dtb_needed(void *opaque)
2079 {
2080     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(opaque);
2081 
2082     return smc->update_dt_enabled;
2083 }
2084 
2085 static int spapr_dtb_pre_load(void *opaque)
2086 {
2087     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
2088 
2089     g_free(spapr->fdt_blob);
2090     spapr->fdt_blob = NULL;
2091     spapr->fdt_size = 0;
2092 
2093     return 0;
2094 }
2095 
2096 static const VMStateDescription vmstate_spapr_dtb = {
2097     .name = "spapr_dtb",
2098     .version_id = 1,
2099     .minimum_version_id = 1,
2100     .needed = spapr_dtb_needed,
2101     .pre_load = spapr_dtb_pre_load,
2102     .fields = (VMStateField[]) {
2103         VMSTATE_UINT32(fdt_initial_size, SpaprMachineState),
2104         VMSTATE_UINT32(fdt_size, SpaprMachineState),
2105         VMSTATE_VBUFFER_ALLOC_UINT32(fdt_blob, SpaprMachineState, 0, NULL,
2106                                      fdt_size),
2107         VMSTATE_END_OF_LIST()
2108     },
2109 };
2110 
2111 static const VMStateDescription vmstate_spapr = {
2112     .name = "spapr",
2113     .version_id = 3,
2114     .minimum_version_id = 1,
2115     .pre_load = spapr_pre_load,
2116     .post_load = spapr_post_load,
2117     .pre_save = spapr_pre_save,
2118     .fields = (VMStateField[]) {
2119         /* used to be @next_irq */
2120         VMSTATE_UNUSED_BUFFER(version_before_3, 0, 4),
2121 
2122         /* RTC offset */
2123         VMSTATE_UINT64_TEST(rtc_offset, SpaprMachineState, version_before_3),
2124 
2125         VMSTATE_PPC_TIMEBASE_V(tb, SpaprMachineState, 2),
2126         VMSTATE_END_OF_LIST()
2127     },
2128     .subsections = (const VMStateDescription*[]) {
2129         &vmstate_spapr_ov5_cas,
2130         &vmstate_spapr_patb_entry,
2131         &vmstate_spapr_pending_events,
2132         &vmstate_spapr_cap_htm,
2133         &vmstate_spapr_cap_vsx,
2134         &vmstate_spapr_cap_dfp,
2135         &vmstate_spapr_cap_cfpc,
2136         &vmstate_spapr_cap_sbbc,
2137         &vmstate_spapr_cap_ibs,
2138         &vmstate_spapr_cap_hpt_maxpagesize,
2139         &vmstate_spapr_irq_map,
2140         &vmstate_spapr_cap_nested_kvm_hv,
2141         &vmstate_spapr_dtb,
2142         &vmstate_spapr_cap_large_decr,
2143         &vmstate_spapr_cap_ccf_assist,
2144         NULL
2145     }
2146 };
2147 
2148 static int htab_save_setup(QEMUFile *f, void *opaque)
2149 {
2150     SpaprMachineState *spapr = opaque;
2151 
2152     /* "Iteration" header */
2153     if (!spapr->htab_shift) {
2154         qemu_put_be32(f, -1);
2155     } else {
2156         qemu_put_be32(f, spapr->htab_shift);
2157     }
2158 
2159     if (spapr->htab) {
2160         spapr->htab_save_index = 0;
2161         spapr->htab_first_pass = true;
2162     } else {
2163         if (spapr->htab_shift) {
2164             assert(kvm_enabled());
2165         }
2166     }
2167 
2168 
2169     return 0;
2170 }
2171 
2172 static void htab_save_chunk(QEMUFile *f, SpaprMachineState *spapr,
2173                             int chunkstart, int n_valid, int n_invalid)
2174 {
2175     qemu_put_be32(f, chunkstart);
2176     qemu_put_be16(f, n_valid);
2177     qemu_put_be16(f, n_invalid);
2178     qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
2179                     HASH_PTE_SIZE_64 * n_valid);
2180 }
2181 
2182 static void htab_save_end_marker(QEMUFile *f)
2183 {
2184     qemu_put_be32(f, 0);
2185     qemu_put_be16(f, 0);
2186     qemu_put_be16(f, 0);
2187 }
2188 
2189 static void htab_save_first_pass(QEMUFile *f, SpaprMachineState *spapr,
2190                                  int64_t max_ns)
2191 {
2192     bool has_timeout = max_ns != -1;
2193     int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
2194     int index = spapr->htab_save_index;
2195     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2196 
2197     assert(spapr->htab_first_pass);
2198 
2199     do {
2200         int chunkstart;
2201 
2202         /* Consume invalid HPTEs */
2203         while ((index < htabslots)
2204                && !HPTE_VALID(HPTE(spapr->htab, index))) {
2205             CLEAN_HPTE(HPTE(spapr->htab, index));
2206             index++;
2207         }
2208 
2209         /* Consume valid HPTEs */
2210         chunkstart = index;
2211         while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
2212                && HPTE_VALID(HPTE(spapr->htab, index))) {
2213             CLEAN_HPTE(HPTE(spapr->htab, index));
2214             index++;
2215         }
2216 
2217         if (index > chunkstart) {
2218             int n_valid = index - chunkstart;
2219 
2220             htab_save_chunk(f, spapr, chunkstart, n_valid, 0);
2221 
2222             if (has_timeout &&
2223                 (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
2224                 break;
2225             }
2226         }
2227     } while ((index < htabslots) && !qemu_file_rate_limit(f));
2228 
2229     if (index >= htabslots) {
2230         assert(index == htabslots);
2231         index = 0;
2232         spapr->htab_first_pass = false;
2233     }
2234     spapr->htab_save_index = index;
2235 }
2236 
2237 static int htab_save_later_pass(QEMUFile *f, SpaprMachineState *spapr,
2238                                 int64_t max_ns)
2239 {
2240     bool final = max_ns < 0;
2241     int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
2242     int examined = 0, sent = 0;
2243     int index = spapr->htab_save_index;
2244     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2245 
2246     assert(!spapr->htab_first_pass);
2247 
2248     do {
2249         int chunkstart, invalidstart;
2250 
2251         /* Consume non-dirty HPTEs */
2252         while ((index < htabslots)
2253                && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
2254             index++;
2255             examined++;
2256         }
2257 
2258         chunkstart = index;
2259         /* Consume valid dirty HPTEs */
2260         while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
2261                && HPTE_DIRTY(HPTE(spapr->htab, index))
2262                && HPTE_VALID(HPTE(spapr->htab, index))) {
2263             CLEAN_HPTE(HPTE(spapr->htab, index));
2264             index++;
2265             examined++;
2266         }
2267 
2268         invalidstart = index;
2269         /* Consume invalid dirty HPTEs */
2270         while ((index < htabslots) && (index - invalidstart < USHRT_MAX)
2271                && HPTE_DIRTY(HPTE(spapr->htab, index))
2272                && !HPTE_VALID(HPTE(spapr->htab, index))) {
2273             CLEAN_HPTE(HPTE(spapr->htab, index));
2274             index++;
2275             examined++;
2276         }
2277 
2278         if (index > chunkstart) {
2279             int n_valid = invalidstart - chunkstart;
2280             int n_invalid = index - invalidstart;
2281 
2282             htab_save_chunk(f, spapr, chunkstart, n_valid, n_invalid);
2283             sent += index - chunkstart;
2284 
2285             if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
2286                 break;
2287             }
2288         }
2289 
2290         if (examined >= htabslots) {
2291             break;
2292         }
2293 
2294         if (index >= htabslots) {
2295             assert(index == htabslots);
2296             index = 0;
2297         }
2298     } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));
2299 
2300     if (index >= htabslots) {
2301         assert(index == htabslots);
2302         index = 0;
2303     }
2304 
2305     spapr->htab_save_index = index;
2306 
2307     return (examined >= htabslots) && (sent == 0) ? 1 : 0;
2308 }
2309 
2310 #define MAX_ITERATION_NS    5000000 /* 5 ms */
2311 #define MAX_KVM_BUF_SIZE    2048
2312 
2313 static int htab_save_iterate(QEMUFile *f, void *opaque)
2314 {
2315     SpaprMachineState *spapr = opaque;
2316     int fd;
2317     int rc = 0;
2318 
2319     /* Iteration header */
2320     if (!spapr->htab_shift) {
2321         qemu_put_be32(f, -1);
2322         return 1;
2323     } else {
2324         qemu_put_be32(f, 0);
2325     }
2326 
2327     if (!spapr->htab) {
2328         assert(kvm_enabled());
2329 
2330         fd = get_htab_fd(spapr);
2331         if (fd < 0) {
2332             return fd;
2333         }
2334 
2335         rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
2336         if (rc < 0) {
2337             return rc;
2338         }
2339     } else  if (spapr->htab_first_pass) {
2340         htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
2341     } else {
2342         rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
2343     }
2344 
2345     htab_save_end_marker(f);
2346 
2347     return rc;
2348 }
2349 
2350 static int htab_save_complete(QEMUFile *f, void *opaque)
2351 {
2352     SpaprMachineState *spapr = opaque;
2353     int fd;
2354 
2355     /* Iteration header */
2356     if (!spapr->htab_shift) {
2357         qemu_put_be32(f, -1);
2358         return 0;
2359     } else {
2360         qemu_put_be32(f, 0);
2361     }
2362 
2363     if (!spapr->htab) {
2364         int rc;
2365 
2366         assert(kvm_enabled());
2367 
2368         fd = get_htab_fd(spapr);
2369         if (fd < 0) {
2370             return fd;
2371         }
2372 
2373         rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, -1);
2374         if (rc < 0) {
2375             return rc;
2376         }
2377     } else {
2378         if (spapr->htab_first_pass) {
2379             htab_save_first_pass(f, spapr, -1);
2380         }
2381         htab_save_later_pass(f, spapr, -1);
2382     }
2383 
2384     /* End marker */
2385     htab_save_end_marker(f);
2386 
2387     return 0;
2388 }
2389 
2390 static int htab_load(QEMUFile *f, void *opaque, int version_id)
2391 {
2392     SpaprMachineState *spapr = opaque;
2393     uint32_t section_hdr;
2394     int fd = -1;
2395     Error *local_err = NULL;
2396 
2397     if (version_id < 1 || version_id > 1) {
2398         error_report("htab_load() bad version");
2399         return -EINVAL;
2400     }
2401 
2402     section_hdr = qemu_get_be32(f);
2403 
2404     if (section_hdr == -1) {
2405         spapr_free_hpt(spapr);
2406         return 0;
2407     }
2408 
2409     if (section_hdr) {
2410         /* First section gives the htab size */
2411         spapr_reallocate_hpt(spapr, section_hdr, &local_err);
2412         if (local_err) {
2413             error_report_err(local_err);
2414             return -EINVAL;
2415         }
2416         return 0;
2417     }
2418 
2419     if (!spapr->htab) {
2420         assert(kvm_enabled());
2421 
2422         fd = kvmppc_get_htab_fd(true, 0, &local_err);
2423         if (fd < 0) {
2424             error_report_err(local_err);
2425             return fd;
2426         }
2427     }
2428 
2429     while (true) {
2430         uint32_t index;
2431         uint16_t n_valid, n_invalid;
2432 
2433         index = qemu_get_be32(f);
2434         n_valid = qemu_get_be16(f);
2435         n_invalid = qemu_get_be16(f);
2436 
2437         if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
2438             /* End of Stream */
2439             break;
2440         }
2441 
2442         if ((index + n_valid + n_invalid) >
2443             (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
2444             /* Bad index in stream */
2445             error_report(
2446                 "htab_load() bad index %d (%hd+%hd entries) in htab stream (htab_shift=%d)",
2447                 index, n_valid, n_invalid, spapr->htab_shift);
2448             return -EINVAL;
2449         }
2450 
2451         if (spapr->htab) {
2452             if (n_valid) {
2453                 qemu_get_buffer(f, HPTE(spapr->htab, index),
2454                                 HASH_PTE_SIZE_64 * n_valid);
2455             }
2456             if (n_invalid) {
2457                 memset(HPTE(spapr->htab, index + n_valid), 0,
2458                        HASH_PTE_SIZE_64 * n_invalid);
2459             }
2460         } else {
2461             int rc;
2462 
2463             assert(fd >= 0);
2464 
2465             rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid);
2466             if (rc < 0) {
2467                 return rc;
2468             }
2469         }
2470     }
2471 
2472     if (!spapr->htab) {
2473         assert(fd >= 0);
2474         close(fd);
2475     }
2476 
2477     return 0;
2478 }
2479 
2480 static void htab_save_cleanup(void *opaque)
2481 {
2482     SpaprMachineState *spapr = opaque;
2483 
2484     close_htab_fd(spapr);
2485 }
2486 
2487 static SaveVMHandlers savevm_htab_handlers = {
2488     .save_setup = htab_save_setup,
2489     .save_live_iterate = htab_save_iterate,
2490     .save_live_complete_precopy = htab_save_complete,
2491     .save_cleanup = htab_save_cleanup,
2492     .load_state = htab_load,
2493 };
2494 
2495 static void spapr_boot_set(void *opaque, const char *boot_device,
2496                            Error **errp)
2497 {
2498     MachineState *machine = MACHINE(opaque);
2499     machine->boot_order = g_strdup(boot_device);
2500 }
2501 
2502 static void spapr_create_lmb_dr_connectors(SpaprMachineState *spapr)
2503 {
2504     MachineState *machine = MACHINE(spapr);
2505     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
2506     uint32_t nr_lmbs = (machine->maxram_size - machine->ram_size)/lmb_size;
2507     int i;
2508 
2509     for (i = 0; i < nr_lmbs; i++) {
2510         uint64_t addr;
2511 
2512         addr = i * lmb_size + machine->device_memory->base;
2513         spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_LMB,
2514                                addr / lmb_size);
2515     }
2516 }
2517 
2518 /*
2519  * If RAM size, maxmem size and individual node mem sizes aren't aligned
2520  * to SPAPR_MEMORY_BLOCK_SIZE(256MB), then refuse to start the guest
2521  * since we can't support such unaligned sizes with DRCONF_MEMORY.
2522  */
2523 static void spapr_validate_node_memory(MachineState *machine, Error **errp)
2524 {
2525     int i;
2526 
2527     if (machine->ram_size % SPAPR_MEMORY_BLOCK_SIZE) {
2528         error_setg(errp, "Memory size 0x" RAM_ADDR_FMT
2529                    " is not aligned to %" PRIu64 " MiB",
2530                    machine->ram_size,
2531                    SPAPR_MEMORY_BLOCK_SIZE / MiB);
2532         return;
2533     }
2534 
2535     if (machine->maxram_size % SPAPR_MEMORY_BLOCK_SIZE) {
2536         error_setg(errp, "Maximum memory size 0x" RAM_ADDR_FMT
2537                    " is not aligned to %" PRIu64 " MiB",
2538                    machine->ram_size,
2539                    SPAPR_MEMORY_BLOCK_SIZE / MiB);
2540         return;
2541     }
2542 
2543     for (i = 0; i < nb_numa_nodes; i++) {
2544         if (numa_info[i].node_mem % SPAPR_MEMORY_BLOCK_SIZE) {
2545             error_setg(errp,
2546                        "Node %d memory size 0x%" PRIx64
2547                        " is not aligned to %" PRIu64 " MiB",
2548                        i, numa_info[i].node_mem,
2549                        SPAPR_MEMORY_BLOCK_SIZE / MiB);
2550             return;
2551         }
2552     }
2553 }
2554 
2555 /* find cpu slot in machine->possible_cpus by core_id */
2556 static CPUArchId *spapr_find_cpu_slot(MachineState *ms, uint32_t id, int *idx)
2557 {
2558     int index = id / ms->smp.threads;
2559 
2560     if (index >= ms->possible_cpus->len) {
2561         return NULL;
2562     }
2563     if (idx) {
2564         *idx = index;
2565     }
2566     return &ms->possible_cpus->cpus[index];
2567 }
2568 
2569 static void spapr_set_vsmt_mode(SpaprMachineState *spapr, Error **errp)
2570 {
2571     MachineState *ms = MACHINE(spapr);
2572     Error *local_err = NULL;
2573     bool vsmt_user = !!spapr->vsmt;
2574     int kvm_smt = kvmppc_smt_threads();
2575     int ret;
2576     unsigned int smp_threads = ms->smp.threads;
2577 
2578     if (!kvm_enabled() && (smp_threads > 1)) {
2579         error_setg(&local_err, "TCG cannot support more than 1 thread/core "
2580                      "on a pseries machine");
2581         goto out;
2582     }
2583     if (!is_power_of_2(smp_threads)) {
2584         error_setg(&local_err, "Cannot support %d threads/core on a pseries "
2585                      "machine because it must be a power of 2", smp_threads);
2586         goto out;
2587     }
2588 
2589     /* Detemine the VSMT mode to use: */
2590     if (vsmt_user) {
2591         if (spapr->vsmt < smp_threads) {
2592             error_setg(&local_err, "Cannot support VSMT mode %d"
2593                          " because it must be >= threads/core (%d)",
2594                          spapr->vsmt, smp_threads);
2595             goto out;
2596         }
2597         /* In this case, spapr->vsmt has been set by the command line */
2598     } else {
2599         /*
2600          * Default VSMT value is tricky, because we need it to be as
2601          * consistent as possible (for migration), but this requires
2602          * changing it for at least some existing cases.  We pick 8 as
2603          * the value that we'd get with KVM on POWER8, the
2604          * overwhelmingly common case in production systems.
2605          */
2606         spapr->vsmt = MAX(8, smp_threads);
2607     }
2608 
2609     /* KVM: If necessary, set the SMT mode: */
2610     if (kvm_enabled() && (spapr->vsmt != kvm_smt)) {
2611         ret = kvmppc_set_smt_threads(spapr->vsmt);
2612         if (ret) {
2613             /* Looks like KVM isn't able to change VSMT mode */
2614             error_setg(&local_err,
2615                        "Failed to set KVM's VSMT mode to %d (errno %d)",
2616                        spapr->vsmt, ret);
2617             /* We can live with that if the default one is big enough
2618              * for the number of threads, and a submultiple of the one
2619              * we want.  In this case we'll waste some vcpu ids, but
2620              * behaviour will be correct */
2621             if ((kvm_smt >= smp_threads) && ((spapr->vsmt % kvm_smt) == 0)) {
2622                 warn_report_err(local_err);
2623                 local_err = NULL;
2624                 goto out;
2625             } else {
2626                 if (!vsmt_user) {
2627                     error_append_hint(&local_err,
2628                                       "On PPC, a VM with %d threads/core"
2629                                       " on a host with %d threads/core"
2630                                       " requires the use of VSMT mode %d.\n",
2631                                       smp_threads, kvm_smt, spapr->vsmt);
2632                 }
2633                 kvmppc_hint_smt_possible(&local_err);
2634                 goto out;
2635             }
2636         }
2637     }
2638     /* else TCG: nothing to do currently */
2639 out:
2640     error_propagate(errp, local_err);
2641 }
2642 
2643 static void spapr_init_cpus(SpaprMachineState *spapr)
2644 {
2645     MachineState *machine = MACHINE(spapr);
2646     MachineClass *mc = MACHINE_GET_CLASS(machine);
2647     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
2648     const char *type = spapr_get_cpu_core_type(machine->cpu_type);
2649     const CPUArchIdList *possible_cpus;
2650     unsigned int smp_cpus = machine->smp.cpus;
2651     unsigned int smp_threads = machine->smp.threads;
2652     unsigned int max_cpus = machine->smp.max_cpus;
2653     int boot_cores_nr = smp_cpus / smp_threads;
2654     int i;
2655 
2656     possible_cpus = mc->possible_cpu_arch_ids(machine);
2657     if (mc->has_hotpluggable_cpus) {
2658         if (smp_cpus % smp_threads) {
2659             error_report("smp_cpus (%u) must be multiple of threads (%u)",
2660                          smp_cpus, smp_threads);
2661             exit(1);
2662         }
2663         if (max_cpus % smp_threads) {
2664             error_report("max_cpus (%u) must be multiple of threads (%u)",
2665                          max_cpus, smp_threads);
2666             exit(1);
2667         }
2668     } else {
2669         if (max_cpus != smp_cpus) {
2670             error_report("This machine version does not support CPU hotplug");
2671             exit(1);
2672         }
2673         boot_cores_nr = possible_cpus->len;
2674     }
2675 
2676     if (smc->pre_2_10_has_unused_icps) {
2677         int i;
2678 
2679         for (i = 0; i < spapr_max_server_number(spapr); i++) {
2680             /* Dummy entries get deregistered when real ICPState objects
2681              * are registered during CPU core hotplug.
2682              */
2683             pre_2_10_vmstate_register_dummy_icp(i);
2684         }
2685     }
2686 
2687     for (i = 0; i < possible_cpus->len; i++) {
2688         int core_id = i * smp_threads;
2689 
2690         if (mc->has_hotpluggable_cpus) {
2691             spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_CPU,
2692                                    spapr_vcpu_id(spapr, core_id));
2693         }
2694 
2695         if (i < boot_cores_nr) {
2696             Object *core  = object_new(type);
2697             int nr_threads = smp_threads;
2698 
2699             /* Handle the partially filled core for older machine types */
2700             if ((i + 1) * smp_threads >= smp_cpus) {
2701                 nr_threads = smp_cpus - i * smp_threads;
2702             }
2703 
2704             object_property_set_int(core, nr_threads, "nr-threads",
2705                                     &error_fatal);
2706             object_property_set_int(core, core_id, CPU_CORE_PROP_CORE_ID,
2707                                     &error_fatal);
2708             object_property_set_bool(core, true, "realized", &error_fatal);
2709 
2710             object_unref(core);
2711         }
2712     }
2713 }
2714 
2715 static PCIHostState *spapr_create_default_phb(void)
2716 {
2717     DeviceState *dev;
2718 
2719     dev = qdev_create(NULL, TYPE_SPAPR_PCI_HOST_BRIDGE);
2720     qdev_prop_set_uint32(dev, "index", 0);
2721     qdev_init_nofail(dev);
2722 
2723     return PCI_HOST_BRIDGE(dev);
2724 }
2725 
2726 /* pSeries LPAR / sPAPR hardware init */
2727 static void spapr_machine_init(MachineState *machine)
2728 {
2729     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
2730     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
2731     const char *kernel_filename = machine->kernel_filename;
2732     const char *initrd_filename = machine->initrd_filename;
2733     PCIHostState *phb;
2734     int i;
2735     MemoryRegion *sysmem = get_system_memory();
2736     MemoryRegion *ram = g_new(MemoryRegion, 1);
2737     hwaddr node0_size = spapr_node0_size(machine);
2738     long load_limit, fw_size;
2739     char *filename;
2740     Error *resize_hpt_err = NULL;
2741 
2742     msi_nonbroken = true;
2743 
2744     QLIST_INIT(&spapr->phbs);
2745     QTAILQ_INIT(&spapr->pending_dimm_unplugs);
2746 
2747     /* Determine capabilities to run with */
2748     spapr_caps_init(spapr);
2749 
2750     kvmppc_check_papr_resize_hpt(&resize_hpt_err);
2751     if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DEFAULT) {
2752         /*
2753          * If the user explicitly requested a mode we should either
2754          * supply it, or fail completely (which we do below).  But if
2755          * it's not set explicitly, we reset our mode to something
2756          * that works
2757          */
2758         if (resize_hpt_err) {
2759             spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
2760             error_free(resize_hpt_err);
2761             resize_hpt_err = NULL;
2762         } else {
2763             spapr->resize_hpt = smc->resize_hpt_default;
2764         }
2765     }
2766 
2767     assert(spapr->resize_hpt != SPAPR_RESIZE_HPT_DEFAULT);
2768 
2769     if ((spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) && resize_hpt_err) {
2770         /*
2771          * User requested HPT resize, but this host can't supply it.  Bail out
2772          */
2773         error_report_err(resize_hpt_err);
2774         exit(1);
2775     }
2776 
2777     spapr->rma_size = node0_size;
2778 
2779     /* With KVM, we don't actually know whether KVM supports an
2780      * unbounded RMA (PR KVM) or is limited by the hash table size
2781      * (HV KVM using VRMA), so we always assume the latter
2782      *
2783      * In that case, we also limit the initial allocations for RTAS
2784      * etc... to 256M since we have no way to know what the VRMA size
2785      * is going to be as it depends on the size of the hash table
2786      * which isn't determined yet.
2787      */
2788     if (kvm_enabled()) {
2789         spapr->vrma_adjust = 1;
2790         spapr->rma_size = MIN(spapr->rma_size, 0x10000000);
2791     }
2792 
2793     /* Actually we don't support unbounded RMA anymore since we added
2794      * proper emulation of HV mode. The max we can get is 16G which
2795      * also happens to be what we configure for PAPR mode so make sure
2796      * we don't do anything bigger than that
2797      */
2798     spapr->rma_size = MIN(spapr->rma_size, 0x400000000ull);
2799 
2800     if (spapr->rma_size > node0_size) {
2801         error_report("Numa node 0 has to span the RMA (%#08"HWADDR_PRIx")",
2802                      spapr->rma_size);
2803         exit(1);
2804     }
2805 
2806     /* Setup a load limit for the ramdisk leaving room for SLOF and FDT */
2807     load_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR) - FW_OVERHEAD;
2808 
2809     /*
2810      * VSMT must be set in order to be able to compute VCPU ids, ie to
2811      * call spapr_max_server_number() or spapr_vcpu_id().
2812      */
2813     spapr_set_vsmt_mode(spapr, &error_fatal);
2814 
2815     /* Set up Interrupt Controller before we create the VCPUs */
2816     spapr_irq_init(spapr, &error_fatal);
2817 
2818     /* Set up containers for ibm,client-architecture-support negotiated options
2819      */
2820     spapr->ov5 = spapr_ovec_new();
2821     spapr->ov5_cas = spapr_ovec_new();
2822 
2823     if (smc->dr_lmb_enabled) {
2824         spapr_ovec_set(spapr->ov5, OV5_DRCONF_MEMORY);
2825         spapr_validate_node_memory(machine, &error_fatal);
2826     }
2827 
2828     spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
2829 
2830     /* advertise support for dedicated HP event source to guests */
2831     if (spapr->use_hotplug_event_source) {
2832         spapr_ovec_set(spapr->ov5, OV5_HP_EVT);
2833     }
2834 
2835     /* advertise support for HPT resizing */
2836     if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
2837         spapr_ovec_set(spapr->ov5, OV5_HPT_RESIZE);
2838     }
2839 
2840     /* advertise support for ibm,dyamic-memory-v2 */
2841     spapr_ovec_set(spapr->ov5, OV5_DRMEM_V2);
2842 
2843     /* advertise XIVE on POWER9 machines */
2844     if (spapr->irq->ov5 & (SPAPR_OV5_XIVE_EXPLOIT | SPAPR_OV5_XIVE_BOTH)) {
2845         spapr_ovec_set(spapr->ov5, OV5_XIVE_EXPLOIT);
2846     }
2847 
2848     /* init CPUs */
2849     spapr_init_cpus(spapr);
2850 
2851     if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
2852         ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
2853                               spapr->max_compat_pvr)) {
2854         /* KVM and TCG always allow GTSE with radix... */
2855         spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
2856     }
2857     /* ... but not with hash (currently). */
2858 
2859     if (kvm_enabled()) {
2860         /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
2861         kvmppc_enable_logical_ci_hcalls();
2862         kvmppc_enable_set_mode_hcall();
2863 
2864         /* H_CLEAR_MOD/_REF are mandatory in PAPR, but off by default */
2865         kvmppc_enable_clear_ref_mod_hcalls();
2866 
2867         /* Enable H_PAGE_INIT */
2868         kvmppc_enable_h_page_init();
2869     }
2870 
2871     /* allocate RAM */
2872     memory_region_allocate_system_memory(ram, NULL, "ppc_spapr.ram",
2873                                          machine->ram_size);
2874     memory_region_add_subregion(sysmem, 0, ram);
2875 
2876     /* always allocate the device memory information */
2877     machine->device_memory = g_malloc0(sizeof(*machine->device_memory));
2878 
2879     /* initialize hotplug memory address space */
2880     if (machine->ram_size < machine->maxram_size) {
2881         ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size;
2882         /*
2883          * Limit the number of hotpluggable memory slots to half the number
2884          * slots that KVM supports, leaving the other half for PCI and other
2885          * devices. However ensure that number of slots doesn't drop below 32.
2886          */
2887         int max_memslots = kvm_enabled() ? kvm_get_max_memslots() / 2 :
2888                            SPAPR_MAX_RAM_SLOTS;
2889 
2890         if (max_memslots < SPAPR_MAX_RAM_SLOTS) {
2891             max_memslots = SPAPR_MAX_RAM_SLOTS;
2892         }
2893         if (machine->ram_slots > max_memslots) {
2894             error_report("Specified number of memory slots %"
2895                          PRIu64" exceeds max supported %d",
2896                          machine->ram_slots, max_memslots);
2897             exit(1);
2898         }
2899 
2900         machine->device_memory->base = ROUND_UP(machine->ram_size,
2901                                                 SPAPR_DEVICE_MEM_ALIGN);
2902         memory_region_init(&machine->device_memory->mr, OBJECT(spapr),
2903                            "device-memory", device_mem_size);
2904         memory_region_add_subregion(sysmem, machine->device_memory->base,
2905                                     &machine->device_memory->mr);
2906     }
2907 
2908     if (smc->dr_lmb_enabled) {
2909         spapr_create_lmb_dr_connectors(spapr);
2910     }
2911 
2912     filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
2913     if (!filename) {
2914         error_report("Could not find LPAR rtas '%s'", "spapr-rtas.bin");
2915         exit(1);
2916     }
2917     spapr->rtas_size = get_image_size(filename);
2918     if (spapr->rtas_size < 0) {
2919         error_report("Could not get size of LPAR rtas '%s'", filename);
2920         exit(1);
2921     }
2922     spapr->rtas_blob = g_malloc(spapr->rtas_size);
2923     if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
2924         error_report("Could not load LPAR rtas '%s'", filename);
2925         exit(1);
2926     }
2927     if (spapr->rtas_size > RTAS_MAX_SIZE) {
2928         error_report("RTAS too big ! 0x%zx bytes (max is 0x%x)",
2929                      (size_t)spapr->rtas_size, RTAS_MAX_SIZE);
2930         exit(1);
2931     }
2932     g_free(filename);
2933 
2934     /* Set up RTAS event infrastructure */
2935     spapr_events_init(spapr);
2936 
2937     /* Set up the RTC RTAS interfaces */
2938     spapr_rtc_create(spapr);
2939 
2940     /* Set up VIO bus */
2941     spapr->vio_bus = spapr_vio_bus_init();
2942 
2943     for (i = 0; i < serial_max_hds(); i++) {
2944         if (serial_hd(i)) {
2945             spapr_vty_create(spapr->vio_bus, serial_hd(i));
2946         }
2947     }
2948 
2949     /* We always have at least the nvram device on VIO */
2950     spapr_create_nvram(spapr);
2951 
2952     /*
2953      * Setup hotplug / dynamic-reconfiguration connectors. top-level
2954      * connectors (described in root DT node's "ibm,drc-types" property)
2955      * are pre-initialized here. additional child connectors (such as
2956      * connectors for a PHBs PCI slots) are added as needed during their
2957      * parent's realization.
2958      */
2959     if (smc->dr_phb_enabled) {
2960         for (i = 0; i < SPAPR_MAX_PHBS; i++) {
2961             spapr_dr_connector_new(OBJECT(machine), TYPE_SPAPR_DRC_PHB, i);
2962         }
2963     }
2964 
2965     /* Set up PCI */
2966     spapr_pci_rtas_init();
2967 
2968     phb = spapr_create_default_phb();
2969 
2970     for (i = 0; i < nb_nics; i++) {
2971         NICInfo *nd = &nd_table[i];
2972 
2973         if (!nd->model) {
2974             nd->model = g_strdup("spapr-vlan");
2975         }
2976 
2977         if (g_str_equal(nd->model, "spapr-vlan") ||
2978             g_str_equal(nd->model, "ibmveth")) {
2979             spapr_vlan_create(spapr->vio_bus, nd);
2980         } else {
2981             pci_nic_init_nofail(&nd_table[i], phb->bus, nd->model, NULL);
2982         }
2983     }
2984 
2985     for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
2986         spapr_vscsi_create(spapr->vio_bus);
2987     }
2988 
2989     /* Graphics */
2990     if (spapr_vga_init(phb->bus, &error_fatal)) {
2991         spapr->has_graphics = true;
2992         machine->usb |= defaults_enabled() && !machine->usb_disabled;
2993     }
2994 
2995     if (machine->usb) {
2996         if (smc->use_ohci_by_default) {
2997             pci_create_simple(phb->bus, -1, "pci-ohci");
2998         } else {
2999             pci_create_simple(phb->bus, -1, "nec-usb-xhci");
3000         }
3001 
3002         if (spapr->has_graphics) {
3003             USBBus *usb_bus = usb_bus_find(-1);
3004 
3005             usb_create_simple(usb_bus, "usb-kbd");
3006             usb_create_simple(usb_bus, "usb-mouse");
3007         }
3008     }
3009 
3010     if (spapr->rma_size < (MIN_RMA_SLOF * MiB)) {
3011         error_report(
3012             "pSeries SLOF firmware requires >= %ldM guest RMA (Real Mode Area memory)",
3013             MIN_RMA_SLOF);
3014         exit(1);
3015     }
3016 
3017     if (kernel_filename) {
3018         uint64_t lowaddr = 0;
3019 
3020         spapr->kernel_size = load_elf(kernel_filename, NULL,
3021                                       translate_kernel_address, NULL,
3022                                       NULL, &lowaddr, NULL, 1,
3023                                       PPC_ELF_MACHINE, 0, 0);
3024         if (spapr->kernel_size == ELF_LOAD_WRONG_ENDIAN) {
3025             spapr->kernel_size = load_elf(kernel_filename, NULL,
3026                                           translate_kernel_address, NULL, NULL,
3027                                           &lowaddr, NULL, 0, PPC_ELF_MACHINE,
3028                                           0, 0);
3029             spapr->kernel_le = spapr->kernel_size > 0;
3030         }
3031         if (spapr->kernel_size < 0) {
3032             error_report("error loading %s: %s", kernel_filename,
3033                          load_elf_strerror(spapr->kernel_size));
3034             exit(1);
3035         }
3036 
3037         /* load initrd */
3038         if (initrd_filename) {
3039             /* Try to locate the initrd in the gap between the kernel
3040              * and the firmware. Add a bit of space just in case
3041              */
3042             spapr->initrd_base = (KERNEL_LOAD_ADDR + spapr->kernel_size
3043                                   + 0x1ffff) & ~0xffff;
3044             spapr->initrd_size = load_image_targphys(initrd_filename,
3045                                                      spapr->initrd_base,
3046                                                      load_limit
3047                                                      - spapr->initrd_base);
3048             if (spapr->initrd_size < 0) {
3049                 error_report("could not load initial ram disk '%s'",
3050                              initrd_filename);
3051                 exit(1);
3052             }
3053         }
3054     }
3055 
3056     if (bios_name == NULL) {
3057         bios_name = FW_FILE_NAME;
3058     }
3059     filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
3060     if (!filename) {
3061         error_report("Could not find LPAR firmware '%s'", bios_name);
3062         exit(1);
3063     }
3064     fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
3065     if (fw_size <= 0) {
3066         error_report("Could not load LPAR firmware '%s'", filename);
3067         exit(1);
3068     }
3069     g_free(filename);
3070 
3071     /* FIXME: Should register things through the MachineState's qdev
3072      * interface, this is a legacy from the sPAPREnvironment structure
3073      * which predated MachineState but had a similar function */
3074     vmstate_register(NULL, 0, &vmstate_spapr, spapr);
3075     register_savevm_live(NULL, "spapr/htab", -1, 1,
3076                          &savevm_htab_handlers, spapr);
3077 
3078     qbus_set_hotplug_handler(sysbus_get_default(), OBJECT(machine),
3079                              &error_fatal);
3080 
3081     qemu_register_boot_set(spapr_boot_set, spapr);
3082 
3083     if (kvm_enabled()) {
3084         /* to stop and start vmclock */
3085         qemu_add_vm_change_state_handler(cpu_ppc_clock_vm_state_change,
3086                                          &spapr->tb);
3087 
3088         kvmppc_spapr_enable_inkernel_multitce();
3089     }
3090 }
3091 
3092 static int spapr_kvm_type(MachineState *machine, const char *vm_type)
3093 {
3094     if (!vm_type) {
3095         return 0;
3096     }
3097 
3098     if (!strcmp(vm_type, "HV")) {
3099         return 1;
3100     }
3101 
3102     if (!strcmp(vm_type, "PR")) {
3103         return 2;
3104     }
3105 
3106     error_report("Unknown kvm-type specified '%s'", vm_type);
3107     exit(1);
3108 }
3109 
3110 /*
3111  * Implementation of an interface to adjust firmware path
3112  * for the bootindex property handling.
3113  */
3114 static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
3115                                    DeviceState *dev)
3116 {
3117 #define CAST(type, obj, name) \
3118     ((type *)object_dynamic_cast(OBJECT(obj), (name)))
3119     SCSIDevice *d = CAST(SCSIDevice,  dev, TYPE_SCSI_DEVICE);
3120     SpaprPhbState *phb = CAST(SpaprPhbState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);
3121     VHostSCSICommon *vsc = CAST(VHostSCSICommon, dev, TYPE_VHOST_SCSI_COMMON);
3122 
3123     if (d) {
3124         void *spapr = CAST(void, bus->parent, "spapr-vscsi");
3125         VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI);
3126         USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE);
3127 
3128         if (spapr) {
3129             /*
3130              * Replace "channel@0/disk@0,0" with "disk@8000000000000000":
3131              * In the top 16 bits of the 64-bit LUN, we use SRP luns of the form
3132              * 0x8000 | (target << 8) | (bus << 5) | lun
3133              * (see the "Logical unit addressing format" table in SAM5)
3134              */
3135             unsigned id = 0x8000 | (d->id << 8) | (d->channel << 5) | d->lun;
3136             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3137                                    (uint64_t)id << 48);
3138         } else if (virtio) {
3139             /*
3140              * We use SRP luns of the form 01000000 | (target << 8) | lun
3141              * in the top 32 bits of the 64-bit LUN
3142              * Note: the quote above is from SLOF and it is wrong,
3143              * the actual binding is:
3144              * swap 0100 or 10 << or 20 << ( target lun-id -- srplun )
3145              */
3146             unsigned id = 0x1000000 | (d->id << 16) | d->lun;
3147             if (d->lun >= 256) {
3148                 /* Use the LUN "flat space addressing method" */
3149                 id |= 0x4000;
3150             }
3151             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3152                                    (uint64_t)id << 32);
3153         } else if (usb) {
3154             /*
3155              * We use SRP luns of the form 01000000 | (usb-port << 16) | lun
3156              * in the top 32 bits of the 64-bit LUN
3157              */
3158             unsigned usb_port = atoi(usb->port->path);
3159             unsigned id = 0x1000000 | (usb_port << 16) | d->lun;
3160             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3161                                    (uint64_t)id << 32);
3162         }
3163     }
3164 
3165     /*
3166      * SLOF probes the USB devices, and if it recognizes that the device is a
3167      * storage device, it changes its name to "storage" instead of "usb-host",
3168      * and additionally adds a child node for the SCSI LUN, so the correct
3169      * boot path in SLOF is something like .../storage@1/disk@xxx" instead.
3170      */
3171     if (strcmp("usb-host", qdev_fw_name(dev)) == 0) {
3172         USBDevice *usbdev = CAST(USBDevice, dev, TYPE_USB_DEVICE);
3173         if (usb_host_dev_is_scsi_storage(usbdev)) {
3174             return g_strdup_printf("storage@%s/disk", usbdev->port->path);
3175         }
3176     }
3177 
3178     if (phb) {
3179         /* Replace "pci" with "pci@800000020000000" */
3180         return g_strdup_printf("pci@%"PRIX64, phb->buid);
3181     }
3182 
3183     if (vsc) {
3184         /* Same logic as virtio above */
3185         unsigned id = 0x1000000 | (vsc->target << 16) | vsc->lun;
3186         return g_strdup_printf("disk@%"PRIX64, (uint64_t)id << 32);
3187     }
3188 
3189     if (g_str_equal("pci-bridge", qdev_fw_name(dev))) {
3190         /* SLOF uses "pci" instead of "pci-bridge" for PCI bridges */
3191         PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
3192         return g_strdup_printf("pci@%x", PCI_SLOT(pcidev->devfn));
3193     }
3194 
3195     return NULL;
3196 }
3197 
3198 static char *spapr_get_kvm_type(Object *obj, Error **errp)
3199 {
3200     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3201 
3202     return g_strdup(spapr->kvm_type);
3203 }
3204 
3205 static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
3206 {
3207     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3208 
3209     g_free(spapr->kvm_type);
3210     spapr->kvm_type = g_strdup(value);
3211 }
3212 
3213 static bool spapr_get_modern_hotplug_events(Object *obj, Error **errp)
3214 {
3215     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3216 
3217     return spapr->use_hotplug_event_source;
3218 }
3219 
3220 static void spapr_set_modern_hotplug_events(Object *obj, bool value,
3221                                             Error **errp)
3222 {
3223     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3224 
3225     spapr->use_hotplug_event_source = value;
3226 }
3227 
3228 static bool spapr_get_msix_emulation(Object *obj, Error **errp)
3229 {
3230     return true;
3231 }
3232 
3233 static char *spapr_get_resize_hpt(Object *obj, Error **errp)
3234 {
3235     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3236 
3237     switch (spapr->resize_hpt) {
3238     case SPAPR_RESIZE_HPT_DEFAULT:
3239         return g_strdup("default");
3240     case SPAPR_RESIZE_HPT_DISABLED:
3241         return g_strdup("disabled");
3242     case SPAPR_RESIZE_HPT_ENABLED:
3243         return g_strdup("enabled");
3244     case SPAPR_RESIZE_HPT_REQUIRED:
3245         return g_strdup("required");
3246     }
3247     g_assert_not_reached();
3248 }
3249 
3250 static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp)
3251 {
3252     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3253 
3254     if (strcmp(value, "default") == 0) {
3255         spapr->resize_hpt = SPAPR_RESIZE_HPT_DEFAULT;
3256     } else if (strcmp(value, "disabled") == 0) {
3257         spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
3258     } else if (strcmp(value, "enabled") == 0) {
3259         spapr->resize_hpt = SPAPR_RESIZE_HPT_ENABLED;
3260     } else if (strcmp(value, "required") == 0) {
3261         spapr->resize_hpt = SPAPR_RESIZE_HPT_REQUIRED;
3262     } else {
3263         error_setg(errp, "Bad value for \"resize-hpt\" property");
3264     }
3265 }
3266 
3267 static void spapr_get_vsmt(Object *obj, Visitor *v, const char *name,
3268                                    void *opaque, Error **errp)
3269 {
3270     visit_type_uint32(v, name, (uint32_t *)opaque, errp);
3271 }
3272 
3273 static void spapr_set_vsmt(Object *obj, Visitor *v, const char *name,
3274                                    void *opaque, Error **errp)
3275 {
3276     visit_type_uint32(v, name, (uint32_t *)opaque, errp);
3277 }
3278 
3279 static char *spapr_get_ic_mode(Object *obj, Error **errp)
3280 {
3281     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3282 
3283     if (spapr->irq == &spapr_irq_xics_legacy) {
3284         return g_strdup("legacy");
3285     } else if (spapr->irq == &spapr_irq_xics) {
3286         return g_strdup("xics");
3287     } else if (spapr->irq == &spapr_irq_xive) {
3288         return g_strdup("xive");
3289     } else if (spapr->irq == &spapr_irq_dual) {
3290         return g_strdup("dual");
3291     }
3292     g_assert_not_reached();
3293 }
3294 
3295 static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
3296 {
3297     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3298 
3299     if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
3300         error_setg(errp, "This machine only uses the legacy XICS backend, don't pass ic-mode");
3301         return;
3302     }
3303 
3304     /* The legacy IRQ backend can not be set */
3305     if (strcmp(value, "xics") == 0) {
3306         spapr->irq = &spapr_irq_xics;
3307     } else if (strcmp(value, "xive") == 0) {
3308         spapr->irq = &spapr_irq_xive;
3309     } else if (strcmp(value, "dual") == 0) {
3310         spapr->irq = &spapr_irq_dual;
3311     } else {
3312         error_setg(errp, "Bad value for \"ic-mode\" property");
3313     }
3314 }
3315 
3316 static char *spapr_get_host_model(Object *obj, Error **errp)
3317 {
3318     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3319 
3320     return g_strdup(spapr->host_model);
3321 }
3322 
3323 static void spapr_set_host_model(Object *obj, const char *value, Error **errp)
3324 {
3325     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3326 
3327     g_free(spapr->host_model);
3328     spapr->host_model = g_strdup(value);
3329 }
3330 
3331 static char *spapr_get_host_serial(Object *obj, Error **errp)
3332 {
3333     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3334 
3335     return g_strdup(spapr->host_serial);
3336 }
3337 
3338 static void spapr_set_host_serial(Object *obj, const char *value, Error **errp)
3339 {
3340     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3341 
3342     g_free(spapr->host_serial);
3343     spapr->host_serial = g_strdup(value);
3344 }
3345 
3346 static void spapr_instance_init(Object *obj)
3347 {
3348     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3349     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
3350 
3351     spapr->htab_fd = -1;
3352     spapr->use_hotplug_event_source = true;
3353     object_property_add_str(obj, "kvm-type",
3354                             spapr_get_kvm_type, spapr_set_kvm_type, NULL);
3355     object_property_set_description(obj, "kvm-type",
3356                                     "Specifies the KVM virtualization mode (HV, PR)",
3357                                     NULL);
3358     object_property_add_bool(obj, "modern-hotplug-events",
3359                             spapr_get_modern_hotplug_events,
3360                             spapr_set_modern_hotplug_events,
3361                             NULL);
3362     object_property_set_description(obj, "modern-hotplug-events",
3363                                     "Use dedicated hotplug event mechanism in"
3364                                     " place of standard EPOW events when possible"
3365                                     " (required for memory hot-unplug support)",
3366                                     NULL);
3367     ppc_compat_add_property(obj, "max-cpu-compat", &spapr->max_compat_pvr,
3368                             "Maximum permitted CPU compatibility mode",
3369                             &error_fatal);
3370 
3371     object_property_add_str(obj, "resize-hpt",
3372                             spapr_get_resize_hpt, spapr_set_resize_hpt, NULL);
3373     object_property_set_description(obj, "resize-hpt",
3374                                     "Resizing of the Hash Page Table (enabled, disabled, required)",
3375                                     NULL);
3376     object_property_add(obj, "vsmt", "uint32", spapr_get_vsmt,
3377                         spapr_set_vsmt, NULL, &spapr->vsmt, &error_abort);
3378     object_property_set_description(obj, "vsmt",
3379                                     "Virtual SMT: KVM behaves as if this were"
3380                                     " the host's SMT mode", &error_abort);
3381     object_property_add_bool(obj, "vfio-no-msix-emulation",
3382                              spapr_get_msix_emulation, NULL, NULL);
3383 
3384     /* The machine class defines the default interrupt controller mode */
3385     spapr->irq = smc->irq;
3386     object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
3387                             spapr_set_ic_mode, NULL);
3388     object_property_set_description(obj, "ic-mode",
3389                  "Specifies the interrupt controller mode (xics, xive, dual)",
3390                  NULL);
3391 
3392     object_property_add_str(obj, "host-model",
3393         spapr_get_host_model, spapr_set_host_model,
3394         &error_abort);
3395     object_property_set_description(obj, "host-model",
3396         "Host model to advertise in guest device tree", &error_abort);
3397     object_property_add_str(obj, "host-serial",
3398         spapr_get_host_serial, spapr_set_host_serial,
3399         &error_abort);
3400     object_property_set_description(obj, "host-serial",
3401         "Host serial number to advertise in guest device tree", &error_abort);
3402 }
3403 
3404 static void spapr_machine_finalizefn(Object *obj)
3405 {
3406     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3407 
3408     g_free(spapr->kvm_type);
3409 }
3410 
3411 void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg)
3412 {
3413     cpu_synchronize_state(cs);
3414     ppc_cpu_do_system_reset(cs);
3415 }
3416 
3417 static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
3418 {
3419     CPUState *cs;
3420 
3421     CPU_FOREACH(cs) {
3422         async_run_on_cpu(cs, spapr_do_system_reset_on_cpu, RUN_ON_CPU_NULL);
3423     }
3424 }
3425 
3426 int spapr_lmb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3427                           void *fdt, int *fdt_start_offset, Error **errp)
3428 {
3429     uint64_t addr;
3430     uint32_t node;
3431 
3432     addr = spapr_drc_index(drc) * SPAPR_MEMORY_BLOCK_SIZE;
3433     node = object_property_get_uint(OBJECT(drc->dev), PC_DIMM_NODE_PROP,
3434                                     &error_abort);
3435     *fdt_start_offset = spapr_populate_memory_node(fdt, node, addr,
3436                                                    SPAPR_MEMORY_BLOCK_SIZE);
3437     return 0;
3438 }
3439 
3440 static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
3441                            bool dedicated_hp_event_source, Error **errp)
3442 {
3443     SpaprDrc *drc;
3444     uint32_t nr_lmbs = size/SPAPR_MEMORY_BLOCK_SIZE;
3445     int i;
3446     uint64_t addr = addr_start;
3447     bool hotplugged = spapr_drc_hotplugged(dev);
3448     Error *local_err = NULL;
3449 
3450     for (i = 0; i < nr_lmbs; i++) {
3451         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3452                               addr / SPAPR_MEMORY_BLOCK_SIZE);
3453         g_assert(drc);
3454 
3455         spapr_drc_attach(drc, dev, &local_err);
3456         if (local_err) {
3457             while (addr > addr_start) {
3458                 addr -= SPAPR_MEMORY_BLOCK_SIZE;
3459                 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3460                                       addr / SPAPR_MEMORY_BLOCK_SIZE);
3461                 spapr_drc_detach(drc);
3462             }
3463             error_propagate(errp, local_err);
3464             return;
3465         }
3466         if (!hotplugged) {
3467             spapr_drc_reset(drc);
3468         }
3469         addr += SPAPR_MEMORY_BLOCK_SIZE;
3470     }
3471     /* send hotplug notification to the
3472      * guest only in case of hotplugged memory
3473      */
3474     if (hotplugged) {
3475         if (dedicated_hp_event_source) {
3476             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3477                                   addr_start / SPAPR_MEMORY_BLOCK_SIZE);
3478             spapr_hotplug_req_add_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
3479                                                    nr_lmbs,
3480                                                    spapr_drc_index(drc));
3481         } else {
3482             spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB,
3483                                            nr_lmbs);
3484         }
3485     }
3486 }
3487 
3488 static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3489                               Error **errp)
3490 {
3491     Error *local_err = NULL;
3492     SpaprMachineState *ms = SPAPR_MACHINE(hotplug_dev);
3493     PCDIMMDevice *dimm = PC_DIMM(dev);
3494     uint64_t size, addr;
3495 
3496     size = memory_device_get_region_size(MEMORY_DEVICE(dev), &error_abort);
3497 
3498     pc_dimm_plug(dimm, MACHINE(ms), &local_err);
3499     if (local_err) {
3500         goto out;
3501     }
3502 
3503     addr = object_property_get_uint(OBJECT(dimm),
3504                                     PC_DIMM_ADDR_PROP, &local_err);
3505     if (local_err) {
3506         goto out_unplug;
3507     }
3508 
3509     spapr_add_lmbs(dev, addr, size, spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
3510                    &local_err);
3511     if (local_err) {
3512         goto out_unplug;
3513     }
3514 
3515     return;
3516 
3517 out_unplug:
3518     pc_dimm_unplug(dimm, MACHINE(ms));
3519 out:
3520     error_propagate(errp, local_err);
3521 }
3522 
3523 static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3524                                   Error **errp)
3525 {
3526     const SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
3527     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3528     PCDIMMDevice *dimm = PC_DIMM(dev);
3529     Error *local_err = NULL;
3530     uint64_t size;
3531     Object *memdev;
3532     hwaddr pagesize;
3533 
3534     if (!smc->dr_lmb_enabled) {
3535         error_setg(errp, "Memory hotplug not supported for this machine");
3536         return;
3537     }
3538 
3539     size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &local_err);
3540     if (local_err) {
3541         error_propagate(errp, local_err);
3542         return;
3543     }
3544 
3545     if (size % SPAPR_MEMORY_BLOCK_SIZE) {
3546         error_setg(errp, "Hotplugged memory size must be a multiple of "
3547                       "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
3548         return;
3549     }
3550 
3551     memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
3552                                       &error_abort);
3553     pagesize = host_memory_backend_pagesize(MEMORY_BACKEND(memdev));
3554     spapr_check_pagesize(spapr, pagesize, &local_err);
3555     if (local_err) {
3556         error_propagate(errp, local_err);
3557         return;
3558     }
3559 
3560     pc_dimm_pre_plug(dimm, MACHINE(hotplug_dev), NULL, errp);
3561 }
3562 
3563 struct SpaprDimmState {
3564     PCDIMMDevice *dimm;
3565     uint32_t nr_lmbs;
3566     QTAILQ_ENTRY(SpaprDimmState) next;
3567 };
3568 
3569 static SpaprDimmState *spapr_pending_dimm_unplugs_find(SpaprMachineState *s,
3570                                                        PCDIMMDevice *dimm)
3571 {
3572     SpaprDimmState *dimm_state = NULL;
3573 
3574     QTAILQ_FOREACH(dimm_state, &s->pending_dimm_unplugs, next) {
3575         if (dimm_state->dimm == dimm) {
3576             break;
3577         }
3578     }
3579     return dimm_state;
3580 }
3581 
3582 static SpaprDimmState *spapr_pending_dimm_unplugs_add(SpaprMachineState *spapr,
3583                                                       uint32_t nr_lmbs,
3584                                                       PCDIMMDevice *dimm)
3585 {
3586     SpaprDimmState *ds = NULL;
3587 
3588     /*
3589      * If this request is for a DIMM whose removal had failed earlier
3590      * (due to guest's refusal to remove the LMBs), we would have this
3591      * dimm already in the pending_dimm_unplugs list. In that
3592      * case don't add again.
3593      */
3594     ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
3595     if (!ds) {
3596         ds = g_malloc0(sizeof(SpaprDimmState));
3597         ds->nr_lmbs = nr_lmbs;
3598         ds->dimm = dimm;
3599         QTAILQ_INSERT_HEAD(&spapr->pending_dimm_unplugs, ds, next);
3600     }
3601     return ds;
3602 }
3603 
3604 static void spapr_pending_dimm_unplugs_remove(SpaprMachineState *spapr,
3605                                               SpaprDimmState *dimm_state)
3606 {
3607     QTAILQ_REMOVE(&spapr->pending_dimm_unplugs, dimm_state, next);
3608     g_free(dimm_state);
3609 }
3610 
3611 static SpaprDimmState *spapr_recover_pending_dimm_state(SpaprMachineState *ms,
3612                                                         PCDIMMDevice *dimm)
3613 {
3614     SpaprDrc *drc;
3615     uint64_t size = memory_device_get_region_size(MEMORY_DEVICE(dimm),
3616                                                   &error_abort);
3617     uint32_t nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3618     uint32_t avail_lmbs = 0;
3619     uint64_t addr_start, addr;
3620     int i;
3621 
3622     addr_start = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3623                                          &error_abort);
3624 
3625     addr = addr_start;
3626     for (i = 0; i < nr_lmbs; i++) {
3627         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3628                               addr / SPAPR_MEMORY_BLOCK_SIZE);
3629         g_assert(drc);
3630         if (drc->dev) {
3631             avail_lmbs++;
3632         }
3633         addr += SPAPR_MEMORY_BLOCK_SIZE;
3634     }
3635 
3636     return spapr_pending_dimm_unplugs_add(ms, avail_lmbs, dimm);
3637 }
3638 
3639 /* Callback to be called during DRC release. */
3640 void spapr_lmb_release(DeviceState *dev)
3641 {
3642     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3643     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_ctrl);
3644     SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
3645 
3646     /* This information will get lost if a migration occurs
3647      * during the unplug process. In this case recover it. */
3648     if (ds == NULL) {
3649         ds = spapr_recover_pending_dimm_state(spapr, PC_DIMM(dev));
3650         g_assert(ds);
3651         /* The DRC being examined by the caller at least must be counted */
3652         g_assert(ds->nr_lmbs);
3653     }
3654 
3655     if (--ds->nr_lmbs) {
3656         return;
3657     }
3658 
3659     /*
3660      * Now that all the LMBs have been removed by the guest, call the
3661      * unplug handler chain. This can never fail.
3662      */
3663     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3664     object_unparent(OBJECT(dev));
3665 }
3666 
3667 static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
3668 {
3669     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3670     SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
3671 
3672     pc_dimm_unplug(PC_DIMM(dev), MACHINE(hotplug_dev));
3673     object_property_set_bool(OBJECT(dev), false, "realized", NULL);
3674     spapr_pending_dimm_unplugs_remove(spapr, ds);
3675 }
3676 
3677 static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev,
3678                                         DeviceState *dev, Error **errp)
3679 {
3680     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3681     Error *local_err = NULL;
3682     PCDIMMDevice *dimm = PC_DIMM(dev);
3683     uint32_t nr_lmbs;
3684     uint64_t size, addr_start, addr;
3685     int i;
3686     SpaprDrc *drc;
3687 
3688     size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
3689     nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3690 
3691     addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3692                                          &local_err);
3693     if (local_err) {
3694         goto out;
3695     }
3696 
3697     /*
3698      * An existing pending dimm state for this DIMM means that there is an
3699      * unplug operation in progress, waiting for the spapr_lmb_release
3700      * callback to complete the job (BQL can't cover that far). In this case,
3701      * bail out to avoid detaching DRCs that were already released.
3702      */
3703     if (spapr_pending_dimm_unplugs_find(spapr, dimm)) {
3704         error_setg(&local_err,
3705                    "Memory unplug already in progress for device %s",
3706                    dev->id);
3707         goto out;
3708     }
3709 
3710     spapr_pending_dimm_unplugs_add(spapr, nr_lmbs, dimm);
3711 
3712     addr = addr_start;
3713     for (i = 0; i < nr_lmbs; i++) {
3714         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3715                               addr / SPAPR_MEMORY_BLOCK_SIZE);
3716         g_assert(drc);
3717 
3718         spapr_drc_detach(drc);
3719         addr += SPAPR_MEMORY_BLOCK_SIZE;
3720     }
3721 
3722     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3723                           addr_start / SPAPR_MEMORY_BLOCK_SIZE);
3724     spapr_hotplug_req_remove_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
3725                                               nr_lmbs, spapr_drc_index(drc));
3726 out:
3727     error_propagate(errp, local_err);
3728 }
3729 
3730 /* Callback to be called during DRC release. */
3731 void spapr_core_release(DeviceState *dev)
3732 {
3733     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3734 
3735     /* Call the unplug handler chain. This can never fail. */
3736     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3737     object_unparent(OBJECT(dev));
3738 }
3739 
3740 static void spapr_core_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
3741 {
3742     MachineState *ms = MACHINE(hotplug_dev);
3743     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms);
3744     CPUCore *cc = CPU_CORE(dev);
3745     CPUArchId *core_slot = spapr_find_cpu_slot(ms, cc->core_id, NULL);
3746 
3747     if (smc->pre_2_10_has_unused_icps) {
3748         SpaprCpuCore *sc = SPAPR_CPU_CORE(OBJECT(dev));
3749         int i;
3750 
3751         for (i = 0; i < cc->nr_threads; i++) {
3752             CPUState *cs = CPU(sc->threads[i]);
3753 
3754             pre_2_10_vmstate_register_dummy_icp(cs->cpu_index);
3755         }
3756     }
3757 
3758     assert(core_slot);
3759     core_slot->cpu = NULL;
3760     object_property_set_bool(OBJECT(dev), false, "realized", NULL);
3761 }
3762 
3763 static
3764 void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev,
3765                                Error **errp)
3766 {
3767     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3768     int index;
3769     SpaprDrc *drc;
3770     CPUCore *cc = CPU_CORE(dev);
3771 
3772     if (!spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index)) {
3773         error_setg(errp, "Unable to find CPU core with core-id: %d",
3774                    cc->core_id);
3775         return;
3776     }
3777     if (index == 0) {
3778         error_setg(errp, "Boot CPU core may not be unplugged");
3779         return;
3780     }
3781 
3782     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
3783                           spapr_vcpu_id(spapr, cc->core_id));
3784     g_assert(drc);
3785 
3786     spapr_drc_detach(drc);
3787 
3788     spapr_hotplug_req_remove_by_index(drc);
3789 }
3790 
3791 int spapr_core_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3792                            void *fdt, int *fdt_start_offset, Error **errp)
3793 {
3794     SpaprCpuCore *core = SPAPR_CPU_CORE(drc->dev);
3795     CPUState *cs = CPU(core->threads[0]);
3796     PowerPCCPU *cpu = POWERPC_CPU(cs);
3797     DeviceClass *dc = DEVICE_GET_CLASS(cs);
3798     int id = spapr_get_vcpu_id(cpu);
3799     char *nodename;
3800     int offset;
3801 
3802     nodename = g_strdup_printf("%s@%x", dc->fw_name, id);
3803     offset = fdt_add_subnode(fdt, 0, nodename);
3804     g_free(nodename);
3805 
3806     spapr_populate_cpu_dt(cs, fdt, offset, spapr);
3807 
3808     *fdt_start_offset = offset;
3809     return 0;
3810 }
3811 
3812 static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3813                             Error **errp)
3814 {
3815     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3816     MachineClass *mc = MACHINE_GET_CLASS(spapr);
3817     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
3818     SpaprCpuCore *core = SPAPR_CPU_CORE(OBJECT(dev));
3819     CPUCore *cc = CPU_CORE(dev);
3820     CPUState *cs;
3821     SpaprDrc *drc;
3822     Error *local_err = NULL;
3823     CPUArchId *core_slot;
3824     int index;
3825     bool hotplugged = spapr_drc_hotplugged(dev);
3826 
3827     core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
3828     if (!core_slot) {
3829         error_setg(errp, "Unable to find CPU core with core-id: %d",
3830                    cc->core_id);
3831         return;
3832     }
3833     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
3834                           spapr_vcpu_id(spapr, cc->core_id));
3835 
3836     g_assert(drc || !mc->has_hotpluggable_cpus);
3837 
3838     if (drc) {
3839         spapr_drc_attach(drc, dev, &local_err);
3840         if (local_err) {
3841             error_propagate(errp, local_err);
3842             return;
3843         }
3844 
3845         if (hotplugged) {
3846             /*
3847              * Send hotplug notification interrupt to the guest only
3848              * in case of hotplugged CPUs.
3849              */
3850             spapr_hotplug_req_add_by_index(drc);
3851         } else {
3852             spapr_drc_reset(drc);
3853         }
3854     }
3855 
3856     core_slot->cpu = OBJECT(dev);
3857 
3858     if (smc->pre_2_10_has_unused_icps) {
3859         int i;
3860 
3861         for (i = 0; i < cc->nr_threads; i++) {
3862             cs = CPU(core->threads[i]);
3863             pre_2_10_vmstate_unregister_dummy_icp(cs->cpu_index);
3864         }
3865     }
3866 }
3867 
3868 static void spapr_core_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3869                                 Error **errp)
3870 {
3871     MachineState *machine = MACHINE(OBJECT(hotplug_dev));
3872     MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
3873     Error *local_err = NULL;
3874     CPUCore *cc = CPU_CORE(dev);
3875     const char *base_core_type = spapr_get_cpu_core_type(machine->cpu_type);
3876     const char *type = object_get_typename(OBJECT(dev));
3877     CPUArchId *core_slot;
3878     int index;
3879     unsigned int smp_threads = machine->smp.threads;
3880 
3881     if (dev->hotplugged && !mc->has_hotpluggable_cpus) {
3882         error_setg(&local_err, "CPU hotplug not supported for this machine");
3883         goto out;
3884     }
3885 
3886     if (strcmp(base_core_type, type)) {
3887         error_setg(&local_err, "CPU core type should be %s", base_core_type);
3888         goto out;
3889     }
3890 
3891     if (cc->core_id % smp_threads) {
3892         error_setg(&local_err, "invalid core id %d", cc->core_id);
3893         goto out;
3894     }
3895 
3896     /*
3897      * In general we should have homogeneous threads-per-core, but old
3898      * (pre hotplug support) machine types allow the last core to have
3899      * reduced threads as a compatibility hack for when we allowed
3900      * total vcpus not a multiple of threads-per-core.
3901      */
3902     if (mc->has_hotpluggable_cpus && (cc->nr_threads != smp_threads)) {
3903         error_setg(&local_err, "invalid nr-threads %d, must be %d",
3904                    cc->nr_threads, smp_threads);
3905         goto out;
3906     }
3907 
3908     core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
3909     if (!core_slot) {
3910         error_setg(&local_err, "core id %d out of range", cc->core_id);
3911         goto out;
3912     }
3913 
3914     if (core_slot->cpu) {
3915         error_setg(&local_err, "core %d already populated", cc->core_id);
3916         goto out;
3917     }
3918 
3919     numa_cpu_pre_plug(core_slot, dev, &local_err);
3920 
3921 out:
3922     error_propagate(errp, local_err);
3923 }
3924 
3925 int spapr_phb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3926                           void *fdt, int *fdt_start_offset, Error **errp)
3927 {
3928     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(drc->dev);
3929     int intc_phandle;
3930 
3931     intc_phandle = spapr_irq_get_phandle(spapr, spapr->fdt_blob, errp);
3932     if (intc_phandle <= 0) {
3933         return -1;
3934     }
3935 
3936     if (spapr_dt_phb(sphb, intc_phandle, fdt, spapr->irq->nr_msis,
3937                      fdt_start_offset)) {
3938         error_setg(errp, "unable to create FDT node for PHB %d", sphb->index);
3939         return -1;
3940     }
3941 
3942     /* generally SLOF creates these, for hotplug it's up to QEMU */
3943     _FDT(fdt_setprop_string(fdt, *fdt_start_offset, "name", "pci"));
3944 
3945     return 0;
3946 }
3947 
3948 static void spapr_phb_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3949                                Error **errp)
3950 {
3951     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3952     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
3953     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
3954     const unsigned windows_supported = spapr_phb_windows_supported(sphb);
3955 
3956     if (dev->hotplugged && !smc->dr_phb_enabled) {
3957         error_setg(errp, "PHB hotplug not supported for this machine");
3958         return;
3959     }
3960 
3961     if (sphb->index == (uint32_t)-1) {
3962         error_setg(errp, "\"index\" for PAPR PHB is mandatory");
3963         return;
3964     }
3965 
3966     /*
3967      * This will check that sphb->index doesn't exceed the maximum number of
3968      * PHBs for the current machine type.
3969      */
3970     smc->phb_placement(spapr, sphb->index,
3971                        &sphb->buid, &sphb->io_win_addr,
3972                        &sphb->mem_win_addr, &sphb->mem64_win_addr,
3973                        windows_supported, sphb->dma_liobn,
3974                        &sphb->nv2_gpa_win_addr, &sphb->nv2_atsd_win_addr,
3975                        errp);
3976 }
3977 
3978 static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3979                            Error **errp)
3980 {
3981     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3982     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
3983     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
3984     SpaprDrc *drc;
3985     bool hotplugged = spapr_drc_hotplugged(dev);
3986     Error *local_err = NULL;
3987 
3988     if (!smc->dr_phb_enabled) {
3989         return;
3990     }
3991 
3992     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
3993     /* hotplug hooks should check it's enabled before getting this far */
3994     assert(drc);
3995 
3996     spapr_drc_attach(drc, DEVICE(dev), &local_err);
3997     if (local_err) {
3998         error_propagate(errp, local_err);
3999         return;
4000     }
4001 
4002     if (hotplugged) {
4003         spapr_hotplug_req_add_by_index(drc);
4004     } else {
4005         spapr_drc_reset(drc);
4006     }
4007 }
4008 
4009 void spapr_phb_release(DeviceState *dev)
4010 {
4011     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
4012 
4013     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
4014     object_unparent(OBJECT(dev));
4015 }
4016 
4017 static void spapr_phb_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
4018 {
4019     object_property_set_bool(OBJECT(dev), false, "realized", NULL);
4020 }
4021 
4022 static void spapr_phb_unplug_request(HotplugHandler *hotplug_dev,
4023                                      DeviceState *dev, Error **errp)
4024 {
4025     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4026     SpaprDrc *drc;
4027 
4028     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4029     assert(drc);
4030 
4031     if (!spapr_drc_unplug_requested(drc)) {
4032         spapr_drc_detach(drc);
4033         spapr_hotplug_req_remove_by_index(drc);
4034     }
4035 }
4036 
4037 static void spapr_machine_device_plug(HotplugHandler *hotplug_dev,
4038                                       DeviceState *dev, Error **errp)
4039 {
4040     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4041         spapr_memory_plug(hotplug_dev, dev, errp);
4042     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4043         spapr_core_plug(hotplug_dev, dev, errp);
4044     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4045         spapr_phb_plug(hotplug_dev, dev, errp);
4046     }
4047 }
4048 
4049 static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
4050                                         DeviceState *dev, Error **errp)
4051 {
4052     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4053         spapr_memory_unplug(hotplug_dev, dev);
4054     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4055         spapr_core_unplug(hotplug_dev, dev);
4056     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4057         spapr_phb_unplug(hotplug_dev, dev);
4058     }
4059 }
4060 
4061 static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev,
4062                                                 DeviceState *dev, Error **errp)
4063 {
4064     SpaprMachineState *sms = SPAPR_MACHINE(OBJECT(hotplug_dev));
4065     MachineClass *mc = MACHINE_GET_CLASS(sms);
4066     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4067 
4068     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4069         if (spapr_ovec_test(sms->ov5_cas, OV5_HP_EVT)) {
4070             spapr_memory_unplug_request(hotplug_dev, dev, errp);
4071         } else {
4072             /* NOTE: this means there is a window after guest reset, prior to
4073              * CAS negotiation, where unplug requests will fail due to the
4074              * capability not being detected yet. This is a bit different than
4075              * the case with PCI unplug, where the events will be queued and
4076              * eventually handled by the guest after boot
4077              */
4078             error_setg(errp, "Memory hot unplug not supported for this guest");
4079         }
4080     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4081         if (!mc->has_hotpluggable_cpus) {
4082             error_setg(errp, "CPU hot unplug not supported on this machine");
4083             return;
4084         }
4085         spapr_core_unplug_request(hotplug_dev, dev, errp);
4086     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4087         if (!smc->dr_phb_enabled) {
4088             error_setg(errp, "PHB hot unplug not supported on this machine");
4089             return;
4090         }
4091         spapr_phb_unplug_request(hotplug_dev, dev, errp);
4092     }
4093 }
4094 
4095 static void spapr_machine_device_pre_plug(HotplugHandler *hotplug_dev,
4096                                           DeviceState *dev, Error **errp)
4097 {
4098     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4099         spapr_memory_pre_plug(hotplug_dev, dev, errp);
4100     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4101         spapr_core_pre_plug(hotplug_dev, dev, errp);
4102     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4103         spapr_phb_pre_plug(hotplug_dev, dev, errp);
4104     }
4105 }
4106 
4107 static HotplugHandler *spapr_get_hotplug_handler(MachineState *machine,
4108                                                  DeviceState *dev)
4109 {
4110     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
4111         object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE) ||
4112         object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4113         return HOTPLUG_HANDLER(machine);
4114     }
4115     if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
4116         PCIDevice *pcidev = PCI_DEVICE(dev);
4117         PCIBus *root = pci_device_root_bus(pcidev);
4118         SpaprPhbState *phb =
4119             (SpaprPhbState *)object_dynamic_cast(OBJECT(BUS(root)->parent),
4120                                                  TYPE_SPAPR_PCI_HOST_BRIDGE);
4121 
4122         if (phb) {
4123             return HOTPLUG_HANDLER(phb);
4124         }
4125     }
4126     return NULL;
4127 }
4128 
4129 static CpuInstanceProperties
4130 spapr_cpu_index_to_props(MachineState *machine, unsigned cpu_index)
4131 {
4132     CPUArchId *core_slot;
4133     MachineClass *mc = MACHINE_GET_CLASS(machine);
4134 
4135     /* make sure possible_cpu are intialized */
4136     mc->possible_cpu_arch_ids(machine);
4137     /* get CPU core slot containing thread that matches cpu_index */
4138     core_slot = spapr_find_cpu_slot(machine, cpu_index, NULL);
4139     assert(core_slot);
4140     return core_slot->props;
4141 }
4142 
4143 static int64_t spapr_get_default_cpu_node_id(const MachineState *ms, int idx)
4144 {
4145     return idx / ms->smp.cores % nb_numa_nodes;
4146 }
4147 
4148 static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine)
4149 {
4150     int i;
4151     unsigned int smp_threads = machine->smp.threads;
4152     unsigned int smp_cpus = machine->smp.cpus;
4153     const char *core_type;
4154     int spapr_max_cores = machine->smp.max_cpus / smp_threads;
4155     MachineClass *mc = MACHINE_GET_CLASS(machine);
4156 
4157     if (!mc->has_hotpluggable_cpus) {
4158         spapr_max_cores = QEMU_ALIGN_UP(smp_cpus, smp_threads) / smp_threads;
4159     }
4160     if (machine->possible_cpus) {
4161         assert(machine->possible_cpus->len == spapr_max_cores);
4162         return machine->possible_cpus;
4163     }
4164 
4165     core_type = spapr_get_cpu_core_type(machine->cpu_type);
4166     if (!core_type) {
4167         error_report("Unable to find sPAPR CPU Core definition");
4168         exit(1);
4169     }
4170 
4171     machine->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
4172                              sizeof(CPUArchId) * spapr_max_cores);
4173     machine->possible_cpus->len = spapr_max_cores;
4174     for (i = 0; i < machine->possible_cpus->len; i++) {
4175         int core_id = i * smp_threads;
4176 
4177         machine->possible_cpus->cpus[i].type = core_type;
4178         machine->possible_cpus->cpus[i].vcpus_count = smp_threads;
4179         machine->possible_cpus->cpus[i].arch_id = core_id;
4180         machine->possible_cpus->cpus[i].props.has_core_id = true;
4181         machine->possible_cpus->cpus[i].props.core_id = core_id;
4182     }
4183     return machine->possible_cpus;
4184 }
4185 
4186 static void spapr_phb_placement(SpaprMachineState *spapr, uint32_t index,
4187                                 uint64_t *buid, hwaddr *pio,
4188                                 hwaddr *mmio32, hwaddr *mmio64,
4189                                 unsigned n_dma, uint32_t *liobns,
4190                                 hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
4191 {
4192     /*
4193      * New-style PHB window placement.
4194      *
4195      * Goals: Gives large (1TiB), naturally aligned 64-bit MMIO window
4196      * for each PHB, in addition to 2GiB 32-bit MMIO and 64kiB PIO
4197      * windows.
4198      *
4199      * Some guest kernels can't work with MMIO windows above 1<<46
4200      * (64TiB), so we place up to 31 PHBs in the area 32TiB..64TiB
4201      *
4202      * 32TiB..(33TiB+1984kiB) contains the 64kiB PIO windows for each
4203      * PHB stacked together.  (32TiB+2GiB)..(32TiB+64GiB) contains the
4204      * 2GiB 32-bit MMIO windows for each PHB.  Then 33..64TiB has the
4205      * 1TiB 64-bit MMIO windows for each PHB.
4206      */
4207     const uint64_t base_buid = 0x800000020000000ULL;
4208     int i;
4209 
4210     /* Sanity check natural alignments */
4211     QEMU_BUILD_BUG_ON((SPAPR_PCI_BASE % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
4212     QEMU_BUILD_BUG_ON((SPAPR_PCI_LIMIT % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
4213     QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM64_WIN_SIZE % SPAPR_PCI_MEM32_WIN_SIZE) != 0);
4214     QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM32_WIN_SIZE % SPAPR_PCI_IO_WIN_SIZE) != 0);
4215     /* Sanity check bounds */
4216     QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_IO_WIN_SIZE) >
4217                       SPAPR_PCI_MEM32_WIN_SIZE);
4218     QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_MEM32_WIN_SIZE) >
4219                       SPAPR_PCI_MEM64_WIN_SIZE);
4220 
4221     if (index >= SPAPR_MAX_PHBS) {
4222         error_setg(errp, "\"index\" for PAPR PHB is too large (max %llu)",
4223                    SPAPR_MAX_PHBS - 1);
4224         return;
4225     }
4226 
4227     *buid = base_buid + index;
4228     for (i = 0; i < n_dma; ++i) {
4229         liobns[i] = SPAPR_PCI_LIOBN(index, i);
4230     }
4231 
4232     *pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE;
4233     *mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE;
4234     *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE;
4235 
4236     *nv2gpa = SPAPR_PCI_NV2RAM64_WIN_BASE + index * SPAPR_PCI_NV2RAM64_WIN_SIZE;
4237     *nv2atsd = SPAPR_PCI_NV2ATSD_WIN_BASE + index * SPAPR_PCI_NV2ATSD_WIN_SIZE;
4238 }
4239 
4240 static ICSState *spapr_ics_get(XICSFabric *dev, int irq)
4241 {
4242     SpaprMachineState *spapr = SPAPR_MACHINE(dev);
4243 
4244     return ics_valid_irq(spapr->ics, irq) ? spapr->ics : NULL;
4245 }
4246 
4247 static void spapr_ics_resend(XICSFabric *dev)
4248 {
4249     SpaprMachineState *spapr = SPAPR_MACHINE(dev);
4250 
4251     ics_resend(spapr->ics);
4252 }
4253 
4254 static ICPState *spapr_icp_get(XICSFabric *xi, int vcpu_id)
4255 {
4256     PowerPCCPU *cpu = spapr_find_cpu(vcpu_id);
4257 
4258     return cpu ? spapr_cpu_state(cpu)->icp : NULL;
4259 }
4260 
4261 static void spapr_pic_print_info(InterruptStatsProvider *obj,
4262                                  Monitor *mon)
4263 {
4264     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
4265 
4266     spapr->irq->print_info(spapr, mon);
4267 }
4268 
4269 int spapr_get_vcpu_id(PowerPCCPU *cpu)
4270 {
4271     return cpu->vcpu_id;
4272 }
4273 
4274 void spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp)
4275 {
4276     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
4277     MachineState *ms = MACHINE(spapr);
4278     int vcpu_id;
4279 
4280     vcpu_id = spapr_vcpu_id(spapr, cpu_index);
4281 
4282     if (kvm_enabled() && !kvm_vcpu_id_is_valid(vcpu_id)) {
4283         error_setg(errp, "Can't create CPU with id %d in KVM", vcpu_id);
4284         error_append_hint(errp, "Adjust the number of cpus to %d "
4285                           "or try to raise the number of threads per core\n",
4286                           vcpu_id * ms->smp.threads / spapr->vsmt);
4287         return;
4288     }
4289 
4290     cpu->vcpu_id = vcpu_id;
4291 }
4292 
4293 PowerPCCPU *spapr_find_cpu(int vcpu_id)
4294 {
4295     CPUState *cs;
4296 
4297     CPU_FOREACH(cs) {
4298         PowerPCCPU *cpu = POWERPC_CPU(cs);
4299 
4300         if (spapr_get_vcpu_id(cpu) == vcpu_id) {
4301             return cpu;
4302         }
4303     }
4304 
4305     return NULL;
4306 }
4307 
4308 static void spapr_machine_class_init(ObjectClass *oc, void *data)
4309 {
4310     MachineClass *mc = MACHINE_CLASS(oc);
4311     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(oc);
4312     FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
4313     NMIClass *nc = NMI_CLASS(oc);
4314     HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
4315     PPCVirtualHypervisorClass *vhc = PPC_VIRTUAL_HYPERVISOR_CLASS(oc);
4316     XICSFabricClass *xic = XICS_FABRIC_CLASS(oc);
4317     InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc);
4318 
4319     mc->desc = "pSeries Logical Partition (PAPR compliant)";
4320     mc->ignore_boot_device_suffixes = true;
4321 
4322     /*
4323      * We set up the default / latest behaviour here.  The class_init
4324      * functions for the specific versioned machine types can override
4325      * these details for backwards compatibility
4326      */
4327     mc->init = spapr_machine_init;
4328     mc->reset = spapr_machine_reset;
4329     mc->block_default_type = IF_SCSI;
4330     mc->max_cpus = 1024;
4331     mc->no_parallel = 1;
4332     mc->default_boot_order = "";
4333     mc->default_ram_size = 512 * MiB;
4334     mc->default_display = "std";
4335     mc->kvm_type = spapr_kvm_type;
4336     machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SPAPR_PCI_HOST_BRIDGE);
4337     mc->pci_allow_0_address = true;
4338     assert(!mc->get_hotplug_handler);
4339     mc->get_hotplug_handler = spapr_get_hotplug_handler;
4340     hc->pre_plug = spapr_machine_device_pre_plug;
4341     hc->plug = spapr_machine_device_plug;
4342     mc->cpu_index_to_instance_props = spapr_cpu_index_to_props;
4343     mc->get_default_cpu_node_id = spapr_get_default_cpu_node_id;
4344     mc->possible_cpu_arch_ids = spapr_possible_cpu_arch_ids;
4345     hc->unplug_request = spapr_machine_device_unplug_request;
4346     hc->unplug = spapr_machine_device_unplug;
4347 
4348     smc->dr_lmb_enabled = true;
4349     smc->update_dt_enabled = true;
4350     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power9_v2.0");
4351     mc->has_hotpluggable_cpus = true;
4352     smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED;
4353     fwc->get_dev_path = spapr_get_fw_dev_path;
4354     nc->nmi_monitor_handler = spapr_nmi;
4355     smc->phb_placement = spapr_phb_placement;
4356     vhc->hypercall = emulate_spapr_hypercall;
4357     vhc->hpt_mask = spapr_hpt_mask;
4358     vhc->map_hptes = spapr_map_hptes;
4359     vhc->unmap_hptes = spapr_unmap_hptes;
4360     vhc->hpte_set_c = spapr_hpte_set_c;
4361     vhc->hpte_set_r = spapr_hpte_set_r;
4362     vhc->get_pate = spapr_get_pate;
4363     vhc->encode_hpt_for_kvm_pr = spapr_encode_hpt_for_kvm_pr;
4364     xic->ics_get = spapr_ics_get;
4365     xic->ics_resend = spapr_ics_resend;
4366     xic->icp_get = spapr_icp_get;
4367     ispc->print_info = spapr_pic_print_info;
4368     /* Force NUMA node memory size to be a multiple of
4369      * SPAPR_MEMORY_BLOCK_SIZE (256M) since that's the granularity
4370      * in which LMBs are represented and hot-added
4371      */
4372     mc->numa_mem_align_shift = 28;
4373     mc->numa_mem_supported = true;
4374 
4375     smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_OFF;
4376     smc->default_caps.caps[SPAPR_CAP_VSX] = SPAPR_CAP_ON;
4377     smc->default_caps.caps[SPAPR_CAP_DFP] = SPAPR_CAP_ON;
4378     smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND;
4379     smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND;
4380     smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_WORKAROUND;
4381     smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 16; /* 64kiB */
4382     smc->default_caps.caps[SPAPR_CAP_NESTED_KVM_HV] = SPAPR_CAP_OFF;
4383     smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON;
4384     smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF;
4385     spapr_caps_add_properties(smc, &error_abort);
4386     smc->irq = &spapr_irq_dual;
4387     smc->dr_phb_enabled = true;
4388 }
4389 
4390 static const TypeInfo spapr_machine_info = {
4391     .name          = TYPE_SPAPR_MACHINE,
4392     .parent        = TYPE_MACHINE,
4393     .abstract      = true,
4394     .instance_size = sizeof(SpaprMachineState),
4395     .instance_init = spapr_instance_init,
4396     .instance_finalize = spapr_machine_finalizefn,
4397     .class_size    = sizeof(SpaprMachineClass),
4398     .class_init    = spapr_machine_class_init,
4399     .interfaces = (InterfaceInfo[]) {
4400         { TYPE_FW_PATH_PROVIDER },
4401         { TYPE_NMI },
4402         { TYPE_HOTPLUG_HANDLER },
4403         { TYPE_PPC_VIRTUAL_HYPERVISOR },
4404         { TYPE_XICS_FABRIC },
4405         { TYPE_INTERRUPT_STATS_PROVIDER },
4406         { }
4407     },
4408 };
4409 
4410 #define DEFINE_SPAPR_MACHINE(suffix, verstr, latest)                 \
4411     static void spapr_machine_##suffix##_class_init(ObjectClass *oc, \
4412                                                     void *data)      \
4413     {                                                                \
4414         MachineClass *mc = MACHINE_CLASS(oc);                        \
4415         spapr_machine_##suffix##_class_options(mc);                  \
4416         if (latest) {                                                \
4417             mc->alias = "pseries";                                   \
4418             mc->is_default = 1;                                      \
4419         }                                                            \
4420     }                                                                \
4421     static const TypeInfo spapr_machine_##suffix##_info = {          \
4422         .name = MACHINE_TYPE_NAME("pseries-" verstr),                \
4423         .parent = TYPE_SPAPR_MACHINE,                                \
4424         .class_init = spapr_machine_##suffix##_class_init,           \
4425     };                                                               \
4426     static void spapr_machine_register_##suffix(void)                \
4427     {                                                                \
4428         type_register(&spapr_machine_##suffix##_info);               \
4429     }                                                                \
4430     type_init(spapr_machine_register_##suffix)
4431 
4432 /*
4433  * pseries-4.1
4434  */
4435 static void spapr_machine_4_1_class_options(MachineClass *mc)
4436 {
4437     /* Defaults for the latest behaviour inherited from the base class */
4438 }
4439 
4440 DEFINE_SPAPR_MACHINE(4_1, "4.1", true);
4441 
4442 /*
4443  * pseries-4.0
4444  */
4445 static void phb_placement_4_0(SpaprMachineState *spapr, uint32_t index,
4446                               uint64_t *buid, hwaddr *pio,
4447                               hwaddr *mmio32, hwaddr *mmio64,
4448                               unsigned n_dma, uint32_t *liobns,
4449                               hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
4450 {
4451     spapr_phb_placement(spapr, index, buid, pio, mmio32, mmio64, n_dma, liobns,
4452                         nv2gpa, nv2atsd, errp);
4453     *nv2gpa = 0;
4454     *nv2atsd = 0;
4455 }
4456 
4457 static void spapr_machine_4_0_class_options(MachineClass *mc)
4458 {
4459     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4460 
4461     spapr_machine_4_1_class_options(mc);
4462     compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len);
4463     smc->phb_placement = phb_placement_4_0;
4464     smc->irq = &spapr_irq_xics;
4465     smc->pre_4_1_migration = true;
4466 }
4467 
4468 DEFINE_SPAPR_MACHINE(4_0, "4.0", false);
4469 
4470 /*
4471  * pseries-3.1
4472  */
4473 static void spapr_machine_3_1_class_options(MachineClass *mc)
4474 {
4475     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4476 
4477     spapr_machine_4_0_class_options(mc);
4478     compat_props_add(mc->compat_props, hw_compat_3_1, hw_compat_3_1_len);
4479 
4480     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
4481     smc->update_dt_enabled = false;
4482     smc->dr_phb_enabled = false;
4483     smc->broken_host_serial_model = true;
4484     smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_BROKEN;
4485     smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN;
4486     smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN;
4487     smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_OFF;
4488 }
4489 
4490 DEFINE_SPAPR_MACHINE(3_1, "3.1", false);
4491 
4492 /*
4493  * pseries-3.0
4494  */
4495 
4496 static void spapr_machine_3_0_class_options(MachineClass *mc)
4497 {
4498     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4499 
4500     spapr_machine_3_1_class_options(mc);
4501     compat_props_add(mc->compat_props, hw_compat_3_0, hw_compat_3_0_len);
4502 
4503     smc->legacy_irq_allocation = true;
4504     smc->irq = &spapr_irq_xics_legacy;
4505 }
4506 
4507 DEFINE_SPAPR_MACHINE(3_0, "3.0", false);
4508 
4509 /*
4510  * pseries-2.12
4511  */
4512 static void spapr_machine_2_12_class_options(MachineClass *mc)
4513 {
4514     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4515     static GlobalProperty compat[] = {
4516         { TYPE_POWERPC_CPU, "pre-3.0-migration", "on" },
4517         { TYPE_SPAPR_CPU_CORE, "pre-3.0-migration", "on" },
4518     };
4519 
4520     spapr_machine_3_0_class_options(mc);
4521     compat_props_add(mc->compat_props, hw_compat_2_12, hw_compat_2_12_len);
4522     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4523 
4524     /* We depend on kvm_enabled() to choose a default value for the
4525      * hpt-max-page-size capability. Of course we can't do it here
4526      * because this is too early and the HW accelerator isn't initialzed
4527      * yet. Postpone this to machine init (see default_caps_with_cpu()).
4528      */
4529     smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 0;
4530 }
4531 
4532 DEFINE_SPAPR_MACHINE(2_12, "2.12", false);
4533 
4534 static void spapr_machine_2_12_sxxm_class_options(MachineClass *mc)
4535 {
4536     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4537 
4538     spapr_machine_2_12_class_options(mc);
4539     smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND;
4540     smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND;
4541     smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_FIXED_CCD;
4542 }
4543 
4544 DEFINE_SPAPR_MACHINE(2_12_sxxm, "2.12-sxxm", false);
4545 
4546 /*
4547  * pseries-2.11
4548  */
4549 
4550 static void spapr_machine_2_11_class_options(MachineClass *mc)
4551 {
4552     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4553 
4554     spapr_machine_2_12_class_options(mc);
4555     smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_ON;
4556     compat_props_add(mc->compat_props, hw_compat_2_11, hw_compat_2_11_len);
4557 }
4558 
4559 DEFINE_SPAPR_MACHINE(2_11, "2.11", false);
4560 
4561 /*
4562  * pseries-2.10
4563  */
4564 
4565 static void spapr_machine_2_10_class_options(MachineClass *mc)
4566 {
4567     spapr_machine_2_11_class_options(mc);
4568     compat_props_add(mc->compat_props, hw_compat_2_10, hw_compat_2_10_len);
4569 }
4570 
4571 DEFINE_SPAPR_MACHINE(2_10, "2.10", false);
4572 
4573 /*
4574  * pseries-2.9
4575  */
4576 
4577 static void spapr_machine_2_9_class_options(MachineClass *mc)
4578 {
4579     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4580     static GlobalProperty compat[] = {
4581         { TYPE_POWERPC_CPU, "pre-2.10-migration", "on" },
4582     };
4583 
4584     spapr_machine_2_10_class_options(mc);
4585     compat_props_add(mc->compat_props, hw_compat_2_9, hw_compat_2_9_len);
4586     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4587     mc->numa_auto_assign_ram = numa_legacy_auto_assign_ram;
4588     smc->pre_2_10_has_unused_icps = true;
4589     smc->resize_hpt_default = SPAPR_RESIZE_HPT_DISABLED;
4590 }
4591 
4592 DEFINE_SPAPR_MACHINE(2_9, "2.9", false);
4593 
4594 /*
4595  * pseries-2.8
4596  */
4597 
4598 static void spapr_machine_2_8_class_options(MachineClass *mc)
4599 {
4600     static GlobalProperty compat[] = {
4601         { TYPE_SPAPR_PCI_HOST_BRIDGE, "pcie-extended-configuration-space", "off" },
4602     };
4603 
4604     spapr_machine_2_9_class_options(mc);
4605     compat_props_add(mc->compat_props, hw_compat_2_8, hw_compat_2_8_len);
4606     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4607     mc->numa_mem_align_shift = 23;
4608 }
4609 
4610 DEFINE_SPAPR_MACHINE(2_8, "2.8", false);
4611 
4612 /*
4613  * pseries-2.7
4614  */
4615 
4616 static void phb_placement_2_7(SpaprMachineState *spapr, uint32_t index,
4617                               uint64_t *buid, hwaddr *pio,
4618                               hwaddr *mmio32, hwaddr *mmio64,
4619                               unsigned n_dma, uint32_t *liobns,
4620                               hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
4621 {
4622     /* Legacy PHB placement for pseries-2.7 and earlier machine types */
4623     const uint64_t base_buid = 0x800000020000000ULL;
4624     const hwaddr phb_spacing = 0x1000000000ULL; /* 64 GiB */
4625     const hwaddr mmio_offset = 0xa0000000; /* 2 GiB + 512 MiB */
4626     const hwaddr pio_offset = 0x80000000; /* 2 GiB */
4627     const uint32_t max_index = 255;
4628     const hwaddr phb0_alignment = 0x10000000000ULL; /* 1 TiB */
4629 
4630     uint64_t ram_top = MACHINE(spapr)->ram_size;
4631     hwaddr phb0_base, phb_base;
4632     int i;
4633 
4634     /* Do we have device memory? */
4635     if (MACHINE(spapr)->maxram_size > ram_top) {
4636         /* Can't just use maxram_size, because there may be an
4637          * alignment gap between normal and device memory regions
4638          */
4639         ram_top = MACHINE(spapr)->device_memory->base +
4640             memory_region_size(&MACHINE(spapr)->device_memory->mr);
4641     }
4642 
4643     phb0_base = QEMU_ALIGN_UP(ram_top, phb0_alignment);
4644 
4645     if (index > max_index) {
4646         error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)",
4647                    max_index);
4648         return;
4649     }
4650 
4651     *buid = base_buid + index;
4652     for (i = 0; i < n_dma; ++i) {
4653         liobns[i] = SPAPR_PCI_LIOBN(index, i);
4654     }
4655 
4656     phb_base = phb0_base + index * phb_spacing;
4657     *pio = phb_base + pio_offset;
4658     *mmio32 = phb_base + mmio_offset;
4659     /*
4660      * We don't set the 64-bit MMIO window, relying on the PHB's
4661      * fallback behaviour of automatically splitting a large "32-bit"
4662      * window into contiguous 32-bit and 64-bit windows
4663      */
4664 
4665     *nv2gpa = 0;
4666     *nv2atsd = 0;
4667 }
4668 
4669 static void spapr_machine_2_7_class_options(MachineClass *mc)
4670 {
4671     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4672     static GlobalProperty compat[] = {
4673         { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem_win_size", "0xf80000000", },
4674         { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem64_win_size", "0", },
4675         { TYPE_POWERPC_CPU, "pre-2.8-migration", "on", },
4676         { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-2.8-migration", "on", },
4677     };
4678 
4679     spapr_machine_2_8_class_options(mc);
4680     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power7_v2.3");
4681     mc->default_machine_opts = "modern-hotplug-events=off";
4682     compat_props_add(mc->compat_props, hw_compat_2_7, hw_compat_2_7_len);
4683     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4684     smc->phb_placement = phb_placement_2_7;
4685 }
4686 
4687 DEFINE_SPAPR_MACHINE(2_7, "2.7", false);
4688 
4689 /*
4690  * pseries-2.6
4691  */
4692 
4693 static void spapr_machine_2_6_class_options(MachineClass *mc)
4694 {
4695     static GlobalProperty compat[] = {
4696         { TYPE_SPAPR_PCI_HOST_BRIDGE, "ddw", "off" },
4697     };
4698 
4699     spapr_machine_2_7_class_options(mc);
4700     mc->has_hotpluggable_cpus = false;
4701     compat_props_add(mc->compat_props, hw_compat_2_6, hw_compat_2_6_len);
4702     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4703 }
4704 
4705 DEFINE_SPAPR_MACHINE(2_6, "2.6", false);
4706 
4707 /*
4708  * pseries-2.5
4709  */
4710 
4711 static void spapr_machine_2_5_class_options(MachineClass *mc)
4712 {
4713     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4714     static GlobalProperty compat[] = {
4715         { "spapr-vlan", "use-rx-buffer-pools", "off" },
4716     };
4717 
4718     spapr_machine_2_6_class_options(mc);
4719     smc->use_ohci_by_default = true;
4720     compat_props_add(mc->compat_props, hw_compat_2_5, hw_compat_2_5_len);
4721     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4722 }
4723 
4724 DEFINE_SPAPR_MACHINE(2_5, "2.5", false);
4725 
4726 /*
4727  * pseries-2.4
4728  */
4729 
4730 static void spapr_machine_2_4_class_options(MachineClass *mc)
4731 {
4732     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4733 
4734     spapr_machine_2_5_class_options(mc);
4735     smc->dr_lmb_enabled = false;
4736     compat_props_add(mc->compat_props, hw_compat_2_4, hw_compat_2_4_len);
4737 }
4738 
4739 DEFINE_SPAPR_MACHINE(2_4, "2.4", false);
4740 
4741 /*
4742  * pseries-2.3
4743  */
4744 
4745 static void spapr_machine_2_3_class_options(MachineClass *mc)
4746 {
4747     static GlobalProperty compat[] = {
4748         { "spapr-pci-host-bridge", "dynamic-reconfiguration", "off" },
4749     };
4750     spapr_machine_2_4_class_options(mc);
4751     compat_props_add(mc->compat_props, hw_compat_2_3, hw_compat_2_3_len);
4752     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4753 }
4754 DEFINE_SPAPR_MACHINE(2_3, "2.3", false);
4755 
4756 /*
4757  * pseries-2.2
4758  */
4759 
4760 static void spapr_machine_2_2_class_options(MachineClass *mc)
4761 {
4762     static GlobalProperty compat[] = {
4763         { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem_win_size", "0x20000000" },
4764     };
4765 
4766     spapr_machine_2_3_class_options(mc);
4767     compat_props_add(mc->compat_props, hw_compat_2_2, hw_compat_2_2_len);
4768     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4769     mc->default_machine_opts = "modern-hotplug-events=off,suppress-vmdesc=on";
4770 }
4771 DEFINE_SPAPR_MACHINE(2_2, "2.2", false);
4772 
4773 /*
4774  * pseries-2.1
4775  */
4776 
4777 static void spapr_machine_2_1_class_options(MachineClass *mc)
4778 {
4779     spapr_machine_2_2_class_options(mc);
4780     compat_props_add(mc->compat_props, hw_compat_2_1, hw_compat_2_1_len);
4781 }
4782 DEFINE_SPAPR_MACHINE(2_1, "2.1", false);
4783 
4784 static void spapr_machine_register_types(void)
4785 {
4786     type_register_static(&spapr_machine_info);
4787 }
4788 
4789 type_init(spapr_machine_register_types)
4790