xref: /openbmc/qemu/hw/ppc/spapr.c (revision 79e42085)
1 /*
2  * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
3  *
4  * Copyright (c) 2004-2007 Fabrice Bellard
5  * Copyright (c) 2007 Jocelyn Mayer
6  * Copyright (c) 2010 David Gibson, IBM Corporation.
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  *
26  */
27 #include "qemu/osdep.h"
28 #include "qapi/error.h"
29 #include "qapi/visitor.h"
30 #include "sysemu/sysemu.h"
31 #include "sysemu/numa.h"
32 #include "sysemu/qtest.h"
33 #include "hw/hw.h"
34 #include "qemu/log.h"
35 #include "hw/fw-path-provider.h"
36 #include "elf.h"
37 #include "net/net.h"
38 #include "sysemu/device_tree.h"
39 #include "sysemu/cpus.h"
40 #include "sysemu/hw_accel.h"
41 #include "kvm_ppc.h"
42 #include "migration/misc.h"
43 #include "migration/global_state.h"
44 #include "migration/register.h"
45 #include "mmu-hash64.h"
46 #include "mmu-book3s-v3.h"
47 #include "cpu-models.h"
48 #include "qom/cpu.h"
49 
50 #include "hw/boards.h"
51 #include "hw/ppc/ppc.h"
52 #include "hw/loader.h"
53 
54 #include "hw/ppc/fdt.h"
55 #include "hw/ppc/spapr.h"
56 #include "hw/ppc/spapr_vio.h"
57 #include "hw/pci-host/spapr.h"
58 #include "hw/pci/msi.h"
59 
60 #include "hw/pci/pci.h"
61 #include "hw/scsi/scsi.h"
62 #include "hw/virtio/virtio-scsi.h"
63 #include "hw/virtio/vhost-scsi-common.h"
64 
65 #include "exec/address-spaces.h"
66 #include "exec/ram_addr.h"
67 #include "hw/usb.h"
68 #include "qemu/config-file.h"
69 #include "qemu/error-report.h"
70 #include "trace.h"
71 #include "hw/nmi.h"
72 #include "hw/intc/intc.h"
73 
74 #include "qemu/cutils.h"
75 #include "hw/ppc/spapr_cpu_core.h"
76 #include "hw/mem/memory-device.h"
77 
78 #include <libfdt.h>
79 
80 /* SLOF memory layout:
81  *
82  * SLOF raw image loaded at 0, copies its romfs right below the flat
83  * device-tree, then position SLOF itself 31M below that
84  *
85  * So we set FW_OVERHEAD to 40MB which should account for all of that
86  * and more
87  *
88  * We load our kernel at 4M, leaving space for SLOF initial image
89  */
90 #define FDT_MAX_SIZE            0x100000
91 #define RTAS_MAX_SIZE           0x10000
92 #define RTAS_MAX_ADDR           0x80000000 /* RTAS must stay below that */
93 #define FW_MAX_SIZE             0x400000
94 #define FW_FILE_NAME            "slof.bin"
95 #define FW_OVERHEAD             0x2800000
96 #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
97 
98 #define MIN_RMA_SLOF            128UL
99 
100 #define PHANDLE_INTC            0x00001111
101 
102 /* These two functions implement the VCPU id numbering: one to compute them
103  * all and one to identify thread 0 of a VCORE. Any change to the first one
104  * is likely to have an impact on the second one, so let's keep them close.
105  */
106 static int spapr_vcpu_id(SpaprMachineState *spapr, int cpu_index)
107 {
108     assert(spapr->vsmt);
109     return
110         (cpu_index / smp_threads) * spapr->vsmt + cpu_index % smp_threads;
111 }
112 static bool spapr_is_thread0_in_vcore(SpaprMachineState *spapr,
113                                       PowerPCCPU *cpu)
114 {
115     assert(spapr->vsmt);
116     return spapr_get_vcpu_id(cpu) % spapr->vsmt == 0;
117 }
118 
119 static bool pre_2_10_vmstate_dummy_icp_needed(void *opaque)
120 {
121     /* Dummy entries correspond to unused ICPState objects in older QEMUs,
122      * and newer QEMUs don't even have them. In both cases, we don't want
123      * to send anything on the wire.
124      */
125     return false;
126 }
127 
128 static const VMStateDescription pre_2_10_vmstate_dummy_icp = {
129     .name = "icp/server",
130     .version_id = 1,
131     .minimum_version_id = 1,
132     .needed = pre_2_10_vmstate_dummy_icp_needed,
133     .fields = (VMStateField[]) {
134         VMSTATE_UNUSED(4), /* uint32_t xirr */
135         VMSTATE_UNUSED(1), /* uint8_t pending_priority */
136         VMSTATE_UNUSED(1), /* uint8_t mfrr */
137         VMSTATE_END_OF_LIST()
138     },
139 };
140 
141 static void pre_2_10_vmstate_register_dummy_icp(int i)
142 {
143     vmstate_register(NULL, i, &pre_2_10_vmstate_dummy_icp,
144                      (void *)(uintptr_t) i);
145 }
146 
147 static void pre_2_10_vmstate_unregister_dummy_icp(int i)
148 {
149     vmstate_unregister(NULL, &pre_2_10_vmstate_dummy_icp,
150                        (void *)(uintptr_t) i);
151 }
152 
153 int spapr_max_server_number(SpaprMachineState *spapr)
154 {
155     assert(spapr->vsmt);
156     return DIV_ROUND_UP(max_cpus * spapr->vsmt, smp_threads);
157 }
158 
159 static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
160                                   int smt_threads)
161 {
162     int i, ret = 0;
163     uint32_t servers_prop[smt_threads];
164     uint32_t gservers_prop[smt_threads * 2];
165     int index = spapr_get_vcpu_id(cpu);
166 
167     if (cpu->compat_pvr) {
168         ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->compat_pvr);
169         if (ret < 0) {
170             return ret;
171         }
172     }
173 
174     /* Build interrupt servers and gservers properties */
175     for (i = 0; i < smt_threads; i++) {
176         servers_prop[i] = cpu_to_be32(index + i);
177         /* Hack, direct the group queues back to cpu 0 */
178         gservers_prop[i*2] = cpu_to_be32(index + i);
179         gservers_prop[i*2 + 1] = 0;
180     }
181     ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
182                       servers_prop, sizeof(servers_prop));
183     if (ret < 0) {
184         return ret;
185     }
186     ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
187                       gservers_prop, sizeof(gservers_prop));
188 
189     return ret;
190 }
191 
192 static int spapr_fixup_cpu_numa_dt(void *fdt, int offset, PowerPCCPU *cpu)
193 {
194     int index = spapr_get_vcpu_id(cpu);
195     uint32_t associativity[] = {cpu_to_be32(0x5),
196                                 cpu_to_be32(0x0),
197                                 cpu_to_be32(0x0),
198                                 cpu_to_be32(0x0),
199                                 cpu_to_be32(cpu->node_id),
200                                 cpu_to_be32(index)};
201 
202     /* Advertise NUMA via ibm,associativity */
203     return fdt_setprop(fdt, offset, "ibm,associativity", associativity,
204                           sizeof(associativity));
205 }
206 
207 /* Populate the "ibm,pa-features" property */
208 static void spapr_populate_pa_features(SpaprMachineState *spapr,
209                                        PowerPCCPU *cpu,
210                                        void *fdt, int offset,
211                                        bool legacy_guest)
212 {
213     uint8_t pa_features_206[] = { 6, 0,
214         0xf6, 0x1f, 0xc7, 0x00, 0x80, 0xc0 };
215     uint8_t pa_features_207[] = { 24, 0,
216         0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0,
217         0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
218         0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
219         0x80, 0x00, 0x80, 0x00, 0x00, 0x00 };
220     uint8_t pa_features_300[] = { 66, 0,
221         /* 0: MMU|FPU|SLB|RUN|DABR|NX, 1: fri[nzpm]|DABRX|SPRG3|SLB0|PP110 */
222         /* 2: VPM|DS205|PPR|DS202|DS206, 3: LSD|URG, SSO, 5: LE|CFAR|EB|LSQ */
223         0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0, /* 0 - 5 */
224         /* 6: DS207 */
225         0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 6 - 11 */
226         /* 16: Vector */
227         0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 12 - 17 */
228         /* 18: Vec. Scalar, 20: Vec. XOR, 22: HTM */
229         0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 18 - 23 */
230         /* 24: Ext. Dec, 26: 64 bit ftrs, 28: PM ftrs */
231         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 24 - 29 */
232         /* 30: MMR, 32: LE atomic, 34: EBB + ext EBB */
233         0x80, 0x00, 0x80, 0x00, 0xC0, 0x00, /* 30 - 35 */
234         /* 36: SPR SO, 38: Copy/Paste, 40: Radix MMU */
235         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 36 - 41 */
236         /* 42: PM, 44: PC RA, 46: SC vec'd */
237         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 42 - 47 */
238         /* 48: SIMD, 50: QP BFP, 52: String */
239         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
240         /* 54: DecFP, 56: DecI, 58: SHA */
241         0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
242         /* 60: NM atomic, 62: RNG */
243         0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
244     };
245     uint8_t *pa_features = NULL;
246     size_t pa_size;
247 
248     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_06, 0, cpu->compat_pvr)) {
249         pa_features = pa_features_206;
250         pa_size = sizeof(pa_features_206);
251     }
252     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_07, 0, cpu->compat_pvr)) {
253         pa_features = pa_features_207;
254         pa_size = sizeof(pa_features_207);
255     }
256     if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0, cpu->compat_pvr)) {
257         pa_features = pa_features_300;
258         pa_size = sizeof(pa_features_300);
259     }
260     if (!pa_features) {
261         return;
262     }
263 
264     if (ppc_hash64_has(cpu, PPC_HASH64_CI_LARGEPAGE)) {
265         /*
266          * Note: we keep CI large pages off by default because a 64K capable
267          * guest provisioned with large pages might otherwise try to map a qemu
268          * framebuffer (or other kind of memory mapped PCI BAR) using 64K pages
269          * even if that qemu runs on a 4k host.
270          * We dd this bit back here if we are confident this is not an issue
271          */
272         pa_features[3] |= 0x20;
273     }
274     if ((spapr_get_cap(spapr, SPAPR_CAP_HTM) != 0) && pa_size > 24) {
275         pa_features[24] |= 0x80;    /* Transactional memory support */
276     }
277     if (legacy_guest && pa_size > 40) {
278         /* Workaround for broken kernels that attempt (guest) radix
279          * mode when they can't handle it, if they see the radix bit set
280          * in pa-features. So hide it from them. */
281         pa_features[40 + 2] &= ~0x80; /* Radix MMU */
282     }
283 
284     _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
285 }
286 
287 static int spapr_fixup_cpu_dt(void *fdt, SpaprMachineState *spapr)
288 {
289     int ret = 0, offset, cpus_offset;
290     CPUState *cs;
291     char cpu_model[32];
292     uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
293 
294     CPU_FOREACH(cs) {
295         PowerPCCPU *cpu = POWERPC_CPU(cs);
296         DeviceClass *dc = DEVICE_GET_CLASS(cs);
297         int index = spapr_get_vcpu_id(cpu);
298         int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu));
299 
300         if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
301             continue;
302         }
303 
304         snprintf(cpu_model, 32, "%s@%x", dc->fw_name, index);
305 
306         cpus_offset = fdt_path_offset(fdt, "/cpus");
307         if (cpus_offset < 0) {
308             cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
309             if (cpus_offset < 0) {
310                 return cpus_offset;
311             }
312         }
313         offset = fdt_subnode_offset(fdt, cpus_offset, cpu_model);
314         if (offset < 0) {
315             offset = fdt_add_subnode(fdt, cpus_offset, cpu_model);
316             if (offset < 0) {
317                 return offset;
318             }
319         }
320 
321         ret = fdt_setprop(fdt, offset, "ibm,pft-size",
322                           pft_size_prop, sizeof(pft_size_prop));
323         if (ret < 0) {
324             return ret;
325         }
326 
327         if (nb_numa_nodes > 1) {
328             ret = spapr_fixup_cpu_numa_dt(fdt, offset, cpu);
329             if (ret < 0) {
330                 return ret;
331             }
332         }
333 
334         ret = spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt);
335         if (ret < 0) {
336             return ret;
337         }
338 
339         spapr_populate_pa_features(spapr, cpu, fdt, offset,
340                                    spapr->cas_legacy_guest_workaround);
341     }
342     return ret;
343 }
344 
345 static hwaddr spapr_node0_size(MachineState *machine)
346 {
347     if (nb_numa_nodes) {
348         int i;
349         for (i = 0; i < nb_numa_nodes; ++i) {
350             if (numa_info[i].node_mem) {
351                 return MIN(pow2floor(numa_info[i].node_mem),
352                            machine->ram_size);
353             }
354         }
355     }
356     return machine->ram_size;
357 }
358 
359 static void add_str(GString *s, const gchar *s1)
360 {
361     g_string_append_len(s, s1, strlen(s1) + 1);
362 }
363 
364 static int spapr_populate_memory_node(void *fdt, int nodeid, hwaddr start,
365                                        hwaddr size)
366 {
367     uint32_t associativity[] = {
368         cpu_to_be32(0x4), /* length */
369         cpu_to_be32(0x0), cpu_to_be32(0x0),
370         cpu_to_be32(0x0), cpu_to_be32(nodeid)
371     };
372     char mem_name[32];
373     uint64_t mem_reg_property[2];
374     int off;
375 
376     mem_reg_property[0] = cpu_to_be64(start);
377     mem_reg_property[1] = cpu_to_be64(size);
378 
379     sprintf(mem_name, "memory@" TARGET_FMT_lx, start);
380     off = fdt_add_subnode(fdt, 0, mem_name);
381     _FDT(off);
382     _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
383     _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
384                       sizeof(mem_reg_property))));
385     _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
386                       sizeof(associativity))));
387     return off;
388 }
389 
390 static int spapr_populate_memory(SpaprMachineState *spapr, void *fdt)
391 {
392     MachineState *machine = MACHINE(spapr);
393     hwaddr mem_start, node_size;
394     int i, nb_nodes = nb_numa_nodes;
395     NodeInfo *nodes = numa_info;
396     NodeInfo ramnode;
397 
398     /* No NUMA nodes, assume there is just one node with whole RAM */
399     if (!nb_numa_nodes) {
400         nb_nodes = 1;
401         ramnode.node_mem = machine->ram_size;
402         nodes = &ramnode;
403     }
404 
405     for (i = 0, mem_start = 0; i < nb_nodes; ++i) {
406         if (!nodes[i].node_mem) {
407             continue;
408         }
409         if (mem_start >= machine->ram_size) {
410             node_size = 0;
411         } else {
412             node_size = nodes[i].node_mem;
413             if (node_size > machine->ram_size - mem_start) {
414                 node_size = machine->ram_size - mem_start;
415             }
416         }
417         if (!mem_start) {
418             /* spapr_machine_init() checks for rma_size <= node0_size
419              * already */
420             spapr_populate_memory_node(fdt, i, 0, spapr->rma_size);
421             mem_start += spapr->rma_size;
422             node_size -= spapr->rma_size;
423         }
424         for ( ; node_size; ) {
425             hwaddr sizetmp = pow2floor(node_size);
426 
427             /* mem_start != 0 here */
428             if (ctzl(mem_start) < ctzl(sizetmp)) {
429                 sizetmp = 1ULL << ctzl(mem_start);
430             }
431 
432             spapr_populate_memory_node(fdt, i, mem_start, sizetmp);
433             node_size -= sizetmp;
434             mem_start += sizetmp;
435         }
436     }
437 
438     return 0;
439 }
440 
441 static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset,
442                                   SpaprMachineState *spapr)
443 {
444     PowerPCCPU *cpu = POWERPC_CPU(cs);
445     CPUPPCState *env = &cpu->env;
446     PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
447     int index = spapr_get_vcpu_id(cpu);
448     uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
449                        0xffffffff, 0xffffffff};
450     uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq()
451         : SPAPR_TIMEBASE_FREQ;
452     uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
453     uint32_t page_sizes_prop[64];
454     size_t page_sizes_prop_size;
455     uint32_t vcpus_per_socket = smp_threads * smp_cores;
456     uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
457     int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu));
458     SpaprDrc *drc;
459     int drc_index;
460     uint32_t radix_AP_encodings[PPC_PAGE_SIZES_MAX_SZ];
461     int i;
462 
463     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, index);
464     if (drc) {
465         drc_index = spapr_drc_index(drc);
466         _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index)));
467     }
468 
469     _FDT((fdt_setprop_cell(fdt, offset, "reg", index)));
470     _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu")));
471 
472     _FDT((fdt_setprop_cell(fdt, offset, "cpu-version", env->spr[SPR_PVR])));
473     _FDT((fdt_setprop_cell(fdt, offset, "d-cache-block-size",
474                            env->dcache_line_size)));
475     _FDT((fdt_setprop_cell(fdt, offset, "d-cache-line-size",
476                            env->dcache_line_size)));
477     _FDT((fdt_setprop_cell(fdt, offset, "i-cache-block-size",
478                            env->icache_line_size)));
479     _FDT((fdt_setprop_cell(fdt, offset, "i-cache-line-size",
480                            env->icache_line_size)));
481 
482     if (pcc->l1_dcache_size) {
483         _FDT((fdt_setprop_cell(fdt, offset, "d-cache-size",
484                                pcc->l1_dcache_size)));
485     } else {
486         warn_report("Unknown L1 dcache size for cpu");
487     }
488     if (pcc->l1_icache_size) {
489         _FDT((fdt_setprop_cell(fdt, offset, "i-cache-size",
490                                pcc->l1_icache_size)));
491     } else {
492         warn_report("Unknown L1 icache size for cpu");
493     }
494 
495     _FDT((fdt_setprop_cell(fdt, offset, "timebase-frequency", tbfreq)));
496     _FDT((fdt_setprop_cell(fdt, offset, "clock-frequency", cpufreq)));
497     _FDT((fdt_setprop_cell(fdt, offset, "slb-size", cpu->hash64_opts->slb_size)));
498     _FDT((fdt_setprop_cell(fdt, offset, "ibm,slb-size", cpu->hash64_opts->slb_size)));
499     _FDT((fdt_setprop_string(fdt, offset, "status", "okay")));
500     _FDT((fdt_setprop(fdt, offset, "64-bit", NULL, 0)));
501 
502     if (env->spr_cb[SPR_PURR].oea_read) {
503         _FDT((fdt_setprop_cell(fdt, offset, "ibm,purr", 1)));
504     }
505     if (env->spr_cb[SPR_SPURR].oea_read) {
506         _FDT((fdt_setprop_cell(fdt, offset, "ibm,spurr", 1)));
507     }
508 
509     if (ppc_hash64_has(cpu, PPC_HASH64_1TSEG)) {
510         _FDT((fdt_setprop(fdt, offset, "ibm,processor-segment-sizes",
511                           segs, sizeof(segs))));
512     }
513 
514     /* Advertise VSX (vector extensions) if available
515      *   1               == VMX / Altivec available
516      *   2               == VSX available
517      *
518      * Only CPUs for which we create core types in spapr_cpu_core.c
519      * are possible, and all of those have VMX */
520     if (spapr_get_cap(spapr, SPAPR_CAP_VSX) != 0) {
521         _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 2)));
522     } else {
523         _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 1)));
524     }
525 
526     /* Advertise DFP (Decimal Floating Point) if available
527      *   0 / no property == no DFP
528      *   1               == DFP available */
529     if (spapr_get_cap(spapr, SPAPR_CAP_DFP) != 0) {
530         _FDT((fdt_setprop_cell(fdt, offset, "ibm,dfp", 1)));
531     }
532 
533     page_sizes_prop_size = ppc_create_page_sizes_prop(cpu, page_sizes_prop,
534                                                       sizeof(page_sizes_prop));
535     if (page_sizes_prop_size) {
536         _FDT((fdt_setprop(fdt, offset, "ibm,segment-page-sizes",
537                           page_sizes_prop, page_sizes_prop_size)));
538     }
539 
540     spapr_populate_pa_features(spapr, cpu, fdt, offset, false);
541 
542     _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id",
543                            cs->cpu_index / vcpus_per_socket)));
544 
545     _FDT((fdt_setprop(fdt, offset, "ibm,pft-size",
546                       pft_size_prop, sizeof(pft_size_prop))));
547 
548     if (nb_numa_nodes > 1) {
549         _FDT(spapr_fixup_cpu_numa_dt(fdt, offset, cpu));
550     }
551 
552     _FDT(spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt));
553 
554     if (pcc->radix_page_info) {
555         for (i = 0; i < pcc->radix_page_info->count; i++) {
556             radix_AP_encodings[i] =
557                 cpu_to_be32(pcc->radix_page_info->entries[i]);
558         }
559         _FDT((fdt_setprop(fdt, offset, "ibm,processor-radix-AP-encodings",
560                           radix_AP_encodings,
561                           pcc->radix_page_info->count *
562                           sizeof(radix_AP_encodings[0]))));
563     }
564 
565     /*
566      * We set this property to let the guest know that it can use the large
567      * decrementer and its width in bits.
568      */
569     if (spapr_get_cap(spapr, SPAPR_CAP_LARGE_DECREMENTER) != SPAPR_CAP_OFF)
570         _FDT((fdt_setprop_u32(fdt, offset, "ibm,dec-bits",
571                               pcc->lrg_decr_bits)));
572 }
573 
574 static void spapr_populate_cpus_dt_node(void *fdt, SpaprMachineState *spapr)
575 {
576     CPUState **rev;
577     CPUState *cs;
578     int n_cpus;
579     int cpus_offset;
580     char *nodename;
581     int i;
582 
583     cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
584     _FDT(cpus_offset);
585     _FDT((fdt_setprop_cell(fdt, cpus_offset, "#address-cells", 0x1)));
586     _FDT((fdt_setprop_cell(fdt, cpus_offset, "#size-cells", 0x0)));
587 
588     /*
589      * We walk the CPUs in reverse order to ensure that CPU DT nodes
590      * created by fdt_add_subnode() end up in the right order in FDT
591      * for the guest kernel the enumerate the CPUs correctly.
592      *
593      * The CPU list cannot be traversed in reverse order, so we need
594      * to do extra work.
595      */
596     n_cpus = 0;
597     rev = NULL;
598     CPU_FOREACH(cs) {
599         rev = g_renew(CPUState *, rev, n_cpus + 1);
600         rev[n_cpus++] = cs;
601     }
602 
603     for (i = n_cpus - 1; i >= 0; i--) {
604         CPUState *cs = rev[i];
605         PowerPCCPU *cpu = POWERPC_CPU(cs);
606         int index = spapr_get_vcpu_id(cpu);
607         DeviceClass *dc = DEVICE_GET_CLASS(cs);
608         int offset;
609 
610         if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
611             continue;
612         }
613 
614         nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
615         offset = fdt_add_subnode(fdt, cpus_offset, nodename);
616         g_free(nodename);
617         _FDT(offset);
618         spapr_populate_cpu_dt(cs, fdt, offset, spapr);
619     }
620 
621     g_free(rev);
622 }
623 
624 static int spapr_rng_populate_dt(void *fdt)
625 {
626     int node;
627     int ret;
628 
629     node = qemu_fdt_add_subnode(fdt, "/ibm,platform-facilities");
630     if (node <= 0) {
631         return -1;
632     }
633     ret = fdt_setprop_string(fdt, node, "device_type",
634                              "ibm,platform-facilities");
635     ret |= fdt_setprop_cell(fdt, node, "#address-cells", 0x1);
636     ret |= fdt_setprop_cell(fdt, node, "#size-cells", 0x0);
637 
638     node = fdt_add_subnode(fdt, node, "ibm,random-v1");
639     if (node <= 0) {
640         return -1;
641     }
642     ret |= fdt_setprop_string(fdt, node, "compatible", "ibm,random");
643 
644     return ret ? -1 : 0;
645 }
646 
647 static uint32_t spapr_pc_dimm_node(MemoryDeviceInfoList *list, ram_addr_t addr)
648 {
649     MemoryDeviceInfoList *info;
650 
651     for (info = list; info; info = info->next) {
652         MemoryDeviceInfo *value = info->value;
653 
654         if (value && value->type == MEMORY_DEVICE_INFO_KIND_DIMM) {
655             PCDIMMDeviceInfo *pcdimm_info = value->u.dimm.data;
656 
657             if (addr >= pcdimm_info->addr &&
658                 addr < (pcdimm_info->addr + pcdimm_info->size)) {
659                 return pcdimm_info->node;
660             }
661         }
662     }
663 
664     return -1;
665 }
666 
667 struct sPAPRDrconfCellV2 {
668      uint32_t seq_lmbs;
669      uint64_t base_addr;
670      uint32_t drc_index;
671      uint32_t aa_index;
672      uint32_t flags;
673 } QEMU_PACKED;
674 
675 typedef struct DrconfCellQueue {
676     struct sPAPRDrconfCellV2 cell;
677     QSIMPLEQ_ENTRY(DrconfCellQueue) entry;
678 } DrconfCellQueue;
679 
680 static DrconfCellQueue *
681 spapr_get_drconf_cell(uint32_t seq_lmbs, uint64_t base_addr,
682                       uint32_t drc_index, uint32_t aa_index,
683                       uint32_t flags)
684 {
685     DrconfCellQueue *elem;
686 
687     elem = g_malloc0(sizeof(*elem));
688     elem->cell.seq_lmbs = cpu_to_be32(seq_lmbs);
689     elem->cell.base_addr = cpu_to_be64(base_addr);
690     elem->cell.drc_index = cpu_to_be32(drc_index);
691     elem->cell.aa_index = cpu_to_be32(aa_index);
692     elem->cell.flags = cpu_to_be32(flags);
693 
694     return elem;
695 }
696 
697 /* ibm,dynamic-memory-v2 */
698 static int spapr_populate_drmem_v2(SpaprMachineState *spapr, void *fdt,
699                                    int offset, MemoryDeviceInfoList *dimms)
700 {
701     MachineState *machine = MACHINE(spapr);
702     uint8_t *int_buf, *cur_index;
703     int ret;
704     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
705     uint64_t addr, cur_addr, size;
706     uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
707     uint64_t mem_end = machine->device_memory->base +
708                        memory_region_size(&machine->device_memory->mr);
709     uint32_t node, buf_len, nr_entries = 0;
710     SpaprDrc *drc;
711     DrconfCellQueue *elem, *next;
712     MemoryDeviceInfoList *info;
713     QSIMPLEQ_HEAD(, DrconfCellQueue) drconf_queue
714         = QSIMPLEQ_HEAD_INITIALIZER(drconf_queue);
715 
716     /* Entry to cover RAM and the gap area */
717     elem = spapr_get_drconf_cell(nr_boot_lmbs, 0, 0, -1,
718                                  SPAPR_LMB_FLAGS_RESERVED |
719                                  SPAPR_LMB_FLAGS_DRC_INVALID);
720     QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
721     nr_entries++;
722 
723     cur_addr = machine->device_memory->base;
724     for (info = dimms; info; info = info->next) {
725         PCDIMMDeviceInfo *di = info->value->u.dimm.data;
726 
727         addr = di->addr;
728         size = di->size;
729         node = di->node;
730 
731         /* Entry for hot-pluggable area */
732         if (cur_addr < addr) {
733             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
734             g_assert(drc);
735             elem = spapr_get_drconf_cell((addr - cur_addr) / lmb_size,
736                                          cur_addr, spapr_drc_index(drc), -1, 0);
737             QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
738             nr_entries++;
739         }
740 
741         /* Entry for DIMM */
742         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
743         g_assert(drc);
744         elem = spapr_get_drconf_cell(size / lmb_size, addr,
745                                      spapr_drc_index(drc), node,
746                                      SPAPR_LMB_FLAGS_ASSIGNED);
747         QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
748         nr_entries++;
749         cur_addr = addr + size;
750     }
751 
752     /* Entry for remaining hotpluggable area */
753     if (cur_addr < mem_end) {
754         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
755         g_assert(drc);
756         elem = spapr_get_drconf_cell((mem_end - cur_addr) / lmb_size,
757                                      cur_addr, spapr_drc_index(drc), -1, 0);
758         QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
759         nr_entries++;
760     }
761 
762     buf_len = nr_entries * sizeof(struct sPAPRDrconfCellV2) + sizeof(uint32_t);
763     int_buf = cur_index = g_malloc0(buf_len);
764     *(uint32_t *)int_buf = cpu_to_be32(nr_entries);
765     cur_index += sizeof(nr_entries);
766 
767     QSIMPLEQ_FOREACH_SAFE(elem, &drconf_queue, entry, next) {
768         memcpy(cur_index, &elem->cell, sizeof(elem->cell));
769         cur_index += sizeof(elem->cell);
770         QSIMPLEQ_REMOVE(&drconf_queue, elem, DrconfCellQueue, entry);
771         g_free(elem);
772     }
773 
774     ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory-v2", int_buf, buf_len);
775     g_free(int_buf);
776     if (ret < 0) {
777         return -1;
778     }
779     return 0;
780 }
781 
782 /* ibm,dynamic-memory */
783 static int spapr_populate_drmem_v1(SpaprMachineState *spapr, void *fdt,
784                                    int offset, MemoryDeviceInfoList *dimms)
785 {
786     MachineState *machine = MACHINE(spapr);
787     int i, ret;
788     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
789     uint32_t device_lmb_start = machine->device_memory->base / lmb_size;
790     uint32_t nr_lmbs = (machine->device_memory->base +
791                        memory_region_size(&machine->device_memory->mr)) /
792                        lmb_size;
793     uint32_t *int_buf, *cur_index, buf_len;
794 
795     /*
796      * Allocate enough buffer size to fit in ibm,dynamic-memory
797      */
798     buf_len = (nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1) * sizeof(uint32_t);
799     cur_index = int_buf = g_malloc0(buf_len);
800     int_buf[0] = cpu_to_be32(nr_lmbs);
801     cur_index++;
802     for (i = 0; i < nr_lmbs; i++) {
803         uint64_t addr = i * lmb_size;
804         uint32_t *dynamic_memory = cur_index;
805 
806         if (i >= device_lmb_start) {
807             SpaprDrc *drc;
808 
809             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, i);
810             g_assert(drc);
811 
812             dynamic_memory[0] = cpu_to_be32(addr >> 32);
813             dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
814             dynamic_memory[2] = cpu_to_be32(spapr_drc_index(drc));
815             dynamic_memory[3] = cpu_to_be32(0); /* reserved */
816             dynamic_memory[4] = cpu_to_be32(spapr_pc_dimm_node(dimms, addr));
817             if (memory_region_present(get_system_memory(), addr)) {
818                 dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_ASSIGNED);
819             } else {
820                 dynamic_memory[5] = cpu_to_be32(0);
821             }
822         } else {
823             /*
824              * LMB information for RMA, boot time RAM and gap b/n RAM and
825              * device memory region -- all these are marked as reserved
826              * and as having no valid DRC.
827              */
828             dynamic_memory[0] = cpu_to_be32(addr >> 32);
829             dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
830             dynamic_memory[2] = cpu_to_be32(0);
831             dynamic_memory[3] = cpu_to_be32(0); /* reserved */
832             dynamic_memory[4] = cpu_to_be32(-1);
833             dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_RESERVED |
834                                             SPAPR_LMB_FLAGS_DRC_INVALID);
835         }
836 
837         cur_index += SPAPR_DR_LMB_LIST_ENTRY_SIZE;
838     }
839     ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len);
840     g_free(int_buf);
841     if (ret < 0) {
842         return -1;
843     }
844     return 0;
845 }
846 
847 /*
848  * Adds ibm,dynamic-reconfiguration-memory node.
849  * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
850  * of this device tree node.
851  */
852 static int spapr_populate_drconf_memory(SpaprMachineState *spapr, void *fdt)
853 {
854     MachineState *machine = MACHINE(spapr);
855     int ret, i, offset;
856     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
857     uint32_t prop_lmb_size[] = {0, cpu_to_be32(lmb_size)};
858     uint32_t *int_buf, *cur_index, buf_len;
859     int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1;
860     MemoryDeviceInfoList *dimms = NULL;
861 
862     /*
863      * Don't create the node if there is no device memory
864      */
865     if (machine->ram_size == machine->maxram_size) {
866         return 0;
867     }
868 
869     offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
870 
871     ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
872                     sizeof(prop_lmb_size));
873     if (ret < 0) {
874         return ret;
875     }
876 
877     ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
878     if (ret < 0) {
879         return ret;
880     }
881 
882     ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
883     if (ret < 0) {
884         return ret;
885     }
886 
887     /* ibm,dynamic-memory or ibm,dynamic-memory-v2 */
888     dimms = qmp_memory_device_list();
889     if (spapr_ovec_test(spapr->ov5_cas, OV5_DRMEM_V2)) {
890         ret = spapr_populate_drmem_v2(spapr, fdt, offset, dimms);
891     } else {
892         ret = spapr_populate_drmem_v1(spapr, fdt, offset, dimms);
893     }
894     qapi_free_MemoryDeviceInfoList(dimms);
895 
896     if (ret < 0) {
897         return ret;
898     }
899 
900     /* ibm,associativity-lookup-arrays */
901     buf_len = (nr_nodes * 4 + 2) * sizeof(uint32_t);
902     cur_index = int_buf = g_malloc0(buf_len);
903     int_buf[0] = cpu_to_be32(nr_nodes);
904     int_buf[1] = cpu_to_be32(4); /* Number of entries per associativity list */
905     cur_index += 2;
906     for (i = 0; i < nr_nodes; i++) {
907         uint32_t associativity[] = {
908             cpu_to_be32(0x0),
909             cpu_to_be32(0x0),
910             cpu_to_be32(0x0),
911             cpu_to_be32(i)
912         };
913         memcpy(cur_index, associativity, sizeof(associativity));
914         cur_index += 4;
915     }
916     ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", int_buf,
917             (cur_index - int_buf) * sizeof(uint32_t));
918     g_free(int_buf);
919 
920     return ret;
921 }
922 
923 static int spapr_dt_cas_updates(SpaprMachineState *spapr, void *fdt,
924                                 SpaprOptionVector *ov5_updates)
925 {
926     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
927     int ret = 0, offset;
928 
929     /* Generate ibm,dynamic-reconfiguration-memory node if required */
930     if (spapr_ovec_test(ov5_updates, OV5_DRCONF_MEMORY)) {
931         g_assert(smc->dr_lmb_enabled);
932         ret = spapr_populate_drconf_memory(spapr, fdt);
933         if (ret) {
934             goto out;
935         }
936     }
937 
938     offset = fdt_path_offset(fdt, "/chosen");
939     if (offset < 0) {
940         offset = fdt_add_subnode(fdt, 0, "chosen");
941         if (offset < 0) {
942             return offset;
943         }
944     }
945     ret = spapr_ovec_populate_dt(fdt, offset, spapr->ov5_cas,
946                                  "ibm,architecture-vec-5");
947 
948 out:
949     return ret;
950 }
951 
952 static bool spapr_hotplugged_dev_before_cas(void)
953 {
954     Object *drc_container, *obj;
955     ObjectProperty *prop;
956     ObjectPropertyIterator iter;
957 
958     drc_container = container_get(object_get_root(), "/dr-connector");
959     object_property_iter_init(&iter, drc_container);
960     while ((prop = object_property_iter_next(&iter))) {
961         if (!strstart(prop->type, "link<", NULL)) {
962             continue;
963         }
964         obj = object_property_get_link(drc_container, prop->name, NULL);
965         if (spapr_drc_needed(obj)) {
966             return true;
967         }
968     }
969     return false;
970 }
971 
972 int spapr_h_cas_compose_response(SpaprMachineState *spapr,
973                                  target_ulong addr, target_ulong size,
974                                  SpaprOptionVector *ov5_updates)
975 {
976     void *fdt, *fdt_skel;
977     SpaprDeviceTreeUpdateHeader hdr = { .version_id = 1 };
978 
979     if (spapr_hotplugged_dev_before_cas()) {
980         return 1;
981     }
982 
983     if (size < sizeof(hdr) || size > FW_MAX_SIZE) {
984         error_report("SLOF provided an unexpected CAS buffer size "
985                      TARGET_FMT_lu " (min: %zu, max: %u)",
986                      size, sizeof(hdr), FW_MAX_SIZE);
987         exit(EXIT_FAILURE);
988     }
989 
990     size -= sizeof(hdr);
991 
992     /* Create skeleton */
993     fdt_skel = g_malloc0(size);
994     _FDT((fdt_create(fdt_skel, size)));
995     _FDT((fdt_finish_reservemap(fdt_skel)));
996     _FDT((fdt_begin_node(fdt_skel, "")));
997     _FDT((fdt_end_node(fdt_skel)));
998     _FDT((fdt_finish(fdt_skel)));
999     fdt = g_malloc0(size);
1000     _FDT((fdt_open_into(fdt_skel, fdt, size)));
1001     g_free(fdt_skel);
1002 
1003     /* Fixup cpu nodes */
1004     _FDT((spapr_fixup_cpu_dt(fdt, spapr)));
1005 
1006     if (spapr_dt_cas_updates(spapr, fdt, ov5_updates)) {
1007         return -1;
1008     }
1009 
1010     /* Pack resulting tree */
1011     _FDT((fdt_pack(fdt)));
1012 
1013     if (fdt_totalsize(fdt) + sizeof(hdr) > size) {
1014         trace_spapr_cas_failed(size);
1015         return -1;
1016     }
1017 
1018     cpu_physical_memory_write(addr, &hdr, sizeof(hdr));
1019     cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt));
1020     trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr));
1021     g_free(fdt);
1022 
1023     return 0;
1024 }
1025 
1026 static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
1027 {
1028     int rtas;
1029     GString *hypertas = g_string_sized_new(256);
1030     GString *qemu_hypertas = g_string_sized_new(256);
1031     uint32_t refpoints[] = { cpu_to_be32(0x4), cpu_to_be32(0x4) };
1032     uint64_t max_device_addr = MACHINE(spapr)->device_memory->base +
1033         memory_region_size(&MACHINE(spapr)->device_memory->mr);
1034     uint32_t lrdr_capacity[] = {
1035         cpu_to_be32(max_device_addr >> 32),
1036         cpu_to_be32(max_device_addr & 0xffffffff),
1037         0, cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE),
1038         cpu_to_be32(max_cpus / smp_threads),
1039     };
1040     uint32_t maxdomain = cpu_to_be32(spapr->gpu_numa_id > 1 ? 1 : 0);
1041     uint32_t maxdomains[] = {
1042         cpu_to_be32(4),
1043         maxdomain,
1044         maxdomain,
1045         maxdomain,
1046         cpu_to_be32(spapr->gpu_numa_id),
1047     };
1048 
1049     _FDT(rtas = fdt_add_subnode(fdt, 0, "rtas"));
1050 
1051     /* hypertas */
1052     add_str(hypertas, "hcall-pft");
1053     add_str(hypertas, "hcall-term");
1054     add_str(hypertas, "hcall-dabr");
1055     add_str(hypertas, "hcall-interrupt");
1056     add_str(hypertas, "hcall-tce");
1057     add_str(hypertas, "hcall-vio");
1058     add_str(hypertas, "hcall-splpar");
1059     add_str(hypertas, "hcall-bulk");
1060     add_str(hypertas, "hcall-set-mode");
1061     add_str(hypertas, "hcall-sprg0");
1062     add_str(hypertas, "hcall-copy");
1063     add_str(hypertas, "hcall-debug");
1064     add_str(hypertas, "hcall-vphn");
1065     add_str(qemu_hypertas, "hcall-memop1");
1066 
1067     if (!kvm_enabled() || kvmppc_spapr_use_multitce()) {
1068         add_str(hypertas, "hcall-multi-tce");
1069     }
1070 
1071     if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
1072         add_str(hypertas, "hcall-hpt-resize");
1073     }
1074 
1075     _FDT(fdt_setprop(fdt, rtas, "ibm,hypertas-functions",
1076                      hypertas->str, hypertas->len));
1077     g_string_free(hypertas, TRUE);
1078     _FDT(fdt_setprop(fdt, rtas, "qemu,hypertas-functions",
1079                      qemu_hypertas->str, qemu_hypertas->len));
1080     g_string_free(qemu_hypertas, TRUE);
1081 
1082     _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points",
1083                      refpoints, sizeof(refpoints)));
1084 
1085     _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains",
1086                      maxdomains, sizeof(maxdomains)));
1087 
1088     _FDT(fdt_setprop_cell(fdt, rtas, "rtas-error-log-max",
1089                           RTAS_ERROR_LOG_MAX));
1090     _FDT(fdt_setprop_cell(fdt, rtas, "rtas-event-scan-rate",
1091                           RTAS_EVENT_SCAN_RATE));
1092 
1093     g_assert(msi_nonbroken);
1094     _FDT(fdt_setprop(fdt, rtas, "ibm,change-msix-capable", NULL, 0));
1095 
1096     /*
1097      * According to PAPR, rtas ibm,os-term does not guarantee a return
1098      * back to the guest cpu.
1099      *
1100      * While an additional ibm,extended-os-term property indicates
1101      * that rtas call return will always occur. Set this property.
1102      */
1103     _FDT(fdt_setprop(fdt, rtas, "ibm,extended-os-term", NULL, 0));
1104 
1105     _FDT(fdt_setprop(fdt, rtas, "ibm,lrdr-capacity",
1106                      lrdr_capacity, sizeof(lrdr_capacity)));
1107 
1108     spapr_dt_rtas_tokens(fdt, rtas);
1109 }
1110 
1111 /*
1112  * Prepare ibm,arch-vec-5-platform-support, which indicates the MMU
1113  * and the XIVE features that the guest may request and thus the valid
1114  * values for bytes 23..26 of option vector 5:
1115  */
1116 static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
1117                                           int chosen)
1118 {
1119     PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
1120 
1121     char val[2 * 4] = {
1122         23, spapr->irq->ov5, /* Xive mode. */
1123         24, 0x00, /* Hash/Radix, filled in below. */
1124         25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
1125         26, 0x40, /* Radix options: GTSE == yes. */
1126     };
1127 
1128     if (!ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_00, 0,
1129                           first_ppc_cpu->compat_pvr)) {
1130         /*
1131          * If we're in a pre POWER9 compat mode then the guest should
1132          * do hash and use the legacy interrupt mode
1133          */
1134         val[1] = 0x00; /* XICS */
1135         val[3] = 0x00; /* Hash */
1136     } else if (kvm_enabled()) {
1137         if (kvmppc_has_cap_mmu_radix() && kvmppc_has_cap_mmu_hash_v3()) {
1138             val[3] = 0x80; /* OV5_MMU_BOTH */
1139         } else if (kvmppc_has_cap_mmu_radix()) {
1140             val[3] = 0x40; /* OV5_MMU_RADIX_300 */
1141         } else {
1142             val[3] = 0x00; /* Hash */
1143         }
1144     } else {
1145         /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
1146         val[3] = 0xC0;
1147     }
1148     _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
1149                      val, sizeof(val)));
1150 }
1151 
1152 static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt)
1153 {
1154     MachineState *machine = MACHINE(spapr);
1155     int chosen;
1156     const char *boot_device = machine->boot_order;
1157     char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
1158     size_t cb = 0;
1159     char *bootlist = get_boot_devices_list(&cb);
1160 
1161     _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen"));
1162 
1163     _FDT(fdt_setprop_string(fdt, chosen, "bootargs", machine->kernel_cmdline));
1164     _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-start",
1165                           spapr->initrd_base));
1166     _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-end",
1167                           spapr->initrd_base + spapr->initrd_size));
1168 
1169     if (spapr->kernel_size) {
1170         uint64_t kprop[2] = { cpu_to_be64(KERNEL_LOAD_ADDR),
1171                               cpu_to_be64(spapr->kernel_size) };
1172 
1173         _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel",
1174                          &kprop, sizeof(kprop)));
1175         if (spapr->kernel_le) {
1176             _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel-le", NULL, 0));
1177         }
1178     }
1179     if (boot_menu) {
1180         _FDT((fdt_setprop_cell(fdt, chosen, "qemu,boot-menu", boot_menu)));
1181     }
1182     _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-width", graphic_width));
1183     _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-height", graphic_height));
1184     _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-depth", graphic_depth));
1185 
1186     if (cb && bootlist) {
1187         int i;
1188 
1189         for (i = 0; i < cb; i++) {
1190             if (bootlist[i] == '\n') {
1191                 bootlist[i] = ' ';
1192             }
1193         }
1194         _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-list", bootlist));
1195     }
1196 
1197     if (boot_device && strlen(boot_device)) {
1198         _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-device", boot_device));
1199     }
1200 
1201     if (!spapr->has_graphics && stdout_path) {
1202         /*
1203          * "linux,stdout-path" and "stdout" properties are deprecated by linux
1204          * kernel. New platforms should only use the "stdout-path" property. Set
1205          * the new property and continue using older property to remain
1206          * compatible with the existing firmware.
1207          */
1208         _FDT(fdt_setprop_string(fdt, chosen, "linux,stdout-path", stdout_path));
1209         _FDT(fdt_setprop_string(fdt, chosen, "stdout-path", stdout_path));
1210     }
1211 
1212     spapr_dt_ov5_platform_support(spapr, fdt, chosen);
1213 
1214     g_free(stdout_path);
1215     g_free(bootlist);
1216 }
1217 
1218 static void spapr_dt_hypervisor(SpaprMachineState *spapr, void *fdt)
1219 {
1220     /* The /hypervisor node isn't in PAPR - this is a hack to allow PR
1221      * KVM to work under pHyp with some guest co-operation */
1222     int hypervisor;
1223     uint8_t hypercall[16];
1224 
1225     _FDT(hypervisor = fdt_add_subnode(fdt, 0, "hypervisor"));
1226     /* indicate KVM hypercall interface */
1227     _FDT(fdt_setprop_string(fdt, hypervisor, "compatible", "linux,kvm"));
1228     if (kvmppc_has_cap_fixup_hcalls()) {
1229         /*
1230          * Older KVM versions with older guest kernels were broken
1231          * with the magic page, don't allow the guest to map it.
1232          */
1233         if (!kvmppc_get_hypercall(first_cpu->env_ptr, hypercall,
1234                                   sizeof(hypercall))) {
1235             _FDT(fdt_setprop(fdt, hypervisor, "hcall-instructions",
1236                              hypercall, sizeof(hypercall)));
1237         }
1238     }
1239 }
1240 
1241 static void *spapr_build_fdt(SpaprMachineState *spapr)
1242 {
1243     MachineState *machine = MACHINE(spapr);
1244     MachineClass *mc = MACHINE_GET_CLASS(machine);
1245     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
1246     int ret;
1247     void *fdt;
1248     SpaprPhbState *phb;
1249     char *buf;
1250 
1251     fdt = g_malloc0(FDT_MAX_SIZE);
1252     _FDT((fdt_create_empty_tree(fdt, FDT_MAX_SIZE)));
1253 
1254     /* Root node */
1255     _FDT(fdt_setprop_string(fdt, 0, "device_type", "chrp"));
1256     _FDT(fdt_setprop_string(fdt, 0, "model", "IBM pSeries (emulated by qemu)"));
1257     _FDT(fdt_setprop_string(fdt, 0, "compatible", "qemu,pseries"));
1258 
1259     /* Guest UUID & Name*/
1260     buf = qemu_uuid_unparse_strdup(&qemu_uuid);
1261     _FDT(fdt_setprop_string(fdt, 0, "vm,uuid", buf));
1262     if (qemu_uuid_set) {
1263         _FDT(fdt_setprop_string(fdt, 0, "system-id", buf));
1264     }
1265     g_free(buf);
1266 
1267     if (qemu_get_vm_name()) {
1268         _FDT(fdt_setprop_string(fdt, 0, "ibm,partition-name",
1269                                 qemu_get_vm_name()));
1270     }
1271 
1272     /* Host Model & Serial Number */
1273     if (spapr->host_model) {
1274         _FDT(fdt_setprop_string(fdt, 0, "host-model", spapr->host_model));
1275     } else if (smc->broken_host_serial_model && kvmppc_get_host_model(&buf)) {
1276         _FDT(fdt_setprop_string(fdt, 0, "host-model", buf));
1277         g_free(buf);
1278     }
1279 
1280     if (spapr->host_serial) {
1281         _FDT(fdt_setprop_string(fdt, 0, "host-serial", spapr->host_serial));
1282     } else if (smc->broken_host_serial_model && kvmppc_get_host_serial(&buf)) {
1283         _FDT(fdt_setprop_string(fdt, 0, "host-serial", buf));
1284         g_free(buf);
1285     }
1286 
1287     _FDT(fdt_setprop_cell(fdt, 0, "#address-cells", 2));
1288     _FDT(fdt_setprop_cell(fdt, 0, "#size-cells", 2));
1289 
1290     /* /interrupt controller */
1291     spapr->irq->dt_populate(spapr, spapr_max_server_number(spapr), fdt,
1292                           PHANDLE_INTC);
1293 
1294     ret = spapr_populate_memory(spapr, fdt);
1295     if (ret < 0) {
1296         error_report("couldn't setup memory nodes in fdt");
1297         exit(1);
1298     }
1299 
1300     /* /vdevice */
1301     spapr_dt_vdevice(spapr->vio_bus, fdt);
1302 
1303     if (object_resolve_path_type("", TYPE_SPAPR_RNG, NULL)) {
1304         ret = spapr_rng_populate_dt(fdt);
1305         if (ret < 0) {
1306             error_report("could not set up rng device in the fdt");
1307             exit(1);
1308         }
1309     }
1310 
1311     QLIST_FOREACH(phb, &spapr->phbs, list) {
1312         ret = spapr_populate_pci_dt(phb, PHANDLE_INTC, fdt,
1313                                     spapr->irq->nr_msis, NULL);
1314         if (ret < 0) {
1315             error_report("couldn't setup PCI devices in fdt");
1316             exit(1);
1317         }
1318     }
1319 
1320     /* cpus */
1321     spapr_populate_cpus_dt_node(fdt, spapr);
1322 
1323     if (smc->dr_lmb_enabled) {
1324         _FDT(spapr_drc_populate_dt(fdt, 0, NULL, SPAPR_DR_CONNECTOR_TYPE_LMB));
1325     }
1326 
1327     if (mc->has_hotpluggable_cpus) {
1328         int offset = fdt_path_offset(fdt, "/cpus");
1329         ret = spapr_drc_populate_dt(fdt, offset, NULL,
1330                                     SPAPR_DR_CONNECTOR_TYPE_CPU);
1331         if (ret < 0) {
1332             error_report("Couldn't set up CPU DR device tree properties");
1333             exit(1);
1334         }
1335     }
1336 
1337     /* /event-sources */
1338     spapr_dt_events(spapr, fdt);
1339 
1340     /* /rtas */
1341     spapr_dt_rtas(spapr, fdt);
1342 
1343     /* /chosen */
1344     spapr_dt_chosen(spapr, fdt);
1345 
1346     /* /hypervisor */
1347     if (kvm_enabled()) {
1348         spapr_dt_hypervisor(spapr, fdt);
1349     }
1350 
1351     /* Build memory reserve map */
1352     if (spapr->kernel_size) {
1353         _FDT((fdt_add_mem_rsv(fdt, KERNEL_LOAD_ADDR, spapr->kernel_size)));
1354     }
1355     if (spapr->initrd_size) {
1356         _FDT((fdt_add_mem_rsv(fdt, spapr->initrd_base, spapr->initrd_size)));
1357     }
1358 
1359     /* ibm,client-architecture-support updates */
1360     ret = spapr_dt_cas_updates(spapr, fdt, spapr->ov5_cas);
1361     if (ret < 0) {
1362         error_report("couldn't setup CAS properties fdt");
1363         exit(1);
1364     }
1365 
1366     if (smc->dr_phb_enabled) {
1367         ret = spapr_drc_populate_dt(fdt, 0, NULL, SPAPR_DR_CONNECTOR_TYPE_PHB);
1368         if (ret < 0) {
1369             error_report("Couldn't set up PHB DR device tree properties");
1370             exit(1);
1371         }
1372     }
1373 
1374     return fdt;
1375 }
1376 
1377 static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
1378 {
1379     return (addr & 0x0fffffff) + KERNEL_LOAD_ADDR;
1380 }
1381 
1382 static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
1383                                     PowerPCCPU *cpu)
1384 {
1385     CPUPPCState *env = &cpu->env;
1386 
1387     /* The TCG path should also be holding the BQL at this point */
1388     g_assert(qemu_mutex_iothread_locked());
1389 
1390     if (msr_pr) {
1391         hcall_dprintf("Hypercall made with MSR[PR]=1\n");
1392         env->gpr[3] = H_PRIVILEGE;
1393     } else {
1394         env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
1395     }
1396 }
1397 
1398 struct LPCRSyncState {
1399     target_ulong value;
1400     target_ulong mask;
1401 };
1402 
1403 static void do_lpcr_sync(CPUState *cs, run_on_cpu_data arg)
1404 {
1405     struct LPCRSyncState *s = arg.host_ptr;
1406     PowerPCCPU *cpu = POWERPC_CPU(cs);
1407     CPUPPCState *env = &cpu->env;
1408     target_ulong lpcr;
1409 
1410     cpu_synchronize_state(cs);
1411     lpcr = env->spr[SPR_LPCR];
1412     lpcr &= ~s->mask;
1413     lpcr |= s->value;
1414     ppc_store_lpcr(cpu, lpcr);
1415 }
1416 
1417 void spapr_set_all_lpcrs(target_ulong value, target_ulong mask)
1418 {
1419     CPUState *cs;
1420     struct LPCRSyncState s = {
1421         .value = value,
1422         .mask = mask
1423     };
1424     CPU_FOREACH(cs) {
1425         run_on_cpu(cs, do_lpcr_sync, RUN_ON_CPU_HOST_PTR(&s));
1426     }
1427 }
1428 
1429 static void spapr_get_pate(PPCVirtualHypervisor *vhyp, ppc_v3_pate_t *entry)
1430 {
1431     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1432 
1433     /* Copy PATE1:GR into PATE0:HR */
1434     entry->dw0 = spapr->patb_entry & PATE0_HR;
1435     entry->dw1 = spapr->patb_entry;
1436 }
1437 
1438 #define HPTE(_table, _i)   (void *)(((uint64_t *)(_table)) + ((_i) * 2))
1439 #define HPTE_VALID(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
1440 #define HPTE_DIRTY(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY)
1441 #define CLEAN_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY))
1442 #define DIRTY_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) |= tswap64(HPTE64_V_HPTE_DIRTY))
1443 
1444 /*
1445  * Get the fd to access the kernel htab, re-opening it if necessary
1446  */
1447 static int get_htab_fd(SpaprMachineState *spapr)
1448 {
1449     Error *local_err = NULL;
1450 
1451     if (spapr->htab_fd >= 0) {
1452         return spapr->htab_fd;
1453     }
1454 
1455     spapr->htab_fd = kvmppc_get_htab_fd(false, 0, &local_err);
1456     if (spapr->htab_fd < 0) {
1457         error_report_err(local_err);
1458     }
1459 
1460     return spapr->htab_fd;
1461 }
1462 
1463 void close_htab_fd(SpaprMachineState *spapr)
1464 {
1465     if (spapr->htab_fd >= 0) {
1466         close(spapr->htab_fd);
1467     }
1468     spapr->htab_fd = -1;
1469 }
1470 
1471 static hwaddr spapr_hpt_mask(PPCVirtualHypervisor *vhyp)
1472 {
1473     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1474 
1475     return HTAB_SIZE(spapr) / HASH_PTEG_SIZE_64 - 1;
1476 }
1477 
1478 static target_ulong spapr_encode_hpt_for_kvm_pr(PPCVirtualHypervisor *vhyp)
1479 {
1480     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1481 
1482     assert(kvm_enabled());
1483 
1484     if (!spapr->htab) {
1485         return 0;
1486     }
1487 
1488     return (target_ulong)(uintptr_t)spapr->htab | (spapr->htab_shift - 18);
1489 }
1490 
1491 static const ppc_hash_pte64_t *spapr_map_hptes(PPCVirtualHypervisor *vhyp,
1492                                                 hwaddr ptex, int n)
1493 {
1494     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1495     hwaddr pte_offset = ptex * HASH_PTE_SIZE_64;
1496 
1497     if (!spapr->htab) {
1498         /*
1499          * HTAB is controlled by KVM. Fetch into temporary buffer
1500          */
1501         ppc_hash_pte64_t *hptes = g_malloc(n * HASH_PTE_SIZE_64);
1502         kvmppc_read_hptes(hptes, ptex, n);
1503         return hptes;
1504     }
1505 
1506     /*
1507      * HTAB is controlled by QEMU. Just point to the internally
1508      * accessible PTEG.
1509      */
1510     return (const ppc_hash_pte64_t *)(spapr->htab + pte_offset);
1511 }
1512 
1513 static void spapr_unmap_hptes(PPCVirtualHypervisor *vhyp,
1514                               const ppc_hash_pte64_t *hptes,
1515                               hwaddr ptex, int n)
1516 {
1517     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1518 
1519     if (!spapr->htab) {
1520         g_free((void *)hptes);
1521     }
1522 
1523     /* Nothing to do for qemu managed HPT */
1524 }
1525 
1526 void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
1527                       uint64_t pte0, uint64_t pte1)
1528 {
1529     SpaprMachineState *spapr = SPAPR_MACHINE(cpu->vhyp);
1530     hwaddr offset = ptex * HASH_PTE_SIZE_64;
1531 
1532     if (!spapr->htab) {
1533         kvmppc_write_hpte(ptex, pte0, pte1);
1534     } else {
1535         if (pte0 & HPTE64_V_VALID) {
1536             stq_p(spapr->htab + offset + HASH_PTE_SIZE_64 / 2, pte1);
1537             /*
1538              * When setting valid, we write PTE1 first. This ensures
1539              * proper synchronization with the reading code in
1540              * ppc_hash64_pteg_search()
1541              */
1542             smp_wmb();
1543             stq_p(spapr->htab + offset, pte0);
1544         } else {
1545             stq_p(spapr->htab + offset, pte0);
1546             /*
1547              * When clearing it we set PTE0 first. This ensures proper
1548              * synchronization with the reading code in
1549              * ppc_hash64_pteg_search()
1550              */
1551             smp_wmb();
1552             stq_p(spapr->htab + offset + HASH_PTE_SIZE_64 / 2, pte1);
1553         }
1554     }
1555 }
1556 
1557 static void spapr_hpte_set_c(PPCVirtualHypervisor *vhyp, hwaddr ptex,
1558                              uint64_t pte1)
1559 {
1560     hwaddr offset = ptex * HASH_PTE_SIZE_64 + 15;
1561     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1562 
1563     if (!spapr->htab) {
1564         /* There should always be a hash table when this is called */
1565         error_report("spapr_hpte_set_c called with no hash table !");
1566         return;
1567     }
1568 
1569     /* The HW performs a non-atomic byte update */
1570     stb_p(spapr->htab + offset, (pte1 & 0xff) | 0x80);
1571 }
1572 
1573 static void spapr_hpte_set_r(PPCVirtualHypervisor *vhyp, hwaddr ptex,
1574                              uint64_t pte1)
1575 {
1576     hwaddr offset = ptex * HASH_PTE_SIZE_64 + 14;
1577     SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1578 
1579     if (!spapr->htab) {
1580         /* There should always be a hash table when this is called */
1581         error_report("spapr_hpte_set_r called with no hash table !");
1582         return;
1583     }
1584 
1585     /* The HW performs a non-atomic byte update */
1586     stb_p(spapr->htab + offset, ((pte1 >> 8) & 0xff) | 0x01);
1587 }
1588 
1589 int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
1590 {
1591     int shift;
1592 
1593     /* We aim for a hash table of size 1/128 the size of RAM (rounded
1594      * up).  The PAPR recommendation is actually 1/64 of RAM size, but
1595      * that's much more than is needed for Linux guests */
1596     shift = ctz64(pow2ceil(ramsize)) - 7;
1597     shift = MAX(shift, 18); /* Minimum architected size */
1598     shift = MIN(shift, 46); /* Maximum architected size */
1599     return shift;
1600 }
1601 
1602 void spapr_free_hpt(SpaprMachineState *spapr)
1603 {
1604     g_free(spapr->htab);
1605     spapr->htab = NULL;
1606     spapr->htab_shift = 0;
1607     close_htab_fd(spapr);
1608 }
1609 
1610 void spapr_reallocate_hpt(SpaprMachineState *spapr, int shift,
1611                           Error **errp)
1612 {
1613     long rc;
1614 
1615     /* Clean up any HPT info from a previous boot */
1616     spapr_free_hpt(spapr);
1617 
1618     rc = kvmppc_reset_htab(shift);
1619     if (rc < 0) {
1620         /* kernel-side HPT needed, but couldn't allocate one */
1621         error_setg_errno(errp, errno,
1622                          "Failed to allocate KVM HPT of order %d (try smaller maxmem?)",
1623                          shift);
1624         /* This is almost certainly fatal, but if the caller really
1625          * wants to carry on with shift == 0, it's welcome to try */
1626     } else if (rc > 0) {
1627         /* kernel-side HPT allocated */
1628         if (rc != shift) {
1629             error_setg(errp,
1630                        "Requested order %d HPT, but kernel allocated order %ld (try smaller maxmem?)",
1631                        shift, rc);
1632         }
1633 
1634         spapr->htab_shift = shift;
1635         spapr->htab = NULL;
1636     } else {
1637         /* kernel-side HPT not needed, allocate in userspace instead */
1638         size_t size = 1ULL << shift;
1639         int i;
1640 
1641         spapr->htab = qemu_memalign(size, size);
1642         if (!spapr->htab) {
1643             error_setg_errno(errp, errno,
1644                              "Could not allocate HPT of order %d", shift);
1645             return;
1646         }
1647 
1648         memset(spapr->htab, 0, size);
1649         spapr->htab_shift = shift;
1650 
1651         for (i = 0; i < size / HASH_PTE_SIZE_64; i++) {
1652             DIRTY_HPTE(HPTE(spapr->htab, i));
1653         }
1654     }
1655     /* We're setting up a hash table, so that means we're not radix */
1656     spapr->patb_entry = 0;
1657     spapr_set_all_lpcrs(0, LPCR_HR | LPCR_UPRT);
1658 }
1659 
1660 void spapr_setup_hpt_and_vrma(SpaprMachineState *spapr)
1661 {
1662     int hpt_shift;
1663 
1664     if ((spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED)
1665         || (spapr->cas_reboot
1666             && !spapr_ovec_test(spapr->ov5_cas, OV5_HPT_RESIZE))) {
1667         hpt_shift = spapr_hpt_shift_for_ramsize(MACHINE(spapr)->maxram_size);
1668     } else {
1669         uint64_t current_ram_size;
1670 
1671         current_ram_size = MACHINE(spapr)->ram_size + get_plugged_memory_size();
1672         hpt_shift = spapr_hpt_shift_for_ramsize(current_ram_size);
1673     }
1674     spapr_reallocate_hpt(spapr, hpt_shift, &error_fatal);
1675 
1676     if (spapr->vrma_adjust) {
1677         spapr->rma_size = kvmppc_rma_size(spapr_node0_size(MACHINE(spapr)),
1678                                           spapr->htab_shift);
1679     }
1680 }
1681 
1682 static int spapr_reset_drcs(Object *child, void *opaque)
1683 {
1684     SpaprDrc *drc =
1685         (SpaprDrc *) object_dynamic_cast(child,
1686                                                  TYPE_SPAPR_DR_CONNECTOR);
1687 
1688     if (drc) {
1689         spapr_drc_reset(drc);
1690     }
1691 
1692     return 0;
1693 }
1694 
1695 static void spapr_machine_reset(void)
1696 {
1697     MachineState *machine = MACHINE(qdev_get_machine());
1698     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
1699     PowerPCCPU *first_ppc_cpu;
1700     uint32_t rtas_limit;
1701     hwaddr rtas_addr, fdt_addr;
1702     void *fdt;
1703     int rc;
1704 
1705     spapr_caps_apply(spapr);
1706 
1707     first_ppc_cpu = POWERPC_CPU(first_cpu);
1708     if (kvm_enabled() && kvmppc_has_cap_mmu_radix() &&
1709         ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
1710                               spapr->max_compat_pvr)) {
1711         /*
1712          * If using KVM with radix mode available, VCPUs can be started
1713          * without a HPT because KVM will start them in radix mode.
1714          * Set the GR bit in PATE so that we know there is no HPT.
1715          */
1716         spapr->patb_entry = PATE1_GR;
1717         spapr_set_all_lpcrs(LPCR_HR | LPCR_UPRT, LPCR_HR | LPCR_UPRT);
1718     } else {
1719         spapr_setup_hpt_and_vrma(spapr);
1720     }
1721 
1722     /*
1723      * If this reset wasn't generated by CAS, we should reset our
1724      * negotiated options and start from scratch
1725      */
1726     if (!spapr->cas_reboot) {
1727         spapr_ovec_cleanup(spapr->ov5_cas);
1728         spapr->ov5_cas = spapr_ovec_new();
1729 
1730         ppc_set_compat(first_ppc_cpu, spapr->max_compat_pvr, &error_fatal);
1731     }
1732 
1733     if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
1734         spapr_irq_msi_reset(spapr);
1735     }
1736 
1737     /*
1738      * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node.
1739      * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is
1740      * called from vPHB reset handler so we initialize the counter here.
1741      * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM
1742      * must be equally distant from any other node.
1743      * The final value of spapr->gpu_numa_id is going to be written to
1744      * max-associativity-domains in spapr_build_fdt().
1745      */
1746     spapr->gpu_numa_id = MAX(1, nb_numa_nodes);
1747     qemu_devices_reset();
1748 
1749     /*
1750      * This is fixing some of the default configuration of the XIVE
1751      * devices. To be called after the reset of the machine devices.
1752      */
1753     spapr_irq_reset(spapr, &error_fatal);
1754 
1755     /*
1756      * There is no CAS under qtest. Simulate one to please the code that
1757      * depends on spapr->ov5_cas. This is especially needed to test device
1758      * unplug, so we do that before resetting the DRCs.
1759      */
1760     if (qtest_enabled()) {
1761         spapr_ovec_cleanup(spapr->ov5_cas);
1762         spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
1763     }
1764 
1765     /* DRC reset may cause a device to be unplugged. This will cause troubles
1766      * if this device is used by another device (eg, a running vhost backend
1767      * will crash QEMU if the DIMM holding the vring goes away). To avoid such
1768      * situations, we reset DRCs after all devices have been reset.
1769      */
1770     object_child_foreach_recursive(object_get_root(), spapr_reset_drcs, NULL);
1771 
1772     spapr_clear_pending_events(spapr);
1773 
1774     /*
1775      * We place the device tree and RTAS just below either the top of the RMA,
1776      * or just below 2GB, whichever is lower, so that it can be
1777      * processed with 32-bit real mode code if necessary
1778      */
1779     rtas_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR);
1780     rtas_addr = rtas_limit - RTAS_MAX_SIZE;
1781     fdt_addr = rtas_addr - FDT_MAX_SIZE;
1782 
1783     fdt = spapr_build_fdt(spapr);
1784 
1785     spapr_load_rtas(spapr, fdt, rtas_addr);
1786 
1787     rc = fdt_pack(fdt);
1788 
1789     /* Should only fail if we've built a corrupted tree */
1790     assert(rc == 0);
1791 
1792     if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
1793         error_report("FDT too big ! 0x%x bytes (max is 0x%x)",
1794                      fdt_totalsize(fdt), FDT_MAX_SIZE);
1795         exit(1);
1796     }
1797 
1798     /* Load the fdt */
1799     qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
1800     cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
1801     g_free(spapr->fdt_blob);
1802     spapr->fdt_size = fdt_totalsize(fdt);
1803     spapr->fdt_initial_size = spapr->fdt_size;
1804     spapr->fdt_blob = fdt;
1805 
1806     /* Set up the entry state */
1807     spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, fdt_addr);
1808     first_ppc_cpu->env.gpr[5] = 0;
1809 
1810     spapr->cas_reboot = false;
1811 }
1812 
1813 static void spapr_create_nvram(SpaprMachineState *spapr)
1814 {
1815     DeviceState *dev = qdev_create(&spapr->vio_bus->bus, "spapr-nvram");
1816     DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
1817 
1818     if (dinfo) {
1819         qdev_prop_set_drive(dev, "drive", blk_by_legacy_dinfo(dinfo),
1820                             &error_fatal);
1821     }
1822 
1823     qdev_init_nofail(dev);
1824 
1825     spapr->nvram = (struct SpaprNvram *)dev;
1826 }
1827 
1828 static void spapr_rtc_create(SpaprMachineState *spapr)
1829 {
1830     object_initialize_child(OBJECT(spapr), "rtc",
1831                             &spapr->rtc, sizeof(spapr->rtc), TYPE_SPAPR_RTC,
1832                             &error_fatal, NULL);
1833     object_property_set_bool(OBJECT(&spapr->rtc), true, "realized",
1834                               &error_fatal);
1835     object_property_add_alias(OBJECT(spapr), "rtc-time", OBJECT(&spapr->rtc),
1836                               "date", &error_fatal);
1837 }
1838 
1839 /* Returns whether we want to use VGA or not */
1840 static bool spapr_vga_init(PCIBus *pci_bus, Error **errp)
1841 {
1842     switch (vga_interface_type) {
1843     case VGA_NONE:
1844         return false;
1845     case VGA_DEVICE:
1846         return true;
1847     case VGA_STD:
1848     case VGA_VIRTIO:
1849     case VGA_CIRRUS:
1850         return pci_vga_init(pci_bus) != NULL;
1851     default:
1852         error_setg(errp,
1853                    "Unsupported VGA mode, only -vga std or -vga virtio is supported");
1854         return false;
1855     }
1856 }
1857 
1858 static int spapr_pre_load(void *opaque)
1859 {
1860     int rc;
1861 
1862     rc = spapr_caps_pre_load(opaque);
1863     if (rc) {
1864         return rc;
1865     }
1866 
1867     return 0;
1868 }
1869 
1870 static int spapr_post_load(void *opaque, int version_id)
1871 {
1872     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1873     int err = 0;
1874 
1875     err = spapr_caps_post_migration(spapr);
1876     if (err) {
1877         return err;
1878     }
1879 
1880     /*
1881      * In earlier versions, there was no separate qdev for the PAPR
1882      * RTC, so the RTC offset was stored directly in sPAPREnvironment.
1883      * So when migrating from those versions, poke the incoming offset
1884      * value into the RTC device
1885      */
1886     if (version_id < 3) {
1887         err = spapr_rtc_import_offset(&spapr->rtc, spapr->rtc_offset);
1888         if (err) {
1889             return err;
1890         }
1891     }
1892 
1893     if (kvm_enabled() && spapr->patb_entry) {
1894         PowerPCCPU *cpu = POWERPC_CPU(first_cpu);
1895         bool radix = !!(spapr->patb_entry & PATE1_GR);
1896         bool gtse = !!(cpu->env.spr[SPR_LPCR] & LPCR_GTSE);
1897 
1898         /*
1899          * Update LPCR:HR and UPRT as they may not be set properly in
1900          * the stream
1901          */
1902         spapr_set_all_lpcrs(radix ? (LPCR_HR | LPCR_UPRT) : 0,
1903                             LPCR_HR | LPCR_UPRT);
1904 
1905         err = kvmppc_configure_v3_mmu(cpu, radix, gtse, spapr->patb_entry);
1906         if (err) {
1907             error_report("Process table config unsupported by the host");
1908             return -EINVAL;
1909         }
1910     }
1911 
1912     err = spapr_irq_post_load(spapr, version_id);
1913     if (err) {
1914         return err;
1915     }
1916 
1917     return err;
1918 }
1919 
1920 static int spapr_pre_save(void *opaque)
1921 {
1922     int rc;
1923 
1924     rc = spapr_caps_pre_save(opaque);
1925     if (rc) {
1926         return rc;
1927     }
1928 
1929     return 0;
1930 }
1931 
1932 static bool version_before_3(void *opaque, int version_id)
1933 {
1934     return version_id < 3;
1935 }
1936 
1937 static bool spapr_pending_events_needed(void *opaque)
1938 {
1939     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1940     return !QTAILQ_EMPTY(&spapr->pending_events);
1941 }
1942 
1943 static const VMStateDescription vmstate_spapr_event_entry = {
1944     .name = "spapr_event_log_entry",
1945     .version_id = 1,
1946     .minimum_version_id = 1,
1947     .fields = (VMStateField[]) {
1948         VMSTATE_UINT32(summary, SpaprEventLogEntry),
1949         VMSTATE_UINT32(extended_length, SpaprEventLogEntry),
1950         VMSTATE_VBUFFER_ALLOC_UINT32(extended_log, SpaprEventLogEntry, 0,
1951                                      NULL, extended_length),
1952         VMSTATE_END_OF_LIST()
1953     },
1954 };
1955 
1956 static const VMStateDescription vmstate_spapr_pending_events = {
1957     .name = "spapr_pending_events",
1958     .version_id = 1,
1959     .minimum_version_id = 1,
1960     .needed = spapr_pending_events_needed,
1961     .fields = (VMStateField[]) {
1962         VMSTATE_QTAILQ_V(pending_events, SpaprMachineState, 1,
1963                          vmstate_spapr_event_entry, SpaprEventLogEntry, next),
1964         VMSTATE_END_OF_LIST()
1965     },
1966 };
1967 
1968 static bool spapr_ov5_cas_needed(void *opaque)
1969 {
1970     SpaprMachineState *spapr = opaque;
1971     SpaprOptionVector *ov5_mask = spapr_ovec_new();
1972     SpaprOptionVector *ov5_legacy = spapr_ovec_new();
1973     SpaprOptionVector *ov5_removed = spapr_ovec_new();
1974     bool cas_needed;
1975 
1976     /* Prior to the introduction of SpaprOptionVector, we had two option
1977      * vectors we dealt with: OV5_FORM1_AFFINITY, and OV5_DRCONF_MEMORY.
1978      * Both of these options encode machine topology into the device-tree
1979      * in such a way that the now-booted OS should still be able to interact
1980      * appropriately with QEMU regardless of what options were actually
1981      * negotiatied on the source side.
1982      *
1983      * As such, we can avoid migrating the CAS-negotiated options if these
1984      * are the only options available on the current machine/platform.
1985      * Since these are the only options available for pseries-2.7 and
1986      * earlier, this allows us to maintain old->new/new->old migration
1987      * compatibility.
1988      *
1989      * For QEMU 2.8+, there are additional CAS-negotiatable options available
1990      * via default pseries-2.8 machines and explicit command-line parameters.
1991      * Some of these options, like OV5_HP_EVT, *do* require QEMU to be aware
1992      * of the actual CAS-negotiated values to continue working properly. For
1993      * example, availability of memory unplug depends on knowing whether
1994      * OV5_HP_EVT was negotiated via CAS.
1995      *
1996      * Thus, for any cases where the set of available CAS-negotiatable
1997      * options extends beyond OV5_FORM1_AFFINITY and OV5_DRCONF_MEMORY, we
1998      * include the CAS-negotiated options in the migration stream, unless
1999      * if they affect boot time behaviour only.
2000      */
2001     spapr_ovec_set(ov5_mask, OV5_FORM1_AFFINITY);
2002     spapr_ovec_set(ov5_mask, OV5_DRCONF_MEMORY);
2003     spapr_ovec_set(ov5_mask, OV5_DRMEM_V2);
2004 
2005     /* spapr_ovec_diff returns true if bits were removed. we avoid using
2006      * the mask itself since in the future it's possible "legacy" bits may be
2007      * removed via machine options, which could generate a false positive
2008      * that breaks migration.
2009      */
2010     spapr_ovec_intersect(ov5_legacy, spapr->ov5, ov5_mask);
2011     cas_needed = spapr_ovec_diff(ov5_removed, spapr->ov5, ov5_legacy);
2012 
2013     spapr_ovec_cleanup(ov5_mask);
2014     spapr_ovec_cleanup(ov5_legacy);
2015     spapr_ovec_cleanup(ov5_removed);
2016 
2017     return cas_needed;
2018 }
2019 
2020 static const VMStateDescription vmstate_spapr_ov5_cas = {
2021     .name = "spapr_option_vector_ov5_cas",
2022     .version_id = 1,
2023     .minimum_version_id = 1,
2024     .needed = spapr_ov5_cas_needed,
2025     .fields = (VMStateField[]) {
2026         VMSTATE_STRUCT_POINTER_V(ov5_cas, SpaprMachineState, 1,
2027                                  vmstate_spapr_ovec, SpaprOptionVector),
2028         VMSTATE_END_OF_LIST()
2029     },
2030 };
2031 
2032 static bool spapr_patb_entry_needed(void *opaque)
2033 {
2034     SpaprMachineState *spapr = opaque;
2035 
2036     return !!spapr->patb_entry;
2037 }
2038 
2039 static const VMStateDescription vmstate_spapr_patb_entry = {
2040     .name = "spapr_patb_entry",
2041     .version_id = 1,
2042     .minimum_version_id = 1,
2043     .needed = spapr_patb_entry_needed,
2044     .fields = (VMStateField[]) {
2045         VMSTATE_UINT64(patb_entry, SpaprMachineState),
2046         VMSTATE_END_OF_LIST()
2047     },
2048 };
2049 
2050 static bool spapr_irq_map_needed(void *opaque)
2051 {
2052     SpaprMachineState *spapr = opaque;
2053 
2054     return spapr->irq_map && !bitmap_empty(spapr->irq_map, spapr->irq_map_nr);
2055 }
2056 
2057 static const VMStateDescription vmstate_spapr_irq_map = {
2058     .name = "spapr_irq_map",
2059     .version_id = 1,
2060     .minimum_version_id = 1,
2061     .needed = spapr_irq_map_needed,
2062     .fields = (VMStateField[]) {
2063         VMSTATE_BITMAP(irq_map, SpaprMachineState, 0, irq_map_nr),
2064         VMSTATE_END_OF_LIST()
2065     },
2066 };
2067 
2068 static bool spapr_dtb_needed(void *opaque)
2069 {
2070     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(opaque);
2071 
2072     return smc->update_dt_enabled;
2073 }
2074 
2075 static int spapr_dtb_pre_load(void *opaque)
2076 {
2077     SpaprMachineState *spapr = (SpaprMachineState *)opaque;
2078 
2079     g_free(spapr->fdt_blob);
2080     spapr->fdt_blob = NULL;
2081     spapr->fdt_size = 0;
2082 
2083     return 0;
2084 }
2085 
2086 static const VMStateDescription vmstate_spapr_dtb = {
2087     .name = "spapr_dtb",
2088     .version_id = 1,
2089     .minimum_version_id = 1,
2090     .needed = spapr_dtb_needed,
2091     .pre_load = spapr_dtb_pre_load,
2092     .fields = (VMStateField[]) {
2093         VMSTATE_UINT32(fdt_initial_size, SpaprMachineState),
2094         VMSTATE_UINT32(fdt_size, SpaprMachineState),
2095         VMSTATE_VBUFFER_ALLOC_UINT32(fdt_blob, SpaprMachineState, 0, NULL,
2096                                      fdt_size),
2097         VMSTATE_END_OF_LIST()
2098     },
2099 };
2100 
2101 static const VMStateDescription vmstate_spapr = {
2102     .name = "spapr",
2103     .version_id = 3,
2104     .minimum_version_id = 1,
2105     .pre_load = spapr_pre_load,
2106     .post_load = spapr_post_load,
2107     .pre_save = spapr_pre_save,
2108     .fields = (VMStateField[]) {
2109         /* used to be @next_irq */
2110         VMSTATE_UNUSED_BUFFER(version_before_3, 0, 4),
2111 
2112         /* RTC offset */
2113         VMSTATE_UINT64_TEST(rtc_offset, SpaprMachineState, version_before_3),
2114 
2115         VMSTATE_PPC_TIMEBASE_V(tb, SpaprMachineState, 2),
2116         VMSTATE_END_OF_LIST()
2117     },
2118     .subsections = (const VMStateDescription*[]) {
2119         &vmstate_spapr_ov5_cas,
2120         &vmstate_spapr_patb_entry,
2121         &vmstate_spapr_pending_events,
2122         &vmstate_spapr_cap_htm,
2123         &vmstate_spapr_cap_vsx,
2124         &vmstate_spapr_cap_dfp,
2125         &vmstate_spapr_cap_cfpc,
2126         &vmstate_spapr_cap_sbbc,
2127         &vmstate_spapr_cap_ibs,
2128         &vmstate_spapr_cap_hpt_maxpagesize,
2129         &vmstate_spapr_irq_map,
2130         &vmstate_spapr_cap_nested_kvm_hv,
2131         &vmstate_spapr_dtb,
2132         &vmstate_spapr_cap_large_decr,
2133         &vmstate_spapr_cap_ccf_assist,
2134         NULL
2135     }
2136 };
2137 
2138 static int htab_save_setup(QEMUFile *f, void *opaque)
2139 {
2140     SpaprMachineState *spapr = opaque;
2141 
2142     /* "Iteration" header */
2143     if (!spapr->htab_shift) {
2144         qemu_put_be32(f, -1);
2145     } else {
2146         qemu_put_be32(f, spapr->htab_shift);
2147     }
2148 
2149     if (spapr->htab) {
2150         spapr->htab_save_index = 0;
2151         spapr->htab_first_pass = true;
2152     } else {
2153         if (spapr->htab_shift) {
2154             assert(kvm_enabled());
2155         }
2156     }
2157 
2158 
2159     return 0;
2160 }
2161 
2162 static void htab_save_chunk(QEMUFile *f, SpaprMachineState *spapr,
2163                             int chunkstart, int n_valid, int n_invalid)
2164 {
2165     qemu_put_be32(f, chunkstart);
2166     qemu_put_be16(f, n_valid);
2167     qemu_put_be16(f, n_invalid);
2168     qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
2169                     HASH_PTE_SIZE_64 * n_valid);
2170 }
2171 
2172 static void htab_save_end_marker(QEMUFile *f)
2173 {
2174     qemu_put_be32(f, 0);
2175     qemu_put_be16(f, 0);
2176     qemu_put_be16(f, 0);
2177 }
2178 
2179 static void htab_save_first_pass(QEMUFile *f, SpaprMachineState *spapr,
2180                                  int64_t max_ns)
2181 {
2182     bool has_timeout = max_ns != -1;
2183     int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
2184     int index = spapr->htab_save_index;
2185     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2186 
2187     assert(spapr->htab_first_pass);
2188 
2189     do {
2190         int chunkstart;
2191 
2192         /* Consume invalid HPTEs */
2193         while ((index < htabslots)
2194                && !HPTE_VALID(HPTE(spapr->htab, index))) {
2195             CLEAN_HPTE(HPTE(spapr->htab, index));
2196             index++;
2197         }
2198 
2199         /* Consume valid HPTEs */
2200         chunkstart = index;
2201         while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
2202                && HPTE_VALID(HPTE(spapr->htab, index))) {
2203             CLEAN_HPTE(HPTE(spapr->htab, index));
2204             index++;
2205         }
2206 
2207         if (index > chunkstart) {
2208             int n_valid = index - chunkstart;
2209 
2210             htab_save_chunk(f, spapr, chunkstart, n_valid, 0);
2211 
2212             if (has_timeout &&
2213                 (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
2214                 break;
2215             }
2216         }
2217     } while ((index < htabslots) && !qemu_file_rate_limit(f));
2218 
2219     if (index >= htabslots) {
2220         assert(index == htabslots);
2221         index = 0;
2222         spapr->htab_first_pass = false;
2223     }
2224     spapr->htab_save_index = index;
2225 }
2226 
2227 static int htab_save_later_pass(QEMUFile *f, SpaprMachineState *spapr,
2228                                 int64_t max_ns)
2229 {
2230     bool final = max_ns < 0;
2231     int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
2232     int examined = 0, sent = 0;
2233     int index = spapr->htab_save_index;
2234     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2235 
2236     assert(!spapr->htab_first_pass);
2237 
2238     do {
2239         int chunkstart, invalidstart;
2240 
2241         /* Consume non-dirty HPTEs */
2242         while ((index < htabslots)
2243                && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
2244             index++;
2245             examined++;
2246         }
2247 
2248         chunkstart = index;
2249         /* Consume valid dirty HPTEs */
2250         while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
2251                && HPTE_DIRTY(HPTE(spapr->htab, index))
2252                && HPTE_VALID(HPTE(spapr->htab, index))) {
2253             CLEAN_HPTE(HPTE(spapr->htab, index));
2254             index++;
2255             examined++;
2256         }
2257 
2258         invalidstart = index;
2259         /* Consume invalid dirty HPTEs */
2260         while ((index < htabslots) && (index - invalidstart < USHRT_MAX)
2261                && HPTE_DIRTY(HPTE(spapr->htab, index))
2262                && !HPTE_VALID(HPTE(spapr->htab, index))) {
2263             CLEAN_HPTE(HPTE(spapr->htab, index));
2264             index++;
2265             examined++;
2266         }
2267 
2268         if (index > chunkstart) {
2269             int n_valid = invalidstart - chunkstart;
2270             int n_invalid = index - invalidstart;
2271 
2272             htab_save_chunk(f, spapr, chunkstart, n_valid, n_invalid);
2273             sent += index - chunkstart;
2274 
2275             if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
2276                 break;
2277             }
2278         }
2279 
2280         if (examined >= htabslots) {
2281             break;
2282         }
2283 
2284         if (index >= htabslots) {
2285             assert(index == htabslots);
2286             index = 0;
2287         }
2288     } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));
2289 
2290     if (index >= htabslots) {
2291         assert(index == htabslots);
2292         index = 0;
2293     }
2294 
2295     spapr->htab_save_index = index;
2296 
2297     return (examined >= htabslots) && (sent == 0) ? 1 : 0;
2298 }
2299 
2300 #define MAX_ITERATION_NS    5000000 /* 5 ms */
2301 #define MAX_KVM_BUF_SIZE    2048
2302 
2303 static int htab_save_iterate(QEMUFile *f, void *opaque)
2304 {
2305     SpaprMachineState *spapr = opaque;
2306     int fd;
2307     int rc = 0;
2308 
2309     /* Iteration header */
2310     if (!spapr->htab_shift) {
2311         qemu_put_be32(f, -1);
2312         return 1;
2313     } else {
2314         qemu_put_be32(f, 0);
2315     }
2316 
2317     if (!spapr->htab) {
2318         assert(kvm_enabled());
2319 
2320         fd = get_htab_fd(spapr);
2321         if (fd < 0) {
2322             return fd;
2323         }
2324 
2325         rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
2326         if (rc < 0) {
2327             return rc;
2328         }
2329     } else  if (spapr->htab_first_pass) {
2330         htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
2331     } else {
2332         rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
2333     }
2334 
2335     htab_save_end_marker(f);
2336 
2337     return rc;
2338 }
2339 
2340 static int htab_save_complete(QEMUFile *f, void *opaque)
2341 {
2342     SpaprMachineState *spapr = opaque;
2343     int fd;
2344 
2345     /* Iteration header */
2346     if (!spapr->htab_shift) {
2347         qemu_put_be32(f, -1);
2348         return 0;
2349     } else {
2350         qemu_put_be32(f, 0);
2351     }
2352 
2353     if (!spapr->htab) {
2354         int rc;
2355 
2356         assert(kvm_enabled());
2357 
2358         fd = get_htab_fd(spapr);
2359         if (fd < 0) {
2360             return fd;
2361         }
2362 
2363         rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, -1);
2364         if (rc < 0) {
2365             return rc;
2366         }
2367     } else {
2368         if (spapr->htab_first_pass) {
2369             htab_save_first_pass(f, spapr, -1);
2370         }
2371         htab_save_later_pass(f, spapr, -1);
2372     }
2373 
2374     /* End marker */
2375     htab_save_end_marker(f);
2376 
2377     return 0;
2378 }
2379 
2380 static int htab_load(QEMUFile *f, void *opaque, int version_id)
2381 {
2382     SpaprMachineState *spapr = opaque;
2383     uint32_t section_hdr;
2384     int fd = -1;
2385     Error *local_err = NULL;
2386 
2387     if (version_id < 1 || version_id > 1) {
2388         error_report("htab_load() bad version");
2389         return -EINVAL;
2390     }
2391 
2392     section_hdr = qemu_get_be32(f);
2393 
2394     if (section_hdr == -1) {
2395         spapr_free_hpt(spapr);
2396         return 0;
2397     }
2398 
2399     if (section_hdr) {
2400         /* First section gives the htab size */
2401         spapr_reallocate_hpt(spapr, section_hdr, &local_err);
2402         if (local_err) {
2403             error_report_err(local_err);
2404             return -EINVAL;
2405         }
2406         return 0;
2407     }
2408 
2409     if (!spapr->htab) {
2410         assert(kvm_enabled());
2411 
2412         fd = kvmppc_get_htab_fd(true, 0, &local_err);
2413         if (fd < 0) {
2414             error_report_err(local_err);
2415             return fd;
2416         }
2417     }
2418 
2419     while (true) {
2420         uint32_t index;
2421         uint16_t n_valid, n_invalid;
2422 
2423         index = qemu_get_be32(f);
2424         n_valid = qemu_get_be16(f);
2425         n_invalid = qemu_get_be16(f);
2426 
2427         if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
2428             /* End of Stream */
2429             break;
2430         }
2431 
2432         if ((index + n_valid + n_invalid) >
2433             (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
2434             /* Bad index in stream */
2435             error_report(
2436                 "htab_load() bad index %d (%hd+%hd entries) in htab stream (htab_shift=%d)",
2437                 index, n_valid, n_invalid, spapr->htab_shift);
2438             return -EINVAL;
2439         }
2440 
2441         if (spapr->htab) {
2442             if (n_valid) {
2443                 qemu_get_buffer(f, HPTE(spapr->htab, index),
2444                                 HASH_PTE_SIZE_64 * n_valid);
2445             }
2446             if (n_invalid) {
2447                 memset(HPTE(spapr->htab, index + n_valid), 0,
2448                        HASH_PTE_SIZE_64 * n_invalid);
2449             }
2450         } else {
2451             int rc;
2452 
2453             assert(fd >= 0);
2454 
2455             rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid);
2456             if (rc < 0) {
2457                 return rc;
2458             }
2459         }
2460     }
2461 
2462     if (!spapr->htab) {
2463         assert(fd >= 0);
2464         close(fd);
2465     }
2466 
2467     return 0;
2468 }
2469 
2470 static void htab_save_cleanup(void *opaque)
2471 {
2472     SpaprMachineState *spapr = opaque;
2473 
2474     close_htab_fd(spapr);
2475 }
2476 
2477 static SaveVMHandlers savevm_htab_handlers = {
2478     .save_setup = htab_save_setup,
2479     .save_live_iterate = htab_save_iterate,
2480     .save_live_complete_precopy = htab_save_complete,
2481     .save_cleanup = htab_save_cleanup,
2482     .load_state = htab_load,
2483 };
2484 
2485 static void spapr_boot_set(void *opaque, const char *boot_device,
2486                            Error **errp)
2487 {
2488     MachineState *machine = MACHINE(opaque);
2489     machine->boot_order = g_strdup(boot_device);
2490 }
2491 
2492 static void spapr_create_lmb_dr_connectors(SpaprMachineState *spapr)
2493 {
2494     MachineState *machine = MACHINE(spapr);
2495     uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
2496     uint32_t nr_lmbs = (machine->maxram_size - machine->ram_size)/lmb_size;
2497     int i;
2498 
2499     for (i = 0; i < nr_lmbs; i++) {
2500         uint64_t addr;
2501 
2502         addr = i * lmb_size + machine->device_memory->base;
2503         spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_LMB,
2504                                addr / lmb_size);
2505     }
2506 }
2507 
2508 /*
2509  * If RAM size, maxmem size and individual node mem sizes aren't aligned
2510  * to SPAPR_MEMORY_BLOCK_SIZE(256MB), then refuse to start the guest
2511  * since we can't support such unaligned sizes with DRCONF_MEMORY.
2512  */
2513 static void spapr_validate_node_memory(MachineState *machine, Error **errp)
2514 {
2515     int i;
2516 
2517     if (machine->ram_size % SPAPR_MEMORY_BLOCK_SIZE) {
2518         error_setg(errp, "Memory size 0x" RAM_ADDR_FMT
2519                    " is not aligned to %" PRIu64 " MiB",
2520                    machine->ram_size,
2521                    SPAPR_MEMORY_BLOCK_SIZE / MiB);
2522         return;
2523     }
2524 
2525     if (machine->maxram_size % SPAPR_MEMORY_BLOCK_SIZE) {
2526         error_setg(errp, "Maximum memory size 0x" RAM_ADDR_FMT
2527                    " is not aligned to %" PRIu64 " MiB",
2528                    machine->ram_size,
2529                    SPAPR_MEMORY_BLOCK_SIZE / MiB);
2530         return;
2531     }
2532 
2533     for (i = 0; i < nb_numa_nodes; i++) {
2534         if (numa_info[i].node_mem % SPAPR_MEMORY_BLOCK_SIZE) {
2535             error_setg(errp,
2536                        "Node %d memory size 0x%" PRIx64
2537                        " is not aligned to %" PRIu64 " MiB",
2538                        i, numa_info[i].node_mem,
2539                        SPAPR_MEMORY_BLOCK_SIZE / MiB);
2540             return;
2541         }
2542     }
2543 }
2544 
2545 /* find cpu slot in machine->possible_cpus by core_id */
2546 static CPUArchId *spapr_find_cpu_slot(MachineState *ms, uint32_t id, int *idx)
2547 {
2548     int index = id / smp_threads;
2549 
2550     if (index >= ms->possible_cpus->len) {
2551         return NULL;
2552     }
2553     if (idx) {
2554         *idx = index;
2555     }
2556     return &ms->possible_cpus->cpus[index];
2557 }
2558 
2559 static void spapr_set_vsmt_mode(SpaprMachineState *spapr, Error **errp)
2560 {
2561     Error *local_err = NULL;
2562     bool vsmt_user = !!spapr->vsmt;
2563     int kvm_smt = kvmppc_smt_threads();
2564     int ret;
2565 
2566     if (!kvm_enabled() && (smp_threads > 1)) {
2567         error_setg(&local_err, "TCG cannot support more than 1 thread/core "
2568                      "on a pseries machine");
2569         goto out;
2570     }
2571     if (!is_power_of_2(smp_threads)) {
2572         error_setg(&local_err, "Cannot support %d threads/core on a pseries "
2573                      "machine because it must be a power of 2", smp_threads);
2574         goto out;
2575     }
2576 
2577     /* Detemine the VSMT mode to use: */
2578     if (vsmt_user) {
2579         if (spapr->vsmt < smp_threads) {
2580             error_setg(&local_err, "Cannot support VSMT mode %d"
2581                          " because it must be >= threads/core (%d)",
2582                          spapr->vsmt, smp_threads);
2583             goto out;
2584         }
2585         /* In this case, spapr->vsmt has been set by the command line */
2586     } else {
2587         /*
2588          * Default VSMT value is tricky, because we need it to be as
2589          * consistent as possible (for migration), but this requires
2590          * changing it for at least some existing cases.  We pick 8 as
2591          * the value that we'd get with KVM on POWER8, the
2592          * overwhelmingly common case in production systems.
2593          */
2594         spapr->vsmt = MAX(8, smp_threads);
2595     }
2596 
2597     /* KVM: If necessary, set the SMT mode: */
2598     if (kvm_enabled() && (spapr->vsmt != kvm_smt)) {
2599         ret = kvmppc_set_smt_threads(spapr->vsmt);
2600         if (ret) {
2601             /* Looks like KVM isn't able to change VSMT mode */
2602             error_setg(&local_err,
2603                        "Failed to set KVM's VSMT mode to %d (errno %d)",
2604                        spapr->vsmt, ret);
2605             /* We can live with that if the default one is big enough
2606              * for the number of threads, and a submultiple of the one
2607              * we want.  In this case we'll waste some vcpu ids, but
2608              * behaviour will be correct */
2609             if ((kvm_smt >= smp_threads) && ((spapr->vsmt % kvm_smt) == 0)) {
2610                 warn_report_err(local_err);
2611                 local_err = NULL;
2612                 goto out;
2613             } else {
2614                 if (!vsmt_user) {
2615                     error_append_hint(&local_err,
2616                                       "On PPC, a VM with %d threads/core"
2617                                       " on a host with %d threads/core"
2618                                       " requires the use of VSMT mode %d.\n",
2619                                       smp_threads, kvm_smt, spapr->vsmt);
2620                 }
2621                 kvmppc_hint_smt_possible(&local_err);
2622                 goto out;
2623             }
2624         }
2625     }
2626     /* else TCG: nothing to do currently */
2627 out:
2628     error_propagate(errp, local_err);
2629 }
2630 
2631 static void spapr_init_cpus(SpaprMachineState *spapr)
2632 {
2633     MachineState *machine = MACHINE(spapr);
2634     MachineClass *mc = MACHINE_GET_CLASS(machine);
2635     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
2636     const char *type = spapr_get_cpu_core_type(machine->cpu_type);
2637     const CPUArchIdList *possible_cpus;
2638     int boot_cores_nr = smp_cpus / smp_threads;
2639     int i;
2640 
2641     possible_cpus = mc->possible_cpu_arch_ids(machine);
2642     if (mc->has_hotpluggable_cpus) {
2643         if (smp_cpus % smp_threads) {
2644             error_report("smp_cpus (%u) must be multiple of threads (%u)",
2645                          smp_cpus, smp_threads);
2646             exit(1);
2647         }
2648         if (max_cpus % smp_threads) {
2649             error_report("max_cpus (%u) must be multiple of threads (%u)",
2650                          max_cpus, smp_threads);
2651             exit(1);
2652         }
2653     } else {
2654         if (max_cpus != smp_cpus) {
2655             error_report("This machine version does not support CPU hotplug");
2656             exit(1);
2657         }
2658         boot_cores_nr = possible_cpus->len;
2659     }
2660 
2661     if (smc->pre_2_10_has_unused_icps) {
2662         int i;
2663 
2664         for (i = 0; i < spapr_max_server_number(spapr); i++) {
2665             /* Dummy entries get deregistered when real ICPState objects
2666              * are registered during CPU core hotplug.
2667              */
2668             pre_2_10_vmstate_register_dummy_icp(i);
2669         }
2670     }
2671 
2672     for (i = 0; i < possible_cpus->len; i++) {
2673         int core_id = i * smp_threads;
2674 
2675         if (mc->has_hotpluggable_cpus) {
2676             spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_CPU,
2677                                    spapr_vcpu_id(spapr, core_id));
2678         }
2679 
2680         if (i < boot_cores_nr) {
2681             Object *core  = object_new(type);
2682             int nr_threads = smp_threads;
2683 
2684             /* Handle the partially filled core for older machine types */
2685             if ((i + 1) * smp_threads >= smp_cpus) {
2686                 nr_threads = smp_cpus - i * smp_threads;
2687             }
2688 
2689             object_property_set_int(core, nr_threads, "nr-threads",
2690                                     &error_fatal);
2691             object_property_set_int(core, core_id, CPU_CORE_PROP_CORE_ID,
2692                                     &error_fatal);
2693             object_property_set_bool(core, true, "realized", &error_fatal);
2694 
2695             object_unref(core);
2696         }
2697     }
2698 }
2699 
2700 static PCIHostState *spapr_create_default_phb(void)
2701 {
2702     DeviceState *dev;
2703 
2704     dev = qdev_create(NULL, TYPE_SPAPR_PCI_HOST_BRIDGE);
2705     qdev_prop_set_uint32(dev, "index", 0);
2706     qdev_init_nofail(dev);
2707 
2708     return PCI_HOST_BRIDGE(dev);
2709 }
2710 
2711 /* pSeries LPAR / sPAPR hardware init */
2712 static void spapr_machine_init(MachineState *machine)
2713 {
2714     SpaprMachineState *spapr = SPAPR_MACHINE(machine);
2715     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
2716     const char *kernel_filename = machine->kernel_filename;
2717     const char *initrd_filename = machine->initrd_filename;
2718     PCIHostState *phb;
2719     int i;
2720     MemoryRegion *sysmem = get_system_memory();
2721     MemoryRegion *ram = g_new(MemoryRegion, 1);
2722     hwaddr node0_size = spapr_node0_size(machine);
2723     long load_limit, fw_size;
2724     char *filename;
2725     Error *resize_hpt_err = NULL;
2726 
2727     msi_nonbroken = true;
2728 
2729     QLIST_INIT(&spapr->phbs);
2730     QTAILQ_INIT(&spapr->pending_dimm_unplugs);
2731 
2732     /* Determine capabilities to run with */
2733     spapr_caps_init(spapr);
2734 
2735     kvmppc_check_papr_resize_hpt(&resize_hpt_err);
2736     if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DEFAULT) {
2737         /*
2738          * If the user explicitly requested a mode we should either
2739          * supply it, or fail completely (which we do below).  But if
2740          * it's not set explicitly, we reset our mode to something
2741          * that works
2742          */
2743         if (resize_hpt_err) {
2744             spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
2745             error_free(resize_hpt_err);
2746             resize_hpt_err = NULL;
2747         } else {
2748             spapr->resize_hpt = smc->resize_hpt_default;
2749         }
2750     }
2751 
2752     assert(spapr->resize_hpt != SPAPR_RESIZE_HPT_DEFAULT);
2753 
2754     if ((spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) && resize_hpt_err) {
2755         /*
2756          * User requested HPT resize, but this host can't supply it.  Bail out
2757          */
2758         error_report_err(resize_hpt_err);
2759         exit(1);
2760     }
2761 
2762     spapr->rma_size = node0_size;
2763 
2764     /* With KVM, we don't actually know whether KVM supports an
2765      * unbounded RMA (PR KVM) or is limited by the hash table size
2766      * (HV KVM using VRMA), so we always assume the latter
2767      *
2768      * In that case, we also limit the initial allocations for RTAS
2769      * etc... to 256M since we have no way to know what the VRMA size
2770      * is going to be as it depends on the size of the hash table
2771      * which isn't determined yet.
2772      */
2773     if (kvm_enabled()) {
2774         spapr->vrma_adjust = 1;
2775         spapr->rma_size = MIN(spapr->rma_size, 0x10000000);
2776     }
2777 
2778     /* Actually we don't support unbounded RMA anymore since we added
2779      * proper emulation of HV mode. The max we can get is 16G which
2780      * also happens to be what we configure for PAPR mode so make sure
2781      * we don't do anything bigger than that
2782      */
2783     spapr->rma_size = MIN(spapr->rma_size, 0x400000000ull);
2784 
2785     if (spapr->rma_size > node0_size) {
2786         error_report("Numa node 0 has to span the RMA (%#08"HWADDR_PRIx")",
2787                      spapr->rma_size);
2788         exit(1);
2789     }
2790 
2791     /* Setup a load limit for the ramdisk leaving room for SLOF and FDT */
2792     load_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR) - FW_OVERHEAD;
2793 
2794     /*
2795      * VSMT must be set in order to be able to compute VCPU ids, ie to
2796      * call spapr_max_server_number() or spapr_vcpu_id().
2797      */
2798     spapr_set_vsmt_mode(spapr, &error_fatal);
2799 
2800     /* Set up Interrupt Controller before we create the VCPUs */
2801     spapr_irq_init(spapr, &error_fatal);
2802 
2803     /* Set up containers for ibm,client-architecture-support negotiated options
2804      */
2805     spapr->ov5 = spapr_ovec_new();
2806     spapr->ov5_cas = spapr_ovec_new();
2807 
2808     if (smc->dr_lmb_enabled) {
2809         spapr_ovec_set(spapr->ov5, OV5_DRCONF_MEMORY);
2810         spapr_validate_node_memory(machine, &error_fatal);
2811     }
2812 
2813     spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
2814 
2815     /* advertise support for dedicated HP event source to guests */
2816     if (spapr->use_hotplug_event_source) {
2817         spapr_ovec_set(spapr->ov5, OV5_HP_EVT);
2818     }
2819 
2820     /* advertise support for HPT resizing */
2821     if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
2822         spapr_ovec_set(spapr->ov5, OV5_HPT_RESIZE);
2823     }
2824 
2825     /* advertise support for ibm,dyamic-memory-v2 */
2826     spapr_ovec_set(spapr->ov5, OV5_DRMEM_V2);
2827 
2828     /* advertise XIVE on POWER9 machines */
2829     if (spapr->irq->ov5 & (SPAPR_OV5_XIVE_EXPLOIT | SPAPR_OV5_XIVE_BOTH)) {
2830         spapr_ovec_set(spapr->ov5, OV5_XIVE_EXPLOIT);
2831     }
2832 
2833     /* init CPUs */
2834     spapr_init_cpus(spapr);
2835 
2836     if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
2837         ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
2838                               spapr->max_compat_pvr)) {
2839         /* KVM and TCG always allow GTSE with radix... */
2840         spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
2841     }
2842     /* ... but not with hash (currently). */
2843 
2844     if (kvm_enabled()) {
2845         /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
2846         kvmppc_enable_logical_ci_hcalls();
2847         kvmppc_enable_set_mode_hcall();
2848 
2849         /* H_CLEAR_MOD/_REF are mandatory in PAPR, but off by default */
2850         kvmppc_enable_clear_ref_mod_hcalls();
2851 
2852         /* Enable H_PAGE_INIT */
2853         kvmppc_enable_h_page_init();
2854     }
2855 
2856     /* allocate RAM */
2857     memory_region_allocate_system_memory(ram, NULL, "ppc_spapr.ram",
2858                                          machine->ram_size);
2859     memory_region_add_subregion(sysmem, 0, ram);
2860 
2861     /* always allocate the device memory information */
2862     machine->device_memory = g_malloc0(sizeof(*machine->device_memory));
2863 
2864     /* initialize hotplug memory address space */
2865     if (machine->ram_size < machine->maxram_size) {
2866         ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size;
2867         /*
2868          * Limit the number of hotpluggable memory slots to half the number
2869          * slots that KVM supports, leaving the other half for PCI and other
2870          * devices. However ensure that number of slots doesn't drop below 32.
2871          */
2872         int max_memslots = kvm_enabled() ? kvm_get_max_memslots() / 2 :
2873                            SPAPR_MAX_RAM_SLOTS;
2874 
2875         if (max_memslots < SPAPR_MAX_RAM_SLOTS) {
2876             max_memslots = SPAPR_MAX_RAM_SLOTS;
2877         }
2878         if (machine->ram_slots > max_memslots) {
2879             error_report("Specified number of memory slots %"
2880                          PRIu64" exceeds max supported %d",
2881                          machine->ram_slots, max_memslots);
2882             exit(1);
2883         }
2884 
2885         machine->device_memory->base = ROUND_UP(machine->ram_size,
2886                                                 SPAPR_DEVICE_MEM_ALIGN);
2887         memory_region_init(&machine->device_memory->mr, OBJECT(spapr),
2888                            "device-memory", device_mem_size);
2889         memory_region_add_subregion(sysmem, machine->device_memory->base,
2890                                     &machine->device_memory->mr);
2891     }
2892 
2893     if (smc->dr_lmb_enabled) {
2894         spapr_create_lmb_dr_connectors(spapr);
2895     }
2896 
2897     filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
2898     if (!filename) {
2899         error_report("Could not find LPAR rtas '%s'", "spapr-rtas.bin");
2900         exit(1);
2901     }
2902     spapr->rtas_size = get_image_size(filename);
2903     if (spapr->rtas_size < 0) {
2904         error_report("Could not get size of LPAR rtas '%s'", filename);
2905         exit(1);
2906     }
2907     spapr->rtas_blob = g_malloc(spapr->rtas_size);
2908     if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
2909         error_report("Could not load LPAR rtas '%s'", filename);
2910         exit(1);
2911     }
2912     if (spapr->rtas_size > RTAS_MAX_SIZE) {
2913         error_report("RTAS too big ! 0x%zx bytes (max is 0x%x)",
2914                      (size_t)spapr->rtas_size, RTAS_MAX_SIZE);
2915         exit(1);
2916     }
2917     g_free(filename);
2918 
2919     /* Set up RTAS event infrastructure */
2920     spapr_events_init(spapr);
2921 
2922     /* Set up the RTC RTAS interfaces */
2923     spapr_rtc_create(spapr);
2924 
2925     /* Set up VIO bus */
2926     spapr->vio_bus = spapr_vio_bus_init();
2927 
2928     for (i = 0; i < serial_max_hds(); i++) {
2929         if (serial_hd(i)) {
2930             spapr_vty_create(spapr->vio_bus, serial_hd(i));
2931         }
2932     }
2933 
2934     /* We always have at least the nvram device on VIO */
2935     spapr_create_nvram(spapr);
2936 
2937     /*
2938      * Setup hotplug / dynamic-reconfiguration connectors. top-level
2939      * connectors (described in root DT node's "ibm,drc-types" property)
2940      * are pre-initialized here. additional child connectors (such as
2941      * connectors for a PHBs PCI slots) are added as needed during their
2942      * parent's realization.
2943      */
2944     if (smc->dr_phb_enabled) {
2945         for (i = 0; i < SPAPR_MAX_PHBS; i++) {
2946             spapr_dr_connector_new(OBJECT(machine), TYPE_SPAPR_DRC_PHB, i);
2947         }
2948     }
2949 
2950     /* Set up PCI */
2951     spapr_pci_rtas_init();
2952 
2953     phb = spapr_create_default_phb();
2954 
2955     for (i = 0; i < nb_nics; i++) {
2956         NICInfo *nd = &nd_table[i];
2957 
2958         if (!nd->model) {
2959             nd->model = g_strdup("spapr-vlan");
2960         }
2961 
2962         if (g_str_equal(nd->model, "spapr-vlan") ||
2963             g_str_equal(nd->model, "ibmveth")) {
2964             spapr_vlan_create(spapr->vio_bus, nd);
2965         } else {
2966             pci_nic_init_nofail(&nd_table[i], phb->bus, nd->model, NULL);
2967         }
2968     }
2969 
2970     for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
2971         spapr_vscsi_create(spapr->vio_bus);
2972     }
2973 
2974     /* Graphics */
2975     if (spapr_vga_init(phb->bus, &error_fatal)) {
2976         spapr->has_graphics = true;
2977         machine->usb |= defaults_enabled() && !machine->usb_disabled;
2978     }
2979 
2980     if (machine->usb) {
2981         if (smc->use_ohci_by_default) {
2982             pci_create_simple(phb->bus, -1, "pci-ohci");
2983         } else {
2984             pci_create_simple(phb->bus, -1, "nec-usb-xhci");
2985         }
2986 
2987         if (spapr->has_graphics) {
2988             USBBus *usb_bus = usb_bus_find(-1);
2989 
2990             usb_create_simple(usb_bus, "usb-kbd");
2991             usb_create_simple(usb_bus, "usb-mouse");
2992         }
2993     }
2994 
2995     if (spapr->rma_size < (MIN_RMA_SLOF * MiB)) {
2996         error_report(
2997             "pSeries SLOF firmware requires >= %ldM guest RMA (Real Mode Area memory)",
2998             MIN_RMA_SLOF);
2999         exit(1);
3000     }
3001 
3002     if (kernel_filename) {
3003         uint64_t lowaddr = 0;
3004 
3005         spapr->kernel_size = load_elf(kernel_filename, NULL,
3006                                       translate_kernel_address, NULL,
3007                                       NULL, &lowaddr, NULL, 1,
3008                                       PPC_ELF_MACHINE, 0, 0);
3009         if (spapr->kernel_size == ELF_LOAD_WRONG_ENDIAN) {
3010             spapr->kernel_size = load_elf(kernel_filename, NULL,
3011                                           translate_kernel_address, NULL, NULL,
3012                                           &lowaddr, NULL, 0, PPC_ELF_MACHINE,
3013                                           0, 0);
3014             spapr->kernel_le = spapr->kernel_size > 0;
3015         }
3016         if (spapr->kernel_size < 0) {
3017             error_report("error loading %s: %s", kernel_filename,
3018                          load_elf_strerror(spapr->kernel_size));
3019             exit(1);
3020         }
3021 
3022         /* load initrd */
3023         if (initrd_filename) {
3024             /* Try to locate the initrd in the gap between the kernel
3025              * and the firmware. Add a bit of space just in case
3026              */
3027             spapr->initrd_base = (KERNEL_LOAD_ADDR + spapr->kernel_size
3028                                   + 0x1ffff) & ~0xffff;
3029             spapr->initrd_size = load_image_targphys(initrd_filename,
3030                                                      spapr->initrd_base,
3031                                                      load_limit
3032                                                      - spapr->initrd_base);
3033             if (spapr->initrd_size < 0) {
3034                 error_report("could not load initial ram disk '%s'",
3035                              initrd_filename);
3036                 exit(1);
3037             }
3038         }
3039     }
3040 
3041     if (bios_name == NULL) {
3042         bios_name = FW_FILE_NAME;
3043     }
3044     filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
3045     if (!filename) {
3046         error_report("Could not find LPAR firmware '%s'", bios_name);
3047         exit(1);
3048     }
3049     fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
3050     if (fw_size <= 0) {
3051         error_report("Could not load LPAR firmware '%s'", filename);
3052         exit(1);
3053     }
3054     g_free(filename);
3055 
3056     /* FIXME: Should register things through the MachineState's qdev
3057      * interface, this is a legacy from the sPAPREnvironment structure
3058      * which predated MachineState but had a similar function */
3059     vmstate_register(NULL, 0, &vmstate_spapr, spapr);
3060     register_savevm_live(NULL, "spapr/htab", -1, 1,
3061                          &savevm_htab_handlers, spapr);
3062 
3063     qbus_set_hotplug_handler(sysbus_get_default(), OBJECT(machine),
3064                              &error_fatal);
3065 
3066     qemu_register_boot_set(spapr_boot_set, spapr);
3067 
3068     if (kvm_enabled()) {
3069         /* to stop and start vmclock */
3070         qemu_add_vm_change_state_handler(cpu_ppc_clock_vm_state_change,
3071                                          &spapr->tb);
3072 
3073         kvmppc_spapr_enable_inkernel_multitce();
3074     }
3075 }
3076 
3077 static int spapr_kvm_type(MachineState *machine, const char *vm_type)
3078 {
3079     if (!vm_type) {
3080         return 0;
3081     }
3082 
3083     if (!strcmp(vm_type, "HV")) {
3084         return 1;
3085     }
3086 
3087     if (!strcmp(vm_type, "PR")) {
3088         return 2;
3089     }
3090 
3091     error_report("Unknown kvm-type specified '%s'", vm_type);
3092     exit(1);
3093 }
3094 
3095 /*
3096  * Implementation of an interface to adjust firmware path
3097  * for the bootindex property handling.
3098  */
3099 static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
3100                                    DeviceState *dev)
3101 {
3102 #define CAST(type, obj, name) \
3103     ((type *)object_dynamic_cast(OBJECT(obj), (name)))
3104     SCSIDevice *d = CAST(SCSIDevice,  dev, TYPE_SCSI_DEVICE);
3105     SpaprPhbState *phb = CAST(SpaprPhbState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);
3106     VHostSCSICommon *vsc = CAST(VHostSCSICommon, dev, TYPE_VHOST_SCSI_COMMON);
3107 
3108     if (d) {
3109         void *spapr = CAST(void, bus->parent, "spapr-vscsi");
3110         VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI);
3111         USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE);
3112 
3113         if (spapr) {
3114             /*
3115              * Replace "channel@0/disk@0,0" with "disk@8000000000000000":
3116              * In the top 16 bits of the 64-bit LUN, we use SRP luns of the form
3117              * 0x8000 | (target << 8) | (bus << 5) | lun
3118              * (see the "Logical unit addressing format" table in SAM5)
3119              */
3120             unsigned id = 0x8000 | (d->id << 8) | (d->channel << 5) | d->lun;
3121             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3122                                    (uint64_t)id << 48);
3123         } else if (virtio) {
3124             /*
3125              * We use SRP luns of the form 01000000 | (target << 8) | lun
3126              * in the top 32 bits of the 64-bit LUN
3127              * Note: the quote above is from SLOF and it is wrong,
3128              * the actual binding is:
3129              * swap 0100 or 10 << or 20 << ( target lun-id -- srplun )
3130              */
3131             unsigned id = 0x1000000 | (d->id << 16) | d->lun;
3132             if (d->lun >= 256) {
3133                 /* Use the LUN "flat space addressing method" */
3134                 id |= 0x4000;
3135             }
3136             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3137                                    (uint64_t)id << 32);
3138         } else if (usb) {
3139             /*
3140              * We use SRP luns of the form 01000000 | (usb-port << 16) | lun
3141              * in the top 32 bits of the 64-bit LUN
3142              */
3143             unsigned usb_port = atoi(usb->port->path);
3144             unsigned id = 0x1000000 | (usb_port << 16) | d->lun;
3145             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3146                                    (uint64_t)id << 32);
3147         }
3148     }
3149 
3150     /*
3151      * SLOF probes the USB devices, and if it recognizes that the device is a
3152      * storage device, it changes its name to "storage" instead of "usb-host",
3153      * and additionally adds a child node for the SCSI LUN, so the correct
3154      * boot path in SLOF is something like .../storage@1/disk@xxx" instead.
3155      */
3156     if (strcmp("usb-host", qdev_fw_name(dev)) == 0) {
3157         USBDevice *usbdev = CAST(USBDevice, dev, TYPE_USB_DEVICE);
3158         if (usb_host_dev_is_scsi_storage(usbdev)) {
3159             return g_strdup_printf("storage@%s/disk", usbdev->port->path);
3160         }
3161     }
3162 
3163     if (phb) {
3164         /* Replace "pci" with "pci@800000020000000" */
3165         return g_strdup_printf("pci@%"PRIX64, phb->buid);
3166     }
3167 
3168     if (vsc) {
3169         /* Same logic as virtio above */
3170         unsigned id = 0x1000000 | (vsc->target << 16) | vsc->lun;
3171         return g_strdup_printf("disk@%"PRIX64, (uint64_t)id << 32);
3172     }
3173 
3174     if (g_str_equal("pci-bridge", qdev_fw_name(dev))) {
3175         /* SLOF uses "pci" instead of "pci-bridge" for PCI bridges */
3176         PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
3177         return g_strdup_printf("pci@%x", PCI_SLOT(pcidev->devfn));
3178     }
3179 
3180     return NULL;
3181 }
3182 
3183 static char *spapr_get_kvm_type(Object *obj, Error **errp)
3184 {
3185     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3186 
3187     return g_strdup(spapr->kvm_type);
3188 }
3189 
3190 static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
3191 {
3192     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3193 
3194     g_free(spapr->kvm_type);
3195     spapr->kvm_type = g_strdup(value);
3196 }
3197 
3198 static bool spapr_get_modern_hotplug_events(Object *obj, Error **errp)
3199 {
3200     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3201 
3202     return spapr->use_hotplug_event_source;
3203 }
3204 
3205 static void spapr_set_modern_hotplug_events(Object *obj, bool value,
3206                                             Error **errp)
3207 {
3208     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3209 
3210     spapr->use_hotplug_event_source = value;
3211 }
3212 
3213 static bool spapr_get_msix_emulation(Object *obj, Error **errp)
3214 {
3215     return true;
3216 }
3217 
3218 static char *spapr_get_resize_hpt(Object *obj, Error **errp)
3219 {
3220     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3221 
3222     switch (spapr->resize_hpt) {
3223     case SPAPR_RESIZE_HPT_DEFAULT:
3224         return g_strdup("default");
3225     case SPAPR_RESIZE_HPT_DISABLED:
3226         return g_strdup("disabled");
3227     case SPAPR_RESIZE_HPT_ENABLED:
3228         return g_strdup("enabled");
3229     case SPAPR_RESIZE_HPT_REQUIRED:
3230         return g_strdup("required");
3231     }
3232     g_assert_not_reached();
3233 }
3234 
3235 static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp)
3236 {
3237     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3238 
3239     if (strcmp(value, "default") == 0) {
3240         spapr->resize_hpt = SPAPR_RESIZE_HPT_DEFAULT;
3241     } else if (strcmp(value, "disabled") == 0) {
3242         spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
3243     } else if (strcmp(value, "enabled") == 0) {
3244         spapr->resize_hpt = SPAPR_RESIZE_HPT_ENABLED;
3245     } else if (strcmp(value, "required") == 0) {
3246         spapr->resize_hpt = SPAPR_RESIZE_HPT_REQUIRED;
3247     } else {
3248         error_setg(errp, "Bad value for \"resize-hpt\" property");
3249     }
3250 }
3251 
3252 static void spapr_get_vsmt(Object *obj, Visitor *v, const char *name,
3253                                    void *opaque, Error **errp)
3254 {
3255     visit_type_uint32(v, name, (uint32_t *)opaque, errp);
3256 }
3257 
3258 static void spapr_set_vsmt(Object *obj, Visitor *v, const char *name,
3259                                    void *opaque, Error **errp)
3260 {
3261     visit_type_uint32(v, name, (uint32_t *)opaque, errp);
3262 }
3263 
3264 static char *spapr_get_ic_mode(Object *obj, Error **errp)
3265 {
3266     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3267 
3268     if (spapr->irq == &spapr_irq_xics_legacy) {
3269         return g_strdup("legacy");
3270     } else if (spapr->irq == &spapr_irq_xics) {
3271         return g_strdup("xics");
3272     } else if (spapr->irq == &spapr_irq_xive) {
3273         return g_strdup("xive");
3274     } else if (spapr->irq == &spapr_irq_dual) {
3275         return g_strdup("dual");
3276     }
3277     g_assert_not_reached();
3278 }
3279 
3280 static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
3281 {
3282     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3283 
3284     if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
3285         error_setg(errp, "This machine only uses the legacy XICS backend, don't pass ic-mode");
3286         return;
3287     }
3288 
3289     /* The legacy IRQ backend can not be set */
3290     if (strcmp(value, "xics") == 0) {
3291         spapr->irq = &spapr_irq_xics;
3292     } else if (strcmp(value, "xive") == 0) {
3293         spapr->irq = &spapr_irq_xive;
3294     } else if (strcmp(value, "dual") == 0) {
3295         spapr->irq = &spapr_irq_dual;
3296     } else {
3297         error_setg(errp, "Bad value for \"ic-mode\" property");
3298     }
3299 }
3300 
3301 static char *spapr_get_host_model(Object *obj, Error **errp)
3302 {
3303     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3304 
3305     return g_strdup(spapr->host_model);
3306 }
3307 
3308 static void spapr_set_host_model(Object *obj, const char *value, Error **errp)
3309 {
3310     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3311 
3312     g_free(spapr->host_model);
3313     spapr->host_model = g_strdup(value);
3314 }
3315 
3316 static char *spapr_get_host_serial(Object *obj, Error **errp)
3317 {
3318     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3319 
3320     return g_strdup(spapr->host_serial);
3321 }
3322 
3323 static void spapr_set_host_serial(Object *obj, const char *value, Error **errp)
3324 {
3325     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3326 
3327     g_free(spapr->host_serial);
3328     spapr->host_serial = g_strdup(value);
3329 }
3330 
3331 static void spapr_instance_init(Object *obj)
3332 {
3333     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3334     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
3335 
3336     spapr->htab_fd = -1;
3337     spapr->use_hotplug_event_source = true;
3338     object_property_add_str(obj, "kvm-type",
3339                             spapr_get_kvm_type, spapr_set_kvm_type, NULL);
3340     object_property_set_description(obj, "kvm-type",
3341                                     "Specifies the KVM virtualization mode (HV, PR)",
3342                                     NULL);
3343     object_property_add_bool(obj, "modern-hotplug-events",
3344                             spapr_get_modern_hotplug_events,
3345                             spapr_set_modern_hotplug_events,
3346                             NULL);
3347     object_property_set_description(obj, "modern-hotplug-events",
3348                                     "Use dedicated hotplug event mechanism in"
3349                                     " place of standard EPOW events when possible"
3350                                     " (required for memory hot-unplug support)",
3351                                     NULL);
3352     ppc_compat_add_property(obj, "max-cpu-compat", &spapr->max_compat_pvr,
3353                             "Maximum permitted CPU compatibility mode",
3354                             &error_fatal);
3355 
3356     object_property_add_str(obj, "resize-hpt",
3357                             spapr_get_resize_hpt, spapr_set_resize_hpt, NULL);
3358     object_property_set_description(obj, "resize-hpt",
3359                                     "Resizing of the Hash Page Table (enabled, disabled, required)",
3360                                     NULL);
3361     object_property_add(obj, "vsmt", "uint32", spapr_get_vsmt,
3362                         spapr_set_vsmt, NULL, &spapr->vsmt, &error_abort);
3363     object_property_set_description(obj, "vsmt",
3364                                     "Virtual SMT: KVM behaves as if this were"
3365                                     " the host's SMT mode", &error_abort);
3366     object_property_add_bool(obj, "vfio-no-msix-emulation",
3367                              spapr_get_msix_emulation, NULL, NULL);
3368 
3369     /* The machine class defines the default interrupt controller mode */
3370     spapr->irq = smc->irq;
3371     object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
3372                             spapr_set_ic_mode, NULL);
3373     object_property_set_description(obj, "ic-mode",
3374                  "Specifies the interrupt controller mode (xics, xive, dual)",
3375                  NULL);
3376 
3377     object_property_add_str(obj, "host-model",
3378         spapr_get_host_model, spapr_set_host_model,
3379         &error_abort);
3380     object_property_set_description(obj, "host-model",
3381         "Host model to advertise in guest device tree", &error_abort);
3382     object_property_add_str(obj, "host-serial",
3383         spapr_get_host_serial, spapr_set_host_serial,
3384         &error_abort);
3385     object_property_set_description(obj, "host-serial",
3386         "Host serial number to advertise in guest device tree", &error_abort);
3387 }
3388 
3389 static void spapr_machine_finalizefn(Object *obj)
3390 {
3391     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3392 
3393     g_free(spapr->kvm_type);
3394 }
3395 
3396 void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg)
3397 {
3398     cpu_synchronize_state(cs);
3399     ppc_cpu_do_system_reset(cs);
3400 }
3401 
3402 static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
3403 {
3404     CPUState *cs;
3405 
3406     CPU_FOREACH(cs) {
3407         async_run_on_cpu(cs, spapr_do_system_reset_on_cpu, RUN_ON_CPU_NULL);
3408     }
3409 }
3410 
3411 int spapr_lmb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3412                           void *fdt, int *fdt_start_offset, Error **errp)
3413 {
3414     uint64_t addr;
3415     uint32_t node;
3416 
3417     addr = spapr_drc_index(drc) * SPAPR_MEMORY_BLOCK_SIZE;
3418     node = object_property_get_uint(OBJECT(drc->dev), PC_DIMM_NODE_PROP,
3419                                     &error_abort);
3420     *fdt_start_offset = spapr_populate_memory_node(fdt, node, addr,
3421                                                    SPAPR_MEMORY_BLOCK_SIZE);
3422     return 0;
3423 }
3424 
3425 static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
3426                            bool dedicated_hp_event_source, Error **errp)
3427 {
3428     SpaprDrc *drc;
3429     uint32_t nr_lmbs = size/SPAPR_MEMORY_BLOCK_SIZE;
3430     int i;
3431     uint64_t addr = addr_start;
3432     bool hotplugged = spapr_drc_hotplugged(dev);
3433     Error *local_err = NULL;
3434 
3435     for (i = 0; i < nr_lmbs; i++) {
3436         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3437                               addr / SPAPR_MEMORY_BLOCK_SIZE);
3438         g_assert(drc);
3439 
3440         spapr_drc_attach(drc, dev, &local_err);
3441         if (local_err) {
3442             while (addr > addr_start) {
3443                 addr -= SPAPR_MEMORY_BLOCK_SIZE;
3444                 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3445                                       addr / SPAPR_MEMORY_BLOCK_SIZE);
3446                 spapr_drc_detach(drc);
3447             }
3448             error_propagate(errp, local_err);
3449             return;
3450         }
3451         if (!hotplugged) {
3452             spapr_drc_reset(drc);
3453         }
3454         addr += SPAPR_MEMORY_BLOCK_SIZE;
3455     }
3456     /* send hotplug notification to the
3457      * guest only in case of hotplugged memory
3458      */
3459     if (hotplugged) {
3460         if (dedicated_hp_event_source) {
3461             drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3462                                   addr_start / SPAPR_MEMORY_BLOCK_SIZE);
3463             spapr_hotplug_req_add_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
3464                                                    nr_lmbs,
3465                                                    spapr_drc_index(drc));
3466         } else {
3467             spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB,
3468                                            nr_lmbs);
3469         }
3470     }
3471 }
3472 
3473 static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3474                               Error **errp)
3475 {
3476     Error *local_err = NULL;
3477     SpaprMachineState *ms = SPAPR_MACHINE(hotplug_dev);
3478     PCDIMMDevice *dimm = PC_DIMM(dev);
3479     uint64_t size, addr;
3480 
3481     size = memory_device_get_region_size(MEMORY_DEVICE(dev), &error_abort);
3482 
3483     pc_dimm_plug(dimm, MACHINE(ms), &local_err);
3484     if (local_err) {
3485         goto out;
3486     }
3487 
3488     addr = object_property_get_uint(OBJECT(dimm),
3489                                     PC_DIMM_ADDR_PROP, &local_err);
3490     if (local_err) {
3491         goto out_unplug;
3492     }
3493 
3494     spapr_add_lmbs(dev, addr, size, spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT),
3495                    &local_err);
3496     if (local_err) {
3497         goto out_unplug;
3498     }
3499 
3500     return;
3501 
3502 out_unplug:
3503     pc_dimm_unplug(dimm, MACHINE(ms));
3504 out:
3505     error_propagate(errp, local_err);
3506 }
3507 
3508 static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3509                                   Error **errp)
3510 {
3511     const SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
3512     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3513     PCDIMMDevice *dimm = PC_DIMM(dev);
3514     Error *local_err = NULL;
3515     uint64_t size;
3516     Object *memdev;
3517     hwaddr pagesize;
3518 
3519     if (!smc->dr_lmb_enabled) {
3520         error_setg(errp, "Memory hotplug not supported for this machine");
3521         return;
3522     }
3523 
3524     size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &local_err);
3525     if (local_err) {
3526         error_propagate(errp, local_err);
3527         return;
3528     }
3529 
3530     if (size % SPAPR_MEMORY_BLOCK_SIZE) {
3531         error_setg(errp, "Hotplugged memory size must be a multiple of "
3532                       "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
3533         return;
3534     }
3535 
3536     memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
3537                                       &error_abort);
3538     pagesize = host_memory_backend_pagesize(MEMORY_BACKEND(memdev));
3539     spapr_check_pagesize(spapr, pagesize, &local_err);
3540     if (local_err) {
3541         error_propagate(errp, local_err);
3542         return;
3543     }
3544 
3545     pc_dimm_pre_plug(dimm, MACHINE(hotplug_dev), NULL, errp);
3546 }
3547 
3548 struct SpaprDimmState {
3549     PCDIMMDevice *dimm;
3550     uint32_t nr_lmbs;
3551     QTAILQ_ENTRY(SpaprDimmState) next;
3552 };
3553 
3554 static SpaprDimmState *spapr_pending_dimm_unplugs_find(SpaprMachineState *s,
3555                                                        PCDIMMDevice *dimm)
3556 {
3557     SpaprDimmState *dimm_state = NULL;
3558 
3559     QTAILQ_FOREACH(dimm_state, &s->pending_dimm_unplugs, next) {
3560         if (dimm_state->dimm == dimm) {
3561             break;
3562         }
3563     }
3564     return dimm_state;
3565 }
3566 
3567 static SpaprDimmState *spapr_pending_dimm_unplugs_add(SpaprMachineState *spapr,
3568                                                       uint32_t nr_lmbs,
3569                                                       PCDIMMDevice *dimm)
3570 {
3571     SpaprDimmState *ds = NULL;
3572 
3573     /*
3574      * If this request is for a DIMM whose removal had failed earlier
3575      * (due to guest's refusal to remove the LMBs), we would have this
3576      * dimm already in the pending_dimm_unplugs list. In that
3577      * case don't add again.
3578      */
3579     ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
3580     if (!ds) {
3581         ds = g_malloc0(sizeof(SpaprDimmState));
3582         ds->nr_lmbs = nr_lmbs;
3583         ds->dimm = dimm;
3584         QTAILQ_INSERT_HEAD(&spapr->pending_dimm_unplugs, ds, next);
3585     }
3586     return ds;
3587 }
3588 
3589 static void spapr_pending_dimm_unplugs_remove(SpaprMachineState *spapr,
3590                                               SpaprDimmState *dimm_state)
3591 {
3592     QTAILQ_REMOVE(&spapr->pending_dimm_unplugs, dimm_state, next);
3593     g_free(dimm_state);
3594 }
3595 
3596 static SpaprDimmState *spapr_recover_pending_dimm_state(SpaprMachineState *ms,
3597                                                         PCDIMMDevice *dimm)
3598 {
3599     SpaprDrc *drc;
3600     uint64_t size = memory_device_get_region_size(MEMORY_DEVICE(dimm),
3601                                                   &error_abort);
3602     uint32_t nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3603     uint32_t avail_lmbs = 0;
3604     uint64_t addr_start, addr;
3605     int i;
3606 
3607     addr_start = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3608                                          &error_abort);
3609 
3610     addr = addr_start;
3611     for (i = 0; i < nr_lmbs; i++) {
3612         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3613                               addr / SPAPR_MEMORY_BLOCK_SIZE);
3614         g_assert(drc);
3615         if (drc->dev) {
3616             avail_lmbs++;
3617         }
3618         addr += SPAPR_MEMORY_BLOCK_SIZE;
3619     }
3620 
3621     return spapr_pending_dimm_unplugs_add(ms, avail_lmbs, dimm);
3622 }
3623 
3624 /* Callback to be called during DRC release. */
3625 void spapr_lmb_release(DeviceState *dev)
3626 {
3627     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3628     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_ctrl);
3629     SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
3630 
3631     /* This information will get lost if a migration occurs
3632      * during the unplug process. In this case recover it. */
3633     if (ds == NULL) {
3634         ds = spapr_recover_pending_dimm_state(spapr, PC_DIMM(dev));
3635         g_assert(ds);
3636         /* The DRC being examined by the caller at least must be counted */
3637         g_assert(ds->nr_lmbs);
3638     }
3639 
3640     if (--ds->nr_lmbs) {
3641         return;
3642     }
3643 
3644     /*
3645      * Now that all the LMBs have been removed by the guest, call the
3646      * unplug handler chain. This can never fail.
3647      */
3648     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3649     object_unparent(OBJECT(dev));
3650 }
3651 
3652 static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
3653 {
3654     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3655     SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
3656 
3657     pc_dimm_unplug(PC_DIMM(dev), MACHINE(hotplug_dev));
3658     object_property_set_bool(OBJECT(dev), false, "realized", NULL);
3659     spapr_pending_dimm_unplugs_remove(spapr, ds);
3660 }
3661 
3662 static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev,
3663                                         DeviceState *dev, Error **errp)
3664 {
3665     SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3666     Error *local_err = NULL;
3667     PCDIMMDevice *dimm = PC_DIMM(dev);
3668     uint32_t nr_lmbs;
3669     uint64_t size, addr_start, addr;
3670     int i;
3671     SpaprDrc *drc;
3672 
3673     size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
3674     nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3675 
3676     addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3677                                          &local_err);
3678     if (local_err) {
3679         goto out;
3680     }
3681 
3682     /*
3683      * An existing pending dimm state for this DIMM means that there is an
3684      * unplug operation in progress, waiting for the spapr_lmb_release
3685      * callback to complete the job (BQL can't cover that far). In this case,
3686      * bail out to avoid detaching DRCs that were already released.
3687      */
3688     if (spapr_pending_dimm_unplugs_find(spapr, dimm)) {
3689         error_setg(&local_err,
3690                    "Memory unplug already in progress for device %s",
3691                    dev->id);
3692         goto out;
3693     }
3694 
3695     spapr_pending_dimm_unplugs_add(spapr, nr_lmbs, dimm);
3696 
3697     addr = addr_start;
3698     for (i = 0; i < nr_lmbs; i++) {
3699         drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3700                               addr / SPAPR_MEMORY_BLOCK_SIZE);
3701         g_assert(drc);
3702 
3703         spapr_drc_detach(drc);
3704         addr += SPAPR_MEMORY_BLOCK_SIZE;
3705     }
3706 
3707     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3708                           addr_start / SPAPR_MEMORY_BLOCK_SIZE);
3709     spapr_hotplug_req_remove_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
3710                                               nr_lmbs, spapr_drc_index(drc));
3711 out:
3712     error_propagate(errp, local_err);
3713 }
3714 
3715 /* Callback to be called during DRC release. */
3716 void spapr_core_release(DeviceState *dev)
3717 {
3718     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3719 
3720     /* Call the unplug handler chain. This can never fail. */
3721     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3722     object_unparent(OBJECT(dev));
3723 }
3724 
3725 static void spapr_core_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
3726 {
3727     MachineState *ms = MACHINE(hotplug_dev);
3728     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms);
3729     CPUCore *cc = CPU_CORE(dev);
3730     CPUArchId *core_slot = spapr_find_cpu_slot(ms, cc->core_id, NULL);
3731 
3732     if (smc->pre_2_10_has_unused_icps) {
3733         SpaprCpuCore *sc = SPAPR_CPU_CORE(OBJECT(dev));
3734         int i;
3735 
3736         for (i = 0; i < cc->nr_threads; i++) {
3737             CPUState *cs = CPU(sc->threads[i]);
3738 
3739             pre_2_10_vmstate_register_dummy_icp(cs->cpu_index);
3740         }
3741     }
3742 
3743     assert(core_slot);
3744     core_slot->cpu = NULL;
3745     object_property_set_bool(OBJECT(dev), false, "realized", NULL);
3746 }
3747 
3748 static
3749 void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev,
3750                                Error **errp)
3751 {
3752     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3753     int index;
3754     SpaprDrc *drc;
3755     CPUCore *cc = CPU_CORE(dev);
3756 
3757     if (!spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index)) {
3758         error_setg(errp, "Unable to find CPU core with core-id: %d",
3759                    cc->core_id);
3760         return;
3761     }
3762     if (index == 0) {
3763         error_setg(errp, "Boot CPU core may not be unplugged");
3764         return;
3765     }
3766 
3767     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
3768                           spapr_vcpu_id(spapr, cc->core_id));
3769     g_assert(drc);
3770 
3771     spapr_drc_detach(drc);
3772 
3773     spapr_hotplug_req_remove_by_index(drc);
3774 }
3775 
3776 int spapr_core_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3777                            void *fdt, int *fdt_start_offset, Error **errp)
3778 {
3779     SpaprCpuCore *core = SPAPR_CPU_CORE(drc->dev);
3780     CPUState *cs = CPU(core->threads[0]);
3781     PowerPCCPU *cpu = POWERPC_CPU(cs);
3782     DeviceClass *dc = DEVICE_GET_CLASS(cs);
3783     int id = spapr_get_vcpu_id(cpu);
3784     char *nodename;
3785     int offset;
3786 
3787     nodename = g_strdup_printf("%s@%x", dc->fw_name, id);
3788     offset = fdt_add_subnode(fdt, 0, nodename);
3789     g_free(nodename);
3790 
3791     spapr_populate_cpu_dt(cs, fdt, offset, spapr);
3792 
3793     *fdt_start_offset = offset;
3794     return 0;
3795 }
3796 
3797 static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3798                             Error **errp)
3799 {
3800     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3801     MachineClass *mc = MACHINE_GET_CLASS(spapr);
3802     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
3803     SpaprCpuCore *core = SPAPR_CPU_CORE(OBJECT(dev));
3804     CPUCore *cc = CPU_CORE(dev);
3805     CPUState *cs;
3806     SpaprDrc *drc;
3807     Error *local_err = NULL;
3808     CPUArchId *core_slot;
3809     int index;
3810     bool hotplugged = spapr_drc_hotplugged(dev);
3811 
3812     core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
3813     if (!core_slot) {
3814         error_setg(errp, "Unable to find CPU core with core-id: %d",
3815                    cc->core_id);
3816         return;
3817     }
3818     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
3819                           spapr_vcpu_id(spapr, cc->core_id));
3820 
3821     g_assert(drc || !mc->has_hotpluggable_cpus);
3822 
3823     if (drc) {
3824         spapr_drc_attach(drc, dev, &local_err);
3825         if (local_err) {
3826             error_propagate(errp, local_err);
3827             return;
3828         }
3829 
3830         if (hotplugged) {
3831             /*
3832              * Send hotplug notification interrupt to the guest only
3833              * in case of hotplugged CPUs.
3834              */
3835             spapr_hotplug_req_add_by_index(drc);
3836         } else {
3837             spapr_drc_reset(drc);
3838         }
3839     }
3840 
3841     core_slot->cpu = OBJECT(dev);
3842 
3843     if (smc->pre_2_10_has_unused_icps) {
3844         int i;
3845 
3846         for (i = 0; i < cc->nr_threads; i++) {
3847             cs = CPU(core->threads[i]);
3848             pre_2_10_vmstate_unregister_dummy_icp(cs->cpu_index);
3849         }
3850     }
3851 }
3852 
3853 static void spapr_core_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3854                                 Error **errp)
3855 {
3856     MachineState *machine = MACHINE(OBJECT(hotplug_dev));
3857     MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
3858     Error *local_err = NULL;
3859     CPUCore *cc = CPU_CORE(dev);
3860     const char *base_core_type = spapr_get_cpu_core_type(machine->cpu_type);
3861     const char *type = object_get_typename(OBJECT(dev));
3862     CPUArchId *core_slot;
3863     int index;
3864 
3865     if (dev->hotplugged && !mc->has_hotpluggable_cpus) {
3866         error_setg(&local_err, "CPU hotplug not supported for this machine");
3867         goto out;
3868     }
3869 
3870     if (strcmp(base_core_type, type)) {
3871         error_setg(&local_err, "CPU core type should be %s", base_core_type);
3872         goto out;
3873     }
3874 
3875     if (cc->core_id % smp_threads) {
3876         error_setg(&local_err, "invalid core id %d", cc->core_id);
3877         goto out;
3878     }
3879 
3880     /*
3881      * In general we should have homogeneous threads-per-core, but old
3882      * (pre hotplug support) machine types allow the last core to have
3883      * reduced threads as a compatibility hack for when we allowed
3884      * total vcpus not a multiple of threads-per-core.
3885      */
3886     if (mc->has_hotpluggable_cpus && (cc->nr_threads != smp_threads)) {
3887         error_setg(&local_err, "invalid nr-threads %d, must be %d",
3888                    cc->nr_threads, smp_threads);
3889         goto out;
3890     }
3891 
3892     core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
3893     if (!core_slot) {
3894         error_setg(&local_err, "core id %d out of range", cc->core_id);
3895         goto out;
3896     }
3897 
3898     if (core_slot->cpu) {
3899         error_setg(&local_err, "core %d already populated", cc->core_id);
3900         goto out;
3901     }
3902 
3903     numa_cpu_pre_plug(core_slot, dev, &local_err);
3904 
3905 out:
3906     error_propagate(errp, local_err);
3907 }
3908 
3909 int spapr_phb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3910                           void *fdt, int *fdt_start_offset, Error **errp)
3911 {
3912     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(drc->dev);
3913     int intc_phandle;
3914 
3915     intc_phandle = spapr_irq_get_phandle(spapr, spapr->fdt_blob, errp);
3916     if (intc_phandle <= 0) {
3917         return -1;
3918     }
3919 
3920     if (spapr_populate_pci_dt(sphb, intc_phandle, fdt, spapr->irq->nr_msis,
3921                               fdt_start_offset)) {
3922         error_setg(errp, "unable to create FDT node for PHB %d", sphb->index);
3923         return -1;
3924     }
3925 
3926     /* generally SLOF creates these, for hotplug it's up to QEMU */
3927     _FDT(fdt_setprop_string(fdt, *fdt_start_offset, "name", "pci"));
3928 
3929     return 0;
3930 }
3931 
3932 static void spapr_phb_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3933                                Error **errp)
3934 {
3935     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3936     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
3937     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
3938     const unsigned windows_supported = spapr_phb_windows_supported(sphb);
3939 
3940     if (dev->hotplugged && !smc->dr_phb_enabled) {
3941         error_setg(errp, "PHB hotplug not supported for this machine");
3942         return;
3943     }
3944 
3945     if (sphb->index == (uint32_t)-1) {
3946         error_setg(errp, "\"index\" for PAPR PHB is mandatory");
3947         return;
3948     }
3949 
3950     /*
3951      * This will check that sphb->index doesn't exceed the maximum number of
3952      * PHBs for the current machine type.
3953      */
3954     smc->phb_placement(spapr, sphb->index,
3955                        &sphb->buid, &sphb->io_win_addr,
3956                        &sphb->mem_win_addr, &sphb->mem64_win_addr,
3957                        windows_supported, sphb->dma_liobn,
3958                        &sphb->nv2_gpa_win_addr, &sphb->nv2_atsd_win_addr,
3959                        errp);
3960 }
3961 
3962 static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3963                            Error **errp)
3964 {
3965     SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3966     SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
3967     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
3968     SpaprDrc *drc;
3969     bool hotplugged = spapr_drc_hotplugged(dev);
3970     Error *local_err = NULL;
3971 
3972     if (!smc->dr_phb_enabled) {
3973         return;
3974     }
3975 
3976     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
3977     /* hotplug hooks should check it's enabled before getting this far */
3978     assert(drc);
3979 
3980     spapr_drc_attach(drc, DEVICE(dev), &local_err);
3981     if (local_err) {
3982         error_propagate(errp, local_err);
3983         return;
3984     }
3985 
3986     if (hotplugged) {
3987         spapr_hotplug_req_add_by_index(drc);
3988     } else {
3989         spapr_drc_reset(drc);
3990     }
3991 }
3992 
3993 void spapr_phb_release(DeviceState *dev)
3994 {
3995     HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3996 
3997     hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3998     object_unparent(OBJECT(dev));
3999 }
4000 
4001 static void spapr_phb_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
4002 {
4003     object_property_set_bool(OBJECT(dev), false, "realized", NULL);
4004 }
4005 
4006 static void spapr_phb_unplug_request(HotplugHandler *hotplug_dev,
4007                                      DeviceState *dev, Error **errp)
4008 {
4009     SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4010     SpaprDrc *drc;
4011 
4012     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4013     assert(drc);
4014 
4015     if (!spapr_drc_unplug_requested(drc)) {
4016         spapr_drc_detach(drc);
4017         spapr_hotplug_req_remove_by_index(drc);
4018     }
4019 }
4020 
4021 static void spapr_machine_device_plug(HotplugHandler *hotplug_dev,
4022                                       DeviceState *dev, Error **errp)
4023 {
4024     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4025         spapr_memory_plug(hotplug_dev, dev, errp);
4026     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4027         spapr_core_plug(hotplug_dev, dev, errp);
4028     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4029         spapr_phb_plug(hotplug_dev, dev, errp);
4030     }
4031 }
4032 
4033 static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
4034                                         DeviceState *dev, Error **errp)
4035 {
4036     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4037         spapr_memory_unplug(hotplug_dev, dev);
4038     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4039         spapr_core_unplug(hotplug_dev, dev);
4040     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4041         spapr_phb_unplug(hotplug_dev, dev);
4042     }
4043 }
4044 
4045 static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev,
4046                                                 DeviceState *dev, Error **errp)
4047 {
4048     SpaprMachineState *sms = SPAPR_MACHINE(OBJECT(hotplug_dev));
4049     MachineClass *mc = MACHINE_GET_CLASS(sms);
4050     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4051 
4052     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4053         if (spapr_ovec_test(sms->ov5_cas, OV5_HP_EVT)) {
4054             spapr_memory_unplug_request(hotplug_dev, dev, errp);
4055         } else {
4056             /* NOTE: this means there is a window after guest reset, prior to
4057              * CAS negotiation, where unplug requests will fail due to the
4058              * capability not being detected yet. This is a bit different than
4059              * the case with PCI unplug, where the events will be queued and
4060              * eventually handled by the guest after boot
4061              */
4062             error_setg(errp, "Memory hot unplug not supported for this guest");
4063         }
4064     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4065         if (!mc->has_hotpluggable_cpus) {
4066             error_setg(errp, "CPU hot unplug not supported on this machine");
4067             return;
4068         }
4069         spapr_core_unplug_request(hotplug_dev, dev, errp);
4070     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4071         if (!smc->dr_phb_enabled) {
4072             error_setg(errp, "PHB hot unplug not supported on this machine");
4073             return;
4074         }
4075         spapr_phb_unplug_request(hotplug_dev, dev, errp);
4076     }
4077 }
4078 
4079 static void spapr_machine_device_pre_plug(HotplugHandler *hotplug_dev,
4080                                           DeviceState *dev, Error **errp)
4081 {
4082     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4083         spapr_memory_pre_plug(hotplug_dev, dev, errp);
4084     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4085         spapr_core_pre_plug(hotplug_dev, dev, errp);
4086     } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4087         spapr_phb_pre_plug(hotplug_dev, dev, errp);
4088     }
4089 }
4090 
4091 static HotplugHandler *spapr_get_hotplug_handler(MachineState *machine,
4092                                                  DeviceState *dev)
4093 {
4094     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
4095         object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE) ||
4096         object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4097         return HOTPLUG_HANDLER(machine);
4098     }
4099     return NULL;
4100 }
4101 
4102 static CpuInstanceProperties
4103 spapr_cpu_index_to_props(MachineState *machine, unsigned cpu_index)
4104 {
4105     CPUArchId *core_slot;
4106     MachineClass *mc = MACHINE_GET_CLASS(machine);
4107 
4108     /* make sure possible_cpu are intialized */
4109     mc->possible_cpu_arch_ids(machine);
4110     /* get CPU core slot containing thread that matches cpu_index */
4111     core_slot = spapr_find_cpu_slot(machine, cpu_index, NULL);
4112     assert(core_slot);
4113     return core_slot->props;
4114 }
4115 
4116 static int64_t spapr_get_default_cpu_node_id(const MachineState *ms, int idx)
4117 {
4118     return idx / smp_cores % nb_numa_nodes;
4119 }
4120 
4121 static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine)
4122 {
4123     int i;
4124     const char *core_type;
4125     int spapr_max_cores = max_cpus / smp_threads;
4126     MachineClass *mc = MACHINE_GET_CLASS(machine);
4127 
4128     if (!mc->has_hotpluggable_cpus) {
4129         spapr_max_cores = QEMU_ALIGN_UP(smp_cpus, smp_threads) / smp_threads;
4130     }
4131     if (machine->possible_cpus) {
4132         assert(machine->possible_cpus->len == spapr_max_cores);
4133         return machine->possible_cpus;
4134     }
4135 
4136     core_type = spapr_get_cpu_core_type(machine->cpu_type);
4137     if (!core_type) {
4138         error_report("Unable to find sPAPR CPU Core definition");
4139         exit(1);
4140     }
4141 
4142     machine->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
4143                              sizeof(CPUArchId) * spapr_max_cores);
4144     machine->possible_cpus->len = spapr_max_cores;
4145     for (i = 0; i < machine->possible_cpus->len; i++) {
4146         int core_id = i * smp_threads;
4147 
4148         machine->possible_cpus->cpus[i].type = core_type;
4149         machine->possible_cpus->cpus[i].vcpus_count = smp_threads;
4150         machine->possible_cpus->cpus[i].arch_id = core_id;
4151         machine->possible_cpus->cpus[i].props.has_core_id = true;
4152         machine->possible_cpus->cpus[i].props.core_id = core_id;
4153     }
4154     return machine->possible_cpus;
4155 }
4156 
4157 static void spapr_phb_placement(SpaprMachineState *spapr, uint32_t index,
4158                                 uint64_t *buid, hwaddr *pio,
4159                                 hwaddr *mmio32, hwaddr *mmio64,
4160                                 unsigned n_dma, uint32_t *liobns,
4161                                 hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
4162 {
4163     /*
4164      * New-style PHB window placement.
4165      *
4166      * Goals: Gives large (1TiB), naturally aligned 64-bit MMIO window
4167      * for each PHB, in addition to 2GiB 32-bit MMIO and 64kiB PIO
4168      * windows.
4169      *
4170      * Some guest kernels can't work with MMIO windows above 1<<46
4171      * (64TiB), so we place up to 31 PHBs in the area 32TiB..64TiB
4172      *
4173      * 32TiB..(33TiB+1984kiB) contains the 64kiB PIO windows for each
4174      * PHB stacked together.  (32TiB+2GiB)..(32TiB+64GiB) contains the
4175      * 2GiB 32-bit MMIO windows for each PHB.  Then 33..64TiB has the
4176      * 1TiB 64-bit MMIO windows for each PHB.
4177      */
4178     const uint64_t base_buid = 0x800000020000000ULL;
4179     int i;
4180 
4181     /* Sanity check natural alignments */
4182     QEMU_BUILD_BUG_ON((SPAPR_PCI_BASE % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
4183     QEMU_BUILD_BUG_ON((SPAPR_PCI_LIMIT % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
4184     QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM64_WIN_SIZE % SPAPR_PCI_MEM32_WIN_SIZE) != 0);
4185     QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM32_WIN_SIZE % SPAPR_PCI_IO_WIN_SIZE) != 0);
4186     /* Sanity check bounds */
4187     QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_IO_WIN_SIZE) >
4188                       SPAPR_PCI_MEM32_WIN_SIZE);
4189     QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_MEM32_WIN_SIZE) >
4190                       SPAPR_PCI_MEM64_WIN_SIZE);
4191 
4192     if (index >= SPAPR_MAX_PHBS) {
4193         error_setg(errp, "\"index\" for PAPR PHB is too large (max %llu)",
4194                    SPAPR_MAX_PHBS - 1);
4195         return;
4196     }
4197 
4198     *buid = base_buid + index;
4199     for (i = 0; i < n_dma; ++i) {
4200         liobns[i] = SPAPR_PCI_LIOBN(index, i);
4201     }
4202 
4203     *pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE;
4204     *mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE;
4205     *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE;
4206 
4207     *nv2gpa = SPAPR_PCI_NV2RAM64_WIN_BASE + index * SPAPR_PCI_NV2RAM64_WIN_SIZE;
4208     *nv2atsd = SPAPR_PCI_NV2ATSD_WIN_BASE + index * SPAPR_PCI_NV2ATSD_WIN_SIZE;
4209 }
4210 
4211 static ICSState *spapr_ics_get(XICSFabric *dev, int irq)
4212 {
4213     SpaprMachineState *spapr = SPAPR_MACHINE(dev);
4214 
4215     return ics_valid_irq(spapr->ics, irq) ? spapr->ics : NULL;
4216 }
4217 
4218 static void spapr_ics_resend(XICSFabric *dev)
4219 {
4220     SpaprMachineState *spapr = SPAPR_MACHINE(dev);
4221 
4222     ics_resend(spapr->ics);
4223 }
4224 
4225 static ICPState *spapr_icp_get(XICSFabric *xi, int vcpu_id)
4226 {
4227     PowerPCCPU *cpu = spapr_find_cpu(vcpu_id);
4228 
4229     return cpu ? spapr_cpu_state(cpu)->icp : NULL;
4230 }
4231 
4232 static void spapr_pic_print_info(InterruptStatsProvider *obj,
4233                                  Monitor *mon)
4234 {
4235     SpaprMachineState *spapr = SPAPR_MACHINE(obj);
4236 
4237     spapr->irq->print_info(spapr, mon);
4238 }
4239 
4240 int spapr_get_vcpu_id(PowerPCCPU *cpu)
4241 {
4242     return cpu->vcpu_id;
4243 }
4244 
4245 void spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp)
4246 {
4247     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
4248     int vcpu_id;
4249 
4250     vcpu_id = spapr_vcpu_id(spapr, cpu_index);
4251 
4252     if (kvm_enabled() && !kvm_vcpu_id_is_valid(vcpu_id)) {
4253         error_setg(errp, "Can't create CPU with id %d in KVM", vcpu_id);
4254         error_append_hint(errp, "Adjust the number of cpus to %d "
4255                           "or try to raise the number of threads per core\n",
4256                           vcpu_id * smp_threads / spapr->vsmt);
4257         return;
4258     }
4259 
4260     cpu->vcpu_id = vcpu_id;
4261 }
4262 
4263 PowerPCCPU *spapr_find_cpu(int vcpu_id)
4264 {
4265     CPUState *cs;
4266 
4267     CPU_FOREACH(cs) {
4268         PowerPCCPU *cpu = POWERPC_CPU(cs);
4269 
4270         if (spapr_get_vcpu_id(cpu) == vcpu_id) {
4271             return cpu;
4272         }
4273     }
4274 
4275     return NULL;
4276 }
4277 
4278 static void spapr_machine_class_init(ObjectClass *oc, void *data)
4279 {
4280     MachineClass *mc = MACHINE_CLASS(oc);
4281     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(oc);
4282     FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
4283     NMIClass *nc = NMI_CLASS(oc);
4284     HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
4285     PPCVirtualHypervisorClass *vhc = PPC_VIRTUAL_HYPERVISOR_CLASS(oc);
4286     XICSFabricClass *xic = XICS_FABRIC_CLASS(oc);
4287     InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc);
4288 
4289     mc->desc = "pSeries Logical Partition (PAPR compliant)";
4290     mc->ignore_boot_device_suffixes = true;
4291 
4292     /*
4293      * We set up the default / latest behaviour here.  The class_init
4294      * functions for the specific versioned machine types can override
4295      * these details for backwards compatibility
4296      */
4297     mc->init = spapr_machine_init;
4298     mc->reset = spapr_machine_reset;
4299     mc->block_default_type = IF_SCSI;
4300     mc->max_cpus = 1024;
4301     mc->no_parallel = 1;
4302     mc->default_boot_order = "";
4303     mc->default_ram_size = 512 * MiB;
4304     mc->default_display = "std";
4305     mc->kvm_type = spapr_kvm_type;
4306     machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SPAPR_PCI_HOST_BRIDGE);
4307     mc->pci_allow_0_address = true;
4308     assert(!mc->get_hotplug_handler);
4309     mc->get_hotplug_handler = spapr_get_hotplug_handler;
4310     hc->pre_plug = spapr_machine_device_pre_plug;
4311     hc->plug = spapr_machine_device_plug;
4312     mc->cpu_index_to_instance_props = spapr_cpu_index_to_props;
4313     mc->get_default_cpu_node_id = spapr_get_default_cpu_node_id;
4314     mc->possible_cpu_arch_ids = spapr_possible_cpu_arch_ids;
4315     hc->unplug_request = spapr_machine_device_unplug_request;
4316     hc->unplug = spapr_machine_device_unplug;
4317 
4318     smc->dr_lmb_enabled = true;
4319     smc->update_dt_enabled = true;
4320     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power9_v2.0");
4321     mc->has_hotpluggable_cpus = true;
4322     smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED;
4323     fwc->get_dev_path = spapr_get_fw_dev_path;
4324     nc->nmi_monitor_handler = spapr_nmi;
4325     smc->phb_placement = spapr_phb_placement;
4326     vhc->hypercall = emulate_spapr_hypercall;
4327     vhc->hpt_mask = spapr_hpt_mask;
4328     vhc->map_hptes = spapr_map_hptes;
4329     vhc->unmap_hptes = spapr_unmap_hptes;
4330     vhc->hpte_set_c = spapr_hpte_set_c;
4331     vhc->hpte_set_r = spapr_hpte_set_r;
4332     vhc->get_pate = spapr_get_pate;
4333     vhc->encode_hpt_for_kvm_pr = spapr_encode_hpt_for_kvm_pr;
4334     xic->ics_get = spapr_ics_get;
4335     xic->ics_resend = spapr_ics_resend;
4336     xic->icp_get = spapr_icp_get;
4337     ispc->print_info = spapr_pic_print_info;
4338     /* Force NUMA node memory size to be a multiple of
4339      * SPAPR_MEMORY_BLOCK_SIZE (256M) since that's the granularity
4340      * in which LMBs are represented and hot-added
4341      */
4342     mc->numa_mem_align_shift = 28;
4343 
4344     smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_OFF;
4345     smc->default_caps.caps[SPAPR_CAP_VSX] = SPAPR_CAP_ON;
4346     smc->default_caps.caps[SPAPR_CAP_DFP] = SPAPR_CAP_ON;
4347     smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND;
4348     smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND;
4349     smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_WORKAROUND;
4350     smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 16; /* 64kiB */
4351     smc->default_caps.caps[SPAPR_CAP_NESTED_KVM_HV] = SPAPR_CAP_OFF;
4352     smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON;
4353     smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF;
4354     spapr_caps_add_properties(smc, &error_abort);
4355     smc->irq = &spapr_irq_dual;
4356     smc->dr_phb_enabled = true;
4357 }
4358 
4359 static const TypeInfo spapr_machine_info = {
4360     .name          = TYPE_SPAPR_MACHINE,
4361     .parent        = TYPE_MACHINE,
4362     .abstract      = true,
4363     .instance_size = sizeof(SpaprMachineState),
4364     .instance_init = spapr_instance_init,
4365     .instance_finalize = spapr_machine_finalizefn,
4366     .class_size    = sizeof(SpaprMachineClass),
4367     .class_init    = spapr_machine_class_init,
4368     .interfaces = (InterfaceInfo[]) {
4369         { TYPE_FW_PATH_PROVIDER },
4370         { TYPE_NMI },
4371         { TYPE_HOTPLUG_HANDLER },
4372         { TYPE_PPC_VIRTUAL_HYPERVISOR },
4373         { TYPE_XICS_FABRIC },
4374         { TYPE_INTERRUPT_STATS_PROVIDER },
4375         { }
4376     },
4377 };
4378 
4379 #define DEFINE_SPAPR_MACHINE(suffix, verstr, latest)                 \
4380     static void spapr_machine_##suffix##_class_init(ObjectClass *oc, \
4381                                                     void *data)      \
4382     {                                                                \
4383         MachineClass *mc = MACHINE_CLASS(oc);                        \
4384         spapr_machine_##suffix##_class_options(mc);                  \
4385         if (latest) {                                                \
4386             mc->alias = "pseries";                                   \
4387             mc->is_default = 1;                                      \
4388         }                                                            \
4389     }                                                                \
4390     static const TypeInfo spapr_machine_##suffix##_info = {          \
4391         .name = MACHINE_TYPE_NAME("pseries-" verstr),                \
4392         .parent = TYPE_SPAPR_MACHINE,                                \
4393         .class_init = spapr_machine_##suffix##_class_init,           \
4394     };                                                               \
4395     static void spapr_machine_register_##suffix(void)                \
4396     {                                                                \
4397         type_register(&spapr_machine_##suffix##_info);               \
4398     }                                                                \
4399     type_init(spapr_machine_register_##suffix)
4400 
4401 /*
4402  * pseries-4.1
4403  */
4404 static void spapr_machine_4_1_class_options(MachineClass *mc)
4405 {
4406     /* Defaults for the latest behaviour inherited from the base class */
4407 }
4408 
4409 DEFINE_SPAPR_MACHINE(4_1, "4.1", true);
4410 
4411 /*
4412  * pseries-4.0
4413  */
4414 static void phb_placement_4_0(SpaprMachineState *spapr, uint32_t index,
4415                               uint64_t *buid, hwaddr *pio,
4416                               hwaddr *mmio32, hwaddr *mmio64,
4417                               unsigned n_dma, uint32_t *liobns,
4418                               hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
4419 {
4420     spapr_phb_placement(spapr, index, buid, pio, mmio32, mmio64, n_dma, liobns,
4421                         nv2gpa, nv2atsd, errp);
4422     *nv2gpa = 0;
4423     *nv2atsd = 0;
4424 }
4425 
4426 static void spapr_machine_4_0_class_options(MachineClass *mc)
4427 {
4428     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4429 
4430     spapr_machine_4_1_class_options(mc);
4431     compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len);
4432     smc->phb_placement = phb_placement_4_0;
4433     smc->irq = &spapr_irq_xics;
4434     smc->pre_4_1_migration = true;
4435 }
4436 
4437 DEFINE_SPAPR_MACHINE(4_0, "4.0", false);
4438 
4439 /*
4440  * pseries-3.1
4441  */
4442 static void spapr_machine_3_1_class_options(MachineClass *mc)
4443 {
4444     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4445 
4446     spapr_machine_4_0_class_options(mc);
4447     compat_props_add(mc->compat_props, hw_compat_3_1, hw_compat_3_1_len);
4448 
4449     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
4450     smc->update_dt_enabled = false;
4451     smc->dr_phb_enabled = false;
4452     smc->broken_host_serial_model = true;
4453     smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_BROKEN;
4454     smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN;
4455     smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN;
4456     smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_OFF;
4457 }
4458 
4459 DEFINE_SPAPR_MACHINE(3_1, "3.1", false);
4460 
4461 /*
4462  * pseries-3.0
4463  */
4464 
4465 static void spapr_machine_3_0_class_options(MachineClass *mc)
4466 {
4467     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4468 
4469     spapr_machine_3_1_class_options(mc);
4470     compat_props_add(mc->compat_props, hw_compat_3_0, hw_compat_3_0_len);
4471 
4472     smc->legacy_irq_allocation = true;
4473     smc->irq = &spapr_irq_xics_legacy;
4474 }
4475 
4476 DEFINE_SPAPR_MACHINE(3_0, "3.0", false);
4477 
4478 /*
4479  * pseries-2.12
4480  */
4481 static void spapr_machine_2_12_class_options(MachineClass *mc)
4482 {
4483     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4484     static GlobalProperty compat[] = {
4485         { TYPE_POWERPC_CPU, "pre-3.0-migration", "on" },
4486         { TYPE_SPAPR_CPU_CORE, "pre-3.0-migration", "on" },
4487     };
4488 
4489     spapr_machine_3_0_class_options(mc);
4490     compat_props_add(mc->compat_props, hw_compat_2_12, hw_compat_2_12_len);
4491     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4492 
4493     /* We depend on kvm_enabled() to choose a default value for the
4494      * hpt-max-page-size capability. Of course we can't do it here
4495      * because this is too early and the HW accelerator isn't initialzed
4496      * yet. Postpone this to machine init (see default_caps_with_cpu()).
4497      */
4498     smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 0;
4499 }
4500 
4501 DEFINE_SPAPR_MACHINE(2_12, "2.12", false);
4502 
4503 static void spapr_machine_2_12_sxxm_class_options(MachineClass *mc)
4504 {
4505     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4506 
4507     spapr_machine_2_12_class_options(mc);
4508     smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND;
4509     smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND;
4510     smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_FIXED_CCD;
4511 }
4512 
4513 DEFINE_SPAPR_MACHINE(2_12_sxxm, "2.12-sxxm", false);
4514 
4515 /*
4516  * pseries-2.11
4517  */
4518 
4519 static void spapr_machine_2_11_class_options(MachineClass *mc)
4520 {
4521     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4522 
4523     spapr_machine_2_12_class_options(mc);
4524     smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_ON;
4525     compat_props_add(mc->compat_props, hw_compat_2_11, hw_compat_2_11_len);
4526 }
4527 
4528 DEFINE_SPAPR_MACHINE(2_11, "2.11", false);
4529 
4530 /*
4531  * pseries-2.10
4532  */
4533 
4534 static void spapr_machine_2_10_class_options(MachineClass *mc)
4535 {
4536     spapr_machine_2_11_class_options(mc);
4537     compat_props_add(mc->compat_props, hw_compat_2_10, hw_compat_2_10_len);
4538 }
4539 
4540 DEFINE_SPAPR_MACHINE(2_10, "2.10", false);
4541 
4542 /*
4543  * pseries-2.9
4544  */
4545 
4546 static void spapr_machine_2_9_class_options(MachineClass *mc)
4547 {
4548     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4549     static GlobalProperty compat[] = {
4550         { TYPE_POWERPC_CPU, "pre-2.10-migration", "on" },
4551     };
4552 
4553     spapr_machine_2_10_class_options(mc);
4554     compat_props_add(mc->compat_props, hw_compat_2_9, hw_compat_2_9_len);
4555     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4556     mc->numa_auto_assign_ram = numa_legacy_auto_assign_ram;
4557     smc->pre_2_10_has_unused_icps = true;
4558     smc->resize_hpt_default = SPAPR_RESIZE_HPT_DISABLED;
4559 }
4560 
4561 DEFINE_SPAPR_MACHINE(2_9, "2.9", false);
4562 
4563 /*
4564  * pseries-2.8
4565  */
4566 
4567 static void spapr_machine_2_8_class_options(MachineClass *mc)
4568 {
4569     static GlobalProperty compat[] = {
4570         { TYPE_SPAPR_PCI_HOST_BRIDGE, "pcie-extended-configuration-space", "off" },
4571     };
4572 
4573     spapr_machine_2_9_class_options(mc);
4574     compat_props_add(mc->compat_props, hw_compat_2_8, hw_compat_2_8_len);
4575     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4576     mc->numa_mem_align_shift = 23;
4577 }
4578 
4579 DEFINE_SPAPR_MACHINE(2_8, "2.8", false);
4580 
4581 /*
4582  * pseries-2.7
4583  */
4584 
4585 static void phb_placement_2_7(SpaprMachineState *spapr, uint32_t index,
4586                               uint64_t *buid, hwaddr *pio,
4587                               hwaddr *mmio32, hwaddr *mmio64,
4588                               unsigned n_dma, uint32_t *liobns,
4589                               hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
4590 {
4591     /* Legacy PHB placement for pseries-2.7 and earlier machine types */
4592     const uint64_t base_buid = 0x800000020000000ULL;
4593     const hwaddr phb_spacing = 0x1000000000ULL; /* 64 GiB */
4594     const hwaddr mmio_offset = 0xa0000000; /* 2 GiB + 512 MiB */
4595     const hwaddr pio_offset = 0x80000000; /* 2 GiB */
4596     const uint32_t max_index = 255;
4597     const hwaddr phb0_alignment = 0x10000000000ULL; /* 1 TiB */
4598 
4599     uint64_t ram_top = MACHINE(spapr)->ram_size;
4600     hwaddr phb0_base, phb_base;
4601     int i;
4602 
4603     /* Do we have device memory? */
4604     if (MACHINE(spapr)->maxram_size > ram_top) {
4605         /* Can't just use maxram_size, because there may be an
4606          * alignment gap between normal and device memory regions
4607          */
4608         ram_top = MACHINE(spapr)->device_memory->base +
4609             memory_region_size(&MACHINE(spapr)->device_memory->mr);
4610     }
4611 
4612     phb0_base = QEMU_ALIGN_UP(ram_top, phb0_alignment);
4613 
4614     if (index > max_index) {
4615         error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)",
4616                    max_index);
4617         return;
4618     }
4619 
4620     *buid = base_buid + index;
4621     for (i = 0; i < n_dma; ++i) {
4622         liobns[i] = SPAPR_PCI_LIOBN(index, i);
4623     }
4624 
4625     phb_base = phb0_base + index * phb_spacing;
4626     *pio = phb_base + pio_offset;
4627     *mmio32 = phb_base + mmio_offset;
4628     /*
4629      * We don't set the 64-bit MMIO window, relying on the PHB's
4630      * fallback behaviour of automatically splitting a large "32-bit"
4631      * window into contiguous 32-bit and 64-bit windows
4632      */
4633 
4634     *nv2gpa = 0;
4635     *nv2atsd = 0;
4636 }
4637 
4638 static void spapr_machine_2_7_class_options(MachineClass *mc)
4639 {
4640     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4641     static GlobalProperty compat[] = {
4642         { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem_win_size", "0xf80000000", },
4643         { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem64_win_size", "0", },
4644         { TYPE_POWERPC_CPU, "pre-2.8-migration", "on", },
4645         { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-2.8-migration", "on", },
4646     };
4647 
4648     spapr_machine_2_8_class_options(mc);
4649     mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power7_v2.3");
4650     mc->default_machine_opts = "modern-hotplug-events=off";
4651     compat_props_add(mc->compat_props, hw_compat_2_7, hw_compat_2_7_len);
4652     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4653     smc->phb_placement = phb_placement_2_7;
4654 }
4655 
4656 DEFINE_SPAPR_MACHINE(2_7, "2.7", false);
4657 
4658 /*
4659  * pseries-2.6
4660  */
4661 
4662 static void spapr_machine_2_6_class_options(MachineClass *mc)
4663 {
4664     static GlobalProperty compat[] = {
4665         { TYPE_SPAPR_PCI_HOST_BRIDGE, "ddw", "off" },
4666     };
4667 
4668     spapr_machine_2_7_class_options(mc);
4669     mc->has_hotpluggable_cpus = false;
4670     compat_props_add(mc->compat_props, hw_compat_2_6, hw_compat_2_6_len);
4671     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4672 }
4673 
4674 DEFINE_SPAPR_MACHINE(2_6, "2.6", false);
4675 
4676 /*
4677  * pseries-2.5
4678  */
4679 
4680 static void spapr_machine_2_5_class_options(MachineClass *mc)
4681 {
4682     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4683     static GlobalProperty compat[] = {
4684         { "spapr-vlan", "use-rx-buffer-pools", "off" },
4685     };
4686 
4687     spapr_machine_2_6_class_options(mc);
4688     smc->use_ohci_by_default = true;
4689     compat_props_add(mc->compat_props, hw_compat_2_5, hw_compat_2_5_len);
4690     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4691 }
4692 
4693 DEFINE_SPAPR_MACHINE(2_5, "2.5", false);
4694 
4695 /*
4696  * pseries-2.4
4697  */
4698 
4699 static void spapr_machine_2_4_class_options(MachineClass *mc)
4700 {
4701     SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4702 
4703     spapr_machine_2_5_class_options(mc);
4704     smc->dr_lmb_enabled = false;
4705     compat_props_add(mc->compat_props, hw_compat_2_4, hw_compat_2_4_len);
4706 }
4707 
4708 DEFINE_SPAPR_MACHINE(2_4, "2.4", false);
4709 
4710 /*
4711  * pseries-2.3
4712  */
4713 
4714 static void spapr_machine_2_3_class_options(MachineClass *mc)
4715 {
4716     static GlobalProperty compat[] = {
4717         { "spapr-pci-host-bridge", "dynamic-reconfiguration", "off" },
4718     };
4719     spapr_machine_2_4_class_options(mc);
4720     compat_props_add(mc->compat_props, hw_compat_2_3, hw_compat_2_3_len);
4721     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4722 }
4723 DEFINE_SPAPR_MACHINE(2_3, "2.3", false);
4724 
4725 /*
4726  * pseries-2.2
4727  */
4728 
4729 static void spapr_machine_2_2_class_options(MachineClass *mc)
4730 {
4731     static GlobalProperty compat[] = {
4732         { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem_win_size", "0x20000000" },
4733     };
4734 
4735     spapr_machine_2_3_class_options(mc);
4736     compat_props_add(mc->compat_props, hw_compat_2_2, hw_compat_2_2_len);
4737     compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4738     mc->default_machine_opts = "modern-hotplug-events=off,suppress-vmdesc=on";
4739 }
4740 DEFINE_SPAPR_MACHINE(2_2, "2.2", false);
4741 
4742 /*
4743  * pseries-2.1
4744  */
4745 
4746 static void spapr_machine_2_1_class_options(MachineClass *mc)
4747 {
4748     spapr_machine_2_2_class_options(mc);
4749     compat_props_add(mc->compat_props, hw_compat_2_1, hw_compat_2_1_len);
4750 }
4751 DEFINE_SPAPR_MACHINE(2_1, "2.1", false);
4752 
4753 static void spapr_machine_register_types(void)
4754 {
4755     type_register_static(&spapr_machine_info);
4756 }
4757 
4758 type_init(spapr_machine_register_types)
4759