1 /* 2 * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator 3 * 4 * Copyright (c) 2004-2007 Fabrice Bellard 5 * Copyright (c) 2007 Jocelyn Mayer 6 * Copyright (c) 2010 David Gibson, IBM Corporation. 7 * Copyright (c) 2010-2024, IBM Corporation.. 8 * 9 * SPDX-License-Identifier: GPL-2.0-or-later 10 * 11 * Permission is hereby granted, free of charge, to any person obtaining a copy 12 * of this software and associated documentation files (the "Software"), to deal 13 * in the Software without restriction, including without limitation the rights 14 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 * copies of the Software, and to permit persons to whom the Software is 16 * furnished to do so, subject to the following conditions: 17 * 18 * The above copyright notice and this permission notice shall be included in 19 * all copies or substantial portions of the Software. 20 * 21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 24 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 27 * THE SOFTWARE. 28 */ 29 30 #include "qemu/osdep.h" 31 #include "qemu/datadir.h" 32 #include "qemu/memalign.h" 33 #include "qemu/guest-random.h" 34 #include "qapi/error.h" 35 #include "qapi/qapi-events-machine.h" 36 #include "qapi/qapi-events-qdev.h" 37 #include "qapi/visitor.h" 38 #include "system/system.h" 39 #include "system/hostmem.h" 40 #include "system/numa.h" 41 #include "system/tcg.h" 42 #include "system/qtest.h" 43 #include "system/reset.h" 44 #include "system/runstate.h" 45 #include "qemu/log.h" 46 #include "hw/fw-path-provider.h" 47 #include "elf.h" 48 #include "net/net.h" 49 #include "system/device_tree.h" 50 #include "system/cpus.h" 51 #include "system/hw_accel.h" 52 #include "kvm_ppc.h" 53 #include "migration/misc.h" 54 #include "migration/qemu-file-types.h" 55 #include "migration/global_state.h" 56 #include "migration/register.h" 57 #include "migration/blocker.h" 58 #include "mmu-hash64.h" 59 #include "mmu-book3s-v3.h" 60 #include "cpu-models.h" 61 #include "hw/core/cpu.h" 62 63 #include "hw/ppc/ppc.h" 64 #include "hw/loader.h" 65 66 #include "hw/ppc/fdt.h" 67 #include "hw/ppc/spapr.h" 68 #include "hw/ppc/spapr_nested.h" 69 #include "hw/ppc/spapr_vio.h" 70 #include "hw/ppc/vof.h" 71 #include "hw/qdev-properties.h" 72 #include "hw/pci-host/spapr.h" 73 #include "hw/pci/msi.h" 74 75 #include "hw/pci/pci.h" 76 #include "hw/scsi/scsi.h" 77 #include "hw/virtio/virtio-scsi.h" 78 #include "hw/virtio/vhost-scsi-common.h" 79 80 #include "system/ram_addr.h" 81 #include "system/confidential-guest-support.h" 82 #include "hw/usb.h" 83 #include "qemu/config-file.h" 84 #include "qemu/error-report.h" 85 #include "trace.h" 86 #include "hw/nmi.h" 87 #include "hw/intc/intc.h" 88 89 #include "hw/ppc/spapr_cpu_core.h" 90 #include "hw/mem/memory-device.h" 91 #include "hw/ppc/spapr_tpm_proxy.h" 92 #include "hw/ppc/spapr_nvdimm.h" 93 #include "hw/ppc/spapr_numa.h" 94 95 #include <libfdt.h> 96 97 /* SLOF memory layout: 98 * 99 * SLOF raw image loaded at 0, copies its romfs right below the flat 100 * device-tree, then position SLOF itself 31M below that 101 * 102 * So we set FW_OVERHEAD to 40MB which should account for all of that 103 * and more 104 * 105 * We load our kernel at 4M, leaving space for SLOF initial image 106 */ 107 #define FDT_MAX_ADDR 0x80000000 /* FDT must stay below that */ 108 #define FW_MAX_SIZE 0x400000 109 #define FW_FILE_NAME "slof.bin" 110 #define FW_FILE_NAME_VOF "vof.bin" 111 #define FW_OVERHEAD 0x2800000 112 #define KERNEL_LOAD_ADDR FW_MAX_SIZE 113 114 #define MIN_RMA_SLOF (128 * MiB) 115 116 #define PHANDLE_INTC 0x00001111 117 118 /* These two functions implement the VCPU id numbering: one to compute them 119 * all and one to identify thread 0 of a VCORE. Any change to the first one 120 * is likely to have an impact on the second one, so let's keep them close. 121 */ 122 static int spapr_vcpu_id(SpaprMachineState *spapr, int cpu_index) 123 { 124 MachineState *ms = MACHINE(spapr); 125 unsigned int smp_threads = ms->smp.threads; 126 127 assert(spapr->vsmt); 128 return 129 (cpu_index / smp_threads) * spapr->vsmt + cpu_index % smp_threads; 130 } 131 static bool spapr_is_thread0_in_vcore(SpaprMachineState *spapr, 132 PowerPCCPU *cpu) 133 { 134 assert(spapr->vsmt); 135 return spapr_get_vcpu_id(cpu) % spapr->vsmt == 0; 136 } 137 138 int spapr_max_server_number(SpaprMachineState *spapr) 139 { 140 MachineState *ms = MACHINE(spapr); 141 142 assert(spapr->vsmt); 143 return DIV_ROUND_UP(ms->smp.max_cpus * spapr->vsmt, ms->smp.threads); 144 } 145 146 static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu, 147 int smt_threads) 148 { 149 int i, ret = 0; 150 g_autofree uint32_t *servers_prop = g_new(uint32_t, smt_threads); 151 g_autofree uint32_t *gservers_prop = g_new(uint32_t, smt_threads * 2); 152 int index = spapr_get_vcpu_id(cpu); 153 154 if (cpu->compat_pvr) { 155 ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->compat_pvr); 156 if (ret < 0) { 157 return ret; 158 } 159 } 160 161 /* Build interrupt servers and gservers properties */ 162 for (i = 0; i < smt_threads; i++) { 163 servers_prop[i] = cpu_to_be32(index + i); 164 /* Hack, direct the group queues back to cpu 0 */ 165 gservers_prop[i*2] = cpu_to_be32(index + i); 166 gservers_prop[i*2 + 1] = 0; 167 } 168 ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s", 169 servers_prop, sizeof(*servers_prop) * smt_threads); 170 if (ret < 0) { 171 return ret; 172 } 173 ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s", 174 gservers_prop, sizeof(*gservers_prop) * smt_threads * 2); 175 176 return ret; 177 } 178 179 static void spapr_dt_pa_features(SpaprMachineState *spapr, 180 PowerPCCPU *cpu, 181 void *fdt, int offset) 182 { 183 /* 184 * SSO (SAO) ordering is supported on KVM and thread=single hosts, 185 * but not MTTCG, so disable it. To advertise it, a cap would have 186 * to be added, or support implemented for MTTCG. 187 * 188 * Copy/paste is not supported by TCG, so it is not advertised. KVM 189 * can execute them but it has no accelerator drivers which are usable, 190 * so there isn't much need for it anyway. 191 */ 192 193 /* These should be kept in sync with pnv */ 194 uint8_t pa_features_206[] = { 6, 0, 195 0xf6, 0x1f, 0xc7, 0x00, 0x00, 0xc0 }; 196 uint8_t pa_features_207[] = { 24, 0, 197 0xf6, 0x1f, 0xc7, 0xc0, 0x00, 0xf0, 198 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 199 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 200 0x80, 0x00, 0x80, 0x00, 0x00, 0x00 }; 201 uint8_t pa_features_300[] = { 66, 0, 202 /* 0: MMU|FPU|SLB|RUN|DABR|NX, 1: fri[nzpm]|DABRX|SPRG3|SLB0|PP110 */ 203 /* 2: VPM|DS205|PPR|DS202|DS206, 3: LSD|URG, 5: LE|CFAR|EB|LSQ */ 204 0xf6, 0x1f, 0xc7, 0xc0, 0x00, 0xf0, /* 0 - 5 */ 205 /* 6: DS207 */ 206 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 6 - 11 */ 207 /* 16: Vector */ 208 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 12 - 17 */ 209 /* 18: Vec. Scalar, 20: Vec. XOR */ 210 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 18 - 23 */ 211 /* 24: Ext. Dec, 26: 64 bit ftrs, 28: PM ftrs */ 212 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 24 - 29 */ 213 /* 32: LE atomic, 34: EBB + ext EBB */ 214 0x00, 0x00, 0x80, 0x00, 0xC0, 0x00, /* 30 - 35 */ 215 /* 40: Radix MMU */ 216 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 36 - 41 */ 217 /* 42: PM, 44: PC RA, 46: SC vec'd */ 218 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 42 - 47 */ 219 /* 48: SIMD, 50: QP BFP, 52: String */ 220 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */ 221 /* 54: DecFP, 56: DecI, 58: SHA */ 222 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */ 223 /* 60: NM atomic, 62: RNG */ 224 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */ 225 }; 226 /* 3.1 removes SAO, HTM support */ 227 uint8_t pa_features_31[] = { 74, 0, 228 /* 0: MMU|FPU|SLB|RUN|DABR|NX, 1: fri[nzpm]|DABRX|SPRG3|SLB0|PP110 */ 229 /* 2: VPM|DS205|PPR|DS202|DS206, 3: LSD|URG, 5: LE|CFAR|EB|LSQ */ 230 0xf6, 0x1f, 0xc7, 0xc0, 0x00, 0xf0, /* 0 - 5 */ 231 /* 6: DS207 */ 232 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 6 - 11 */ 233 /* 16: Vector */ 234 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 12 - 17 */ 235 /* 18: Vec. Scalar, 20: Vec. XOR */ 236 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 18 - 23 */ 237 /* 24: Ext. Dec, 26: 64 bit ftrs, 28: PM ftrs */ 238 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 24 - 29 */ 239 /* 32: LE atomic, 34: EBB + ext EBB */ 240 0x00, 0x00, 0x80, 0x00, 0xC0, 0x00, /* 30 - 35 */ 241 /* 40: Radix MMU */ 242 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 36 - 41 */ 243 /* 42: PM, 44: PC RA, 46: SC vec'd */ 244 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 42 - 47 */ 245 /* 48: SIMD, 50: QP BFP, 52: String */ 246 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */ 247 /* 54: DecFP, 56: DecI, 58: SHA */ 248 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */ 249 /* 60: NM atomic, 62: RNG, 64: DAWR1 (ISA 3.1) */ 250 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */ 251 /* 68: DEXCR[SBHE|IBRTPDUS|SRAPD|NPHIE|PHIE] */ 252 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 66 - 71 */ 253 /* 72: [P]HASHST/[P]HASHCHK */ 254 0x80, 0x00, /* 72 - 73 */ 255 }; 256 uint8_t *pa_features = NULL; 257 size_t pa_size; 258 259 if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_06, 0, cpu->compat_pvr)) { 260 pa_features = pa_features_206; 261 pa_size = sizeof(pa_features_206); 262 } 263 if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_07, 0, cpu->compat_pvr)) { 264 pa_features = pa_features_207; 265 pa_size = sizeof(pa_features_207); 266 } 267 if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0, cpu->compat_pvr)) { 268 pa_features = pa_features_300; 269 pa_size = sizeof(pa_features_300); 270 } 271 if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_10, 0, cpu->compat_pvr)) { 272 pa_features = pa_features_31; 273 pa_size = sizeof(pa_features_31); 274 } 275 if (!pa_features) { 276 return; 277 } 278 279 if (ppc_hash64_has(cpu, PPC_HASH64_CI_LARGEPAGE)) { 280 /* 281 * Note: we keep CI large pages off by default because a 64K capable 282 * guest provisioned with large pages might otherwise try to map a qemu 283 * framebuffer (or other kind of memory mapped PCI BAR) using 64K pages 284 * even if that qemu runs on a 4k host. 285 * We dd this bit back here if we are confident this is not an issue 286 */ 287 pa_features[3] |= 0x20; 288 } 289 if ((spapr_get_cap(spapr, SPAPR_CAP_HTM) != 0) && pa_size > 24) { 290 pa_features[24] |= 0x80; /* Transactional memory support */ 291 } 292 if (spapr->cas_pre_isa3_guest && pa_size > 40) { 293 /* Workaround for broken kernels that attempt (guest) radix 294 * mode when they can't handle it, if they see the radix bit set 295 * in pa-features. So hide it from them. */ 296 pa_features[40 + 2] &= ~0x80; /* Radix MMU */ 297 } 298 if (spapr_get_cap(spapr, SPAPR_CAP_DAWR1)) { 299 g_assert(pa_size > 66); 300 pa_features[66] |= 0x80; 301 } 302 303 _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size))); 304 } 305 306 static void spapr_dt_pi_features(SpaprMachineState *spapr, 307 PowerPCCPU *cpu, 308 void *fdt, int offset) 309 { 310 uint8_t pi_features[] = { 1, 0, 311 0x00 }; 312 313 if (kvm_enabled() && ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 314 0, cpu->compat_pvr)) { 315 /* 316 * POWER9 and later CPUs with KVM run in LPAR-per-thread mode where 317 * all threads are essentially independent CPUs, and msgsndp does not 318 * work (because it is physically-addressed) and therefore is 319 * emulated by KVM, so disable it here to ensure XIVE will be used. 320 * This is both KVM and CPU implementation-specific behaviour so a KVM 321 * cap would be cleanest, but for now this works. If KVM ever permits 322 * native msgsndp execution by guests, a cap could be added at that 323 * time. 324 */ 325 pi_features[2] |= 0x08; /* 4: No msgsndp */ 326 } 327 328 _FDT((fdt_setprop(fdt, offset, "ibm,pi-features", pi_features, 329 sizeof(pi_features)))); 330 } 331 332 static hwaddr spapr_node0_size(MachineState *machine) 333 { 334 if (machine->numa_state->num_nodes) { 335 int i; 336 for (i = 0; i < machine->numa_state->num_nodes; ++i) { 337 if (machine->numa_state->nodes[i].node_mem) { 338 return MIN(pow2floor(machine->numa_state->nodes[i].node_mem), 339 machine->ram_size); 340 } 341 } 342 } 343 return machine->ram_size; 344 } 345 346 static void add_str(GString *s, const gchar *s1) 347 { 348 g_string_append_len(s, s1, strlen(s1) + 1); 349 } 350 351 static int spapr_dt_memory_node(SpaprMachineState *spapr, void *fdt, int nodeid, 352 hwaddr start, hwaddr size) 353 { 354 char mem_name[32]; 355 uint64_t mem_reg_property[2]; 356 int off; 357 358 mem_reg_property[0] = cpu_to_be64(start); 359 mem_reg_property[1] = cpu_to_be64(size); 360 361 sprintf(mem_name, "memory@%" HWADDR_PRIx, start); 362 off = fdt_add_subnode(fdt, 0, mem_name); 363 _FDT(off); 364 _FDT((fdt_setprop_string(fdt, off, "device_type", "memory"))); 365 _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property, 366 sizeof(mem_reg_property)))); 367 spapr_numa_write_associativity_dt(spapr, fdt, off, nodeid); 368 return off; 369 } 370 371 static uint32_t spapr_pc_dimm_node(MemoryDeviceInfoList *list, ram_addr_t addr) 372 { 373 MemoryDeviceInfoList *info; 374 375 for (info = list; info; info = info->next) { 376 MemoryDeviceInfo *value = info->value; 377 378 if (value && value->type == MEMORY_DEVICE_INFO_KIND_DIMM) { 379 PCDIMMDeviceInfo *pcdimm_info = value->u.dimm.data; 380 381 if (addr >= pcdimm_info->addr && 382 addr < (pcdimm_info->addr + pcdimm_info->size)) { 383 return pcdimm_info->node; 384 } 385 } 386 } 387 388 return -1; 389 } 390 391 struct sPAPRDrconfCellV2 { 392 uint32_t seq_lmbs; 393 uint64_t base_addr; 394 uint32_t drc_index; 395 uint32_t aa_index; 396 uint32_t flags; 397 } QEMU_PACKED; 398 399 typedef struct DrconfCellQueue { 400 struct sPAPRDrconfCellV2 cell; 401 QSIMPLEQ_ENTRY(DrconfCellQueue) entry; 402 } DrconfCellQueue; 403 404 static DrconfCellQueue * 405 spapr_get_drconf_cell(uint32_t seq_lmbs, uint64_t base_addr, 406 uint32_t drc_index, uint32_t aa_index, 407 uint32_t flags) 408 { 409 DrconfCellQueue *elem; 410 411 elem = g_malloc0(sizeof(*elem)); 412 elem->cell.seq_lmbs = cpu_to_be32(seq_lmbs); 413 elem->cell.base_addr = cpu_to_be64(base_addr); 414 elem->cell.drc_index = cpu_to_be32(drc_index); 415 elem->cell.aa_index = cpu_to_be32(aa_index); 416 elem->cell.flags = cpu_to_be32(flags); 417 418 return elem; 419 } 420 421 static int spapr_dt_dynamic_memory_v2(SpaprMachineState *spapr, void *fdt, 422 int offset, MemoryDeviceInfoList *dimms) 423 { 424 MachineState *machine = MACHINE(spapr); 425 uint8_t *int_buf, *cur_index; 426 int ret; 427 uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE; 428 uint64_t addr, cur_addr, size; 429 uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size); 430 uint64_t mem_end = machine->device_memory->base + 431 memory_region_size(&machine->device_memory->mr); 432 uint32_t node, buf_len, nr_entries = 0; 433 SpaprDrc *drc; 434 DrconfCellQueue *elem, *next; 435 MemoryDeviceInfoList *info; 436 QSIMPLEQ_HEAD(, DrconfCellQueue) drconf_queue 437 = QSIMPLEQ_HEAD_INITIALIZER(drconf_queue); 438 439 /* Entry to cover RAM and the gap area */ 440 elem = spapr_get_drconf_cell(nr_boot_lmbs, 0, 0, -1, 441 SPAPR_LMB_FLAGS_RESERVED | 442 SPAPR_LMB_FLAGS_DRC_INVALID); 443 QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry); 444 nr_entries++; 445 446 cur_addr = machine->device_memory->base; 447 for (info = dimms; info; info = info->next) { 448 PCDIMMDeviceInfo *di = info->value->u.dimm.data; 449 450 addr = di->addr; 451 size = di->size; 452 node = di->node; 453 454 /* 455 * The NVDIMM area is hotpluggable after the NVDIMM is unplugged. The 456 * area is marked hotpluggable in the next iteration for the bigger 457 * chunk including the NVDIMM occupied area. 458 */ 459 if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) 460 continue; 461 462 /* Entry for hot-pluggable area */ 463 if (cur_addr < addr) { 464 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size); 465 g_assert(drc); 466 elem = spapr_get_drconf_cell((addr - cur_addr) / lmb_size, 467 cur_addr, spapr_drc_index(drc), -1, 0); 468 QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry); 469 nr_entries++; 470 } 471 472 /* Entry for DIMM */ 473 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size); 474 g_assert(drc); 475 elem = spapr_get_drconf_cell(size / lmb_size, addr, 476 spapr_drc_index(drc), node, 477 (SPAPR_LMB_FLAGS_ASSIGNED | 478 SPAPR_LMB_FLAGS_HOTREMOVABLE)); 479 QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry); 480 nr_entries++; 481 cur_addr = addr + size; 482 } 483 484 /* Entry for remaining hotpluggable area */ 485 if (cur_addr < mem_end) { 486 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size); 487 g_assert(drc); 488 elem = spapr_get_drconf_cell((mem_end - cur_addr) / lmb_size, 489 cur_addr, spapr_drc_index(drc), -1, 0); 490 QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry); 491 nr_entries++; 492 } 493 494 buf_len = nr_entries * sizeof(struct sPAPRDrconfCellV2) + sizeof(uint32_t); 495 int_buf = cur_index = g_malloc0(buf_len); 496 *(uint32_t *)int_buf = cpu_to_be32(nr_entries); 497 cur_index += sizeof(nr_entries); 498 499 QSIMPLEQ_FOREACH_SAFE(elem, &drconf_queue, entry, next) { 500 memcpy(cur_index, &elem->cell, sizeof(elem->cell)); 501 cur_index += sizeof(elem->cell); 502 QSIMPLEQ_REMOVE(&drconf_queue, elem, DrconfCellQueue, entry); 503 g_free(elem); 504 } 505 506 ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory-v2", int_buf, buf_len); 507 g_free(int_buf); 508 if (ret < 0) { 509 return -1; 510 } 511 return 0; 512 } 513 514 static int spapr_dt_dynamic_memory(SpaprMachineState *spapr, void *fdt, 515 int offset, MemoryDeviceInfoList *dimms) 516 { 517 MachineState *machine = MACHINE(spapr); 518 int i, ret; 519 uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE; 520 uint32_t device_lmb_start = machine->device_memory->base / lmb_size; 521 uint32_t nr_lmbs = (machine->device_memory->base + 522 memory_region_size(&machine->device_memory->mr)) / 523 lmb_size; 524 uint32_t *int_buf, *cur_index, buf_len; 525 526 /* 527 * Allocate enough buffer size to fit in ibm,dynamic-memory 528 */ 529 buf_len = (nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1) * sizeof(uint32_t); 530 cur_index = int_buf = g_malloc0(buf_len); 531 int_buf[0] = cpu_to_be32(nr_lmbs); 532 cur_index++; 533 for (i = 0; i < nr_lmbs; i++) { 534 uint64_t addr = i * lmb_size; 535 uint32_t *dynamic_memory = cur_index; 536 537 if (i >= device_lmb_start) { 538 SpaprDrc *drc; 539 540 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, i); 541 g_assert(drc); 542 543 dynamic_memory[0] = cpu_to_be32(addr >> 32); 544 dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff); 545 dynamic_memory[2] = cpu_to_be32(spapr_drc_index(drc)); 546 dynamic_memory[3] = cpu_to_be32(0); /* reserved */ 547 dynamic_memory[4] = cpu_to_be32(spapr_pc_dimm_node(dimms, addr)); 548 if (memory_region_present(get_system_memory(), addr)) { 549 dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_ASSIGNED); 550 } else { 551 dynamic_memory[5] = cpu_to_be32(0); 552 } 553 } else { 554 /* 555 * LMB information for RMA, boot time RAM and gap b/n RAM and 556 * device memory region -- all these are marked as reserved 557 * and as having no valid DRC. 558 */ 559 dynamic_memory[0] = cpu_to_be32(addr >> 32); 560 dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff); 561 dynamic_memory[2] = cpu_to_be32(0); 562 dynamic_memory[3] = cpu_to_be32(0); /* reserved */ 563 dynamic_memory[4] = cpu_to_be32(-1); 564 dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_RESERVED | 565 SPAPR_LMB_FLAGS_DRC_INVALID); 566 } 567 568 cur_index += SPAPR_DR_LMB_LIST_ENTRY_SIZE; 569 } 570 ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len); 571 g_free(int_buf); 572 if (ret < 0) { 573 return -1; 574 } 575 return 0; 576 } 577 578 /* 579 * Adds ibm,dynamic-reconfiguration-memory node. 580 * Refer to docs/specs/ppc-spapr-hotplug.rst for the documentation 581 * of this device tree node. 582 */ 583 static int spapr_dt_dynamic_reconfiguration_memory(SpaprMachineState *spapr, 584 void *fdt) 585 { 586 MachineState *machine = MACHINE(spapr); 587 int ret, offset; 588 uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE; 589 uint32_t prop_lmb_size[] = {cpu_to_be32(lmb_size >> 32), 590 cpu_to_be32(lmb_size & 0xffffffff)}; 591 MemoryDeviceInfoList *dimms = NULL; 592 593 /* Don't create the node if there is no device memory. */ 594 if (!machine->device_memory) { 595 return 0; 596 } 597 598 offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory"); 599 600 ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size, 601 sizeof(prop_lmb_size)); 602 if (ret < 0) { 603 return ret; 604 } 605 606 ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff); 607 if (ret < 0) { 608 return ret; 609 } 610 611 ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0); 612 if (ret < 0) { 613 return ret; 614 } 615 616 /* ibm,dynamic-memory or ibm,dynamic-memory-v2 */ 617 dimms = qmp_memory_device_list(); 618 if (spapr_ovec_test(spapr->ov5_cas, OV5_DRMEM_V2)) { 619 ret = spapr_dt_dynamic_memory_v2(spapr, fdt, offset, dimms); 620 } else { 621 ret = spapr_dt_dynamic_memory(spapr, fdt, offset, dimms); 622 } 623 qapi_free_MemoryDeviceInfoList(dimms); 624 625 if (ret < 0) { 626 return ret; 627 } 628 629 ret = spapr_numa_write_assoc_lookup_arrays(spapr, fdt, offset); 630 631 return ret; 632 } 633 634 static int spapr_dt_memory(SpaprMachineState *spapr, void *fdt) 635 { 636 MachineState *machine = MACHINE(spapr); 637 hwaddr mem_start, node_size; 638 int i, nb_nodes = machine->numa_state->num_nodes; 639 NodeInfo *nodes = machine->numa_state->nodes; 640 641 for (i = 0, mem_start = 0; i < nb_nodes; ++i) { 642 if (!nodes[i].node_mem) { 643 continue; 644 } 645 if (mem_start >= machine->ram_size) { 646 node_size = 0; 647 } else { 648 node_size = nodes[i].node_mem; 649 if (node_size > machine->ram_size - mem_start) { 650 node_size = machine->ram_size - mem_start; 651 } 652 } 653 if (!mem_start) { 654 /* spapr_machine_init() checks for rma_size <= node0_size 655 * already */ 656 spapr_dt_memory_node(spapr, fdt, i, 0, spapr->rma_size); 657 mem_start += spapr->rma_size; 658 node_size -= spapr->rma_size; 659 } 660 for ( ; node_size; ) { 661 hwaddr sizetmp = pow2floor(node_size); 662 663 /* mem_start != 0 here */ 664 if (ctzl(mem_start) < ctzl(sizetmp)) { 665 sizetmp = 1ULL << ctzl(mem_start); 666 } 667 668 spapr_dt_memory_node(spapr, fdt, i, mem_start, sizetmp); 669 node_size -= sizetmp; 670 mem_start += sizetmp; 671 } 672 } 673 674 /* Generate ibm,dynamic-reconfiguration-memory node if required */ 675 if (spapr_ovec_test(spapr->ov5_cas, OV5_DRCONF_MEMORY)) { 676 int ret; 677 678 ret = spapr_dt_dynamic_reconfiguration_memory(spapr, fdt); 679 if (ret) { 680 return ret; 681 } 682 } 683 684 return 0; 685 } 686 687 static void spapr_dt_cpu(CPUState *cs, void *fdt, int offset, 688 SpaprMachineState *spapr) 689 { 690 MachineState *ms = MACHINE(spapr); 691 PowerPCCPU *cpu = POWERPC_CPU(cs); 692 CPUPPCState *env = &cpu->env; 693 PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs); 694 int index = spapr_get_vcpu_id(cpu); 695 uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40), 696 0xffffffff, 0xffffffff}; 697 uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq() 698 : SPAPR_TIMEBASE_FREQ; 699 uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000; 700 uint32_t page_sizes_prop[64]; 701 size_t page_sizes_prop_size; 702 unsigned int smp_threads = ms->smp.threads; 703 uint32_t vcpus_per_socket = smp_threads * ms->smp.cores; 704 uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)}; 705 int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu)); 706 SpaprDrc *drc; 707 int drc_index; 708 uint32_t radix_AP_encodings[PPC_PAGE_SIZES_MAX_SZ]; 709 int i; 710 711 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, env->core_index); 712 if (drc) { 713 drc_index = spapr_drc_index(drc); 714 _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index))); 715 } 716 717 _FDT((fdt_setprop_cell(fdt, offset, "reg", index))); 718 _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu"))); 719 720 _FDT((fdt_setprop_cell(fdt, offset, "cpu-version", env->spr[SPR_PVR]))); 721 _FDT((fdt_setprop_cell(fdt, offset, "d-cache-block-size", 722 env->dcache_line_size))); 723 _FDT((fdt_setprop_cell(fdt, offset, "d-cache-line-size", 724 env->dcache_line_size))); 725 _FDT((fdt_setprop_cell(fdt, offset, "i-cache-block-size", 726 env->icache_line_size))); 727 _FDT((fdt_setprop_cell(fdt, offset, "i-cache-line-size", 728 env->icache_line_size))); 729 730 if (pcc->l1_dcache_size) { 731 _FDT((fdt_setprop_cell(fdt, offset, "d-cache-size", 732 pcc->l1_dcache_size))); 733 } else { 734 warn_report("Unknown L1 dcache size for cpu"); 735 } 736 if (pcc->l1_icache_size) { 737 _FDT((fdt_setprop_cell(fdt, offset, "i-cache-size", 738 pcc->l1_icache_size))); 739 } else { 740 warn_report("Unknown L1 icache size for cpu"); 741 } 742 743 _FDT((fdt_setprop_cell(fdt, offset, "timebase-frequency", tbfreq))); 744 _FDT((fdt_setprop_cell(fdt, offset, "clock-frequency", cpufreq))); 745 _FDT((fdt_setprop_cell(fdt, offset, "slb-size", cpu->hash64_opts->slb_size))); 746 _FDT((fdt_setprop_cell(fdt, offset, "ibm,slb-size", cpu->hash64_opts->slb_size))); 747 _FDT((fdt_setprop_string(fdt, offset, "status", "okay"))); 748 _FDT((fdt_setprop(fdt, offset, "64-bit", NULL, 0))); 749 750 if (ppc_has_spr(cpu, SPR_PURR)) { 751 _FDT((fdt_setprop_cell(fdt, offset, "ibm,purr", 1))); 752 } 753 if (ppc_has_spr(cpu, SPR_PURR)) { 754 _FDT((fdt_setprop_cell(fdt, offset, "ibm,spurr", 1))); 755 } 756 757 if (ppc_hash64_has(cpu, PPC_HASH64_1TSEG)) { 758 _FDT((fdt_setprop(fdt, offset, "ibm,processor-segment-sizes", 759 segs, sizeof(segs)))); 760 } 761 762 /* Advertise VSX (vector extensions) if available 763 * 1 == VMX / Altivec available 764 * 2 == VSX available 765 * 766 * Only CPUs for which we create core types in spapr_cpu_core.c 767 * are possible, and all of those have VMX */ 768 if (env->insns_flags & PPC_ALTIVEC) { 769 if (spapr_get_cap(spapr, SPAPR_CAP_VSX) != 0) { 770 _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 2))); 771 } else { 772 _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 1))); 773 } 774 } 775 776 /* Advertise DFP (Decimal Floating Point) if available 777 * 0 / no property == no DFP 778 * 1 == DFP available */ 779 if (spapr_get_cap(spapr, SPAPR_CAP_DFP) != 0) { 780 _FDT((fdt_setprop_cell(fdt, offset, "ibm,dfp", 1))); 781 } 782 783 page_sizes_prop_size = ppc_create_page_sizes_prop(cpu, page_sizes_prop, 784 sizeof(page_sizes_prop)); 785 if (page_sizes_prop_size) { 786 _FDT((fdt_setprop(fdt, offset, "ibm,segment-page-sizes", 787 page_sizes_prop, page_sizes_prop_size))); 788 } 789 790 spapr_dt_pa_features(spapr, cpu, fdt, offset); 791 792 spapr_dt_pi_features(spapr, cpu, fdt, offset); 793 794 _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id", 795 cs->cpu_index / vcpus_per_socket))); 796 797 _FDT((fdt_setprop(fdt, offset, "ibm,pft-size", 798 pft_size_prop, sizeof(pft_size_prop)))); 799 800 if (ms->numa_state->num_nodes > 1) { 801 _FDT(spapr_numa_fixup_cpu_dt(spapr, fdt, offset, cpu)); 802 } 803 804 _FDT(spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt)); 805 806 if (pcc->radix_page_info) { 807 for (i = 0; i < pcc->radix_page_info->count; i++) { 808 radix_AP_encodings[i] = 809 cpu_to_be32(pcc->radix_page_info->entries[i]); 810 } 811 _FDT((fdt_setprop(fdt, offset, "ibm,processor-radix-AP-encodings", 812 radix_AP_encodings, 813 pcc->radix_page_info->count * 814 sizeof(radix_AP_encodings[0])))); 815 } 816 817 /* 818 * We set this property to let the guest know that it can use the large 819 * decrementer and its width in bits. 820 */ 821 if (spapr_get_cap(spapr, SPAPR_CAP_LARGE_DECREMENTER) != SPAPR_CAP_OFF) 822 _FDT((fdt_setprop_u32(fdt, offset, "ibm,dec-bits", 823 pcc->lrg_decr_bits))); 824 } 825 826 static void spapr_dt_one_cpu(void *fdt, SpaprMachineState *spapr, CPUState *cs, 827 int cpus_offset) 828 { 829 PowerPCCPU *cpu = POWERPC_CPU(cs); 830 int index = spapr_get_vcpu_id(cpu); 831 DeviceClass *dc = DEVICE_GET_CLASS(cs); 832 g_autofree char *nodename = NULL; 833 int offset; 834 835 if (!spapr_is_thread0_in_vcore(spapr, cpu)) { 836 return; 837 } 838 839 nodename = g_strdup_printf("%s@%x", dc->fw_name, index); 840 offset = fdt_add_subnode(fdt, cpus_offset, nodename); 841 _FDT(offset); 842 spapr_dt_cpu(cs, fdt, offset, spapr); 843 } 844 845 846 static void spapr_dt_cpus(void *fdt, SpaprMachineState *spapr) 847 { 848 CPUState **rev; 849 CPUState *cs; 850 int n_cpus; 851 int cpus_offset; 852 int i; 853 854 cpus_offset = fdt_add_subnode(fdt, 0, "cpus"); 855 _FDT(cpus_offset); 856 _FDT((fdt_setprop_cell(fdt, cpus_offset, "#address-cells", 0x1))); 857 _FDT((fdt_setprop_cell(fdt, cpus_offset, "#size-cells", 0x0))); 858 859 /* 860 * We walk the CPUs in reverse order to ensure that CPU DT nodes 861 * created by fdt_add_subnode() end up in the right order in FDT 862 * for the guest kernel the enumerate the CPUs correctly. 863 * 864 * The CPU list cannot be traversed in reverse order, so we need 865 * to do extra work. 866 */ 867 n_cpus = 0; 868 rev = NULL; 869 CPU_FOREACH(cs) { 870 rev = g_renew(CPUState *, rev, n_cpus + 1); 871 rev[n_cpus++] = cs; 872 } 873 874 for (i = n_cpus - 1; i >= 0; i--) { 875 spapr_dt_one_cpu(fdt, spapr, rev[i], cpus_offset); 876 } 877 878 g_free(rev); 879 } 880 881 static int spapr_dt_rng(void *fdt) 882 { 883 int node; 884 int ret; 885 886 node = qemu_fdt_add_subnode(fdt, "/ibm,platform-facilities"); 887 if (node <= 0) { 888 return -1; 889 } 890 ret = fdt_setprop_string(fdt, node, "device_type", 891 "ibm,platform-facilities"); 892 ret |= fdt_setprop_cell(fdt, node, "#address-cells", 0x1); 893 ret |= fdt_setprop_cell(fdt, node, "#size-cells", 0x0); 894 895 node = fdt_add_subnode(fdt, node, "ibm,random-v1"); 896 if (node <= 0) { 897 return -1; 898 } 899 ret |= fdt_setprop_string(fdt, node, "compatible", "ibm,random"); 900 901 return ret ? -1 : 0; 902 } 903 904 static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) 905 { 906 MachineState *ms = MACHINE(spapr); 907 int rtas; 908 GString *hypertas = g_string_sized_new(256); 909 GString *qemu_hypertas = g_string_sized_new(256); 910 uint64_t max_device_addr = 0; 911 uint32_t lrdr_capacity[] = { 912 0, 913 0, 914 cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE >> 32), 915 cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE & 0xffffffff), 916 cpu_to_be32(ms->smp.max_cpus / ms->smp.threads), 917 }; 918 919 /* Do we have device memory? */ 920 if (MACHINE(spapr)->device_memory) { 921 max_device_addr = MACHINE(spapr)->device_memory->base + 922 memory_region_size(&MACHINE(spapr)->device_memory->mr); 923 } else if (ms->ram_size == ms->maxram_size) { 924 max_device_addr = ms->ram_size; 925 } 926 927 lrdr_capacity[0] = cpu_to_be32(max_device_addr >> 32); 928 lrdr_capacity[1] = cpu_to_be32(max_device_addr & 0xffffffff); 929 930 _FDT(rtas = fdt_add_subnode(fdt, 0, "rtas")); 931 932 /* hypertas */ 933 add_str(hypertas, "hcall-pft"); 934 add_str(hypertas, "hcall-term"); 935 add_str(hypertas, "hcall-dabr"); 936 add_str(hypertas, "hcall-interrupt"); 937 add_str(hypertas, "hcall-tce"); 938 add_str(hypertas, "hcall-vio"); 939 add_str(hypertas, "hcall-splpar"); 940 add_str(hypertas, "hcall-join"); 941 add_str(hypertas, "hcall-bulk"); 942 add_str(hypertas, "hcall-set-mode"); 943 add_str(hypertas, "hcall-sprg0"); 944 add_str(hypertas, "hcall-copy"); 945 add_str(hypertas, "hcall-debug"); 946 add_str(hypertas, "hcall-vphn"); 947 if (spapr_get_cap(spapr, SPAPR_CAP_RPT_INVALIDATE) == SPAPR_CAP_ON) { 948 add_str(hypertas, "hcall-rpt-invalidate"); 949 } 950 951 add_str(qemu_hypertas, "hcall-memop1"); 952 953 if (!kvm_enabled() || kvmppc_spapr_use_multitce()) { 954 add_str(hypertas, "hcall-multi-tce"); 955 } 956 957 if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) { 958 add_str(hypertas, "hcall-hpt-resize"); 959 } 960 961 add_str(hypertas, "hcall-watchdog"); 962 963 _FDT(fdt_setprop(fdt, rtas, "ibm,hypertas-functions", 964 hypertas->str, hypertas->len)); 965 g_string_free(hypertas, TRUE); 966 _FDT(fdt_setprop(fdt, rtas, "qemu,hypertas-functions", 967 qemu_hypertas->str, qemu_hypertas->len)); 968 g_string_free(qemu_hypertas, TRUE); 969 970 spapr_numa_write_rtas_dt(spapr, fdt, rtas); 971 972 /* 973 * FWNMI reserves RTAS_ERROR_LOG_MAX for the machine check error log, 974 * and 16 bytes per CPU for system reset error log plus an extra 8 bytes. 975 * 976 * The system reset requirements are driven by existing Linux and PowerVM 977 * implementation which (contrary to PAPR) saves r3 in the error log 978 * structure like machine check, so Linux expects to find the saved r3 979 * value at the address in r3 upon FWNMI-enabled sreset interrupt (and 980 * does not look at the error value). 981 * 982 * System reset interrupts are not subject to interlock like machine 983 * check, so this memory area could be corrupted if the sreset is 984 * interrupted by a machine check (or vice versa) if it was shared. To 985 * prevent this, system reset uses per-CPU areas for the sreset save 986 * area. A system reset that interrupts a system reset handler could 987 * still overwrite this area, but Linux doesn't try to recover in that 988 * case anyway. 989 * 990 * The extra 8 bytes is required because Linux's FWNMI error log check 991 * is off-by-one. 992 * 993 * RTAS_MIN_SIZE is required for the RTAS blob itself. 994 */ 995 _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_MIN_SIZE + 996 RTAS_ERROR_LOG_MAX + 997 ms->smp.max_cpus * sizeof(uint64_t) * 2 + 998 sizeof(uint64_t))); 999 _FDT(fdt_setprop_cell(fdt, rtas, "rtas-error-log-max", 1000 RTAS_ERROR_LOG_MAX)); 1001 _FDT(fdt_setprop_cell(fdt, rtas, "rtas-event-scan-rate", 1002 RTAS_EVENT_SCAN_RATE)); 1003 1004 g_assert(msi_nonbroken); 1005 _FDT(fdt_setprop(fdt, rtas, "ibm,change-msix-capable", NULL, 0)); 1006 1007 /* 1008 * According to PAPR, rtas ibm,os-term does not guarantee a return 1009 * back to the guest cpu. 1010 * 1011 * While an additional ibm,extended-os-term property indicates 1012 * that rtas call return will always occur. Set this property. 1013 */ 1014 _FDT(fdt_setprop(fdt, rtas, "ibm,extended-os-term", NULL, 0)); 1015 1016 _FDT(fdt_setprop(fdt, rtas, "ibm,lrdr-capacity", 1017 lrdr_capacity, sizeof(lrdr_capacity))); 1018 1019 spapr_dt_rtas_tokens(fdt, rtas); 1020 } 1021 1022 /* 1023 * Prepare ibm,arch-vec-5-platform-support, which indicates the MMU 1024 * and the XIVE features that the guest may request and thus the valid 1025 * values for bytes 23..26 of option vector 5: 1026 */ 1027 static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt, 1028 int chosen) 1029 { 1030 PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu); 1031 1032 char val[2 * 4] = { 1033 23, 0x00, /* XICS / XIVE mode */ 1034 24, 0x00, /* Hash/Radix, filled in below. */ 1035 25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */ 1036 26, 0x40, /* Radix options: GTSE == yes. */ 1037 }; 1038 1039 if (spapr->irq->xics && spapr->irq->xive) { 1040 val[1] = SPAPR_OV5_XIVE_BOTH; 1041 } else if (spapr->irq->xive) { 1042 val[1] = SPAPR_OV5_XIVE_EXPLOIT; 1043 } else { 1044 assert(spapr->irq->xics); 1045 val[1] = SPAPR_OV5_XIVE_LEGACY; 1046 } 1047 1048 if (!ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_00, 0, 1049 first_ppc_cpu->compat_pvr)) { 1050 /* 1051 * If we're in a pre POWER9 compat mode then the guest should 1052 * do hash and use the legacy interrupt mode 1053 */ 1054 val[1] = SPAPR_OV5_XIVE_LEGACY; /* XICS */ 1055 val[3] = 0x00; /* Hash */ 1056 spapr_check_mmu_mode(false); 1057 } else if (kvm_enabled()) { 1058 if (kvmppc_has_cap_mmu_radix() && kvmppc_has_cap_mmu_hash_v3()) { 1059 val[3] = 0x80; /* OV5_MMU_BOTH */ 1060 } else if (kvmppc_has_cap_mmu_radix()) { 1061 val[3] = 0x40; /* OV5_MMU_RADIX_300 */ 1062 } else { 1063 val[3] = 0x00; /* Hash */ 1064 } 1065 } else { 1066 /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */ 1067 val[3] = 0xC0; 1068 } 1069 _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support", 1070 val, sizeof(val))); 1071 } 1072 1073 static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset) 1074 { 1075 MachineState *machine = MACHINE(spapr); 1076 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); 1077 int chosen; 1078 1079 _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen")); 1080 1081 if (reset) { 1082 const char *boot_device = spapr->boot_device; 1083 g_autofree char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus); 1084 size_t cb = 0; 1085 g_autofree char *bootlist = get_boot_devices_list(&cb); 1086 1087 if (machine->kernel_cmdline && machine->kernel_cmdline[0]) { 1088 _FDT(fdt_setprop_string(fdt, chosen, "bootargs", 1089 machine->kernel_cmdline)); 1090 } 1091 1092 if (spapr->initrd_size) { 1093 _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-start", 1094 spapr->initrd_base)); 1095 _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-end", 1096 spapr->initrd_base + spapr->initrd_size)); 1097 } 1098 1099 if (spapr->kernel_size) { 1100 uint64_t kprop[2] = { cpu_to_be64(spapr->kernel_addr), 1101 cpu_to_be64(spapr->kernel_size) }; 1102 1103 _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel", 1104 &kprop, sizeof(kprop))); 1105 if (spapr->kernel_le) { 1106 _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel-le", NULL, 0)); 1107 } 1108 } 1109 if (machine->boot_config.has_menu && machine->boot_config.menu) { 1110 _FDT((fdt_setprop_cell(fdt, chosen, "qemu,boot-menu", true))); 1111 } 1112 _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-width", graphic_width)); 1113 _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-height", graphic_height)); 1114 _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-depth", graphic_depth)); 1115 1116 if (cb && bootlist) { 1117 int i; 1118 1119 for (i = 0; i < cb; i++) { 1120 if (bootlist[i] == '\n') { 1121 bootlist[i] = ' '; 1122 } 1123 } 1124 _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-list", bootlist)); 1125 } 1126 1127 if (boot_device && strlen(boot_device)) { 1128 _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-device", boot_device)); 1129 } 1130 1131 if (spapr->want_stdout_path && stdout_path) { 1132 /* 1133 * "linux,stdout-path" and "stdout" properties are 1134 * deprecated by linux kernel. New platforms should only 1135 * use the "stdout-path" property. Set the new property 1136 * and continue using older property to remain compatible 1137 * with the existing firmware. 1138 */ 1139 _FDT(fdt_setprop_string(fdt, chosen, "linux,stdout-path", stdout_path)); 1140 _FDT(fdt_setprop_string(fdt, chosen, "stdout-path", stdout_path)); 1141 } 1142 1143 /* 1144 * We can deal with BAR reallocation just fine, advertise it 1145 * to the guest 1146 */ 1147 if (smc->linux_pci_probe) { 1148 _FDT(fdt_setprop_cell(fdt, chosen, "linux,pci-probe-only", 0)); 1149 } 1150 1151 spapr_dt_ov5_platform_support(spapr, fdt, chosen); 1152 } 1153 1154 _FDT(fdt_setprop(fdt, chosen, "rng-seed", spapr->fdt_rng_seed, 32)); 1155 1156 _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5")); 1157 } 1158 1159 static void spapr_dt_hypervisor(SpaprMachineState *spapr, void *fdt) 1160 { 1161 /* The /hypervisor node isn't in PAPR - this is a hack to allow PR 1162 * KVM to work under pHyp with some guest co-operation */ 1163 int hypervisor; 1164 uint8_t hypercall[16]; 1165 1166 _FDT(hypervisor = fdt_add_subnode(fdt, 0, "hypervisor")); 1167 /* indicate KVM hypercall interface */ 1168 _FDT(fdt_setprop_string(fdt, hypervisor, "compatible", "linux,kvm")); 1169 if (kvmppc_has_cap_fixup_hcalls()) { 1170 /* 1171 * Older KVM versions with older guest kernels were broken 1172 * with the magic page, don't allow the guest to map it. 1173 */ 1174 if (!kvmppc_get_hypercall(cpu_env(first_cpu), hypercall, 1175 sizeof(hypercall))) { 1176 _FDT(fdt_setprop(fdt, hypervisor, "hcall-instructions", 1177 hypercall, sizeof(hypercall))); 1178 } 1179 } 1180 } 1181 1182 void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, size_t space) 1183 { 1184 MachineState *machine = MACHINE(spapr); 1185 MachineClass *mc = MACHINE_GET_CLASS(machine); 1186 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); 1187 uint32_t root_drc_type_mask = 0; 1188 int ret; 1189 void *fdt; 1190 SpaprPhbState *phb; 1191 char *buf; 1192 1193 fdt = g_malloc0(space); 1194 _FDT((fdt_create_empty_tree(fdt, space))); 1195 1196 /* Root node */ 1197 _FDT(fdt_setprop_string(fdt, 0, "device_type", "chrp")); 1198 _FDT(fdt_setprop_string(fdt, 0, "model", "IBM pSeries (emulated by qemu)")); 1199 _FDT(fdt_setprop_string(fdt, 0, "compatible", "qemu,pseries")); 1200 1201 /* Guest UUID & Name*/ 1202 buf = qemu_uuid_unparse_strdup(&qemu_uuid); 1203 _FDT(fdt_setprop_string(fdt, 0, "vm,uuid", buf)); 1204 if (qemu_uuid_set) { 1205 _FDT(fdt_setprop_string(fdt, 0, "system-id", buf)); 1206 } 1207 g_free(buf); 1208 1209 if (qemu_get_vm_name()) { 1210 _FDT(fdt_setprop_string(fdt, 0, "ibm,partition-name", 1211 qemu_get_vm_name())); 1212 } 1213 1214 /* Host Model & Serial Number */ 1215 if (spapr->host_model) { 1216 _FDT(fdt_setprop_string(fdt, 0, "host-model", spapr->host_model)); 1217 } else if (smc->broken_host_serial_model && kvmppc_get_host_model(&buf)) { 1218 _FDT(fdt_setprop_string(fdt, 0, "host-model", buf)); 1219 g_free(buf); 1220 } 1221 1222 if (spapr->host_serial) { 1223 _FDT(fdt_setprop_string(fdt, 0, "host-serial", spapr->host_serial)); 1224 } else if (smc->broken_host_serial_model && kvmppc_get_host_serial(&buf)) { 1225 _FDT(fdt_setprop_string(fdt, 0, "host-serial", buf)); 1226 g_free(buf); 1227 } 1228 1229 _FDT(fdt_setprop_cell(fdt, 0, "#address-cells", 2)); 1230 _FDT(fdt_setprop_cell(fdt, 0, "#size-cells", 2)); 1231 1232 /* /interrupt controller */ 1233 spapr_irq_dt(spapr, spapr_max_server_number(spapr), fdt, PHANDLE_INTC); 1234 1235 ret = spapr_dt_memory(spapr, fdt); 1236 if (ret < 0) { 1237 error_report("couldn't setup memory nodes in fdt"); 1238 exit(1); 1239 } 1240 1241 /* /vdevice */ 1242 spapr_dt_vdevice(spapr->vio_bus, fdt); 1243 1244 if (object_resolve_path_type("", TYPE_SPAPR_RNG, NULL)) { 1245 ret = spapr_dt_rng(fdt); 1246 if (ret < 0) { 1247 error_report("could not set up rng device in the fdt"); 1248 exit(1); 1249 } 1250 } 1251 1252 QLIST_FOREACH(phb, &spapr->phbs, list) { 1253 ret = spapr_dt_phb(spapr, phb, PHANDLE_INTC, fdt, NULL); 1254 if (ret < 0) { 1255 error_report("couldn't setup PCI devices in fdt"); 1256 exit(1); 1257 } 1258 } 1259 1260 spapr_dt_cpus(fdt, spapr); 1261 1262 /* ibm,drc-indexes and friends */ 1263 root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_LMB; 1264 if (smc->dr_phb_enabled) { 1265 root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_PHB; 1266 } 1267 if (mc->nvdimm_supported) { 1268 root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_PMEM; 1269 } 1270 if (root_drc_type_mask) { 1271 _FDT(spapr_dt_drc(fdt, 0, NULL, root_drc_type_mask)); 1272 } 1273 1274 if (mc->has_hotpluggable_cpus) { 1275 int offset = fdt_path_offset(fdt, "/cpus"); 1276 ret = spapr_dt_drc(fdt, offset, NULL, SPAPR_DR_CONNECTOR_TYPE_CPU); 1277 if (ret < 0) { 1278 error_report("Couldn't set up CPU DR device tree properties"); 1279 exit(1); 1280 } 1281 } 1282 1283 /* /event-sources */ 1284 spapr_dt_events(spapr, fdt); 1285 1286 /* /rtas */ 1287 spapr_dt_rtas(spapr, fdt); 1288 1289 /* /chosen */ 1290 spapr_dt_chosen(spapr, fdt, reset); 1291 1292 /* /hypervisor */ 1293 if (kvm_enabled()) { 1294 spapr_dt_hypervisor(spapr, fdt); 1295 } 1296 1297 /* Build memory reserve map */ 1298 if (reset) { 1299 if (spapr->kernel_size) { 1300 _FDT((fdt_add_mem_rsv(fdt, spapr->kernel_addr, 1301 spapr->kernel_size))); 1302 } 1303 if (spapr->initrd_size) { 1304 _FDT((fdt_add_mem_rsv(fdt, spapr->initrd_base, 1305 spapr->initrd_size))); 1306 } 1307 } 1308 1309 /* NVDIMM devices */ 1310 if (mc->nvdimm_supported) { 1311 spapr_dt_persistent_memory(spapr, fdt); 1312 } 1313 1314 return fdt; 1315 } 1316 1317 static uint64_t translate_kernel_address(void *opaque, uint64_t addr) 1318 { 1319 SpaprMachineState *spapr = opaque; 1320 1321 return (addr & 0x0fffffff) + spapr->kernel_addr; 1322 } 1323 1324 static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp, 1325 PowerPCCPU *cpu) 1326 { 1327 CPUPPCState *env = &cpu->env; 1328 1329 /* The TCG path should also be holding the BQL at this point */ 1330 g_assert(bql_locked()); 1331 1332 g_assert(!vhyp_cpu_in_nested(cpu)); 1333 1334 if (FIELD_EX64(env->msr, MSR, PR)) { 1335 hcall_dprintf("Hypercall made with MSR[PR]=1\n"); 1336 env->gpr[3] = H_PRIVILEGE; 1337 } else { 1338 env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]); 1339 } 1340 } 1341 1342 struct LPCRSyncState { 1343 target_ulong value; 1344 target_ulong mask; 1345 }; 1346 1347 static void do_lpcr_sync(CPUState *cs, run_on_cpu_data arg) 1348 { 1349 struct LPCRSyncState *s = arg.host_ptr; 1350 PowerPCCPU *cpu = POWERPC_CPU(cs); 1351 CPUPPCState *env = &cpu->env; 1352 target_ulong lpcr; 1353 1354 cpu_synchronize_state(cs); 1355 lpcr = env->spr[SPR_LPCR]; 1356 lpcr &= ~s->mask; 1357 lpcr |= s->value; 1358 ppc_store_lpcr(cpu, lpcr); 1359 } 1360 1361 void spapr_set_all_lpcrs(target_ulong value, target_ulong mask) 1362 { 1363 CPUState *cs; 1364 struct LPCRSyncState s = { 1365 .value = value, 1366 .mask = mask 1367 }; 1368 CPU_FOREACH(cs) { 1369 run_on_cpu(cs, do_lpcr_sync, RUN_ON_CPU_HOST_PTR(&s)); 1370 } 1371 } 1372 1373 /* May be used when the machine is not running */ 1374 void spapr_init_all_lpcrs(target_ulong value, target_ulong mask) 1375 { 1376 CPUState *cs; 1377 CPU_FOREACH(cs) { 1378 PowerPCCPU *cpu = POWERPC_CPU(cs); 1379 CPUPPCState *env = &cpu->env; 1380 target_ulong lpcr; 1381 1382 lpcr = env->spr[SPR_LPCR]; 1383 lpcr &= ~(LPCR_HR | LPCR_UPRT); 1384 ppc_store_lpcr(cpu, lpcr); 1385 } 1386 } 1387 1388 static bool spapr_get_pate(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu, 1389 target_ulong lpid, ppc_v3_pate_t *entry) 1390 { 1391 SpaprMachineState *spapr = SPAPR_MACHINE(vhyp); 1392 SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu); 1393 1394 if (!spapr_cpu->in_nested) { 1395 assert(lpid == 0); 1396 1397 /* Copy PATE1:GR into PATE0:HR */ 1398 entry->dw0 = spapr->patb_entry & PATE0_HR; 1399 entry->dw1 = spapr->patb_entry; 1400 return true; 1401 } else { 1402 if (spapr_nested_api(spapr) == NESTED_API_KVM_HV) { 1403 return spapr_get_pate_nested_hv(spapr, cpu, lpid, entry); 1404 } else if (spapr_nested_api(spapr) == NESTED_API_PAPR) { 1405 return spapr_get_pate_nested_papr(spapr, cpu, lpid, entry); 1406 } else { 1407 g_assert_not_reached(); 1408 } 1409 } 1410 } 1411 1412 static uint64_t *hpte_get_ptr(SpaprMachineState *s, unsigned index) 1413 { 1414 uint64_t *table = s->htab; 1415 1416 return &table[2 * index]; 1417 } 1418 1419 static bool hpte_is_valid(SpaprMachineState *s, unsigned index) 1420 { 1421 return ldq_be_p(hpte_get_ptr(s, index)) & HPTE64_V_VALID; 1422 } 1423 1424 static bool hpte_is_dirty(SpaprMachineState *s, unsigned index) 1425 { 1426 return ldq_be_p(hpte_get_ptr(s, index)) & HPTE64_V_HPTE_DIRTY; 1427 } 1428 1429 static void hpte_set_clean(SpaprMachineState *s, unsigned index) 1430 { 1431 stq_be_p(hpte_get_ptr(s, index), 1432 ldq_be_p(hpte_get_ptr(s, index)) & ~HPTE64_V_HPTE_DIRTY); 1433 } 1434 1435 static void hpte_set_dirty(SpaprMachineState *s, unsigned index) 1436 { 1437 stq_be_p(hpte_get_ptr(s, index), 1438 ldq_be_p(hpte_get_ptr(s, index)) | HPTE64_V_HPTE_DIRTY); 1439 } 1440 1441 /* 1442 * Get the fd to access the kernel htab, re-opening it if necessary 1443 */ 1444 static int get_htab_fd(SpaprMachineState *spapr) 1445 { 1446 Error *local_err = NULL; 1447 1448 if (spapr->htab_fd >= 0) { 1449 return spapr->htab_fd; 1450 } 1451 1452 spapr->htab_fd = kvmppc_get_htab_fd(false, 0, &local_err); 1453 if (spapr->htab_fd < 0) { 1454 error_report_err(local_err); 1455 } 1456 1457 return spapr->htab_fd; 1458 } 1459 1460 void close_htab_fd(SpaprMachineState *spapr) 1461 { 1462 if (spapr->htab_fd >= 0) { 1463 close(spapr->htab_fd); 1464 } 1465 spapr->htab_fd = -1; 1466 } 1467 1468 static hwaddr spapr_hpt_mask(PPCVirtualHypervisor *vhyp) 1469 { 1470 SpaprMachineState *spapr = SPAPR_MACHINE(vhyp); 1471 1472 return HTAB_SIZE(spapr) / HASH_PTEG_SIZE_64 - 1; 1473 } 1474 1475 static target_ulong spapr_encode_hpt_for_kvm_pr(PPCVirtualHypervisor *vhyp) 1476 { 1477 SpaprMachineState *spapr = SPAPR_MACHINE(vhyp); 1478 1479 assert(kvm_enabled()); 1480 1481 if (!spapr->htab) { 1482 return 0; 1483 } 1484 1485 return (target_ulong)(uintptr_t)spapr->htab | (spapr->htab_shift - 18); 1486 } 1487 1488 static const ppc_hash_pte64_t *spapr_map_hptes(PPCVirtualHypervisor *vhyp, 1489 hwaddr ptex, int n) 1490 { 1491 SpaprMachineState *spapr = SPAPR_MACHINE(vhyp); 1492 hwaddr pte_offset = ptex * HASH_PTE_SIZE_64; 1493 1494 if (!spapr->htab) { 1495 /* 1496 * HTAB is controlled by KVM. Fetch into temporary buffer 1497 */ 1498 ppc_hash_pte64_t *hptes = g_malloc(n * HASH_PTE_SIZE_64); 1499 kvmppc_read_hptes(hptes, ptex, n); 1500 return hptes; 1501 } 1502 1503 /* 1504 * HTAB is controlled by QEMU. Just point to the internally 1505 * accessible PTEG. 1506 */ 1507 return (const ppc_hash_pte64_t *)(spapr->htab + pte_offset); 1508 } 1509 1510 static void spapr_unmap_hptes(PPCVirtualHypervisor *vhyp, 1511 const ppc_hash_pte64_t *hptes, 1512 hwaddr ptex, int n) 1513 { 1514 SpaprMachineState *spapr = SPAPR_MACHINE(vhyp); 1515 1516 if (!spapr->htab) { 1517 g_free((void *)hptes); 1518 } 1519 1520 /* Nothing to do for qemu managed HPT */ 1521 } 1522 1523 void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex, 1524 uint64_t pte0, uint64_t pte1) 1525 { 1526 SpaprMachineState *spapr = SPAPR_MACHINE(cpu->vhyp); 1527 hwaddr offset = ptex * HASH_PTE_SIZE_64; 1528 1529 if (!spapr->htab) { 1530 kvmppc_write_hpte(ptex, pte0, pte1); 1531 } else { 1532 if (pte0 & HPTE64_V_VALID) { 1533 stq_p(spapr->htab + offset + HPTE64_DW1, pte1); 1534 /* 1535 * When setting valid, we write PTE1 first. This ensures 1536 * proper synchronization with the reading code in 1537 * ppc_hash64_pteg_search() 1538 */ 1539 smp_wmb(); 1540 stq_p(spapr->htab + offset, pte0); 1541 } else { 1542 stq_p(spapr->htab + offset, pte0); 1543 /* 1544 * When clearing it we set PTE0 first. This ensures proper 1545 * synchronization with the reading code in 1546 * ppc_hash64_pteg_search() 1547 */ 1548 smp_wmb(); 1549 stq_p(spapr->htab + offset + HPTE64_DW1, pte1); 1550 } 1551 } 1552 } 1553 1554 static void spapr_hpte_set_c(PPCVirtualHypervisor *vhyp, hwaddr ptex, 1555 uint64_t pte1) 1556 { 1557 hwaddr offset = ptex * HASH_PTE_SIZE_64 + HPTE64_DW1_C; 1558 SpaprMachineState *spapr = SPAPR_MACHINE(vhyp); 1559 1560 if (!spapr->htab) { 1561 /* There should always be a hash table when this is called */ 1562 error_report("spapr_hpte_set_c called with no hash table !"); 1563 return; 1564 } 1565 1566 /* The HW performs a non-atomic byte update */ 1567 stb_p(spapr->htab + offset, (pte1 & 0xff) | 0x80); 1568 } 1569 1570 static void spapr_hpte_set_r(PPCVirtualHypervisor *vhyp, hwaddr ptex, 1571 uint64_t pte1) 1572 { 1573 hwaddr offset = ptex * HASH_PTE_SIZE_64 + HPTE64_DW1_R; 1574 SpaprMachineState *spapr = SPAPR_MACHINE(vhyp); 1575 1576 if (!spapr->htab) { 1577 /* There should always be a hash table when this is called */ 1578 error_report("spapr_hpte_set_r called with no hash table !"); 1579 return; 1580 } 1581 1582 /* The HW performs a non-atomic byte update */ 1583 stb_p(spapr->htab + offset, ((pte1 >> 8) & 0xff) | 0x01); 1584 } 1585 1586 int spapr_hpt_shift_for_ramsize(uint64_t ramsize) 1587 { 1588 int shift; 1589 1590 /* We aim for a hash table of size 1/128 the size of RAM (rounded 1591 * up). The PAPR recommendation is actually 1/64 of RAM size, but 1592 * that's much more than is needed for Linux guests */ 1593 shift = ctz64(pow2ceil(ramsize)) - 7; 1594 shift = MAX(shift, 18); /* Minimum architected size */ 1595 shift = MIN(shift, 46); /* Maximum architected size */ 1596 return shift; 1597 } 1598 1599 void spapr_free_hpt(SpaprMachineState *spapr) 1600 { 1601 qemu_vfree(spapr->htab); 1602 spapr->htab = NULL; 1603 spapr->htab_shift = 0; 1604 close_htab_fd(spapr); 1605 } 1606 1607 int spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, Error **errp) 1608 { 1609 ERRP_GUARD(); 1610 long rc; 1611 1612 /* Clean up any HPT info from a previous boot */ 1613 spapr_free_hpt(spapr); 1614 1615 rc = kvmppc_reset_htab(shift); 1616 1617 if (rc == -EOPNOTSUPP) { 1618 error_setg(errp, "HPT not supported in nested guests"); 1619 return -EOPNOTSUPP; 1620 } 1621 1622 if (rc < 0) { 1623 /* kernel-side HPT needed, but couldn't allocate one */ 1624 error_setg_errno(errp, errno, "Failed to allocate KVM HPT of order %d", 1625 shift); 1626 error_append_hint(errp, "Try smaller maxmem?\n"); 1627 return -errno; 1628 } else if (rc > 0) { 1629 /* kernel-side HPT allocated */ 1630 if (rc != shift) { 1631 error_setg(errp, 1632 "Requested order %d HPT, but kernel allocated order %ld", 1633 shift, rc); 1634 error_append_hint(errp, "Try smaller maxmem?\n"); 1635 return -ENOSPC; 1636 } 1637 1638 spapr->htab_shift = shift; 1639 spapr->htab = NULL; 1640 } else { 1641 /* kernel-side HPT not needed, allocate in userspace instead */ 1642 size_t size = 1ULL << shift; 1643 int i; 1644 1645 spapr->htab = qemu_memalign(size, size); 1646 memset(spapr->htab, 0, size); 1647 spapr->htab_shift = shift; 1648 1649 for (i = 0; i < size / HASH_PTE_SIZE_64; i++) { 1650 hpte_set_dirty(spapr, i); 1651 } 1652 } 1653 /* We're setting up a hash table, so that means we're not radix */ 1654 spapr->patb_entry = 0; 1655 spapr_init_all_lpcrs(0, LPCR_HR | LPCR_UPRT); 1656 return 0; 1657 } 1658 1659 void spapr_setup_hpt(SpaprMachineState *spapr) 1660 { 1661 int hpt_shift; 1662 1663 if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) { 1664 hpt_shift = spapr_hpt_shift_for_ramsize(MACHINE(spapr)->maxram_size); 1665 } else { 1666 uint64_t current_ram_size; 1667 1668 current_ram_size = MACHINE(spapr)->ram_size + get_plugged_memory_size(); 1669 hpt_shift = spapr_hpt_shift_for_ramsize(current_ram_size); 1670 } 1671 spapr_reallocate_hpt(spapr, hpt_shift, &error_fatal); 1672 1673 if (kvm_enabled()) { 1674 hwaddr vrma_limit = kvmppc_vrma_limit(spapr->htab_shift); 1675 1676 /* Check our RMA fits in the possible VRMA */ 1677 if (vrma_limit < spapr->rma_size) { 1678 error_report("Unable to create %" HWADDR_PRIu 1679 "MiB RMA (VRMA only allows %" HWADDR_PRIu "MiB", 1680 spapr->rma_size / MiB, vrma_limit / MiB); 1681 exit(EXIT_FAILURE); 1682 } 1683 } 1684 } 1685 1686 void spapr_check_mmu_mode(bool guest_radix) 1687 { 1688 if (guest_radix) { 1689 if (kvm_enabled() && !kvmppc_has_cap_mmu_radix()) { 1690 error_report("Guest requested unavailable MMU mode (radix)."); 1691 exit(EXIT_FAILURE); 1692 } 1693 } else { 1694 if (kvm_enabled() && kvmppc_has_cap_mmu_radix() 1695 && !kvmppc_has_cap_mmu_hash_v3()) { 1696 error_report("Guest requested unavailable MMU mode (hash)."); 1697 exit(EXIT_FAILURE); 1698 } 1699 } 1700 } 1701 1702 static void spapr_machine_reset(MachineState *machine, ResetType type) 1703 { 1704 SpaprMachineState *spapr = SPAPR_MACHINE(machine); 1705 PowerPCCPU *first_ppc_cpu; 1706 hwaddr fdt_addr; 1707 void *fdt; 1708 int rc; 1709 1710 if (type != RESET_TYPE_SNAPSHOT_LOAD) { 1711 /* 1712 * Record-replay snapshot load must not consume random, this was 1713 * already replayed from initial machine reset. 1714 */ 1715 qemu_guest_getrandom_nofail(spapr->fdt_rng_seed, 32); 1716 } 1717 1718 if (machine->cgs) { 1719 confidential_guest_kvm_reset(machine->cgs, &error_fatal); 1720 } 1721 spapr_caps_apply(spapr); 1722 spapr_nested_reset(spapr); 1723 1724 first_ppc_cpu = POWERPC_CPU(first_cpu); 1725 if (kvm_enabled() && kvmppc_has_cap_mmu_radix() && 1726 ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0, 1727 spapr->max_compat_pvr)) { 1728 /* 1729 * If using KVM with radix mode available, VCPUs can be started 1730 * without a HPT because KVM will start them in radix mode. 1731 * Set the GR bit in PATE so that we know there is no HPT. 1732 */ 1733 spapr->patb_entry = PATE1_GR; 1734 spapr_set_all_lpcrs(LPCR_HR | LPCR_UPRT, LPCR_HR | LPCR_UPRT); 1735 } else { 1736 spapr_setup_hpt(spapr); 1737 } 1738 1739 qemu_devices_reset(type); 1740 1741 spapr_ovec_cleanup(spapr->ov5_cas); 1742 spapr->ov5_cas = spapr_ovec_new(); 1743 1744 ppc_init_compat_all(spapr->max_compat_pvr, &error_fatal); 1745 1746 /* 1747 * This is fixing some of the default configuration of the XIVE 1748 * devices. To be called after the reset of the machine devices. 1749 */ 1750 spapr_irq_reset(spapr, &error_fatal); 1751 1752 /* 1753 * There is no CAS under qtest. Simulate one to please the code that 1754 * depends on spapr->ov5_cas. This is especially needed to test device 1755 * unplug, so we do that before resetting the DRCs. 1756 */ 1757 if (qtest_enabled()) { 1758 spapr_ovec_cleanup(spapr->ov5_cas); 1759 spapr->ov5_cas = spapr_ovec_clone(spapr->ov5); 1760 } 1761 1762 spapr_nvdimm_finish_flushes(); 1763 1764 /* DRC reset may cause a device to be unplugged. This will cause troubles 1765 * if this device is used by another device (eg, a running vhost backend 1766 * will crash QEMU if the DIMM holding the vring goes away). To avoid such 1767 * situations, we reset DRCs after all devices have been reset. 1768 */ 1769 spapr_drc_reset_all(spapr); 1770 1771 spapr_clear_pending_events(spapr); 1772 1773 /* 1774 * We place the device tree just below either the top of the RMA, 1775 * or just below 2GB, whichever is lower, so that it can be 1776 * processed with 32-bit real mode code if necessary 1777 */ 1778 fdt_addr = MIN(spapr->rma_size, FDT_MAX_ADDR) - FDT_MAX_SIZE; 1779 1780 fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE); 1781 if (spapr->vof) { 1782 spapr_vof_reset(spapr, fdt, &error_fatal); 1783 /* 1784 * Do not pack the FDT as the client may change properties. 1785 * VOF client does not expect the FDT so we do not load it to the VM. 1786 */ 1787 } else { 1788 rc = fdt_pack(fdt); 1789 /* Should only fail if we've built a corrupted tree */ 1790 assert(rc == 0); 1791 1792 spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, 1793 0, fdt_addr, 0); 1794 cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt)); 1795 } 1796 1797 g_free(spapr->fdt_blob); 1798 spapr->fdt_size = fdt_totalsize(fdt); 1799 spapr->fdt_initial_size = spapr->fdt_size; 1800 spapr->fdt_blob = fdt; 1801 1802 /* Set machine->fdt for 'dumpdtb' QMP/HMP command */ 1803 machine->fdt = fdt; 1804 1805 /* Set up the entry state */ 1806 first_ppc_cpu->env.gpr[5] = 0; 1807 1808 spapr->fwnmi_system_reset_addr = -1; 1809 spapr->fwnmi_machine_check_addr = -1; 1810 spapr->fwnmi_machine_check_interlock = -1; 1811 1812 /* Signal all vCPUs waiting on this condition */ 1813 qemu_cond_broadcast(&spapr->fwnmi_machine_check_interlock_cond); 1814 1815 migrate_del_blocker(&spapr->fwnmi_migration_blocker); 1816 } 1817 1818 static void spapr_create_nvram(SpaprMachineState *spapr) 1819 { 1820 DeviceState *dev = qdev_new("spapr-nvram"); 1821 DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0); 1822 1823 if (dinfo) { 1824 qdev_prop_set_drive_err(dev, "drive", blk_by_legacy_dinfo(dinfo), 1825 &error_fatal); 1826 } 1827 1828 qdev_realize_and_unref(dev, &spapr->vio_bus->bus, &error_fatal); 1829 1830 spapr->nvram = (struct SpaprNvram *)dev; 1831 } 1832 1833 static void spapr_rtc_create(SpaprMachineState *spapr) 1834 { 1835 object_initialize_child_with_props(OBJECT(spapr), "rtc", &spapr->rtc, 1836 sizeof(spapr->rtc), TYPE_SPAPR_RTC, 1837 &error_fatal, NULL); 1838 qdev_realize(DEVICE(&spapr->rtc), NULL, &error_fatal); 1839 object_property_add_alias(OBJECT(spapr), "rtc-time", OBJECT(&spapr->rtc), 1840 "date"); 1841 } 1842 1843 /* Returns whether we want to use VGA or not */ 1844 static bool spapr_vga_init(PCIBus *pci_bus, Error **errp) 1845 { 1846 vga_interface_created = true; 1847 switch (vga_interface_type) { 1848 case VGA_NONE: 1849 return false; 1850 case VGA_DEVICE: 1851 return true; 1852 case VGA_STD: 1853 case VGA_VIRTIO: 1854 case VGA_CIRRUS: 1855 return pci_vga_init(pci_bus) != NULL; 1856 default: 1857 error_setg(errp, 1858 "Unsupported VGA mode, only -vga std or -vga virtio is supported"); 1859 return false; 1860 } 1861 } 1862 1863 static int spapr_pre_load(void *opaque) 1864 { 1865 int rc; 1866 1867 rc = spapr_caps_pre_load(opaque); 1868 if (rc) { 1869 return rc; 1870 } 1871 1872 return 0; 1873 } 1874 1875 static int spapr_post_load(void *opaque, int version_id) 1876 { 1877 SpaprMachineState *spapr = (SpaprMachineState *)opaque; 1878 int err = 0; 1879 1880 err = spapr_caps_post_migration(spapr); 1881 if (err) { 1882 return err; 1883 } 1884 1885 /* 1886 * In earlier versions, there was no separate qdev for the PAPR 1887 * RTC, so the RTC offset was stored directly in sPAPREnvironment. 1888 * So when migrating from those versions, poke the incoming offset 1889 * value into the RTC device 1890 */ 1891 if (version_id < 3) { 1892 err = spapr_rtc_import_offset(&spapr->rtc, spapr->rtc_offset); 1893 if (err) { 1894 return err; 1895 } 1896 } 1897 1898 if (kvm_enabled() && spapr->patb_entry) { 1899 PowerPCCPU *cpu = POWERPC_CPU(first_cpu); 1900 bool radix = !!(spapr->patb_entry & PATE1_GR); 1901 bool gtse = !!(cpu->env.spr[SPR_LPCR] & LPCR_GTSE); 1902 1903 /* 1904 * Update LPCR:HR and UPRT as they may not be set properly in 1905 * the stream 1906 */ 1907 spapr_set_all_lpcrs(radix ? (LPCR_HR | LPCR_UPRT) : 0, 1908 LPCR_HR | LPCR_UPRT); 1909 1910 err = kvmppc_configure_v3_mmu(cpu, radix, gtse, spapr->patb_entry); 1911 if (err) { 1912 error_report("Process table config unsupported by the host"); 1913 return -EINVAL; 1914 } 1915 } 1916 1917 err = spapr_irq_post_load(spapr, version_id); 1918 if (err) { 1919 return err; 1920 } 1921 1922 return err; 1923 } 1924 1925 static int spapr_pre_save(void *opaque) 1926 { 1927 int rc; 1928 1929 rc = spapr_caps_pre_save(opaque); 1930 if (rc) { 1931 return rc; 1932 } 1933 1934 return 0; 1935 } 1936 1937 static bool version_before_3(void *opaque, int version_id) 1938 { 1939 return version_id < 3; 1940 } 1941 1942 static bool spapr_pending_events_needed(void *opaque) 1943 { 1944 SpaprMachineState *spapr = (SpaprMachineState *)opaque; 1945 return !QTAILQ_EMPTY(&spapr->pending_events); 1946 } 1947 1948 static const VMStateDescription vmstate_spapr_event_entry = { 1949 .name = "spapr_event_log_entry", 1950 .version_id = 1, 1951 .minimum_version_id = 1, 1952 .fields = (const VMStateField[]) { 1953 VMSTATE_UINT32(summary, SpaprEventLogEntry), 1954 VMSTATE_UINT32(extended_length, SpaprEventLogEntry), 1955 VMSTATE_VBUFFER_ALLOC_UINT32(extended_log, SpaprEventLogEntry, 0, 1956 NULL, extended_length), 1957 VMSTATE_END_OF_LIST() 1958 }, 1959 }; 1960 1961 static const VMStateDescription vmstate_spapr_pending_events = { 1962 .name = "spapr_pending_events", 1963 .version_id = 1, 1964 .minimum_version_id = 1, 1965 .needed = spapr_pending_events_needed, 1966 .fields = (const VMStateField[]) { 1967 VMSTATE_QTAILQ_V(pending_events, SpaprMachineState, 1, 1968 vmstate_spapr_event_entry, SpaprEventLogEntry, next), 1969 VMSTATE_END_OF_LIST() 1970 }, 1971 }; 1972 1973 static bool spapr_ov5_cas_needed(void *opaque) 1974 { 1975 SpaprMachineState *spapr = opaque; 1976 SpaprOptionVector *ov5_mask = spapr_ovec_new(); 1977 bool cas_needed; 1978 1979 /* Prior to the introduction of SpaprOptionVector, we had two option 1980 * vectors we dealt with: OV5_FORM1_AFFINITY, and OV5_DRCONF_MEMORY. 1981 * Both of these options encode machine topology into the device-tree 1982 * in such a way that the now-booted OS should still be able to interact 1983 * appropriately with QEMU regardless of what options were actually 1984 * negotiatied on the source side. 1985 * 1986 * As such, we can avoid migrating the CAS-negotiated options if these 1987 * are the only options available on the current machine/platform. 1988 * Since these are the only options available for pseries-2.7 and 1989 * earlier, this allows us to maintain old->new/new->old migration 1990 * compatibility. 1991 * 1992 * For QEMU 2.8+, there are additional CAS-negotiatable options available 1993 * via default pseries-2.8 machines and explicit command-line parameters. 1994 * Some of these options, like OV5_HP_EVT, *do* require QEMU to be aware 1995 * of the actual CAS-negotiated values to continue working properly. For 1996 * example, availability of memory unplug depends on knowing whether 1997 * OV5_HP_EVT was negotiated via CAS. 1998 * 1999 * Thus, for any cases where the set of available CAS-negotiatable 2000 * options extends beyond OV5_FORM1_AFFINITY and OV5_DRCONF_MEMORY, we 2001 * include the CAS-negotiated options in the migration stream, unless 2002 * if they affect boot time behaviour only. 2003 */ 2004 spapr_ovec_set(ov5_mask, OV5_FORM1_AFFINITY); 2005 spapr_ovec_set(ov5_mask, OV5_DRCONF_MEMORY); 2006 spapr_ovec_set(ov5_mask, OV5_DRMEM_V2); 2007 2008 /* We need extra information if we have any bits outside the mask 2009 * defined above */ 2010 cas_needed = !spapr_ovec_subset(spapr->ov5, ov5_mask); 2011 2012 spapr_ovec_cleanup(ov5_mask); 2013 2014 return cas_needed; 2015 } 2016 2017 static const VMStateDescription vmstate_spapr_ov5_cas = { 2018 .name = "spapr_option_vector_ov5_cas", 2019 .version_id = 1, 2020 .minimum_version_id = 1, 2021 .needed = spapr_ov5_cas_needed, 2022 .fields = (const VMStateField[]) { 2023 VMSTATE_STRUCT_POINTER_V(ov5_cas, SpaprMachineState, 1, 2024 vmstate_spapr_ovec, SpaprOptionVector), 2025 VMSTATE_END_OF_LIST() 2026 }, 2027 }; 2028 2029 static bool spapr_patb_entry_needed(void *opaque) 2030 { 2031 SpaprMachineState *spapr = opaque; 2032 2033 return !!spapr->patb_entry; 2034 } 2035 2036 static const VMStateDescription vmstate_spapr_patb_entry = { 2037 .name = "spapr_patb_entry", 2038 .version_id = 1, 2039 .minimum_version_id = 1, 2040 .needed = spapr_patb_entry_needed, 2041 .fields = (const VMStateField[]) { 2042 VMSTATE_UINT64(patb_entry, SpaprMachineState), 2043 VMSTATE_END_OF_LIST() 2044 }, 2045 }; 2046 2047 static bool spapr_irq_map_needed(void *opaque) 2048 { 2049 SpaprMachineState *spapr = opaque; 2050 2051 return spapr->irq_map && !bitmap_empty(spapr->irq_map, spapr->irq_map_nr); 2052 } 2053 2054 static const VMStateDescription vmstate_spapr_irq_map = { 2055 .name = "spapr_irq_map", 2056 .version_id = 1, 2057 .minimum_version_id = 1, 2058 .needed = spapr_irq_map_needed, 2059 .fields = (const VMStateField[]) { 2060 VMSTATE_BITMAP(irq_map, SpaprMachineState, 0, irq_map_nr), 2061 VMSTATE_END_OF_LIST() 2062 }, 2063 }; 2064 2065 static bool spapr_dtb_needed(void *opaque) 2066 { 2067 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(opaque); 2068 2069 return smc->update_dt_enabled; 2070 } 2071 2072 static int spapr_dtb_pre_load(void *opaque) 2073 { 2074 SpaprMachineState *spapr = (SpaprMachineState *)opaque; 2075 2076 g_free(spapr->fdt_blob); 2077 spapr->fdt_blob = NULL; 2078 spapr->fdt_size = 0; 2079 2080 return 0; 2081 } 2082 2083 static const VMStateDescription vmstate_spapr_dtb = { 2084 .name = "spapr_dtb", 2085 .version_id = 1, 2086 .minimum_version_id = 1, 2087 .needed = spapr_dtb_needed, 2088 .pre_load = spapr_dtb_pre_load, 2089 .fields = (const VMStateField[]) { 2090 VMSTATE_UINT32(fdt_initial_size, SpaprMachineState), 2091 VMSTATE_UINT32(fdt_size, SpaprMachineState), 2092 VMSTATE_VBUFFER_ALLOC_UINT32(fdt_blob, SpaprMachineState, 0, NULL, 2093 fdt_size), 2094 VMSTATE_END_OF_LIST() 2095 }, 2096 }; 2097 2098 static bool spapr_fwnmi_needed(void *opaque) 2099 { 2100 SpaprMachineState *spapr = (SpaprMachineState *)opaque; 2101 2102 return spapr->fwnmi_machine_check_addr != -1; 2103 } 2104 2105 static int spapr_fwnmi_pre_save(void *opaque) 2106 { 2107 SpaprMachineState *spapr = (SpaprMachineState *)opaque; 2108 2109 /* 2110 * Check if machine check handling is in progress and print a 2111 * warning message. 2112 */ 2113 if (spapr->fwnmi_machine_check_interlock != -1) { 2114 warn_report("A machine check is being handled during migration. The" 2115 "handler may run and log hardware error on the destination"); 2116 } 2117 2118 return 0; 2119 } 2120 2121 static const VMStateDescription vmstate_spapr_fwnmi = { 2122 .name = "spapr_fwnmi", 2123 .version_id = 1, 2124 .minimum_version_id = 1, 2125 .needed = spapr_fwnmi_needed, 2126 .pre_save = spapr_fwnmi_pre_save, 2127 .fields = (const VMStateField[]) { 2128 VMSTATE_UINT64(fwnmi_system_reset_addr, SpaprMachineState), 2129 VMSTATE_UINT64(fwnmi_machine_check_addr, SpaprMachineState), 2130 VMSTATE_INT32(fwnmi_machine_check_interlock, SpaprMachineState), 2131 VMSTATE_END_OF_LIST() 2132 }, 2133 }; 2134 2135 static const VMStateDescription vmstate_spapr = { 2136 .name = "spapr", 2137 .version_id = 3, 2138 .minimum_version_id = 1, 2139 .pre_load = spapr_pre_load, 2140 .post_load = spapr_post_load, 2141 .pre_save = spapr_pre_save, 2142 .fields = (const VMStateField[]) { 2143 /* used to be @next_irq */ 2144 VMSTATE_UNUSED_BUFFER(version_before_3, 0, 4), 2145 2146 /* RTC offset */ 2147 VMSTATE_UINT64_TEST(rtc_offset, SpaprMachineState, version_before_3), 2148 2149 VMSTATE_PPC_TIMEBASE_V(tb, SpaprMachineState, 2), 2150 VMSTATE_END_OF_LIST() 2151 }, 2152 .subsections = (const VMStateDescription * const []) { 2153 &vmstate_spapr_ov5_cas, 2154 &vmstate_spapr_patb_entry, 2155 &vmstate_spapr_pending_events, 2156 &vmstate_spapr_cap_htm, 2157 &vmstate_spapr_cap_vsx, 2158 &vmstate_spapr_cap_dfp, 2159 &vmstate_spapr_cap_cfpc, 2160 &vmstate_spapr_cap_sbbc, 2161 &vmstate_spapr_cap_ibs, 2162 &vmstate_spapr_cap_hpt_maxpagesize, 2163 &vmstate_spapr_irq_map, 2164 &vmstate_spapr_cap_nested_kvm_hv, 2165 &vmstate_spapr_dtb, 2166 &vmstate_spapr_cap_large_decr, 2167 &vmstate_spapr_cap_ccf_assist, 2168 &vmstate_spapr_cap_fwnmi, 2169 &vmstate_spapr_fwnmi, 2170 &vmstate_spapr_cap_rpt_invalidate, 2171 &vmstate_spapr_cap_ail_mode_3, 2172 &vmstate_spapr_cap_nested_papr, 2173 &vmstate_spapr_cap_dawr1, 2174 NULL 2175 } 2176 }; 2177 2178 static int htab_save_setup(QEMUFile *f, void *opaque, Error **errp) 2179 { 2180 SpaprMachineState *spapr = opaque; 2181 2182 /* "Iteration" header */ 2183 if (!spapr->htab_shift) { 2184 qemu_put_be32(f, -1); 2185 } else { 2186 qemu_put_be32(f, spapr->htab_shift); 2187 } 2188 2189 if (spapr->htab) { 2190 spapr->htab_save_index = 0; 2191 spapr->htab_first_pass = true; 2192 } else { 2193 if (spapr->htab_shift) { 2194 assert(kvm_enabled()); 2195 } 2196 } 2197 2198 2199 return 0; 2200 } 2201 2202 static void htab_save_chunk(QEMUFile *f, SpaprMachineState *spapr, 2203 int chunkstart, int n_valid, int n_invalid) 2204 { 2205 qemu_put_be32(f, chunkstart); 2206 qemu_put_be16(f, n_valid); 2207 qemu_put_be16(f, n_invalid); 2208 qemu_put_buffer(f, (void *)hpte_get_ptr(spapr, chunkstart), 2209 HASH_PTE_SIZE_64 * n_valid); 2210 } 2211 2212 static void htab_save_end_marker(QEMUFile *f) 2213 { 2214 qemu_put_be32(f, 0); 2215 qemu_put_be16(f, 0); 2216 qemu_put_be16(f, 0); 2217 } 2218 2219 static void htab_save_first_pass(QEMUFile *f, SpaprMachineState *spapr, 2220 int64_t max_ns) 2221 { 2222 bool has_timeout = max_ns != -1; 2223 int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64; 2224 int index = spapr->htab_save_index; 2225 int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2226 2227 assert(spapr->htab_first_pass); 2228 2229 do { 2230 int chunkstart; 2231 2232 /* Consume invalid HPTEs */ 2233 while ((index < htabslots) 2234 && !hpte_is_valid(spapr, index)) { 2235 hpte_set_clean(spapr, index); 2236 index++; 2237 } 2238 2239 /* Consume valid HPTEs */ 2240 chunkstart = index; 2241 while ((index < htabslots) && (index - chunkstart < USHRT_MAX) 2242 && hpte_is_valid(spapr, index)) { 2243 hpte_set_clean(spapr, index); 2244 index++; 2245 } 2246 2247 if (index > chunkstart) { 2248 int n_valid = index - chunkstart; 2249 2250 htab_save_chunk(f, spapr, chunkstart, n_valid, 0); 2251 2252 if (has_timeout && 2253 (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) { 2254 break; 2255 } 2256 } 2257 } while ((index < htabslots) && !migration_rate_exceeded(f)); 2258 2259 if (index >= htabslots) { 2260 assert(index == htabslots); 2261 index = 0; 2262 spapr->htab_first_pass = false; 2263 } 2264 spapr->htab_save_index = index; 2265 } 2266 2267 static int htab_save_later_pass(QEMUFile *f, SpaprMachineState *spapr, 2268 int64_t max_ns) 2269 { 2270 bool final = max_ns < 0; 2271 int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64; 2272 int examined = 0, sent = 0; 2273 int index = spapr->htab_save_index; 2274 int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2275 2276 assert(!spapr->htab_first_pass); 2277 2278 do { 2279 int chunkstart, invalidstart; 2280 2281 /* Consume non-dirty HPTEs */ 2282 while ((index < htabslots) 2283 && !hpte_is_dirty(spapr, index)) { 2284 index++; 2285 examined++; 2286 } 2287 2288 chunkstart = index; 2289 /* Consume valid dirty HPTEs */ 2290 while ((index < htabslots) && (index - chunkstart < USHRT_MAX) 2291 && hpte_is_dirty(spapr, index) 2292 && hpte_is_valid(spapr, index)) { 2293 hpte_set_clean(spapr, index); 2294 index++; 2295 examined++; 2296 } 2297 2298 invalidstart = index; 2299 /* Consume invalid dirty HPTEs */ 2300 while ((index < htabslots) && (index - invalidstart < USHRT_MAX) 2301 && hpte_is_dirty(spapr, index) 2302 && !hpte_is_valid(spapr, index)) { 2303 hpte_set_clean(spapr, index); 2304 index++; 2305 examined++; 2306 } 2307 2308 if (index > chunkstart) { 2309 int n_valid = invalidstart - chunkstart; 2310 int n_invalid = index - invalidstart; 2311 2312 htab_save_chunk(f, spapr, chunkstart, n_valid, n_invalid); 2313 sent += index - chunkstart; 2314 2315 if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) { 2316 break; 2317 } 2318 } 2319 2320 if (examined >= htabslots) { 2321 break; 2322 } 2323 2324 if (index >= htabslots) { 2325 assert(index == htabslots); 2326 index = 0; 2327 } 2328 } while ((examined < htabslots) && (!migration_rate_exceeded(f) || final)); 2329 2330 if (index >= htabslots) { 2331 assert(index == htabslots); 2332 index = 0; 2333 } 2334 2335 spapr->htab_save_index = index; 2336 2337 return (examined >= htabslots) && (sent == 0) ? 1 : 0; 2338 } 2339 2340 #define MAX_ITERATION_NS 5000000 /* 5 ms */ 2341 #define MAX_KVM_BUF_SIZE 2048 2342 2343 static int htab_save_iterate(QEMUFile *f, void *opaque) 2344 { 2345 SpaprMachineState *spapr = opaque; 2346 int fd; 2347 int rc = 0; 2348 2349 /* Iteration header */ 2350 if (!spapr->htab_shift) { 2351 qemu_put_be32(f, -1); 2352 return 1; 2353 } else { 2354 qemu_put_be32(f, 0); 2355 } 2356 2357 if (!spapr->htab) { 2358 assert(kvm_enabled()); 2359 2360 fd = get_htab_fd(spapr); 2361 if (fd < 0) { 2362 return fd; 2363 } 2364 2365 rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, MAX_ITERATION_NS); 2366 if (rc < 0) { 2367 return rc; 2368 } 2369 } else if (spapr->htab_first_pass) { 2370 htab_save_first_pass(f, spapr, MAX_ITERATION_NS); 2371 } else { 2372 rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS); 2373 } 2374 2375 htab_save_end_marker(f); 2376 2377 return rc; 2378 } 2379 2380 static int htab_save_complete(QEMUFile *f, void *opaque) 2381 { 2382 SpaprMachineState *spapr = opaque; 2383 int fd; 2384 2385 /* Iteration header */ 2386 if (!spapr->htab_shift) { 2387 qemu_put_be32(f, -1); 2388 return 0; 2389 } else { 2390 qemu_put_be32(f, 0); 2391 } 2392 2393 if (!spapr->htab) { 2394 int rc; 2395 2396 assert(kvm_enabled()); 2397 2398 fd = get_htab_fd(spapr); 2399 if (fd < 0) { 2400 return fd; 2401 } 2402 2403 rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, -1); 2404 if (rc < 0) { 2405 return rc; 2406 } 2407 } else { 2408 if (spapr->htab_first_pass) { 2409 htab_save_first_pass(f, spapr, -1); 2410 } 2411 htab_save_later_pass(f, spapr, -1); 2412 } 2413 2414 /* End marker */ 2415 htab_save_end_marker(f); 2416 2417 return 0; 2418 } 2419 2420 static int htab_load(QEMUFile *f, void *opaque, int version_id) 2421 { 2422 SpaprMachineState *spapr = opaque; 2423 uint32_t section_hdr; 2424 int fd = -1; 2425 Error *local_err = NULL; 2426 2427 if (version_id < 1 || version_id > 1) { 2428 error_report("htab_load() bad version"); 2429 return -EINVAL; 2430 } 2431 2432 section_hdr = qemu_get_be32(f); 2433 2434 if (section_hdr == -1) { 2435 spapr_free_hpt(spapr); 2436 return 0; 2437 } 2438 2439 if (section_hdr) { 2440 int ret; 2441 2442 /* First section gives the htab size */ 2443 ret = spapr_reallocate_hpt(spapr, section_hdr, &local_err); 2444 if (ret < 0) { 2445 error_report_err(local_err); 2446 return ret; 2447 } 2448 return 0; 2449 } 2450 2451 if (!spapr->htab) { 2452 assert(kvm_enabled()); 2453 2454 fd = kvmppc_get_htab_fd(true, 0, &local_err); 2455 if (fd < 0) { 2456 error_report_err(local_err); 2457 return fd; 2458 } 2459 } 2460 2461 while (true) { 2462 uint32_t index; 2463 uint16_t n_valid, n_invalid; 2464 2465 index = qemu_get_be32(f); 2466 n_valid = qemu_get_be16(f); 2467 n_invalid = qemu_get_be16(f); 2468 2469 if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) { 2470 /* End of Stream */ 2471 break; 2472 } 2473 2474 if ((index + n_valid + n_invalid) > 2475 (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) { 2476 /* Bad index in stream */ 2477 error_report( 2478 "htab_load() bad index %d (%hd+%hd entries) in htab stream (htab_shift=%d)", 2479 index, n_valid, n_invalid, spapr->htab_shift); 2480 return -EINVAL; 2481 } 2482 2483 if (spapr->htab) { 2484 if (n_valid) { 2485 qemu_get_buffer(f, (void *)hpte_get_ptr(spapr, index), 2486 HASH_PTE_SIZE_64 * n_valid); 2487 } 2488 if (n_invalid) { 2489 memset(hpte_get_ptr(spapr, index + n_valid), 0, 2490 HASH_PTE_SIZE_64 * n_invalid); 2491 } 2492 } else { 2493 int rc; 2494 2495 assert(fd >= 0); 2496 2497 rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid, 2498 &local_err); 2499 if (rc < 0) { 2500 error_report_err(local_err); 2501 return rc; 2502 } 2503 } 2504 } 2505 2506 if (!spapr->htab) { 2507 assert(fd >= 0); 2508 close(fd); 2509 } 2510 2511 return 0; 2512 } 2513 2514 static void htab_save_cleanup(void *opaque) 2515 { 2516 SpaprMachineState *spapr = opaque; 2517 2518 close_htab_fd(spapr); 2519 } 2520 2521 static SaveVMHandlers savevm_htab_handlers = { 2522 .save_setup = htab_save_setup, 2523 .save_live_iterate = htab_save_iterate, 2524 .save_complete = htab_save_complete, 2525 .save_cleanup = htab_save_cleanup, 2526 .load_state = htab_load, 2527 }; 2528 2529 static void spapr_boot_set(void *opaque, const char *boot_device, 2530 Error **errp) 2531 { 2532 SpaprMachineState *spapr = SPAPR_MACHINE(opaque); 2533 2534 g_free(spapr->boot_device); 2535 spapr->boot_device = g_strdup(boot_device); 2536 } 2537 2538 static void spapr_create_lmb_dr_connectors(SpaprMachineState *spapr) 2539 { 2540 MachineState *machine = MACHINE(spapr); 2541 uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE; 2542 uint32_t nr_lmbs = (machine->maxram_size - machine->ram_size)/lmb_size; 2543 int i; 2544 2545 g_assert(!nr_lmbs || machine->device_memory); 2546 for (i = 0; i < nr_lmbs; i++) { 2547 uint64_t addr; 2548 2549 addr = i * lmb_size + machine->device_memory->base; 2550 spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_LMB, 2551 addr / lmb_size); 2552 } 2553 } 2554 2555 /* 2556 * If RAM size, maxmem size and individual node mem sizes aren't aligned 2557 * to SPAPR_MEMORY_BLOCK_SIZE(256MB), then refuse to start the guest 2558 * since we can't support such unaligned sizes with DRCONF_MEMORY. 2559 */ 2560 static void spapr_validate_node_memory(MachineState *machine, Error **errp) 2561 { 2562 int i; 2563 2564 if (machine->ram_size % SPAPR_MEMORY_BLOCK_SIZE) { 2565 error_setg(errp, "Memory size 0x" RAM_ADDR_FMT 2566 " is not aligned to %" PRIu64 " MiB", 2567 machine->ram_size, 2568 SPAPR_MEMORY_BLOCK_SIZE / MiB); 2569 return; 2570 } 2571 2572 if (machine->maxram_size % SPAPR_MEMORY_BLOCK_SIZE) { 2573 error_setg(errp, "Maximum memory size 0x" RAM_ADDR_FMT 2574 " is not aligned to %" PRIu64 " MiB", 2575 machine->ram_size, 2576 SPAPR_MEMORY_BLOCK_SIZE / MiB); 2577 return; 2578 } 2579 2580 for (i = 0; i < machine->numa_state->num_nodes; i++) { 2581 if (machine->numa_state->nodes[i].node_mem % SPAPR_MEMORY_BLOCK_SIZE) { 2582 error_setg(errp, 2583 "Node %d memory size 0x%" PRIx64 2584 " is not aligned to %" PRIu64 " MiB", 2585 i, machine->numa_state->nodes[i].node_mem, 2586 SPAPR_MEMORY_BLOCK_SIZE / MiB); 2587 return; 2588 } 2589 } 2590 } 2591 2592 /* find cpu slot in machine->possible_cpus by core_id */ 2593 static CPUArchId *spapr_find_cpu_slot(MachineState *ms, uint32_t id, int *idx) 2594 { 2595 int index = id / ms->smp.threads; 2596 2597 if (index >= ms->possible_cpus->len) { 2598 return NULL; 2599 } 2600 if (idx) { 2601 *idx = index; 2602 } 2603 return &ms->possible_cpus->cpus[index]; 2604 } 2605 2606 static void spapr_set_vsmt_mode(SpaprMachineState *spapr, Error **errp) 2607 { 2608 MachineState *ms = MACHINE(spapr); 2609 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); 2610 Error *local_err = NULL; 2611 bool vsmt_user = !!spapr->vsmt; 2612 int kvm_smt = kvmppc_smt_threads(); 2613 int ret; 2614 unsigned int smp_threads = ms->smp.threads; 2615 2616 if (tcg_enabled()) { 2617 if (smp_threads > 1 && 2618 !ppc_type_check_compat(ms->cpu_type, CPU_POWERPC_LOGICAL_2_07, 0, 2619 spapr->max_compat_pvr)) { 2620 error_setg(errp, "TCG only supports SMT on POWER8 or newer CPUs"); 2621 return; 2622 } 2623 2624 if (smp_threads > 8) { 2625 error_setg(errp, "TCG cannot support more than 8 threads/core " 2626 "on a pseries machine"); 2627 return; 2628 } 2629 } 2630 if (!is_power_of_2(smp_threads)) { 2631 error_setg(errp, "Cannot support %d threads/core on a pseries " 2632 "machine because it must be a power of 2", smp_threads); 2633 return; 2634 } 2635 2636 /* Determine the VSMT mode to use: */ 2637 if (vsmt_user) { 2638 if (spapr->vsmt < smp_threads) { 2639 error_setg(errp, "Cannot support VSMT mode %d" 2640 " because it must be >= threads/core (%d)", 2641 spapr->vsmt, smp_threads); 2642 return; 2643 } 2644 /* In this case, spapr->vsmt has been set by the command line */ 2645 } else if (!smc->smp_threads_vsmt) { 2646 /* 2647 * Default VSMT value is tricky, because we need it to be as 2648 * consistent as possible (for migration), but this requires 2649 * changing it for at least some existing cases. We pick 8 as 2650 * the value that we'd get with KVM on POWER8, the 2651 * overwhelmingly common case in production systems. 2652 */ 2653 spapr->vsmt = MAX(8, smp_threads); 2654 } else { 2655 spapr->vsmt = smp_threads; 2656 } 2657 2658 /* KVM: If necessary, set the SMT mode: */ 2659 if (kvm_enabled() && (spapr->vsmt != kvm_smt)) { 2660 ret = kvmppc_set_smt_threads(spapr->vsmt); 2661 if (ret) { 2662 /* Looks like KVM isn't able to change VSMT mode */ 2663 error_setg(&local_err, 2664 "Failed to set KVM's VSMT mode to %d (errno %d)", 2665 spapr->vsmt, ret); 2666 /* We can live with that if the default one is big enough 2667 * for the number of threads, and a submultiple of the one 2668 * we want. In this case we'll waste some vcpu ids, but 2669 * behaviour will be correct */ 2670 if ((kvm_smt >= smp_threads) && ((spapr->vsmt % kvm_smt) == 0)) { 2671 warn_report_err(local_err); 2672 } else { 2673 if (!vsmt_user) { 2674 error_append_hint(&local_err, 2675 "On PPC, a VM with %d threads/core" 2676 " on a host with %d threads/core" 2677 " requires the use of VSMT mode %d.\n", 2678 smp_threads, kvm_smt, spapr->vsmt); 2679 } 2680 kvmppc_error_append_smt_possible_hint(&local_err); 2681 error_propagate(errp, local_err); 2682 } 2683 } 2684 } 2685 /* else TCG: nothing to do currently */ 2686 } 2687 2688 static void spapr_init_cpus(SpaprMachineState *spapr) 2689 { 2690 MachineState *machine = MACHINE(spapr); 2691 MachineClass *mc = MACHINE_GET_CLASS(machine); 2692 const char *type = spapr_get_cpu_core_type(machine->cpu_type); 2693 const CPUArchIdList *possible_cpus; 2694 unsigned int smp_cpus = machine->smp.cpus; 2695 unsigned int smp_threads = machine->smp.threads; 2696 unsigned int max_cpus = machine->smp.max_cpus; 2697 int boot_cores_nr = smp_cpus / smp_threads; 2698 int i; 2699 2700 possible_cpus = mc->possible_cpu_arch_ids(machine); 2701 if (mc->has_hotpluggable_cpus) { 2702 if (smp_cpus % smp_threads) { 2703 error_report("smp_cpus (%u) must be multiple of threads (%u)", 2704 smp_cpus, smp_threads); 2705 exit(1); 2706 } 2707 if (max_cpus % smp_threads) { 2708 error_report("max_cpus (%u) must be multiple of threads (%u)", 2709 max_cpus, smp_threads); 2710 exit(1); 2711 } 2712 } else { 2713 if (max_cpus != smp_cpus) { 2714 error_report("This machine version does not support CPU hotplug"); 2715 exit(1); 2716 } 2717 boot_cores_nr = possible_cpus->len; 2718 } 2719 2720 for (i = 0; i < possible_cpus->len; i++) { 2721 int core_id = i * smp_threads; 2722 2723 if (mc->has_hotpluggable_cpus) { 2724 spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_CPU, 2725 spapr_vcpu_id(spapr, core_id)); 2726 } 2727 2728 if (i < boot_cores_nr) { 2729 Object *core = object_new(type); 2730 int nr_threads = smp_threads; 2731 2732 /* Handle the partially filled core for older machine types */ 2733 if ((i + 1) * smp_threads >= smp_cpus) { 2734 nr_threads = smp_cpus - i * smp_threads; 2735 } 2736 2737 object_property_set_int(core, "nr-threads", nr_threads, 2738 &error_fatal); 2739 object_property_set_int(core, CPU_CORE_PROP_CORE_ID, core_id, 2740 &error_fatal); 2741 qdev_realize(DEVICE(core), NULL, &error_fatal); 2742 2743 object_unref(core); 2744 } 2745 } 2746 } 2747 2748 static PCIHostState *spapr_create_default_phb(void) 2749 { 2750 DeviceState *dev; 2751 2752 dev = qdev_new(TYPE_SPAPR_PCI_HOST_BRIDGE); 2753 qdev_prop_set_uint32(dev, "index", 0); 2754 sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); 2755 2756 return PCI_HOST_BRIDGE(dev); 2757 } 2758 2759 static hwaddr spapr_rma_size(SpaprMachineState *spapr, Error **errp) 2760 { 2761 MachineState *machine = MACHINE(spapr); 2762 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); 2763 hwaddr rma_size = machine->ram_size; 2764 hwaddr node0_size = spapr_node0_size(machine); 2765 2766 /* RMA has to fit in the first NUMA node */ 2767 rma_size = MIN(rma_size, node0_size); 2768 2769 /* 2770 * VRMA access is via a special 1TiB SLB mapping, so the RMA can 2771 * never exceed that 2772 */ 2773 rma_size = MIN(rma_size, 1 * TiB); 2774 2775 /* 2776 * Clamp the RMA size based on machine type. This is for 2777 * migration compatibility with older qemu versions, which limited 2778 * the RMA size for complicated and mostly bad reasons. 2779 */ 2780 if (smc->rma_limit) { 2781 rma_size = MIN(rma_size, smc->rma_limit); 2782 } 2783 2784 if (rma_size < MIN_RMA_SLOF) { 2785 error_setg(errp, 2786 "pSeries SLOF firmware requires >= %" HWADDR_PRIx 2787 "ldMiB guest RMA (Real Mode Area memory)", 2788 MIN_RMA_SLOF / MiB); 2789 return 0; 2790 } 2791 2792 return rma_size; 2793 } 2794 2795 static void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr) 2796 { 2797 MachineState *machine = MACHINE(spapr); 2798 int i; 2799 2800 for (i = 0; i < machine->ram_slots; i++) { 2801 spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM, i); 2802 } 2803 } 2804 2805 /* pSeries LPAR / sPAPR hardware init */ 2806 static void spapr_machine_init(MachineState *machine) 2807 { 2808 SpaprMachineState *spapr = SPAPR_MACHINE(machine); 2809 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); 2810 MachineClass *mc = MACHINE_GET_CLASS(machine); 2811 const char *bios_default = spapr->vof ? FW_FILE_NAME_VOF : FW_FILE_NAME; 2812 const char *bios_name = machine->firmware ?: bios_default; 2813 g_autofree char *filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name); 2814 const char *kernel_filename = machine->kernel_filename; 2815 const char *initrd_filename = machine->initrd_filename; 2816 PCIHostState *phb; 2817 bool has_vga; 2818 int i; 2819 MemoryRegion *sysmem = get_system_memory(); 2820 long load_limit, fw_size; 2821 Error *resize_hpt_err = NULL; 2822 NICInfo *nd; 2823 2824 if (!filename) { 2825 error_report("Could not find LPAR firmware '%s'", bios_name); 2826 exit(1); 2827 } 2828 fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE); 2829 if (fw_size <= 0) { 2830 error_report("Could not load LPAR firmware '%s'", filename); 2831 exit(1); 2832 } 2833 2834 /* 2835 * if Secure VM (PEF) support is configured, then initialize it 2836 */ 2837 if (machine->cgs) { 2838 confidential_guest_kvm_init(machine->cgs, &error_fatal); 2839 } 2840 2841 msi_nonbroken = true; 2842 2843 QLIST_INIT(&spapr->phbs); 2844 QTAILQ_INIT(&spapr->pending_dimm_unplugs); 2845 2846 /* Determine capabilities to run with */ 2847 spapr_caps_init(spapr); 2848 2849 kvmppc_check_papr_resize_hpt(&resize_hpt_err); 2850 if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DEFAULT) { 2851 /* 2852 * If the user explicitly requested a mode we should either 2853 * supply it, or fail completely (which we do below). But if 2854 * it's not set explicitly, we reset our mode to something 2855 * that works 2856 */ 2857 if (resize_hpt_err) { 2858 spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED; 2859 error_free(resize_hpt_err); 2860 resize_hpt_err = NULL; 2861 } else { 2862 spapr->resize_hpt = smc->resize_hpt_default; 2863 } 2864 } 2865 2866 assert(spapr->resize_hpt != SPAPR_RESIZE_HPT_DEFAULT); 2867 2868 if ((spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) && resize_hpt_err) { 2869 /* 2870 * User requested HPT resize, but this host can't supply it. Bail out 2871 */ 2872 error_report_err(resize_hpt_err); 2873 exit(1); 2874 } 2875 error_free(resize_hpt_err); 2876 2877 spapr->rma_size = spapr_rma_size(spapr, &error_fatal); 2878 2879 /* Setup a load limit for the ramdisk leaving room for SLOF and FDT */ 2880 load_limit = MIN(spapr->rma_size, FDT_MAX_ADDR) - FW_OVERHEAD; 2881 2882 /* 2883 * VSMT must be set in order to be able to compute VCPU ids, ie to 2884 * call spapr_max_server_number() or spapr_vcpu_id(). 2885 */ 2886 spapr_set_vsmt_mode(spapr, &error_fatal); 2887 2888 /* Set up Interrupt Controller before we create the VCPUs */ 2889 spapr_irq_init(spapr, &error_fatal); 2890 2891 /* Set up containers for ibm,client-architecture-support negotiated options 2892 */ 2893 spapr->ov5 = spapr_ovec_new(); 2894 spapr->ov5_cas = spapr_ovec_new(); 2895 2896 spapr_ovec_set(spapr->ov5, OV5_DRCONF_MEMORY); 2897 spapr_validate_node_memory(machine, &error_fatal); 2898 2899 spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY); 2900 2901 /* Do not advertise FORM2 NUMA support for pseries-6.1 and older */ 2902 if (!smc->pre_6_2_numa_affinity) { 2903 spapr_ovec_set(spapr->ov5, OV5_FORM2_AFFINITY); 2904 } 2905 2906 /* advertise support for dedicated HP event source to guests */ 2907 if (spapr->use_hotplug_event_source) { 2908 spapr_ovec_set(spapr->ov5, OV5_HP_EVT); 2909 } 2910 2911 /* advertise support for HPT resizing */ 2912 if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) { 2913 spapr_ovec_set(spapr->ov5, OV5_HPT_RESIZE); 2914 } 2915 2916 /* advertise support for ibm,dyamic-memory-v2 */ 2917 spapr_ovec_set(spapr->ov5, OV5_DRMEM_V2); 2918 2919 /* advertise XIVE on POWER9 machines */ 2920 if (spapr->irq->xive) { 2921 spapr_ovec_set(spapr->ov5, OV5_XIVE_EXPLOIT); 2922 } 2923 2924 qemu_guest_getrandom_nofail(&spapr->hashpkey_val, 2925 sizeof(spapr->hashpkey_val)); 2926 2927 /* init CPUs */ 2928 spapr_init_cpus(spapr); 2929 2930 /* Init numa_assoc_array */ 2931 spapr_numa_associativity_init(spapr, machine); 2932 2933 if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) && 2934 ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0, 2935 spapr->max_compat_pvr)) { 2936 spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300); 2937 /* KVM and TCG always allow GTSE with radix... */ 2938 spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE); 2939 } 2940 /* ... but not with hash (currently). */ 2941 2942 if (kvm_enabled()) { 2943 /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */ 2944 kvmppc_enable_logical_ci_hcalls(); 2945 kvmppc_enable_set_mode_hcall(); 2946 2947 /* H_CLEAR_MOD/_REF are mandatory in PAPR, but off by default */ 2948 kvmppc_enable_clear_ref_mod_hcalls(); 2949 2950 /* Enable H_PAGE_INIT */ 2951 kvmppc_enable_h_page_init(); 2952 } 2953 2954 /* map RAM */ 2955 memory_region_add_subregion(sysmem, 0, machine->ram); 2956 2957 /* initialize hotplug memory address space */ 2958 if (machine->ram_size < machine->maxram_size) { 2959 ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size; 2960 hwaddr device_mem_base; 2961 2962 /* 2963 * Limit the number of hotpluggable memory slots to half the number 2964 * slots that KVM supports, leaving the other half for PCI and other 2965 * devices. However ensure that number of slots doesn't drop below 32. 2966 */ 2967 int max_memslots = kvm_enabled() ? kvm_get_max_memslots() / 2 : 2968 SPAPR_MAX_RAM_SLOTS; 2969 2970 if (max_memslots < SPAPR_MAX_RAM_SLOTS) { 2971 max_memslots = SPAPR_MAX_RAM_SLOTS; 2972 } 2973 if (machine->ram_slots > max_memslots) { 2974 error_report("Specified number of memory slots %" 2975 PRIu64" exceeds max supported %d", 2976 machine->ram_slots, max_memslots); 2977 exit(1); 2978 } 2979 2980 device_mem_base = ROUND_UP(machine->ram_size, SPAPR_DEVICE_MEM_ALIGN); 2981 machine_memory_devices_init(machine, device_mem_base, device_mem_size); 2982 } 2983 2984 spapr_create_lmb_dr_connectors(spapr); 2985 2986 if (mc->nvdimm_supported) { 2987 spapr_create_nvdimm_dr_connectors(spapr); 2988 } 2989 2990 /* Set up RTAS event infrastructure */ 2991 spapr_events_init(spapr); 2992 2993 /* Set up the RTC RTAS interfaces */ 2994 spapr_rtc_create(spapr); 2995 2996 /* Set up VIO bus */ 2997 spapr->vio_bus = spapr_vio_bus_init(); 2998 2999 for (i = 0; serial_hd(i); i++) { 3000 spapr_vty_create(spapr->vio_bus, serial_hd(i)); 3001 } 3002 3003 /* We always have at least the nvram device on VIO */ 3004 spapr_create_nvram(spapr); 3005 3006 /* 3007 * Setup hotplug / dynamic-reconfiguration connectors. top-level 3008 * connectors (described in root DT node's "ibm,drc-types" property) 3009 * are pre-initialized here. additional child connectors (such as 3010 * connectors for a PHBs PCI slots) are added as needed during their 3011 * parent's realization. 3012 */ 3013 if (smc->dr_phb_enabled) { 3014 for (i = 0; i < SPAPR_MAX_PHBS; i++) { 3015 spapr_dr_connector_new(OBJECT(machine), TYPE_SPAPR_DRC_PHB, i); 3016 } 3017 } 3018 3019 /* Set up PCI */ 3020 spapr_pci_rtas_init(); 3021 3022 phb = spapr_create_default_phb(); 3023 3024 while ((nd = qemu_find_nic_info("spapr-vlan", true, "ibmveth"))) { 3025 spapr_vlan_create(spapr->vio_bus, nd); 3026 } 3027 3028 pci_init_nic_devices(phb->bus, NULL); 3029 3030 for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) { 3031 spapr_vscsi_create(spapr->vio_bus); 3032 } 3033 3034 /* Graphics */ 3035 has_vga = spapr_vga_init(phb->bus, &error_fatal); 3036 if (has_vga) { 3037 spapr->want_stdout_path = !machine->enable_graphics; 3038 machine->usb |= defaults_enabled() && !machine->usb_disabled; 3039 } else { 3040 spapr->want_stdout_path = true; 3041 } 3042 3043 if (machine->usb) { 3044 pci_create_simple(phb->bus, -1, "nec-usb-xhci"); 3045 3046 if (has_vga) { 3047 USBBus *usb_bus; 3048 3049 usb_bus = USB_BUS(object_resolve_type_unambiguous(TYPE_USB_BUS, 3050 &error_abort)); 3051 usb_create_simple(usb_bus, "usb-kbd"); 3052 usb_create_simple(usb_bus, "usb-mouse"); 3053 } 3054 } 3055 3056 if (kernel_filename) { 3057 uint64_t loaded_addr = 0; 3058 3059 spapr->kernel_size = load_elf(kernel_filename, NULL, 3060 translate_kernel_address, spapr, 3061 NULL, &loaded_addr, NULL, NULL, 3062 ELFDATA2MSB, PPC_ELF_MACHINE, 0, 0); 3063 if (spapr->kernel_size == ELF_LOAD_WRONG_ENDIAN) { 3064 spapr->kernel_size = load_elf(kernel_filename, NULL, 3065 translate_kernel_address, spapr, 3066 NULL, &loaded_addr, NULL, NULL, 3067 ELFDATA2LSB, PPC_ELF_MACHINE, 0, 0); 3068 spapr->kernel_le = spapr->kernel_size > 0; 3069 } 3070 if (spapr->kernel_size < 0) { 3071 error_report("error loading %s: %s", kernel_filename, 3072 load_elf_strerror(spapr->kernel_size)); 3073 exit(1); 3074 } 3075 3076 if (spapr->kernel_addr != loaded_addr) { 3077 warn_report("spapr: kernel_addr changed from 0x%"PRIx64 3078 " to 0x%"PRIx64, 3079 spapr->kernel_addr, loaded_addr); 3080 spapr->kernel_addr = loaded_addr; 3081 } 3082 3083 /* load initrd */ 3084 if (initrd_filename) { 3085 /* Try to locate the initrd in the gap between the kernel 3086 * and the firmware. Add a bit of space just in case 3087 */ 3088 spapr->initrd_base = (spapr->kernel_addr + spapr->kernel_size 3089 + 0x1ffff) & ~0xffff; 3090 spapr->initrd_size = load_image_targphys(initrd_filename, 3091 spapr->initrd_base, 3092 load_limit 3093 - spapr->initrd_base); 3094 if (spapr->initrd_size < 0) { 3095 error_report("could not load initial ram disk '%s'", 3096 initrd_filename); 3097 exit(1); 3098 } 3099 } 3100 } 3101 3102 /* FIXME: Should register things through the MachineState's qdev 3103 * interface, this is a legacy from the sPAPREnvironment structure 3104 * which predated MachineState but had a similar function */ 3105 vmstate_register(NULL, 0, &vmstate_spapr, spapr); 3106 register_savevm_live("spapr/htab", VMSTATE_INSTANCE_ID_ANY, 1, 3107 &savevm_htab_handlers, spapr); 3108 3109 qbus_set_hotplug_handler(sysbus_get_default(), OBJECT(machine)); 3110 3111 qemu_register_boot_set(spapr_boot_set, spapr); 3112 3113 /* 3114 * Nothing needs to be done to resume a suspended guest because 3115 * suspending does not change the machine state, so no need for 3116 * a ->wakeup method. 3117 */ 3118 qemu_register_wakeup_support(); 3119 3120 if (kvm_enabled()) { 3121 /* to stop and start vmclock */ 3122 qemu_add_vm_change_state_handler(cpu_ppc_clock_vm_state_change, 3123 &spapr->tb); 3124 3125 kvmppc_spapr_enable_inkernel_multitce(); 3126 } 3127 3128 qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond); 3129 if (spapr->vof) { 3130 spapr->vof->fw_size = fw_size; /* for claim() on itself */ 3131 spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, spapr_h_vof_client); 3132 } 3133 3134 spapr_watchdog_init(spapr); 3135 } 3136 3137 #define DEFAULT_KVM_TYPE "auto" 3138 static int spapr_kvm_type(MachineState *machine, const char *vm_type) 3139 { 3140 /* 3141 * The use of g_ascii_strcasecmp() for 'hv' and 'pr' is to 3142 * accommodate the 'HV' and 'PV' formats that exists in the 3143 * wild. The 'auto' mode is being introduced already as 3144 * lower-case, thus we don't need to bother checking for 3145 * "AUTO". 3146 */ 3147 if (!vm_type || !strcmp(vm_type, DEFAULT_KVM_TYPE)) { 3148 return 0; 3149 } 3150 3151 if (!g_ascii_strcasecmp(vm_type, "hv")) { 3152 return 1; 3153 } 3154 3155 if (!g_ascii_strcasecmp(vm_type, "pr")) { 3156 return 2; 3157 } 3158 3159 error_report("Unknown kvm-type specified '%s'", vm_type); 3160 return -1; 3161 } 3162 3163 /* 3164 * Implementation of an interface to adjust firmware path 3165 * for the bootindex property handling. 3166 */ 3167 static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus, 3168 DeviceState *dev) 3169 { 3170 #define CAST(type, obj, name) \ 3171 ((type *)object_dynamic_cast(OBJECT(obj), (name))) 3172 SCSIDevice *d = CAST(SCSIDevice, dev, TYPE_SCSI_DEVICE); 3173 SpaprPhbState *phb = CAST(SpaprPhbState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE); 3174 VHostSCSICommon *vsc = CAST(VHostSCSICommon, dev, TYPE_VHOST_SCSI_COMMON); 3175 PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE); 3176 3177 if (d && bus) { 3178 void *spapr = CAST(void, bus->parent, "spapr-vscsi"); 3179 VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI); 3180 USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE); 3181 3182 if (spapr) { 3183 /* 3184 * Replace "channel@0/disk@0,0" with "disk@8000000000000000": 3185 * In the top 16 bits of the 64-bit LUN, we use SRP luns of the form 3186 * 0x8000 | (target << 8) | (bus << 5) | lun 3187 * (see the "Logical unit addressing format" table in SAM5) 3188 */ 3189 unsigned id = 0x8000 | (d->id << 8) | (d->channel << 5) | d->lun; 3190 return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev), 3191 (uint64_t)id << 48); 3192 } else if (virtio) { 3193 /* 3194 * We use SRP luns of the form 01000000 | (target << 8) | lun 3195 * in the top 32 bits of the 64-bit LUN 3196 * Note: the quote above is from SLOF and it is wrong, 3197 * the actual binding is: 3198 * swap 0100 or 10 << or 20 << ( target lun-id -- srplun ) 3199 */ 3200 unsigned id = 0x1000000 | (d->id << 16) | d->lun; 3201 if (d->lun >= 256) { 3202 /* Use the LUN "flat space addressing method" */ 3203 id |= 0x4000; 3204 } 3205 return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev), 3206 (uint64_t)id << 32); 3207 } else if (usb) { 3208 /* 3209 * We use SRP luns of the form 01000000 | (usb-port << 16) | lun 3210 * in the top 32 bits of the 64-bit LUN 3211 */ 3212 unsigned usb_port = atoi(usb->port->path); 3213 unsigned id = 0x1000000 | (usb_port << 16) | d->lun; 3214 return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev), 3215 (uint64_t)id << 32); 3216 } 3217 } 3218 3219 /* 3220 * SLOF probes the USB devices, and if it recognizes that the device is a 3221 * storage device, it changes its name to "storage" instead of "usb-host", 3222 * and additionally adds a child node for the SCSI LUN, so the correct 3223 * boot path in SLOF is something like .../storage@1/disk@xxx" instead. 3224 */ 3225 if (strcmp("usb-host", qdev_fw_name(dev)) == 0) { 3226 USBDevice *usbdev = CAST(USBDevice, dev, TYPE_USB_DEVICE); 3227 if (usb_device_is_scsi_storage(usbdev)) { 3228 return g_strdup_printf("storage@%s/disk", usbdev->port->path); 3229 } 3230 } 3231 3232 if (phb) { 3233 /* Replace "pci" with "pci@800000020000000" */ 3234 return g_strdup_printf("pci@%"PRIX64, phb->buid); 3235 } 3236 3237 if (vsc) { 3238 /* Same logic as virtio above */ 3239 unsigned id = 0x1000000 | (vsc->target << 16) | vsc->lun; 3240 return g_strdup_printf("disk@%"PRIX64, (uint64_t)id << 32); 3241 } 3242 3243 if (g_str_equal("pci-bridge", qdev_fw_name(dev))) { 3244 /* SLOF uses "pci" instead of "pci-bridge" for PCI bridges */ 3245 PCIDevice *pdev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE); 3246 return g_strdup_printf("pci@%x", PCI_SLOT(pdev->devfn)); 3247 } 3248 3249 if (pcidev) { 3250 return spapr_pci_fw_dev_name(pcidev); 3251 } 3252 3253 return NULL; 3254 } 3255 3256 static char *spapr_get_kvm_type(Object *obj, Error **errp) 3257 { 3258 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3259 3260 return g_strdup(spapr->kvm_type); 3261 } 3262 3263 static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp) 3264 { 3265 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3266 3267 g_free(spapr->kvm_type); 3268 spapr->kvm_type = g_strdup(value); 3269 } 3270 3271 static bool spapr_get_modern_hotplug_events(Object *obj, Error **errp) 3272 { 3273 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3274 3275 return spapr->use_hotplug_event_source; 3276 } 3277 3278 static void spapr_set_modern_hotplug_events(Object *obj, bool value, 3279 Error **errp) 3280 { 3281 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3282 3283 spapr->use_hotplug_event_source = value; 3284 } 3285 3286 static bool spapr_get_msix_emulation(Object *obj, Error **errp) 3287 { 3288 return true; 3289 } 3290 3291 static char *spapr_get_resize_hpt(Object *obj, Error **errp) 3292 { 3293 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3294 3295 switch (spapr->resize_hpt) { 3296 case SPAPR_RESIZE_HPT_DEFAULT: 3297 return g_strdup("default"); 3298 case SPAPR_RESIZE_HPT_DISABLED: 3299 return g_strdup("disabled"); 3300 case SPAPR_RESIZE_HPT_ENABLED: 3301 return g_strdup("enabled"); 3302 case SPAPR_RESIZE_HPT_REQUIRED: 3303 return g_strdup("required"); 3304 } 3305 g_assert_not_reached(); 3306 } 3307 3308 static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp) 3309 { 3310 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3311 3312 if (strcmp(value, "default") == 0) { 3313 spapr->resize_hpt = SPAPR_RESIZE_HPT_DEFAULT; 3314 } else if (strcmp(value, "disabled") == 0) { 3315 spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED; 3316 } else if (strcmp(value, "enabled") == 0) { 3317 spapr->resize_hpt = SPAPR_RESIZE_HPT_ENABLED; 3318 } else if (strcmp(value, "required") == 0) { 3319 spapr->resize_hpt = SPAPR_RESIZE_HPT_REQUIRED; 3320 } else { 3321 error_setg(errp, "Bad value for \"resize-hpt\" property"); 3322 } 3323 } 3324 3325 static bool spapr_get_vof(Object *obj, Error **errp) 3326 { 3327 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3328 3329 return spapr->vof != NULL; 3330 } 3331 3332 static void spapr_set_vof(Object *obj, bool value, Error **errp) 3333 { 3334 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3335 3336 if (spapr->vof) { 3337 vof_cleanup(spapr->vof); 3338 g_free(spapr->vof); 3339 spapr->vof = NULL; 3340 } 3341 if (!value) { 3342 return; 3343 } 3344 spapr->vof = g_malloc0(sizeof(*spapr->vof)); 3345 } 3346 3347 static char *spapr_get_ic_mode(Object *obj, Error **errp) 3348 { 3349 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3350 3351 if (spapr->irq == &spapr_irq_xics_legacy) { 3352 return g_strdup("legacy"); 3353 } else if (spapr->irq == &spapr_irq_xics) { 3354 return g_strdup("xics"); 3355 } else if (spapr->irq == &spapr_irq_xive) { 3356 return g_strdup("xive"); 3357 } else if (spapr->irq == &spapr_irq_dual) { 3358 return g_strdup("dual"); 3359 } 3360 g_assert_not_reached(); 3361 } 3362 3363 static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp) 3364 { 3365 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3366 3367 if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) { 3368 error_setg(errp, "This machine only uses the legacy XICS backend, don't pass ic-mode"); 3369 return; 3370 } 3371 3372 /* The legacy IRQ backend can not be set */ 3373 if (strcmp(value, "xics") == 0) { 3374 spapr->irq = &spapr_irq_xics; 3375 } else if (strcmp(value, "xive") == 0) { 3376 spapr->irq = &spapr_irq_xive; 3377 } else if (strcmp(value, "dual") == 0) { 3378 spapr->irq = &spapr_irq_dual; 3379 } else { 3380 error_setg(errp, "Bad value for \"ic-mode\" property"); 3381 } 3382 } 3383 3384 static char *spapr_get_host_model(Object *obj, Error **errp) 3385 { 3386 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3387 3388 return g_strdup(spapr->host_model); 3389 } 3390 3391 static void spapr_set_host_model(Object *obj, const char *value, Error **errp) 3392 { 3393 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3394 3395 g_free(spapr->host_model); 3396 spapr->host_model = g_strdup(value); 3397 } 3398 3399 static char *spapr_get_host_serial(Object *obj, Error **errp) 3400 { 3401 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3402 3403 return g_strdup(spapr->host_serial); 3404 } 3405 3406 static void spapr_set_host_serial(Object *obj, const char *value, Error **errp) 3407 { 3408 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3409 3410 g_free(spapr->host_serial); 3411 spapr->host_serial = g_strdup(value); 3412 } 3413 3414 static void spapr_instance_init(Object *obj) 3415 { 3416 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3417 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); 3418 MachineState *ms = MACHINE(spapr); 3419 MachineClass *mc = MACHINE_GET_CLASS(ms); 3420 3421 /* 3422 * NVDIMM support went live in 5.1 without considering that, in 3423 * other archs, the user needs to enable NVDIMM support with the 3424 * 'nvdimm' machine option and the default behavior is NVDIMM 3425 * support disabled. It is too late to roll back to the standard 3426 * behavior without breaking 5.1 guests. 3427 */ 3428 if (mc->nvdimm_supported) { 3429 ms->nvdimms_state->is_enabled = true; 3430 } 3431 3432 spapr->htab_fd = -1; 3433 spapr->use_hotplug_event_source = true; 3434 spapr->kvm_type = g_strdup(DEFAULT_KVM_TYPE); 3435 object_property_add_str(obj, "kvm-type", 3436 spapr_get_kvm_type, spapr_set_kvm_type); 3437 object_property_set_description(obj, "kvm-type", 3438 "Specifies the KVM virtualization mode (auto," 3439 " hv, pr). Defaults to 'auto'. This mode will use" 3440 " any available KVM module loaded in the host," 3441 " where kvm_hv takes precedence if both kvm_hv and" 3442 " kvm_pr are loaded."); 3443 object_property_add_bool(obj, "modern-hotplug-events", 3444 spapr_get_modern_hotplug_events, 3445 spapr_set_modern_hotplug_events); 3446 object_property_set_description(obj, "modern-hotplug-events", 3447 "Use dedicated hotplug event mechanism in" 3448 " place of standard EPOW events when possible" 3449 " (required for memory hot-unplug support)"); 3450 ppc_compat_add_property(obj, "max-cpu-compat", &spapr->max_compat_pvr, 3451 "Maximum permitted CPU compatibility mode"); 3452 3453 object_property_add_str(obj, "resize-hpt", 3454 spapr_get_resize_hpt, spapr_set_resize_hpt); 3455 object_property_set_description(obj, "resize-hpt", 3456 "Resizing of the Hash Page Table (enabled, disabled, required)"); 3457 object_property_add_uint32_ptr(obj, "vsmt", 3458 &spapr->vsmt, OBJ_PROP_FLAG_READWRITE); 3459 object_property_set_description(obj, "vsmt", 3460 "Virtual SMT: KVM behaves as if this were" 3461 " the host's SMT mode"); 3462 3463 object_property_add_bool(obj, "vfio-no-msix-emulation", 3464 spapr_get_msix_emulation, NULL); 3465 3466 object_property_add_uint64_ptr(obj, "kernel-addr", 3467 &spapr->kernel_addr, OBJ_PROP_FLAG_READWRITE); 3468 object_property_set_description(obj, "kernel-addr", 3469 stringify(KERNEL_LOAD_ADDR) 3470 " for -kernel is the default"); 3471 spapr->kernel_addr = KERNEL_LOAD_ADDR; 3472 3473 object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof); 3474 object_property_set_description(obj, "x-vof", 3475 "Enable Virtual Open Firmware (experimental)"); 3476 3477 /* The machine class defines the default interrupt controller mode */ 3478 spapr->irq = smc->irq; 3479 object_property_add_str(obj, "ic-mode", spapr_get_ic_mode, 3480 spapr_set_ic_mode); 3481 object_property_set_description(obj, "ic-mode", 3482 "Specifies the interrupt controller mode (xics, xive, dual)"); 3483 3484 object_property_add_str(obj, "host-model", 3485 spapr_get_host_model, spapr_set_host_model); 3486 object_property_set_description(obj, "host-model", 3487 "Host model to advertise in guest device tree"); 3488 object_property_add_str(obj, "host-serial", 3489 spapr_get_host_serial, spapr_set_host_serial); 3490 object_property_set_description(obj, "host-serial", 3491 "Host serial number to advertise in guest device tree"); 3492 } 3493 3494 static void spapr_machine_finalizefn(Object *obj) 3495 { 3496 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 3497 3498 g_free(spapr->kvm_type); 3499 } 3500 3501 void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg) 3502 { 3503 SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); 3504 CPUPPCState *env = cpu_env(cs); 3505 3506 cpu_synchronize_state(cs); 3507 /* If FWNMI is inactive, addr will be -1, which will deliver to 0x100 */ 3508 if (spapr->fwnmi_system_reset_addr != -1) { 3509 uint64_t rtas_addr, addr; 3510 3511 /* get rtas addr from fdt */ 3512 rtas_addr = spapr_get_rtas_addr(); 3513 if (!rtas_addr) { 3514 qemu_system_guest_panicked(NULL); 3515 return; 3516 } 3517 3518 addr = rtas_addr + RTAS_ERROR_LOG_MAX + cs->cpu_index * sizeof(uint64_t)*2; 3519 stq_be_phys(&address_space_memory, addr, env->gpr[3]); 3520 stq_be_phys(&address_space_memory, addr + sizeof(uint64_t), 0); 3521 env->gpr[3] = addr; 3522 } 3523 ppc_cpu_do_system_reset(cs); 3524 if (spapr->fwnmi_system_reset_addr != -1) { 3525 env->nip = spapr->fwnmi_system_reset_addr; 3526 } 3527 } 3528 3529 static void spapr_nmi(NMIState *n, int cpu_index, Error **errp) 3530 { 3531 CPUState *cs; 3532 3533 CPU_FOREACH(cs) { 3534 async_run_on_cpu(cs, spapr_do_system_reset_on_cpu, RUN_ON_CPU_NULL); 3535 } 3536 } 3537 3538 int spapr_lmb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, 3539 void *fdt, int *fdt_start_offset, Error **errp) 3540 { 3541 uint64_t addr; 3542 uint32_t node; 3543 3544 addr = spapr_drc_index(drc) * SPAPR_MEMORY_BLOCK_SIZE; 3545 node = object_property_get_uint(OBJECT(drc->dev), PC_DIMM_NODE_PROP, 3546 &error_abort); 3547 *fdt_start_offset = spapr_dt_memory_node(spapr, fdt, node, addr, 3548 SPAPR_MEMORY_BLOCK_SIZE); 3549 return 0; 3550 } 3551 3552 static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size, 3553 bool dedicated_hp_event_source) 3554 { 3555 SpaprDrc *drc; 3556 uint32_t nr_lmbs = size/SPAPR_MEMORY_BLOCK_SIZE; 3557 int i; 3558 uint64_t addr = addr_start; 3559 bool hotplugged = spapr_drc_hotplugged(dev); 3560 3561 for (i = 0; i < nr_lmbs; i++) { 3562 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, 3563 addr / SPAPR_MEMORY_BLOCK_SIZE); 3564 g_assert(drc); 3565 3566 /* 3567 * memory_device_get_free_addr() provided a range of free addresses 3568 * that doesn't overlap with any existing mapping at pre-plug. The 3569 * corresponding LMB DRCs are thus assumed to be all attachable. 3570 */ 3571 spapr_drc_attach(drc, dev); 3572 if (!hotplugged) { 3573 spapr_drc_reset(drc); 3574 } 3575 addr += SPAPR_MEMORY_BLOCK_SIZE; 3576 } 3577 /* send hotplug notification to the 3578 * guest only in case of hotplugged memory 3579 */ 3580 if (hotplugged) { 3581 if (dedicated_hp_event_source) { 3582 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, 3583 addr_start / SPAPR_MEMORY_BLOCK_SIZE); 3584 g_assert(drc); 3585 spapr_hotplug_req_add_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB, 3586 nr_lmbs, 3587 spapr_drc_index(drc)); 3588 } else { 3589 spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB, 3590 nr_lmbs); 3591 } 3592 } 3593 } 3594 3595 static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev) 3596 { 3597 SpaprMachineState *ms = SPAPR_MACHINE(hotplug_dev); 3598 PCDIMMDevice *dimm = PC_DIMM(dev); 3599 uint64_t size, addr; 3600 int64_t slot; 3601 bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM); 3602 3603 size = memory_device_get_region_size(MEMORY_DEVICE(dev), &error_abort); 3604 3605 pc_dimm_plug(dimm, MACHINE(ms)); 3606 3607 if (!is_nvdimm) { 3608 addr = object_property_get_uint(OBJECT(dimm), 3609 PC_DIMM_ADDR_PROP, &error_abort); 3610 spapr_add_lmbs(dev, addr, size, 3611 spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT)); 3612 } else { 3613 slot = object_property_get_int(OBJECT(dimm), 3614 PC_DIMM_SLOT_PROP, &error_abort); 3615 /* We should have valid slot number at this point */ 3616 g_assert(slot >= 0); 3617 spapr_add_nvdimm(dev, slot); 3618 } 3619 } 3620 3621 static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, 3622 Error **errp) 3623 { 3624 SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev); 3625 bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM); 3626 PCDIMMDevice *dimm = PC_DIMM(dev); 3627 Error *local_err = NULL; 3628 uint64_t size; 3629 Object *memdev; 3630 hwaddr pagesize; 3631 3632 size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &local_err); 3633 if (local_err) { 3634 error_propagate(errp, local_err); 3635 return; 3636 } 3637 3638 if (is_nvdimm) { 3639 if (!spapr_nvdimm_validate(hotplug_dev, NVDIMM(dev), size, errp)) { 3640 return; 3641 } 3642 } else if (size % SPAPR_MEMORY_BLOCK_SIZE) { 3643 error_setg(errp, "Hotplugged memory size must be a multiple of " 3644 "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB); 3645 return; 3646 } 3647 3648 memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP, 3649 &error_abort); 3650 pagesize = host_memory_backend_pagesize(MEMORY_BACKEND(memdev)); 3651 if (!spapr_check_pagesize(spapr, pagesize, errp)) { 3652 return; 3653 } 3654 3655 pc_dimm_pre_plug(dimm, MACHINE(hotplug_dev), errp); 3656 } 3657 3658 struct SpaprDimmState { 3659 PCDIMMDevice *dimm; 3660 uint32_t nr_lmbs; 3661 QTAILQ_ENTRY(SpaprDimmState) next; 3662 }; 3663 3664 static SpaprDimmState *spapr_pending_dimm_unplugs_find(SpaprMachineState *s, 3665 PCDIMMDevice *dimm) 3666 { 3667 SpaprDimmState *dimm_state = NULL; 3668 3669 QTAILQ_FOREACH(dimm_state, &s->pending_dimm_unplugs, next) { 3670 if (dimm_state->dimm == dimm) { 3671 break; 3672 } 3673 } 3674 return dimm_state; 3675 } 3676 3677 static SpaprDimmState *spapr_pending_dimm_unplugs_add(SpaprMachineState *spapr, 3678 uint32_t nr_lmbs, 3679 PCDIMMDevice *dimm) 3680 { 3681 SpaprDimmState *ds = NULL; 3682 3683 /* 3684 * If this request is for a DIMM whose removal had failed earlier 3685 * (due to guest's refusal to remove the LMBs), we would have this 3686 * dimm already in the pending_dimm_unplugs list. In that 3687 * case don't add again. 3688 */ 3689 ds = spapr_pending_dimm_unplugs_find(spapr, dimm); 3690 if (!ds) { 3691 ds = g_new0(SpaprDimmState, 1); 3692 ds->nr_lmbs = nr_lmbs; 3693 ds->dimm = dimm; 3694 QTAILQ_INSERT_HEAD(&spapr->pending_dimm_unplugs, ds, next); 3695 } 3696 return ds; 3697 } 3698 3699 static void spapr_pending_dimm_unplugs_remove(SpaprMachineState *spapr, 3700 SpaprDimmState *dimm_state) 3701 { 3702 QTAILQ_REMOVE(&spapr->pending_dimm_unplugs, dimm_state, next); 3703 g_free(dimm_state); 3704 } 3705 3706 static SpaprDimmState *spapr_recover_pending_dimm_state(SpaprMachineState *ms, 3707 PCDIMMDevice *dimm) 3708 { 3709 SpaprDrc *drc; 3710 uint64_t size = memory_device_get_region_size(MEMORY_DEVICE(dimm), 3711 &error_abort); 3712 uint32_t nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE; 3713 uint32_t avail_lmbs = 0; 3714 uint64_t addr_start, addr; 3715 int i; 3716 3717 addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP, 3718 &error_abort); 3719 3720 addr = addr_start; 3721 for (i = 0; i < nr_lmbs; i++) { 3722 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, 3723 addr / SPAPR_MEMORY_BLOCK_SIZE); 3724 g_assert(drc); 3725 if (drc->dev) { 3726 avail_lmbs++; 3727 } 3728 addr += SPAPR_MEMORY_BLOCK_SIZE; 3729 } 3730 3731 return spapr_pending_dimm_unplugs_add(ms, avail_lmbs, dimm); 3732 } 3733 3734 void spapr_memory_unplug_rollback(SpaprMachineState *spapr, DeviceState *dev) 3735 { 3736 SpaprDimmState *ds; 3737 PCDIMMDevice *dimm; 3738 SpaprDrc *drc; 3739 uint32_t nr_lmbs; 3740 uint64_t size, addr_start, addr; 3741 int i; 3742 3743 if (!dev) { 3744 return; 3745 } 3746 3747 dimm = PC_DIMM(dev); 3748 ds = spapr_pending_dimm_unplugs_find(spapr, dimm); 3749 3750 /* 3751 * 'ds == NULL' would mean that the DIMM doesn't have a pending 3752 * unplug state, but one of its DRC is marked as unplug_requested. 3753 * This is bad and weird enough to g_assert() out. 3754 */ 3755 g_assert(ds); 3756 3757 spapr_pending_dimm_unplugs_remove(spapr, ds); 3758 3759 size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort); 3760 nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE; 3761 3762 addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP, 3763 &error_abort); 3764 3765 addr = addr_start; 3766 for (i = 0; i < nr_lmbs; i++) { 3767 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, 3768 addr / SPAPR_MEMORY_BLOCK_SIZE); 3769 g_assert(drc); 3770 3771 drc->unplug_requested = false; 3772 addr += SPAPR_MEMORY_BLOCK_SIZE; 3773 } 3774 3775 /* 3776 * Tell QAPI that something happened and the memory 3777 * hotunplug wasn't successful. 3778 */ 3779 qapi_event_send_device_unplug_guest_error(dev->id, 3780 dev->canonical_path); 3781 } 3782 3783 /* Callback to be called during DRC release. */ 3784 void spapr_lmb_release(DeviceState *dev) 3785 { 3786 HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev); 3787 SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_ctrl); 3788 SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev)); 3789 3790 /* This information will get lost if a migration occurs 3791 * during the unplug process. In this case recover it. */ 3792 if (ds == NULL) { 3793 ds = spapr_recover_pending_dimm_state(spapr, PC_DIMM(dev)); 3794 g_assert(ds); 3795 /* The DRC being examined by the caller at least must be counted */ 3796 g_assert(ds->nr_lmbs); 3797 } 3798 3799 if (--ds->nr_lmbs) { 3800 return; 3801 } 3802 3803 /* 3804 * Now that all the LMBs have been removed by the guest, call the 3805 * unplug handler chain. This can never fail. 3806 */ 3807 hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort); 3808 object_unparent(OBJECT(dev)); 3809 } 3810 3811 static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev) 3812 { 3813 SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev); 3814 SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev)); 3815 3816 /* We really shouldn't get this far without anything to unplug */ 3817 g_assert(ds); 3818 3819 pc_dimm_unplug(PC_DIMM(dev), MACHINE(hotplug_dev)); 3820 qdev_unrealize(dev); 3821 spapr_pending_dimm_unplugs_remove(spapr, ds); 3822 } 3823 3824 static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev, 3825 DeviceState *dev, Error **errp) 3826 { 3827 SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev); 3828 PCDIMMDevice *dimm = PC_DIMM(dev); 3829 uint32_t nr_lmbs; 3830 uint64_t size, addr_start, addr; 3831 int i; 3832 SpaprDrc *drc; 3833 3834 if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) { 3835 error_setg(errp, "nvdimm device hot unplug is not supported yet."); 3836 return; 3837 } 3838 3839 size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort); 3840 nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE; 3841 3842 addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP, 3843 &error_abort); 3844 3845 /* 3846 * An existing pending dimm state for this DIMM means that there is an 3847 * unplug operation in progress, waiting for the spapr_lmb_release 3848 * callback to complete the job (BQL can't cover that far). In this case, 3849 * bail out to avoid detaching DRCs that were already released. 3850 */ 3851 if (spapr_pending_dimm_unplugs_find(spapr, dimm)) { 3852 error_setg(errp, "Memory unplug already in progress for device %s", 3853 dev->id); 3854 return; 3855 } 3856 3857 spapr_pending_dimm_unplugs_add(spapr, nr_lmbs, dimm); 3858 3859 addr = addr_start; 3860 for (i = 0; i < nr_lmbs; i++) { 3861 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, 3862 addr / SPAPR_MEMORY_BLOCK_SIZE); 3863 g_assert(drc); 3864 3865 spapr_drc_unplug_request(drc); 3866 addr += SPAPR_MEMORY_BLOCK_SIZE; 3867 } 3868 3869 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, 3870 addr_start / SPAPR_MEMORY_BLOCK_SIZE); 3871 spapr_hotplug_req_remove_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB, 3872 nr_lmbs, spapr_drc_index(drc)); 3873 } 3874 3875 /* Callback to be called during DRC release. */ 3876 void spapr_core_release(DeviceState *dev) 3877 { 3878 HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev); 3879 3880 /* Call the unplug handler chain. This can never fail. */ 3881 hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort); 3882 object_unparent(OBJECT(dev)); 3883 } 3884 3885 static void spapr_core_unplug(HotplugHandler *hotplug_dev, DeviceState *dev) 3886 { 3887 MachineState *ms = MACHINE(hotplug_dev); 3888 CPUCore *cc = CPU_CORE(dev); 3889 CPUArchId *core_slot = spapr_find_cpu_slot(ms, cc->core_id, NULL); 3890 3891 assert(core_slot); 3892 core_slot->cpu = NULL; 3893 qdev_unrealize(dev); 3894 } 3895 3896 static 3897 void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev, 3898 Error **errp) 3899 { 3900 SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev)); 3901 int index; 3902 SpaprDrc *drc; 3903 CPUCore *cc = CPU_CORE(dev); 3904 3905 if (!spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index)) { 3906 error_setg(errp, "Unable to find CPU core with core-id: %d", 3907 cc->core_id); 3908 return; 3909 } 3910 if (index == 0) { 3911 error_setg(errp, "Boot CPU core may not be unplugged"); 3912 return; 3913 } 3914 3915 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, 3916 spapr_vcpu_id(spapr, cc->core_id)); 3917 g_assert(drc); 3918 3919 if (!spapr_drc_unplug_requested(drc)) { 3920 spapr_drc_unplug_request(drc); 3921 } 3922 3923 /* 3924 * spapr_hotplug_req_remove_by_index is left unguarded, out of the 3925 * "!spapr_drc_unplug_requested" check, to allow for multiple IRQ 3926 * pulses removing the same CPU. Otherwise, in an failed hotunplug 3927 * attempt (e.g. the kernel will refuse to remove the last online 3928 * CPU), we will never attempt it again because unplug_requested 3929 * will still be 'true' in that case. 3930 */ 3931 spapr_hotplug_req_remove_by_index(drc); 3932 } 3933 3934 int spapr_core_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, 3935 void *fdt, int *fdt_start_offset, Error **errp) 3936 { 3937 SpaprCpuCore *core = SPAPR_CPU_CORE(drc->dev); 3938 CPUState *cs = CPU(core->threads[0]); 3939 PowerPCCPU *cpu = POWERPC_CPU(cs); 3940 DeviceClass *dc = DEVICE_GET_CLASS(cs); 3941 int id = spapr_get_vcpu_id(cpu); 3942 g_autofree char *nodename = NULL; 3943 int offset; 3944 3945 nodename = g_strdup_printf("%s@%x", dc->fw_name, id); 3946 offset = fdt_add_subnode(fdt, 0, nodename); 3947 3948 spapr_dt_cpu(cs, fdt, offset, spapr); 3949 3950 /* 3951 * spapr_dt_cpu() does not fill the 'name' property in the 3952 * CPU node. The function is called during boot process, before 3953 * and after CAS, and overwriting the 'name' property written 3954 * by SLOF is not allowed. 3955 * 3956 * Write it manually after spapr_dt_cpu(). This makes the hotplug 3957 * CPUs more compatible with the coldplugged ones, which have 3958 * the 'name' property. Linux Kernel also relies on this 3959 * property to identify CPU nodes. 3960 */ 3961 _FDT((fdt_setprop_string(fdt, offset, "name", nodename))); 3962 3963 *fdt_start_offset = offset; 3964 return 0; 3965 } 3966 3967 static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev) 3968 { 3969 SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev)); 3970 MachineClass *mc = MACHINE_GET_CLASS(spapr); 3971 SpaprCpuCore *core = SPAPR_CPU_CORE(OBJECT(dev)); 3972 CPUCore *cc = CPU_CORE(dev); 3973 SpaprDrc *drc; 3974 CPUArchId *core_slot; 3975 int index; 3976 bool hotplugged = spapr_drc_hotplugged(dev); 3977 int i; 3978 3979 core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index); 3980 g_assert(core_slot); /* Already checked in spapr_core_pre_plug() */ 3981 3982 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, 3983 spapr_vcpu_id(spapr, cc->core_id)); 3984 3985 g_assert(drc || !mc->has_hotpluggable_cpus); 3986 3987 if (drc) { 3988 /* 3989 * spapr_core_pre_plug() already buys us this is a brand new 3990 * core being plugged into a free slot. Nothing should already 3991 * be attached to the corresponding DRC. 3992 */ 3993 spapr_drc_attach(drc, dev); 3994 3995 if (hotplugged) { 3996 /* 3997 * Send hotplug notification interrupt to the guest only 3998 * in case of hotplugged CPUs. 3999 */ 4000 spapr_hotplug_req_add_by_index(drc); 4001 } else { 4002 spapr_drc_reset(drc); 4003 } 4004 } 4005 4006 core_slot->cpu = CPU(dev); 4007 4008 /* 4009 * Set compatibility mode to match the boot CPU, which was either set 4010 * by the machine reset code or by CAS. This really shouldn't fail at 4011 * this point. 4012 */ 4013 if (hotplugged) { 4014 for (i = 0; i < cc->nr_threads; i++) { 4015 ppc_set_compat(core->threads[i], POWERPC_CPU(first_cpu)->compat_pvr, 4016 &error_abort); 4017 } 4018 } 4019 4020 } 4021 4022 static void spapr_core_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, 4023 Error **errp) 4024 { 4025 MachineState *machine = MACHINE(OBJECT(hotplug_dev)); 4026 MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev); 4027 CPUCore *cc = CPU_CORE(dev); 4028 const char *base_core_type = spapr_get_cpu_core_type(machine->cpu_type); 4029 const char *type = object_get_typename(OBJECT(dev)); 4030 CPUArchId *core_slot; 4031 int index; 4032 unsigned int smp_threads = machine->smp.threads; 4033 4034 if (dev->hotplugged && !mc->has_hotpluggable_cpus) { 4035 error_setg(errp, "CPU hotplug not supported for this machine"); 4036 return; 4037 } 4038 4039 if (strcmp(base_core_type, type)) { 4040 error_setg(errp, "CPU core type should be %s", base_core_type); 4041 return; 4042 } 4043 4044 if (cc->core_id % smp_threads) { 4045 error_setg(errp, "invalid core id %d", cc->core_id); 4046 return; 4047 } 4048 4049 /* 4050 * In general we should have homogeneous threads-per-core, but old 4051 * (pre hotplug support) machine types allow the last core to have 4052 * reduced threads as a compatibility hack for when we allowed 4053 * total vcpus not a multiple of threads-per-core. 4054 */ 4055 if (mc->has_hotpluggable_cpus && (cc->nr_threads != smp_threads)) { 4056 error_setg(errp, "invalid nr-threads %d, must be %d", cc->nr_threads, 4057 smp_threads); 4058 return; 4059 } 4060 4061 core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index); 4062 if (!core_slot) { 4063 error_setg(errp, "core id %d out of range", cc->core_id); 4064 return; 4065 } 4066 4067 if (core_slot->cpu) { 4068 error_setg(errp, "core %d already populated", cc->core_id); 4069 return; 4070 } 4071 4072 numa_cpu_pre_plug(core_slot, dev, errp); 4073 } 4074 4075 int spapr_phb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, 4076 void *fdt, int *fdt_start_offset, Error **errp) 4077 { 4078 SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(drc->dev); 4079 int intc_phandle; 4080 4081 intc_phandle = spapr_irq_get_phandle(spapr, spapr->fdt_blob, errp); 4082 if (intc_phandle <= 0) { 4083 return -1; 4084 } 4085 4086 if (spapr_dt_phb(spapr, sphb, intc_phandle, fdt, fdt_start_offset)) { 4087 error_setg(errp, "unable to create FDT node for PHB %d", sphb->index); 4088 return -1; 4089 } 4090 4091 /* generally SLOF creates these, for hotplug it's up to QEMU */ 4092 _FDT(fdt_setprop_string(fdt, *fdt_start_offset, "name", "pci")); 4093 4094 return 0; 4095 } 4096 4097 static bool spapr_phb_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, 4098 Error **errp) 4099 { 4100 SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev)); 4101 SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev); 4102 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); 4103 const unsigned windows_supported = spapr_phb_windows_supported(sphb); 4104 SpaprDrc *drc; 4105 4106 if (dev->hotplugged && !smc->dr_phb_enabled) { 4107 error_setg(errp, "PHB hotplug not supported for this machine"); 4108 return false; 4109 } 4110 4111 if (sphb->index == (uint32_t)-1) { 4112 error_setg(errp, "\"index\" for PAPR PHB is mandatory"); 4113 return false; 4114 } 4115 4116 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index); 4117 if (drc && drc->dev) { 4118 error_setg(errp, "PHB %d already attached", sphb->index); 4119 return false; 4120 } 4121 4122 /* 4123 * This will check that sphb->index doesn't exceed the maximum number of 4124 * PHBs for the current machine type. 4125 */ 4126 return 4127 smc->phb_placement(spapr, sphb->index, 4128 &sphb->buid, &sphb->io_win_addr, 4129 &sphb->mem_win_addr, &sphb->mem64_win_addr, 4130 windows_supported, sphb->dma_liobn, 4131 errp); 4132 } 4133 4134 static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev) 4135 { 4136 SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev)); 4137 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); 4138 SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev); 4139 SpaprDrc *drc; 4140 bool hotplugged = spapr_drc_hotplugged(dev); 4141 4142 if (!smc->dr_phb_enabled) { 4143 return; 4144 } 4145 4146 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index); 4147 /* hotplug hooks should check it's enabled before getting this far */ 4148 assert(drc); 4149 4150 /* spapr_phb_pre_plug() already checked the DRC is attachable */ 4151 spapr_drc_attach(drc, dev); 4152 4153 if (hotplugged) { 4154 spapr_hotplug_req_add_by_index(drc); 4155 } else { 4156 spapr_drc_reset(drc); 4157 } 4158 } 4159 4160 void spapr_phb_release(DeviceState *dev) 4161 { 4162 HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev); 4163 4164 hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort); 4165 object_unparent(OBJECT(dev)); 4166 } 4167 4168 static void spapr_phb_unplug(HotplugHandler *hotplug_dev, DeviceState *dev) 4169 { 4170 qdev_unrealize(dev); 4171 } 4172 4173 static void spapr_phb_unplug_request(HotplugHandler *hotplug_dev, 4174 DeviceState *dev, Error **errp) 4175 { 4176 SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev); 4177 SpaprDrc *drc; 4178 4179 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index); 4180 assert(drc); 4181 4182 if (!spapr_drc_unplug_requested(drc)) { 4183 spapr_drc_unplug_request(drc); 4184 spapr_hotplug_req_remove_by_index(drc); 4185 } else { 4186 error_setg(errp, 4187 "PCI Host Bridge unplug already in progress for device %s", 4188 dev->id); 4189 } 4190 } 4191 4192 static 4193 bool spapr_tpm_proxy_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, 4194 Error **errp) 4195 { 4196 SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev)); 4197 4198 if (spapr->tpm_proxy != NULL) { 4199 error_setg(errp, "Only one TPM proxy can be specified for this machine"); 4200 return false; 4201 } 4202 4203 return true; 4204 } 4205 4206 static void spapr_tpm_proxy_plug(HotplugHandler *hotplug_dev, DeviceState *dev) 4207 { 4208 SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev)); 4209 SpaprTpmProxy *tpm_proxy = SPAPR_TPM_PROXY(dev); 4210 4211 /* Already checked in spapr_tpm_proxy_pre_plug() */ 4212 g_assert(spapr->tpm_proxy == NULL); 4213 4214 spapr->tpm_proxy = tpm_proxy; 4215 } 4216 4217 static void spapr_tpm_proxy_unplug(HotplugHandler *hotplug_dev, DeviceState *dev) 4218 { 4219 SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev)); 4220 4221 qdev_unrealize(dev); 4222 object_unparent(OBJECT(dev)); 4223 spapr->tpm_proxy = NULL; 4224 } 4225 4226 static void spapr_machine_device_plug(HotplugHandler *hotplug_dev, 4227 DeviceState *dev, Error **errp) 4228 { 4229 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { 4230 spapr_memory_plug(hotplug_dev, dev); 4231 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) { 4232 spapr_core_plug(hotplug_dev, dev); 4233 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) { 4234 spapr_phb_plug(hotplug_dev, dev); 4235 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) { 4236 spapr_tpm_proxy_plug(hotplug_dev, dev); 4237 } 4238 } 4239 4240 static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev, 4241 DeviceState *dev, Error **errp) 4242 { 4243 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { 4244 spapr_memory_unplug(hotplug_dev, dev); 4245 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) { 4246 spapr_core_unplug(hotplug_dev, dev); 4247 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) { 4248 spapr_phb_unplug(hotplug_dev, dev); 4249 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) { 4250 spapr_tpm_proxy_unplug(hotplug_dev, dev); 4251 } 4252 } 4253 4254 bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr) 4255 { 4256 return spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT) || 4257 /* 4258 * CAS will process all pending unplug requests. 4259 * 4260 * HACK: a guest could theoretically have cleared all bits in OV5, 4261 * but none of the guests we care for do. 4262 */ 4263 spapr_ovec_empty(spapr->ov5_cas); 4264 } 4265 4266 static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev, 4267 DeviceState *dev, Error **errp) 4268 { 4269 SpaprMachineState *sms = SPAPR_MACHINE(OBJECT(hotplug_dev)); 4270 MachineClass *mc = MACHINE_GET_CLASS(sms); 4271 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); 4272 4273 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { 4274 if (spapr_memory_hot_unplug_supported(sms)) { 4275 spapr_memory_unplug_request(hotplug_dev, dev, errp); 4276 } else { 4277 error_setg(errp, "Memory hot unplug not supported for this guest"); 4278 } 4279 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) { 4280 if (!mc->has_hotpluggable_cpus) { 4281 error_setg(errp, "CPU hot unplug not supported on this machine"); 4282 return; 4283 } 4284 spapr_core_unplug_request(hotplug_dev, dev, errp); 4285 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) { 4286 if (!smc->dr_phb_enabled) { 4287 error_setg(errp, "PHB hot unplug not supported on this machine"); 4288 return; 4289 } 4290 spapr_phb_unplug_request(hotplug_dev, dev, errp); 4291 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) { 4292 spapr_tpm_proxy_unplug(hotplug_dev, dev); 4293 } 4294 } 4295 4296 static void spapr_machine_device_pre_plug(HotplugHandler *hotplug_dev, 4297 DeviceState *dev, Error **errp) 4298 { 4299 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { 4300 spapr_memory_pre_plug(hotplug_dev, dev, errp); 4301 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) { 4302 spapr_core_pre_plug(hotplug_dev, dev, errp); 4303 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) { 4304 spapr_phb_pre_plug(hotplug_dev, dev, errp); 4305 } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) { 4306 spapr_tpm_proxy_pre_plug(hotplug_dev, dev, errp); 4307 } 4308 } 4309 4310 static HotplugHandler *spapr_get_hotplug_handler(MachineState *machine, 4311 DeviceState *dev) 4312 { 4313 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) || 4314 object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE) || 4315 object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE) || 4316 object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) { 4317 return HOTPLUG_HANDLER(machine); 4318 } 4319 if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { 4320 PCIDevice *pcidev = PCI_DEVICE(dev); 4321 PCIBus *root = pci_device_root_bus(pcidev); 4322 SpaprPhbState *phb = 4323 (SpaprPhbState *)object_dynamic_cast(OBJECT(BUS(root)->parent), 4324 TYPE_SPAPR_PCI_HOST_BRIDGE); 4325 4326 if (phb) { 4327 return HOTPLUG_HANDLER(phb); 4328 } 4329 } 4330 return NULL; 4331 } 4332 4333 static CpuInstanceProperties 4334 spapr_cpu_index_to_props(MachineState *machine, unsigned cpu_index) 4335 { 4336 CPUArchId *core_slot; 4337 MachineClass *mc = MACHINE_GET_CLASS(machine); 4338 4339 /* make sure possible_cpu are initialized */ 4340 mc->possible_cpu_arch_ids(machine); 4341 /* get CPU core slot containing thread that matches cpu_index */ 4342 core_slot = spapr_find_cpu_slot(machine, cpu_index, NULL); 4343 assert(core_slot); 4344 return core_slot->props; 4345 } 4346 4347 static int64_t spapr_get_default_cpu_node_id(const MachineState *ms, int idx) 4348 { 4349 return idx / ms->smp.cores % ms->numa_state->num_nodes; 4350 } 4351 4352 static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine) 4353 { 4354 int i; 4355 unsigned int smp_threads = machine->smp.threads; 4356 unsigned int smp_cpus = machine->smp.cpus; 4357 const char *core_type; 4358 int spapr_max_cores = machine->smp.max_cpus / smp_threads; 4359 MachineClass *mc = MACHINE_GET_CLASS(machine); 4360 4361 if (!mc->has_hotpluggable_cpus) { 4362 spapr_max_cores = QEMU_ALIGN_UP(smp_cpus, smp_threads) / smp_threads; 4363 } 4364 if (machine->possible_cpus) { 4365 assert(machine->possible_cpus->len == spapr_max_cores); 4366 return machine->possible_cpus; 4367 } 4368 4369 core_type = spapr_get_cpu_core_type(machine->cpu_type); 4370 if (!core_type) { 4371 error_report("Unable to find sPAPR CPU Core definition"); 4372 exit(1); 4373 } 4374 4375 machine->possible_cpus = g_malloc0(sizeof(CPUArchIdList) + 4376 sizeof(CPUArchId) * spapr_max_cores); 4377 machine->possible_cpus->len = spapr_max_cores; 4378 for (i = 0; i < machine->possible_cpus->len; i++) { 4379 int core_id = i * smp_threads; 4380 4381 machine->possible_cpus->cpus[i].type = core_type; 4382 machine->possible_cpus->cpus[i].vcpus_count = smp_threads; 4383 machine->possible_cpus->cpus[i].arch_id = core_id; 4384 machine->possible_cpus->cpus[i].props.has_core_id = true; 4385 machine->possible_cpus->cpus[i].props.core_id = core_id; 4386 } 4387 return machine->possible_cpus; 4388 } 4389 4390 static bool spapr_phb_placement(SpaprMachineState *spapr, uint32_t index, 4391 uint64_t *buid, hwaddr *pio, 4392 hwaddr *mmio32, hwaddr *mmio64, 4393 unsigned n_dma, uint32_t *liobns, Error **errp) 4394 { 4395 /* 4396 * New-style PHB window placement. 4397 * 4398 * Goals: Gives large (1TiB), naturally aligned 64-bit MMIO window 4399 * for each PHB, in addition to 2GiB 32-bit MMIO and 64kiB PIO 4400 * windows. 4401 * 4402 * Some guest kernels can't work with MMIO windows above 1<<46 4403 * (64TiB), so we place up to 31 PHBs in the area 32TiB..64TiB 4404 * 4405 * 32TiB..(33TiB+1984kiB) contains the 64kiB PIO windows for each 4406 * PHB stacked together. (32TiB+2GiB)..(32TiB+64GiB) contains the 4407 * 2GiB 32-bit MMIO windows for each PHB. Then 33..64TiB has the 4408 * 1TiB 64-bit MMIO windows for each PHB. 4409 */ 4410 const uint64_t base_buid = 0x800000020000000ULL; 4411 int i; 4412 4413 /* Sanity check natural alignments */ 4414 QEMU_BUILD_BUG_ON((SPAPR_PCI_BASE % SPAPR_PCI_MEM64_WIN_SIZE) != 0); 4415 QEMU_BUILD_BUG_ON((SPAPR_PCI_LIMIT % SPAPR_PCI_MEM64_WIN_SIZE) != 0); 4416 QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM64_WIN_SIZE % SPAPR_PCI_MEM32_WIN_SIZE) != 0); 4417 QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM32_WIN_SIZE % SPAPR_PCI_IO_WIN_SIZE) != 0); 4418 /* Sanity check bounds */ 4419 QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_IO_WIN_SIZE) > 4420 SPAPR_PCI_MEM32_WIN_SIZE); 4421 QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_MEM32_WIN_SIZE) > 4422 SPAPR_PCI_MEM64_WIN_SIZE); 4423 4424 if (index >= SPAPR_MAX_PHBS) { 4425 error_setg(errp, "\"index\" for PAPR PHB is too large (max %llu)", 4426 SPAPR_MAX_PHBS - 1); 4427 return false; 4428 } 4429 4430 *buid = base_buid + index; 4431 for (i = 0; i < n_dma; ++i) { 4432 liobns[i] = SPAPR_PCI_LIOBN(index, i); 4433 } 4434 4435 *pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE; 4436 *mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE; 4437 *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE; 4438 return true; 4439 } 4440 4441 static ICSState *spapr_ics_get(XICSFabric *dev, int irq) 4442 { 4443 SpaprMachineState *spapr = SPAPR_MACHINE(dev); 4444 4445 return ics_valid_irq(spapr->ics, irq) ? spapr->ics : NULL; 4446 } 4447 4448 static void spapr_ics_resend(XICSFabric *dev) 4449 { 4450 SpaprMachineState *spapr = SPAPR_MACHINE(dev); 4451 4452 ics_resend(spapr->ics); 4453 } 4454 4455 static ICPState *spapr_icp_get(XICSFabric *xi, int vcpu_id) 4456 { 4457 PowerPCCPU *cpu = spapr_find_cpu(vcpu_id); 4458 4459 return cpu ? spapr_cpu_state(cpu)->icp : NULL; 4460 } 4461 4462 static void spapr_pic_print_info(InterruptStatsProvider *obj, GString *buf) 4463 { 4464 SpaprMachineState *spapr = SPAPR_MACHINE(obj); 4465 4466 spapr_irq_print_info(spapr, buf); 4467 g_string_append_printf(buf, "irqchip: %s\n", 4468 kvm_irqchip_in_kernel() ? "in-kernel" : "emulated"); 4469 } 4470 4471 /* 4472 * This is a XIVE only operation 4473 */ 4474 static bool spapr_match_nvt(XiveFabric *xfb, uint8_t format, 4475 uint8_t nvt_blk, uint32_t nvt_idx, 4476 bool crowd, bool cam_ignore, uint8_t priority, 4477 uint32_t logic_serv, XiveTCTXMatch *match) 4478 { 4479 SpaprMachineState *spapr = SPAPR_MACHINE(xfb); 4480 XivePresenter *xptr = XIVE_PRESENTER(spapr->active_intc); 4481 XivePresenterClass *xpc = XIVE_PRESENTER_GET_CLASS(xptr); 4482 4483 /* 4484 * When we implement the save and restore of the thread interrupt 4485 * contexts in the enter/exit CPU handlers of the machine and the 4486 * escalations in QEMU, we should be able to handle non dispatched 4487 * vCPUs. 4488 * 4489 * Until this is done, the sPAPR machine should find at least one 4490 * matching context always. 4491 */ 4492 if (!xpc->match_nvt(xptr, format, nvt_blk, nvt_idx, crowd, cam_ignore, 4493 priority, logic_serv, match)) { 4494 qemu_log_mask(LOG_GUEST_ERROR, "XIVE: NVT %x/%x is not dispatched\n", 4495 nvt_blk, nvt_idx); 4496 return false; 4497 } 4498 4499 return true; 4500 } 4501 4502 int spapr_get_vcpu_id(PowerPCCPU *cpu) 4503 { 4504 return cpu->vcpu_id; 4505 } 4506 4507 bool spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp) 4508 { 4509 SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); 4510 MachineState *ms = MACHINE(spapr); 4511 int vcpu_id; 4512 4513 vcpu_id = spapr_vcpu_id(spapr, cpu_index); 4514 4515 if (kvm_enabled() && !kvm_vcpu_id_is_valid(vcpu_id)) { 4516 error_setg(errp, "Can't create CPU with id %d in KVM", vcpu_id); 4517 error_append_hint(errp, "Adjust the number of cpus to %d " 4518 "or try to raise the number of threads per core\n", 4519 vcpu_id * ms->smp.threads / spapr->vsmt); 4520 return false; 4521 } 4522 4523 cpu->vcpu_id = vcpu_id; 4524 return true; 4525 } 4526 4527 PowerPCCPU *spapr_find_cpu(int vcpu_id) 4528 { 4529 CPUState *cs; 4530 4531 CPU_FOREACH(cs) { 4532 PowerPCCPU *cpu = POWERPC_CPU(cs); 4533 4534 if (spapr_get_vcpu_id(cpu) == vcpu_id) { 4535 return cpu; 4536 } 4537 } 4538 4539 return NULL; 4540 } 4541 4542 static bool spapr_cpu_in_nested(PowerPCCPU *cpu) 4543 { 4544 SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu); 4545 4546 return spapr_cpu->in_nested; 4547 } 4548 4549 static void spapr_cpu_exec_enter(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu) 4550 { 4551 SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu); 4552 4553 /* These are only called by TCG, KVM maintains dispatch state */ 4554 4555 spapr_cpu->prod = false; 4556 if (spapr_cpu->vpa_addr) { 4557 CPUState *cs = CPU(cpu); 4558 uint32_t dispatch; 4559 4560 dispatch = ldl_be_phys(cs->as, 4561 spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER); 4562 dispatch++; 4563 if ((dispatch & 1) != 0) { 4564 qemu_log_mask(LOG_GUEST_ERROR, 4565 "VPA: incorrect dispatch counter value for " 4566 "dispatched partition %u, correcting.\n", dispatch); 4567 dispatch++; 4568 } 4569 stl_be_phys(cs->as, 4570 spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER, dispatch); 4571 } 4572 } 4573 4574 static void spapr_cpu_exec_exit(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu) 4575 { 4576 SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu); 4577 4578 if (spapr_cpu->vpa_addr) { 4579 CPUState *cs = CPU(cpu); 4580 uint32_t dispatch; 4581 4582 dispatch = ldl_be_phys(cs->as, 4583 spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER); 4584 dispatch++; 4585 if ((dispatch & 1) != 1) { 4586 qemu_log_mask(LOG_GUEST_ERROR, 4587 "VPA: incorrect dispatch counter value for " 4588 "preempted partition %u, correcting.\n", dispatch); 4589 dispatch++; 4590 } 4591 stl_be_phys(cs->as, 4592 spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER, dispatch); 4593 } 4594 } 4595 4596 static void spapr_machine_class_init(ObjectClass *oc, const void *data) 4597 { 4598 MachineClass *mc = MACHINE_CLASS(oc); 4599 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(oc); 4600 FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc); 4601 NMIClass *nc = NMI_CLASS(oc); 4602 HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); 4603 PPCVirtualHypervisorClass *vhc = PPC_VIRTUAL_HYPERVISOR_CLASS(oc); 4604 XICSFabricClass *xic = XICS_FABRIC_CLASS(oc); 4605 InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc); 4606 XiveFabricClass *xfc = XIVE_FABRIC_CLASS(oc); 4607 VofMachineIfClass *vmc = VOF_MACHINE_CLASS(oc); 4608 4609 mc->desc = "pSeries Logical Partition (PAPR compliant)"; 4610 mc->ignore_boot_device_suffixes = true; 4611 4612 /* 4613 * We set up the default / latest behaviour here. The class_init 4614 * functions for the specific versioned machine types can override 4615 * these details for backwards compatibility 4616 */ 4617 mc->init = spapr_machine_init; 4618 mc->reset = spapr_machine_reset; 4619 mc->block_default_type = IF_SCSI; 4620 4621 /* 4622 * While KVM determines max cpus in kvm_init() using kvm_max_vcpus(), 4623 * In TCG the limit is restricted by the range of CPU IPIs available. 4624 */ 4625 mc->max_cpus = SPAPR_IRQ_NR_IPIS; 4626 4627 mc->no_parallel = 1; 4628 mc->default_boot_order = ""; 4629 mc->default_ram_size = 512 * MiB; 4630 mc->default_ram_id = "ppc_spapr.ram"; 4631 mc->default_display = "std"; 4632 mc->kvm_type = spapr_kvm_type; 4633 machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SPAPR_PCI_HOST_BRIDGE); 4634 mc->pci_allow_0_address = true; 4635 assert(!mc->get_hotplug_handler); 4636 mc->get_hotplug_handler = spapr_get_hotplug_handler; 4637 hc->pre_plug = spapr_machine_device_pre_plug; 4638 hc->plug = spapr_machine_device_plug; 4639 mc->cpu_index_to_instance_props = spapr_cpu_index_to_props; 4640 mc->get_default_cpu_node_id = spapr_get_default_cpu_node_id; 4641 mc->possible_cpu_arch_ids = spapr_possible_cpu_arch_ids; 4642 hc->unplug_request = spapr_machine_device_unplug_request; 4643 hc->unplug = spapr_machine_device_unplug; 4644 4645 smc->update_dt_enabled = true; 4646 mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power10_v2.0"); 4647 mc->has_hotpluggable_cpus = true; 4648 mc->nvdimm_supported = true; 4649 smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED; 4650 fwc->get_dev_path = spapr_get_fw_dev_path; 4651 nc->nmi_monitor_handler = spapr_nmi; 4652 smc->phb_placement = spapr_phb_placement; 4653 vhc->cpu_in_nested = spapr_cpu_in_nested; 4654 vhc->deliver_hv_excp = spapr_exit_nested; 4655 vhc->hypercall = emulate_spapr_hypercall; 4656 vhc->hpt_mask = spapr_hpt_mask; 4657 vhc->map_hptes = spapr_map_hptes; 4658 vhc->unmap_hptes = spapr_unmap_hptes; 4659 vhc->hpte_set_c = spapr_hpte_set_c; 4660 vhc->hpte_set_r = spapr_hpte_set_r; 4661 vhc->get_pate = spapr_get_pate; 4662 vhc->encode_hpt_for_kvm_pr = spapr_encode_hpt_for_kvm_pr; 4663 vhc->cpu_exec_enter = spapr_cpu_exec_enter; 4664 vhc->cpu_exec_exit = spapr_cpu_exec_exit; 4665 xic->ics_get = spapr_ics_get; 4666 xic->ics_resend = spapr_ics_resend; 4667 xic->icp_get = spapr_icp_get; 4668 ispc->print_info = spapr_pic_print_info; 4669 /* Force NUMA node memory size to be a multiple of 4670 * SPAPR_MEMORY_BLOCK_SIZE (256M) since that's the granularity 4671 * in which LMBs are represented and hot-added 4672 */ 4673 mc->numa_mem_align_shift = 28; 4674 mc->auto_enable_numa = true; 4675 4676 smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_OFF; 4677 smc->default_caps.caps[SPAPR_CAP_VSX] = SPAPR_CAP_ON; 4678 smc->default_caps.caps[SPAPR_CAP_DFP] = SPAPR_CAP_ON; 4679 smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND; 4680 smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND; 4681 smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_WORKAROUND; 4682 smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 16; /* 64kiB */ 4683 smc->default_caps.caps[SPAPR_CAP_NESTED_KVM_HV] = SPAPR_CAP_OFF; 4684 smc->default_caps.caps[SPAPR_CAP_NESTED_PAPR] = SPAPR_CAP_OFF; 4685 smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON; 4686 smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON; 4687 smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON; 4688 smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF; 4689 smc->default_caps.caps[SPAPR_CAP_DAWR1] = SPAPR_CAP_ON; 4690 4691 /* 4692 * This cap specifies whether the AIL 3 mode for 4693 * H_SET_RESOURCE is supported. The default is modified 4694 * by default_caps_with_cpu(). 4695 */ 4696 smc->default_caps.caps[SPAPR_CAP_AIL_MODE_3] = SPAPR_CAP_ON; 4697 spapr_caps_add_properties(smc); 4698 smc->irq = &spapr_irq_dual; 4699 smc->dr_phb_enabled = true; 4700 smc->linux_pci_probe = true; 4701 smc->smp_threads_vsmt = true; 4702 smc->nr_xirqs = SPAPR_NR_XIRQS; 4703 xfc->match_nvt = spapr_match_nvt; 4704 vmc->client_architecture_support = spapr_vof_client_architecture_support; 4705 vmc->quiesce = spapr_vof_quiesce; 4706 vmc->setprop = spapr_vof_setprop; 4707 } 4708 4709 static const TypeInfo spapr_machine_info = { 4710 .name = TYPE_SPAPR_MACHINE, 4711 .parent = TYPE_MACHINE, 4712 .abstract = true, 4713 .instance_size = sizeof(SpaprMachineState), 4714 .instance_init = spapr_instance_init, 4715 .instance_finalize = spapr_machine_finalizefn, 4716 .class_size = sizeof(SpaprMachineClass), 4717 .class_init = spapr_machine_class_init, 4718 .interfaces = (const InterfaceInfo[]) { 4719 { TYPE_FW_PATH_PROVIDER }, 4720 { TYPE_NMI }, 4721 { TYPE_HOTPLUG_HANDLER }, 4722 { TYPE_PPC_VIRTUAL_HYPERVISOR }, 4723 { TYPE_XICS_FABRIC }, 4724 { TYPE_INTERRUPT_STATS_PROVIDER }, 4725 { TYPE_XIVE_FABRIC }, 4726 { TYPE_VOF_MACHINE_IF }, 4727 { } 4728 }, 4729 }; 4730 4731 static void spapr_machine_latest_class_options(MachineClass *mc) 4732 { 4733 mc->alias = "pseries"; 4734 mc->is_default = true; 4735 } 4736 4737 #define DEFINE_SPAPR_MACHINE_IMPL(latest, ...) \ 4738 static void MACHINE_VER_SYM(class_init, spapr, __VA_ARGS__)( \ 4739 ObjectClass *oc, \ 4740 const void *data) \ 4741 { \ 4742 MachineClass *mc = MACHINE_CLASS(oc); \ 4743 MACHINE_VER_SYM(class_options, spapr, __VA_ARGS__)(mc); \ 4744 MACHINE_VER_DEPRECATION(__VA_ARGS__); \ 4745 if (latest) { \ 4746 spapr_machine_latest_class_options(mc); \ 4747 } \ 4748 } \ 4749 static const TypeInfo MACHINE_VER_SYM(info, spapr, __VA_ARGS__) = \ 4750 { \ 4751 .name = MACHINE_VER_TYPE_NAME("pseries", __VA_ARGS__), \ 4752 .parent = TYPE_SPAPR_MACHINE, \ 4753 .class_init = MACHINE_VER_SYM(class_init, spapr, __VA_ARGS__), \ 4754 }; \ 4755 static void MACHINE_VER_SYM(register, spapr, __VA_ARGS__)(void) \ 4756 { \ 4757 MACHINE_VER_DELETION(__VA_ARGS__); \ 4758 type_register_static(&MACHINE_VER_SYM(info, spapr, __VA_ARGS__)); \ 4759 } \ 4760 type_init(MACHINE_VER_SYM(register, spapr, __VA_ARGS__)) 4761 4762 #define DEFINE_SPAPR_MACHINE_AS_LATEST(major, minor) \ 4763 DEFINE_SPAPR_MACHINE_IMPL(true, major, minor) 4764 #define DEFINE_SPAPR_MACHINE(major, minor) \ 4765 DEFINE_SPAPR_MACHINE_IMPL(false, major, minor) 4766 4767 /* 4768 * pseries-10.1 4769 */ 4770 static void spapr_machine_10_1_class_options(MachineClass *mc) 4771 { 4772 /* Defaults for the latest behaviour inherited from the base class */ 4773 } 4774 4775 DEFINE_SPAPR_MACHINE_AS_LATEST(10, 1); 4776 4777 /* 4778 * pseries-10.0 4779 */ 4780 static void spapr_machine_10_0_class_options(MachineClass *mc) 4781 { 4782 spapr_machine_10_1_class_options(mc); 4783 compat_props_add(mc->compat_props, hw_compat_10_0, hw_compat_10_0_len); 4784 } 4785 4786 DEFINE_SPAPR_MACHINE(10, 0); 4787 4788 /* 4789 * pseries-9.2 4790 */ 4791 static void spapr_machine_9_2_class_options(MachineClass *mc) 4792 { 4793 spapr_machine_10_0_class_options(mc); 4794 compat_props_add(mc->compat_props, hw_compat_9_2, hw_compat_9_2_len); 4795 } 4796 4797 DEFINE_SPAPR_MACHINE(9, 2); 4798 4799 /* 4800 * pseries-9.1 4801 */ 4802 static void spapr_machine_9_1_class_options(MachineClass *mc) 4803 { 4804 spapr_machine_9_2_class_options(mc); 4805 compat_props_add(mc->compat_props, hw_compat_9_1, hw_compat_9_1_len); 4806 } 4807 4808 DEFINE_SPAPR_MACHINE(9, 1); 4809 4810 /* 4811 * pseries-9.0 4812 */ 4813 static void spapr_machine_9_0_class_options(MachineClass *mc) 4814 { 4815 spapr_machine_9_1_class_options(mc); 4816 compat_props_add(mc->compat_props, hw_compat_9_0, hw_compat_9_0_len); 4817 } 4818 4819 DEFINE_SPAPR_MACHINE(9, 0); 4820 4821 /* 4822 * pseries-8.2 4823 */ 4824 static void spapr_machine_8_2_class_options(MachineClass *mc) 4825 { 4826 spapr_machine_9_0_class_options(mc); 4827 compat_props_add(mc->compat_props, hw_compat_8_2, hw_compat_8_2_len); 4828 mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power9_v2.2"); 4829 } 4830 4831 DEFINE_SPAPR_MACHINE(8, 2); 4832 4833 /* 4834 * pseries-8.1 4835 */ 4836 static void spapr_machine_8_1_class_options(MachineClass *mc) 4837 { 4838 spapr_machine_8_2_class_options(mc); 4839 compat_props_add(mc->compat_props, hw_compat_8_1, hw_compat_8_1_len); 4840 } 4841 4842 DEFINE_SPAPR_MACHINE(8, 1); 4843 4844 /* 4845 * pseries-8.0 4846 */ 4847 static void spapr_machine_8_0_class_options(MachineClass *mc) 4848 { 4849 spapr_machine_8_1_class_options(mc); 4850 compat_props_add(mc->compat_props, hw_compat_8_0, hw_compat_8_0_len); 4851 } 4852 4853 DEFINE_SPAPR_MACHINE(8, 0); 4854 4855 /* 4856 * pseries-7.2 4857 */ 4858 static void spapr_machine_7_2_class_options(MachineClass *mc) 4859 { 4860 spapr_machine_8_0_class_options(mc); 4861 compat_props_add(mc->compat_props, hw_compat_7_2, hw_compat_7_2_len); 4862 } 4863 4864 DEFINE_SPAPR_MACHINE(7, 2); 4865 4866 /* 4867 * pseries-7.1 4868 */ 4869 static void spapr_machine_7_1_class_options(MachineClass *mc) 4870 { 4871 spapr_machine_7_2_class_options(mc); 4872 compat_props_add(mc->compat_props, hw_compat_7_1, hw_compat_7_1_len); 4873 } 4874 4875 DEFINE_SPAPR_MACHINE(7, 1); 4876 4877 /* 4878 * pseries-7.0 4879 */ 4880 static void spapr_machine_7_0_class_options(MachineClass *mc) 4881 { 4882 spapr_machine_7_1_class_options(mc); 4883 compat_props_add(mc->compat_props, hw_compat_7_0, hw_compat_7_0_len); 4884 } 4885 4886 DEFINE_SPAPR_MACHINE(7, 0); 4887 4888 /* 4889 * pseries-6.2 4890 */ 4891 static void spapr_machine_6_2_class_options(MachineClass *mc) 4892 { 4893 spapr_machine_7_0_class_options(mc); 4894 compat_props_add(mc->compat_props, hw_compat_6_2, hw_compat_6_2_len); 4895 } 4896 4897 DEFINE_SPAPR_MACHINE(6, 2); 4898 4899 /* 4900 * pseries-6.1 4901 */ 4902 static void spapr_machine_6_1_class_options(MachineClass *mc) 4903 { 4904 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); 4905 4906 spapr_machine_6_2_class_options(mc); 4907 compat_props_add(mc->compat_props, hw_compat_6_1, hw_compat_6_1_len); 4908 smc->pre_6_2_numa_affinity = true; 4909 mc->smp_props.prefer_sockets = true; 4910 } 4911 4912 DEFINE_SPAPR_MACHINE(6, 1); 4913 4914 /* 4915 * pseries-6.0 4916 */ 4917 static void spapr_machine_6_0_class_options(MachineClass *mc) 4918 { 4919 spapr_machine_6_1_class_options(mc); 4920 compat_props_add(mc->compat_props, hw_compat_6_0, hw_compat_6_0_len); 4921 } 4922 4923 DEFINE_SPAPR_MACHINE(6, 0); 4924 4925 /* 4926 * pseries-5.2 4927 */ 4928 static void spapr_machine_5_2_class_options(MachineClass *mc) 4929 { 4930 spapr_machine_6_0_class_options(mc); 4931 compat_props_add(mc->compat_props, hw_compat_5_2, hw_compat_5_2_len); 4932 } 4933 4934 DEFINE_SPAPR_MACHINE(5, 2); 4935 4936 /* 4937 * pseries-5.1 4938 */ 4939 static void spapr_machine_5_1_class_options(MachineClass *mc) 4940 { 4941 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); 4942 4943 spapr_machine_5_2_class_options(mc); 4944 compat_props_add(mc->compat_props, hw_compat_5_1, hw_compat_5_1_len); 4945 smc->pre_5_2_numa_associativity = true; 4946 } 4947 4948 DEFINE_SPAPR_MACHINE(5, 1); 4949 4950 /* 4951 * pseries-5.0 4952 */ 4953 static void spapr_machine_5_0_class_options(MachineClass *mc) 4954 { 4955 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); 4956 static GlobalProperty compat[] = { 4957 { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-5.1-associativity", "on" }, 4958 }; 4959 4960 spapr_machine_5_1_class_options(mc); 4961 compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len); 4962 compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat)); 4963 mc->numa_mem_supported = true; 4964 smc->pre_5_1_assoc_refpoints = true; 4965 } 4966 4967 DEFINE_SPAPR_MACHINE(5, 0); 4968 4969 /* 4970 * pseries-4.2 4971 */ 4972 static void spapr_machine_4_2_class_options(MachineClass *mc) 4973 { 4974 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); 4975 4976 spapr_machine_5_0_class_options(mc); 4977 compat_props_add(mc->compat_props, hw_compat_4_2, hw_compat_4_2_len); 4978 smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF; 4979 smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_OFF; 4980 smc->rma_limit = 16 * GiB; 4981 mc->nvdimm_supported = false; 4982 } 4983 4984 DEFINE_SPAPR_MACHINE(4, 2); 4985 4986 /* 4987 * pseries-4.1 4988 */ 4989 static void spapr_machine_4_1_class_options(MachineClass *mc) 4990 { 4991 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); 4992 static GlobalProperty compat[] = { 4993 /* Only allow 4kiB and 64kiB IOMMU pagesizes */ 4994 { TYPE_SPAPR_PCI_HOST_BRIDGE, "pgsz", "0x11000" }, 4995 }; 4996 4997 spapr_machine_4_2_class_options(mc); 4998 smc->linux_pci_probe = false; 4999 smc->smp_threads_vsmt = false; 5000 compat_props_add(mc->compat_props, hw_compat_4_1, hw_compat_4_1_len); 5001 compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat)); 5002 } 5003 5004 DEFINE_SPAPR_MACHINE(4, 1); 5005 5006 /* 5007 * pseries-4.0 5008 */ 5009 static bool phb_placement_4_0(SpaprMachineState *spapr, uint32_t index, 5010 uint64_t *buid, hwaddr *pio, 5011 hwaddr *mmio32, hwaddr *mmio64, 5012 unsigned n_dma, uint32_t *liobns, Error **errp) 5013 { 5014 if (!spapr_phb_placement(spapr, index, buid, pio, mmio32, mmio64, n_dma, 5015 liobns, errp)) { 5016 return false; 5017 } 5018 return true; 5019 } 5020 static void spapr_machine_4_0_class_options(MachineClass *mc) 5021 { 5022 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); 5023 5024 spapr_machine_4_1_class_options(mc); 5025 compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len); 5026 smc->phb_placement = phb_placement_4_0; 5027 smc->irq = &spapr_irq_xics; 5028 smc->pre_4_1_migration = true; 5029 } 5030 5031 DEFINE_SPAPR_MACHINE(4, 0); 5032 5033 /* 5034 * pseries-3.1 5035 */ 5036 static void spapr_machine_3_1_class_options(MachineClass *mc) 5037 { 5038 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); 5039 5040 spapr_machine_4_0_class_options(mc); 5041 compat_props_add(mc->compat_props, hw_compat_3_1, hw_compat_3_1_len); 5042 5043 mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0"); 5044 smc->update_dt_enabled = false; 5045 smc->dr_phb_enabled = false; 5046 smc->broken_host_serial_model = true; 5047 smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_BROKEN; 5048 smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN; 5049 smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN; 5050 smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_OFF; 5051 } 5052 5053 DEFINE_SPAPR_MACHINE(3, 1); 5054 5055 /* 5056 * pseries-3.0 5057 */ 5058 5059 static void spapr_machine_3_0_class_options(MachineClass *mc) 5060 { 5061 SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); 5062 5063 spapr_machine_3_1_class_options(mc); 5064 compat_props_add(mc->compat_props, hw_compat_3_0, hw_compat_3_0_len); 5065 5066 smc->legacy_irq_allocation = true; 5067 smc->nr_xirqs = 0x400; 5068 smc->irq = &spapr_irq_xics_legacy; 5069 } 5070 5071 DEFINE_SPAPR_MACHINE(3, 0); 5072 5073 static void spapr_machine_register_types(void) 5074 { 5075 type_register_static(&spapr_machine_info); 5076 } 5077 5078 type_init(spapr_machine_register_types) 5079