xref: /openbmc/qemu/hw/ppc/spapr.c (revision 88f62c2b)
1 /*
2  * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
3  *
4  * Copyright (c) 2004-2007 Fabrice Bellard
5  * Copyright (c) 2007 Jocelyn Mayer
6  * Copyright (c) 2010 David Gibson, IBM Corporation.
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  *
26  */
27 #include "sysemu/sysemu.h"
28 #include "hw/hw.h"
29 #include "elf.h"
30 #include "net/net.h"
31 #include "sysemu/blockdev.h"
32 #include "sysemu/cpus.h"
33 #include "sysemu/kvm.h"
34 #include "kvm_ppc.h"
35 
36 #include "hw/boards.h"
37 #include "hw/ppc/ppc.h"
38 #include "hw/loader.h"
39 
40 #include "hw/ppc/spapr.h"
41 #include "hw/ppc/spapr_vio.h"
42 #include "hw/pci-host/spapr.h"
43 #include "hw/ppc/xics.h"
44 #include "hw/pci/msi.h"
45 
46 #include "hw/pci/pci.h"
47 
48 #include "exec/address-spaces.h"
49 #include "hw/usb.h"
50 #include "qemu/config-file.h"
51 
52 #include <libfdt.h>
53 
54 /* SLOF memory layout:
55  *
56  * SLOF raw image loaded at 0, copies its romfs right below the flat
57  * device-tree, then position SLOF itself 31M below that
58  *
59  * So we set FW_OVERHEAD to 40MB which should account for all of that
60  * and more
61  *
62  * We load our kernel at 4M, leaving space for SLOF initial image
63  */
64 #define FDT_MAX_SIZE            0x10000
65 #define RTAS_MAX_SIZE           0x10000
66 #define FW_MAX_SIZE             0x400000
67 #define FW_FILE_NAME            "slof.bin"
68 #define FW_OVERHEAD             0x2800000
69 #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
70 
71 #define MIN_RMA_SLOF            128UL
72 
73 #define TIMEBASE_FREQ           512000000ULL
74 
75 #define MAX_CPUS                256
76 #define XICS_IRQS               1024
77 
78 #define PHANDLE_XICP            0x00001111
79 
80 #define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
81 
82 sPAPREnvironment *spapr;
83 
84 int spapr_allocate_irq(int hint, bool lsi)
85 {
86     int irq;
87 
88     if (hint) {
89         irq = hint;
90         /* FIXME: we should probably check for collisions somehow */
91     } else {
92         irq = spapr->next_irq++;
93     }
94 
95     /* Configure irq type */
96     if (!xics_get_qirq(spapr->icp, irq)) {
97         return 0;
98     }
99 
100     xics_set_irq_type(spapr->icp, irq, lsi);
101 
102     return irq;
103 }
104 
105 /* Allocate block of consequtive IRQs, returns a number of the first */
106 int spapr_allocate_irq_block(int num, bool lsi)
107 {
108     int first = -1;
109     int i;
110 
111     for (i = 0; i < num; ++i) {
112         int irq;
113 
114         irq = spapr_allocate_irq(0, lsi);
115         if (!irq) {
116             return -1;
117         }
118 
119         if (0 == i) {
120             first = irq;
121         }
122 
123         /* If the above doesn't create a consecutive block then that's
124          * an internal bug */
125         assert(irq == (first + i));
126     }
127 
128     return first;
129 }
130 
131 static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr)
132 {
133     int ret = 0, offset;
134     CPUPPCState *env;
135     CPUState *cpu;
136     char cpu_model[32];
137     int smt = kvmppc_smt_threads();
138     uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
139 
140     assert(spapr->cpu_model);
141 
142     for (env = first_cpu; env != NULL; env = env->next_cpu) {
143         cpu = CPU(ppc_env_get_cpu(env));
144         uint32_t associativity[] = {cpu_to_be32(0x5),
145                                     cpu_to_be32(0x0),
146                                     cpu_to_be32(0x0),
147                                     cpu_to_be32(0x0),
148                                     cpu_to_be32(cpu->numa_node),
149                                     cpu_to_be32(cpu->cpu_index)};
150 
151         if ((cpu->cpu_index % smt) != 0) {
152             continue;
153         }
154 
155         snprintf(cpu_model, 32, "/cpus/%s@%x", spapr->cpu_model,
156                  cpu->cpu_index);
157 
158         offset = fdt_path_offset(fdt, cpu_model);
159         if (offset < 0) {
160             return offset;
161         }
162 
163         if (nb_numa_nodes > 1) {
164             ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
165                               sizeof(associativity));
166             if (ret < 0) {
167                 return ret;
168             }
169         }
170 
171         ret = fdt_setprop(fdt, offset, "ibm,pft-size",
172                           pft_size_prop, sizeof(pft_size_prop));
173         if (ret < 0) {
174             return ret;
175         }
176     }
177     return ret;
178 }
179 
180 
181 static size_t create_page_sizes_prop(CPUPPCState *env, uint32_t *prop,
182                                      size_t maxsize)
183 {
184     size_t maxcells = maxsize / sizeof(uint32_t);
185     int i, j, count;
186     uint32_t *p = prop;
187 
188     for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
189         struct ppc_one_seg_page_size *sps = &env->sps.sps[i];
190 
191         if (!sps->page_shift) {
192             break;
193         }
194         for (count = 0; count < PPC_PAGE_SIZES_MAX_SZ; count++) {
195             if (sps->enc[count].page_shift == 0) {
196                 break;
197             }
198         }
199         if ((p - prop) >= (maxcells - 3 - count * 2)) {
200             break;
201         }
202         *(p++) = cpu_to_be32(sps->page_shift);
203         *(p++) = cpu_to_be32(sps->slb_enc);
204         *(p++) = cpu_to_be32(count);
205         for (j = 0; j < count; j++) {
206             *(p++) = cpu_to_be32(sps->enc[j].page_shift);
207             *(p++) = cpu_to_be32(sps->enc[j].pte_enc);
208         }
209     }
210 
211     return (p - prop) * sizeof(uint32_t);
212 }
213 
214 #define _FDT(exp) \
215     do { \
216         int ret = (exp);                                           \
217         if (ret < 0) {                                             \
218             fprintf(stderr, "qemu: error creating device tree: %s: %s\n", \
219                     #exp, fdt_strerror(ret));                      \
220             exit(1);                                               \
221         }                                                          \
222     } while (0)
223 
224 
225 static void *spapr_create_fdt_skel(const char *cpu_model,
226                                    hwaddr initrd_base,
227                                    hwaddr initrd_size,
228                                    hwaddr kernel_size,
229                                    const char *boot_device,
230                                    const char *kernel_cmdline,
231                                    uint32_t epow_irq)
232 {
233     void *fdt;
234     CPUPPCState *env;
235     uint32_t start_prop = cpu_to_be32(initrd_base);
236     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
237     char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
238         "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk";
239     char qemu_hypertas_prop[] = "hcall-memop1";
240     uint32_t refpoints[] = {cpu_to_be32(0x4), cpu_to_be32(0x4)};
241     uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
242     char *modelname;
243     int i, smt = kvmppc_smt_threads();
244     unsigned char vec5[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80};
245 
246     fdt = g_malloc0(FDT_MAX_SIZE);
247     _FDT((fdt_create(fdt, FDT_MAX_SIZE)));
248 
249     if (kernel_size) {
250         _FDT((fdt_add_reservemap_entry(fdt, KERNEL_LOAD_ADDR, kernel_size)));
251     }
252     if (initrd_size) {
253         _FDT((fdt_add_reservemap_entry(fdt, initrd_base, initrd_size)));
254     }
255     _FDT((fdt_finish_reservemap(fdt)));
256 
257     /* Root node */
258     _FDT((fdt_begin_node(fdt, "")));
259     _FDT((fdt_property_string(fdt, "device_type", "chrp")));
260     _FDT((fdt_property_string(fdt, "model", "IBM pSeries (emulated by qemu)")));
261     _FDT((fdt_property_string(fdt, "compatible", "qemu,pseries")));
262 
263     _FDT((fdt_property_cell(fdt, "#address-cells", 0x2)));
264     _FDT((fdt_property_cell(fdt, "#size-cells", 0x2)));
265 
266     /* /chosen */
267     _FDT((fdt_begin_node(fdt, "chosen")));
268 
269     /* Set Form1_affinity */
270     _FDT((fdt_property(fdt, "ibm,architecture-vec-5", vec5, sizeof(vec5))));
271 
272     _FDT((fdt_property_string(fdt, "bootargs", kernel_cmdline)));
273     _FDT((fdt_property(fdt, "linux,initrd-start",
274                        &start_prop, sizeof(start_prop))));
275     _FDT((fdt_property(fdt, "linux,initrd-end",
276                        &end_prop, sizeof(end_prop))));
277     if (kernel_size) {
278         uint64_t kprop[2] = { cpu_to_be64(KERNEL_LOAD_ADDR),
279                               cpu_to_be64(kernel_size) };
280 
281         _FDT((fdt_property(fdt, "qemu,boot-kernel", &kprop, sizeof(kprop))));
282     }
283     if (boot_device) {
284         _FDT((fdt_property_string(fdt, "qemu,boot-device", boot_device)));
285     }
286     _FDT((fdt_property_cell(fdt, "qemu,graphic-width", graphic_width)));
287     _FDT((fdt_property_cell(fdt, "qemu,graphic-height", graphic_height)));
288     _FDT((fdt_property_cell(fdt, "qemu,graphic-depth", graphic_depth)));
289 
290     _FDT((fdt_end_node(fdt)));
291 
292     /* cpus */
293     _FDT((fdt_begin_node(fdt, "cpus")));
294 
295     _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
296     _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
297 
298     modelname = g_strdup(cpu_model);
299 
300     for (i = 0; i < strlen(modelname); i++) {
301         modelname[i] = toupper(modelname[i]);
302     }
303 
304     /* This is needed during FDT finalization */
305     spapr->cpu_model = g_strdup(modelname);
306 
307     for (env = first_cpu; env != NULL; env = env->next_cpu) {
308         CPUState *cpu = CPU(ppc_env_get_cpu(env));
309         PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
310         int index = cpu->cpu_index;
311         uint32_t servers_prop[smp_threads];
312         uint32_t gservers_prop[smp_threads * 2];
313         char *nodename;
314         uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
315                            0xffffffff, 0xffffffff};
316         uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq() : TIMEBASE_FREQ;
317         uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
318         uint32_t page_sizes_prop[64];
319         size_t page_sizes_prop_size;
320 
321         if ((index % smt) != 0) {
322             continue;
323         }
324 
325         nodename = g_strdup_printf("%s@%x", modelname, index);
326 
327         _FDT((fdt_begin_node(fdt, nodename)));
328 
329         g_free(nodename);
330 
331         _FDT((fdt_property_cell(fdt, "reg", index)));
332         _FDT((fdt_property_string(fdt, "device_type", "cpu")));
333 
334         _FDT((fdt_property_cell(fdt, "cpu-version", env->spr[SPR_PVR])));
335         _FDT((fdt_property_cell(fdt, "d-cache-block-size",
336                                 env->dcache_line_size)));
337         _FDT((fdt_property_cell(fdt, "d-cache-line-size",
338                                 env->dcache_line_size)));
339         _FDT((fdt_property_cell(fdt, "i-cache-block-size",
340                                 env->icache_line_size)));
341         _FDT((fdt_property_cell(fdt, "i-cache-line-size",
342                                 env->icache_line_size)));
343 
344         if (pcc->l1_dcache_size) {
345             _FDT((fdt_property_cell(fdt, "d-cache-size", pcc->l1_dcache_size)));
346         } else {
347             fprintf(stderr, "Warning: Unknown L1 dcache size for cpu\n");
348         }
349         if (pcc->l1_icache_size) {
350             _FDT((fdt_property_cell(fdt, "i-cache-size", pcc->l1_icache_size)));
351         } else {
352             fprintf(stderr, "Warning: Unknown L1 icache size for cpu\n");
353         }
354 
355         _FDT((fdt_property_cell(fdt, "timebase-frequency", tbfreq)));
356         _FDT((fdt_property_cell(fdt, "clock-frequency", cpufreq)));
357         _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr)));
358         _FDT((fdt_property_string(fdt, "status", "okay")));
359         _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
360 
361         /* Build interrupt servers and gservers properties */
362         for (i = 0; i < smp_threads; i++) {
363             servers_prop[i] = cpu_to_be32(index + i);
364             /* Hack, direct the group queues back to cpu 0 */
365             gservers_prop[i*2] = cpu_to_be32(index + i);
366             gservers_prop[i*2 + 1] = 0;
367         }
368         _FDT((fdt_property(fdt, "ibm,ppc-interrupt-server#s",
369                            servers_prop, sizeof(servers_prop))));
370         _FDT((fdt_property(fdt, "ibm,ppc-interrupt-gserver#s",
371                            gservers_prop, sizeof(gservers_prop))));
372 
373         if (env->mmu_model & POWERPC_MMU_1TSEG) {
374             _FDT((fdt_property(fdt, "ibm,processor-segment-sizes",
375                                segs, sizeof(segs))));
376         }
377 
378         /* Advertise VMX/VSX (vector extensions) if available
379          *   0 / no property == no vector extensions
380          *   1               == VMX / Altivec available
381          *   2               == VSX available */
382         if (env->insns_flags & PPC_ALTIVEC) {
383             uint32_t vmx = (env->insns_flags2 & PPC2_VSX) ? 2 : 1;
384 
385             _FDT((fdt_property_cell(fdt, "ibm,vmx", vmx)));
386         }
387 
388         /* Advertise DFP (Decimal Floating Point) if available
389          *   0 / no property == no DFP
390          *   1               == DFP available */
391         if (env->insns_flags2 & PPC2_DFP) {
392             _FDT((fdt_property_cell(fdt, "ibm,dfp", 1)));
393         }
394 
395         page_sizes_prop_size = create_page_sizes_prop(env, page_sizes_prop,
396                                                       sizeof(page_sizes_prop));
397         if (page_sizes_prop_size) {
398             _FDT((fdt_property(fdt, "ibm,segment-page-sizes",
399                                page_sizes_prop, page_sizes_prop_size)));
400         }
401 
402         _FDT((fdt_end_node(fdt)));
403     }
404 
405     g_free(modelname);
406 
407     _FDT((fdt_end_node(fdt)));
408 
409     /* RTAS */
410     _FDT((fdt_begin_node(fdt, "rtas")));
411 
412     _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
413                        sizeof(hypertas_prop))));
414     _FDT((fdt_property(fdt, "qemu,hypertas-functions", qemu_hypertas_prop,
415                        sizeof(qemu_hypertas_prop))));
416 
417     _FDT((fdt_property(fdt, "ibm,associativity-reference-points",
418         refpoints, sizeof(refpoints))));
419 
420     _FDT((fdt_property_cell(fdt, "rtas-error-log-max", RTAS_ERROR_LOG_MAX)));
421 
422     _FDT((fdt_end_node(fdt)));
423 
424     /* interrupt controller */
425     _FDT((fdt_begin_node(fdt, "interrupt-controller")));
426 
427     _FDT((fdt_property_string(fdt, "device_type",
428                               "PowerPC-External-Interrupt-Presentation")));
429     _FDT((fdt_property_string(fdt, "compatible", "IBM,ppc-xicp")));
430     _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
431     _FDT((fdt_property(fdt, "ibm,interrupt-server-ranges",
432                        interrupt_server_ranges_prop,
433                        sizeof(interrupt_server_ranges_prop))));
434     _FDT((fdt_property_cell(fdt, "#interrupt-cells", 2)));
435     _FDT((fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP)));
436     _FDT((fdt_property_cell(fdt, "phandle", PHANDLE_XICP)));
437 
438     _FDT((fdt_end_node(fdt)));
439 
440     /* vdevice */
441     _FDT((fdt_begin_node(fdt, "vdevice")));
442 
443     _FDT((fdt_property_string(fdt, "device_type", "vdevice")));
444     _FDT((fdt_property_string(fdt, "compatible", "IBM,vdevice")));
445     _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
446     _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
447     _FDT((fdt_property_cell(fdt, "#interrupt-cells", 0x2)));
448     _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
449 
450     _FDT((fdt_end_node(fdt)));
451 
452     /* event-sources */
453     spapr_events_fdt_skel(fdt, epow_irq);
454 
455     _FDT((fdt_end_node(fdt))); /* close root node */
456     _FDT((fdt_finish(fdt)));
457 
458     return fdt;
459 }
460 
461 static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt)
462 {
463     uint32_t associativity[] = {cpu_to_be32(0x4), cpu_to_be32(0x0),
464                                 cpu_to_be32(0x0), cpu_to_be32(0x0),
465                                 cpu_to_be32(0x0)};
466     char mem_name[32];
467     hwaddr node0_size, mem_start;
468     uint64_t mem_reg_property[2];
469     int i, off;
470 
471     /* memory node(s) */
472     node0_size = (nb_numa_nodes > 1) ? node_mem[0] : ram_size;
473     if (spapr->rma_size > node0_size) {
474         spapr->rma_size = node0_size;
475     }
476 
477     /* RMA */
478     mem_reg_property[0] = 0;
479     mem_reg_property[1] = cpu_to_be64(spapr->rma_size);
480     off = fdt_add_subnode(fdt, 0, "memory@0");
481     _FDT(off);
482     _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
483     _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
484                       sizeof(mem_reg_property))));
485     _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
486                       sizeof(associativity))));
487 
488     /* RAM: Node 0 */
489     if (node0_size > spapr->rma_size) {
490         mem_reg_property[0] = cpu_to_be64(spapr->rma_size);
491         mem_reg_property[1] = cpu_to_be64(node0_size - spapr->rma_size);
492 
493         sprintf(mem_name, "memory@" TARGET_FMT_lx, spapr->rma_size);
494         off = fdt_add_subnode(fdt, 0, mem_name);
495         _FDT(off);
496         _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
497         _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
498                           sizeof(mem_reg_property))));
499         _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
500                           sizeof(associativity))));
501     }
502 
503     /* RAM: Node 1 and beyond */
504     mem_start = node0_size;
505     for (i = 1; i < nb_numa_nodes; i++) {
506         mem_reg_property[0] = cpu_to_be64(mem_start);
507         mem_reg_property[1] = cpu_to_be64(node_mem[i]);
508         associativity[3] = associativity[4] = cpu_to_be32(i);
509         sprintf(mem_name, "memory@" TARGET_FMT_lx, mem_start);
510         off = fdt_add_subnode(fdt, 0, mem_name);
511         _FDT(off);
512         _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
513         _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
514                           sizeof(mem_reg_property))));
515         _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
516                           sizeof(associativity))));
517         mem_start += node_mem[i];
518     }
519 
520     return 0;
521 }
522 
523 static void spapr_finalize_fdt(sPAPREnvironment *spapr,
524                                hwaddr fdt_addr,
525                                hwaddr rtas_addr,
526                                hwaddr rtas_size)
527 {
528     int ret;
529     void *fdt;
530     sPAPRPHBState *phb;
531 
532     fdt = g_malloc(FDT_MAX_SIZE);
533 
534     /* open out the base tree into a temp buffer for the final tweaks */
535     _FDT((fdt_open_into(spapr->fdt_skel, fdt, FDT_MAX_SIZE)));
536 
537     ret = spapr_populate_memory(spapr, fdt);
538     if (ret < 0) {
539         fprintf(stderr, "couldn't setup memory nodes in fdt\n");
540         exit(1);
541     }
542 
543     ret = spapr_populate_vdevice(spapr->vio_bus, fdt);
544     if (ret < 0) {
545         fprintf(stderr, "couldn't setup vio devices in fdt\n");
546         exit(1);
547     }
548 
549     QLIST_FOREACH(phb, &spapr->phbs, list) {
550         ret = spapr_populate_pci_dt(phb, PHANDLE_XICP, fdt);
551     }
552 
553     if (ret < 0) {
554         fprintf(stderr, "couldn't setup PCI devices in fdt\n");
555         exit(1);
556     }
557 
558     /* RTAS */
559     ret = spapr_rtas_device_tree_setup(fdt, rtas_addr, rtas_size);
560     if (ret < 0) {
561         fprintf(stderr, "Couldn't set up RTAS device tree properties\n");
562     }
563 
564     /* Advertise NUMA via ibm,associativity */
565     ret = spapr_fixup_cpu_dt(fdt, spapr);
566     if (ret < 0) {
567         fprintf(stderr, "Couldn't finalize CPU device tree properties\n");
568     }
569 
570     if (!spapr->has_graphics) {
571         spapr_populate_chosen_stdout(fdt, spapr->vio_bus);
572     }
573 
574     _FDT((fdt_pack(fdt)));
575 
576     if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
577         hw_error("FDT too big ! 0x%x bytes (max is 0x%x)\n",
578                  fdt_totalsize(fdt), FDT_MAX_SIZE);
579         exit(1);
580     }
581 
582     cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
583 
584     g_free(fdt);
585 }
586 
587 static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
588 {
589     return (addr & 0x0fffffff) + KERNEL_LOAD_ADDR;
590 }
591 
592 static void emulate_spapr_hypercall(PowerPCCPU *cpu)
593 {
594     CPUPPCState *env = &cpu->env;
595 
596     if (msr_pr) {
597         hcall_dprintf("Hypercall made with MSR[PR]=1\n");
598         env->gpr[3] = H_PRIVILEGE;
599     } else {
600         env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
601     }
602 }
603 
604 static void spapr_reset_htab(sPAPREnvironment *spapr)
605 {
606     long shift;
607 
608     /* allocate hash page table.  For now we always make this 16mb,
609      * later we should probably make it scale to the size of guest
610      * RAM */
611 
612     shift = kvmppc_reset_htab(spapr->htab_shift);
613 
614     if (shift > 0) {
615         /* Kernel handles htab, we don't need to allocate one */
616         spapr->htab_shift = shift;
617     } else {
618         if (!spapr->htab) {
619             /* Allocate an htab if we don't yet have one */
620             spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
621         }
622 
623         /* And clear it */
624         memset(spapr->htab, 0, HTAB_SIZE(spapr));
625     }
626 
627     /* Update the RMA size if necessary */
628     if (spapr->vrma_adjust) {
629         spapr->rma_size = kvmppc_rma_size(ram_size, spapr->htab_shift);
630     }
631 }
632 
633 static void ppc_spapr_reset(void)
634 {
635     CPUState *first_cpu_cpu;
636 
637     /* Reset the hash table & recalc the RMA */
638     spapr_reset_htab(spapr);
639 
640     qemu_devices_reset();
641 
642     /* Load the fdt */
643     spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
644                        spapr->rtas_size);
645 
646     /* Set up the entry state */
647     first_cpu_cpu = ENV_GET_CPU(first_cpu);
648     first_cpu->gpr[3] = spapr->fdt_addr;
649     first_cpu->gpr[5] = 0;
650     first_cpu_cpu->halted = 0;
651     first_cpu->nip = spapr->entry_point;
652 
653 }
654 
655 static void spapr_cpu_reset(void *opaque)
656 {
657     PowerPCCPU *cpu = opaque;
658     CPUState *cs = CPU(cpu);
659     CPUPPCState *env = &cpu->env;
660 
661     cpu_reset(cs);
662 
663     /* All CPUs start halted.  CPU0 is unhalted from the machine level
664      * reset code and the rest are explicitly started up by the guest
665      * using an RTAS call */
666     cs->halted = 1;
667 
668     env->spr[SPR_HIOR] = 0;
669 
670     env->external_htab = spapr->htab;
671     env->htab_base = -1;
672     env->htab_mask = HTAB_SIZE(spapr) - 1;
673     env->spr[SPR_SDR1] = (unsigned long)spapr->htab |
674         (spapr->htab_shift - 18);
675 }
676 
677 static void spapr_create_nvram(sPAPREnvironment *spapr)
678 {
679     QemuOpts *machine_opts;
680     DeviceState *dev;
681 
682     dev = qdev_create(&spapr->vio_bus->bus, "spapr-nvram");
683 
684     machine_opts = qemu_opts_find(qemu_find_opts("machine"), 0);
685     if (machine_opts) {
686         const char *drivename;
687 
688         drivename = qemu_opt_get(machine_opts, "nvram");
689         if (drivename) {
690             BlockDriverState *bs;
691 
692             bs = bdrv_find(drivename);
693             if (!bs) {
694                 fprintf(stderr, "No such block device \"%s\" for nvram\n",
695                         drivename);
696                 exit(1);
697             }
698             qdev_prop_set_drive_nofail(dev, "drive", bs);
699         }
700     }
701 
702     qdev_init_nofail(dev);
703 
704     spapr->nvram = (struct sPAPRNVRAM *)dev;
705 }
706 
707 /* Returns whether we want to use VGA or not */
708 static int spapr_vga_init(PCIBus *pci_bus)
709 {
710     switch (vga_interface_type) {
711     case VGA_NONE:
712     case VGA_STD:
713         return pci_vga_init(pci_bus) != NULL;
714     default:
715         fprintf(stderr, "This vga model is not supported,"
716                 "currently it only supports -vga std\n");
717         exit(0);
718         break;
719     }
720 }
721 
722 /* pSeries LPAR / sPAPR hardware init */
723 static void ppc_spapr_init(QEMUMachineInitArgs *args)
724 {
725     ram_addr_t ram_size = args->ram_size;
726     const char *cpu_model = args->cpu_model;
727     const char *kernel_filename = args->kernel_filename;
728     const char *kernel_cmdline = args->kernel_cmdline;
729     const char *initrd_filename = args->initrd_filename;
730     const char *boot_device = args->boot_device;
731     PowerPCCPU *cpu;
732     CPUPPCState *env;
733     PCIHostState *phb;
734     int i;
735     MemoryRegion *sysmem = get_system_memory();
736     MemoryRegion *ram = g_new(MemoryRegion, 1);
737     hwaddr rma_alloc_size;
738     uint32_t initrd_base = 0;
739     long kernel_size = 0, initrd_size = 0;
740     long load_limit, rtas_limit, fw_size;
741     char *filename;
742 
743     msi_supported = true;
744 
745     spapr = g_malloc0(sizeof(*spapr));
746     QLIST_INIT(&spapr->phbs);
747 
748     cpu_ppc_hypercall = emulate_spapr_hypercall;
749 
750     /* Allocate RMA if necessary */
751     rma_alloc_size = kvmppc_alloc_rma("ppc_spapr.rma", sysmem);
752 
753     if (rma_alloc_size == -1) {
754         hw_error("qemu: Unable to create RMA\n");
755         exit(1);
756     }
757 
758     if (rma_alloc_size && (rma_alloc_size < ram_size)) {
759         spapr->rma_size = rma_alloc_size;
760     } else {
761         spapr->rma_size = ram_size;
762 
763         /* With KVM, we don't actually know whether KVM supports an
764          * unbounded RMA (PR KVM) or is limited by the hash table size
765          * (HV KVM using VRMA), so we always assume the latter
766          *
767          * In that case, we also limit the initial allocations for RTAS
768          * etc... to 256M since we have no way to know what the VRMA size
769          * is going to be as it depends on the size of the hash table
770          * isn't determined yet.
771          */
772         if (kvm_enabled()) {
773             spapr->vrma_adjust = 1;
774             spapr->rma_size = MIN(spapr->rma_size, 0x10000000);
775         }
776     }
777 
778     /* We place the device tree and RTAS just below either the top of the RMA,
779      * or just below 2GB, whichever is lowere, so that it can be
780      * processed with 32-bit real mode code if necessary */
781     rtas_limit = MIN(spapr->rma_size, 0x80000000);
782     spapr->rtas_addr = rtas_limit - RTAS_MAX_SIZE;
783     spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;
784     load_limit = spapr->fdt_addr - FW_OVERHEAD;
785 
786     /* We aim for a hash table of size 1/128 the size of RAM.  The
787      * normal rule of thumb is 1/64 the size of RAM, but that's much
788      * more than needed for the Linux guests we support. */
789     spapr->htab_shift = 18; /* Minimum architected size */
790     while (spapr->htab_shift <= 46) {
791         if ((1ULL << (spapr->htab_shift + 7)) >= ram_size) {
792             break;
793         }
794         spapr->htab_shift++;
795     }
796 
797     /* Set up Interrupt Controller before we create the VCPUs */
798     spapr->icp = xics_system_init(smp_cpus * kvmppc_smt_threads() / smp_threads,
799                                   XICS_IRQS);
800     spapr->next_irq = XICS_IRQ_BASE;
801 
802     /* init CPUs */
803     if (cpu_model == NULL) {
804         cpu_model = kvm_enabled() ? "host" : "POWER7";
805     }
806     for (i = 0; i < smp_cpus; i++) {
807         cpu = cpu_ppc_init(cpu_model);
808         if (cpu == NULL) {
809             fprintf(stderr, "Unable to find PowerPC CPU definition\n");
810             exit(1);
811         }
812         env = &cpu->env;
813 
814         xics_cpu_setup(spapr->icp, cpu);
815 
816         /* Set time-base frequency to 512 MHz */
817         cpu_ppc_tb_init(env, TIMEBASE_FREQ);
818 
819         /* PAPR always has exception vectors in RAM not ROM. To ensure this,
820          * MSR[IP] should never be set.
821          */
822         env->msr_mask &= ~(1 << 6);
823 
824         /* Tell KVM that we're in PAPR mode */
825         if (kvm_enabled()) {
826             kvmppc_set_papr(cpu);
827         }
828 
829         qemu_register_reset(spapr_cpu_reset, cpu);
830     }
831 
832     /* allocate RAM */
833     spapr->ram_limit = ram_size;
834     if (spapr->ram_limit > rma_alloc_size) {
835         ram_addr_t nonrma_base = rma_alloc_size;
836         ram_addr_t nonrma_size = spapr->ram_limit - rma_alloc_size;
837 
838         memory_region_init_ram(ram, "ppc_spapr.ram", nonrma_size);
839         vmstate_register_ram_global(ram);
840         memory_region_add_subregion(sysmem, nonrma_base, ram);
841     }
842 
843     filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
844     spapr->rtas_size = load_image_targphys(filename, spapr->rtas_addr,
845                                            rtas_limit - spapr->rtas_addr);
846     if (spapr->rtas_size < 0) {
847         hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
848         exit(1);
849     }
850     if (spapr->rtas_size > RTAS_MAX_SIZE) {
851         hw_error("RTAS too big ! 0x%lx bytes (max is 0x%x)\n",
852                  spapr->rtas_size, RTAS_MAX_SIZE);
853         exit(1);
854     }
855     g_free(filename);
856 
857     /* Set up EPOW events infrastructure */
858     spapr_events_init(spapr);
859 
860     /* Set up IOMMU */
861     spapr_iommu_init();
862 
863     /* Set up VIO bus */
864     spapr->vio_bus = spapr_vio_bus_init();
865 
866     for (i = 0; i < MAX_SERIAL_PORTS; i++) {
867         if (serial_hds[i]) {
868             spapr_vty_create(spapr->vio_bus, serial_hds[i]);
869         }
870     }
871 
872     /* We always have at least the nvram device on VIO */
873     spapr_create_nvram(spapr);
874 
875     /* Set up PCI */
876     spapr_pci_rtas_init();
877 
878     phb = spapr_create_phb(spapr, 0);
879 
880     for (i = 0; i < nb_nics; i++) {
881         NICInfo *nd = &nd_table[i];
882 
883         if (!nd->model) {
884             nd->model = g_strdup("ibmveth");
885         }
886 
887         if (strcmp(nd->model, "ibmveth") == 0) {
888             spapr_vlan_create(spapr->vio_bus, nd);
889         } else {
890             pci_nic_init_nofail(&nd_table[i], nd->model, NULL);
891         }
892     }
893 
894     for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
895         spapr_vscsi_create(spapr->vio_bus);
896     }
897 
898     /* Graphics */
899     if (spapr_vga_init(phb->bus)) {
900         spapr->has_graphics = true;
901     }
902 
903     if (usb_enabled(spapr->has_graphics)) {
904         pci_create_simple(phb->bus, -1, "pci-ohci");
905         if (spapr->has_graphics) {
906             usbdevice_create("keyboard");
907             usbdevice_create("mouse");
908         }
909     }
910 
911     if (spapr->rma_size < (MIN_RMA_SLOF << 20)) {
912         fprintf(stderr, "qemu: pSeries SLOF firmware requires >= "
913                 "%ldM guest RMA (Real Mode Area memory)\n", MIN_RMA_SLOF);
914         exit(1);
915     }
916 
917     if (kernel_filename) {
918         uint64_t lowaddr = 0;
919 
920         kernel_size = load_elf(kernel_filename, translate_kernel_address, NULL,
921                                NULL, &lowaddr, NULL, 1, ELF_MACHINE, 0);
922         if (kernel_size < 0) {
923             kernel_size = load_image_targphys(kernel_filename,
924                                               KERNEL_LOAD_ADDR,
925                                               load_limit - KERNEL_LOAD_ADDR);
926         }
927         if (kernel_size < 0) {
928             fprintf(stderr, "qemu: could not load kernel '%s'\n",
929                     kernel_filename);
930             exit(1);
931         }
932 
933         /* load initrd */
934         if (initrd_filename) {
935             /* Try to locate the initrd in the gap between the kernel
936              * and the firmware. Add a bit of space just in case
937              */
938             initrd_base = (KERNEL_LOAD_ADDR + kernel_size + 0x1ffff) & ~0xffff;
939             initrd_size = load_image_targphys(initrd_filename, initrd_base,
940                                               load_limit - initrd_base);
941             if (initrd_size < 0) {
942                 fprintf(stderr, "qemu: could not load initial ram disk '%s'\n",
943                         initrd_filename);
944                 exit(1);
945             }
946         } else {
947             initrd_base = 0;
948             initrd_size = 0;
949         }
950     }
951 
952     filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, FW_FILE_NAME);
953     fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
954     if (fw_size < 0) {
955         hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
956         exit(1);
957     }
958     g_free(filename);
959 
960     spapr->entry_point = 0x100;
961 
962     /* Prepare the device tree */
963     spapr->fdt_skel = spapr_create_fdt_skel(cpu_model,
964                                             initrd_base, initrd_size,
965                                             kernel_size,
966                                             boot_device, kernel_cmdline,
967                                             spapr->epow_irq);
968     assert(spapr->fdt_skel != NULL);
969 }
970 
971 static QEMUMachine spapr_machine = {
972     .name = "pseries",
973     .desc = "pSeries Logical Partition (PAPR compliant)",
974     .init = ppc_spapr_init,
975     .reset = ppc_spapr_reset,
976     .block_default_type = IF_SCSI,
977     .max_cpus = MAX_CPUS,
978     .no_parallel = 1,
979     .boot_order = NULL,
980 };
981 
982 static void spapr_machine_init(void)
983 {
984     qemu_register_machine(&spapr_machine);
985 }
986 
987 machine_init(spapr_machine_init);
988