xref: /openbmc/linux/arch/x86/xen/enlighten_pv.c (revision a36954f5)
1 /*
2  * Core of Xen paravirt_ops implementation.
3  *
4  * This file contains the xen_paravirt_ops structure itself, and the
5  * implementations for:
6  * - privileged instructions
7  * - interrupt flags
8  * - segment operations
9  * - booting and setup
10  *
11  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
12  */
13 
14 #include <linux/cpu.h>
15 #include <linux/kernel.h>
16 #include <linux/init.h>
17 #include <linux/smp.h>
18 #include <linux/preempt.h>
19 #include <linux/hardirq.h>
20 #include <linux/percpu.h>
21 #include <linux/delay.h>
22 #include <linux/start_kernel.h>
23 #include <linux/sched.h>
24 #include <linux/kprobes.h>
25 #include <linux/bootmem.h>
26 #include <linux/export.h>
27 #include <linux/mm.h>
28 #include <linux/page-flags.h>
29 #include <linux/highmem.h>
30 #include <linux/console.h>
31 #include <linux/pci.h>
32 #include <linux/gfp.h>
33 #include <linux/memblock.h>
34 #include <linux/edd.h>
35 #include <linux/frame.h>
36 
37 #include <xen/xen.h>
38 #include <xen/events.h>
39 #include <xen/interface/xen.h>
40 #include <xen/interface/version.h>
41 #include <xen/interface/physdev.h>
42 #include <xen/interface/vcpu.h>
43 #include <xen/interface/memory.h>
44 #include <xen/interface/nmi.h>
45 #include <xen/interface/xen-mca.h>
46 #include <xen/features.h>
47 #include <xen/page.h>
48 #include <xen/hvc-console.h>
49 #include <xen/acpi.h>
50 
51 #include <asm/paravirt.h>
52 #include <asm/apic.h>
53 #include <asm/page.h>
54 #include <asm/xen/pci.h>
55 #include <asm/xen/hypercall.h>
56 #include <asm/xen/hypervisor.h>
57 #include <asm/xen/cpuid.h>
58 #include <asm/fixmap.h>
59 #include <asm/processor.h>
60 #include <asm/proto.h>
61 #include <asm/msr-index.h>
62 #include <asm/traps.h>
63 #include <asm/setup.h>
64 #include <asm/desc.h>
65 #include <asm/pgalloc.h>
66 #include <asm/pgtable.h>
67 #include <asm/tlbflush.h>
68 #include <asm/reboot.h>
69 #include <asm/stackprotector.h>
70 #include <asm/hypervisor.h>
71 #include <asm/mach_traps.h>
72 #include <asm/mwait.h>
73 #include <asm/pci_x86.h>
74 #include <asm/cpu.h>
75 
76 #ifdef CONFIG_ACPI
77 #include <linux/acpi.h>
78 #include <asm/acpi.h>
79 #include <acpi/pdc_intel.h>
80 #include <acpi/processor.h>
81 #include <xen/interface/platform.h>
82 #endif
83 
84 #include "xen-ops.h"
85 #include "mmu.h"
86 #include "smp.h"
87 #include "multicalls.h"
88 #include "pmu.h"
89 
90 void *xen_initial_gdt;
91 
92 RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
93 
94 static int xen_cpu_up_prepare_pv(unsigned int cpu);
95 static int xen_cpu_dead_pv(unsigned int cpu);
96 
97 struct tls_descs {
98 	struct desc_struct desc[3];
99 };
100 
101 /*
102  * Updating the 3 TLS descriptors in the GDT on every task switch is
103  * surprisingly expensive so we avoid updating them if they haven't
104  * changed.  Since Xen writes different descriptors than the one
105  * passed in the update_descriptor hypercall we keep shadow copies to
106  * compare against.
107  */
108 static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
109 
110 /*
111  * On restore, set the vcpu placement up again.
112  * If it fails, then we're in a bad state, since
113  * we can't back out from using it...
114  */
115 void xen_vcpu_restore(void)
116 {
117 	int cpu;
118 
119 	for_each_possible_cpu(cpu) {
120 		bool other_cpu = (cpu != smp_processor_id());
121 		bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu),
122 						NULL);
123 
124 		if (other_cpu && is_up &&
125 		    HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL))
126 			BUG();
127 
128 		xen_setup_runstate_info(cpu);
129 
130 		if (xen_have_vcpu_info_placement)
131 			xen_vcpu_setup(cpu);
132 
133 		if (other_cpu && is_up &&
134 		    HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
135 			BUG();
136 	}
137 }
138 
139 static void __init xen_banner(void)
140 {
141 	unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
142 	struct xen_extraversion extra;
143 	HYPERVISOR_xen_version(XENVER_extraversion, &extra);
144 
145 	pr_info("Booting paravirtualized kernel on %s\n", pv_info.name);
146 	printk(KERN_INFO "Xen version: %d.%d%s%s\n",
147 	       version >> 16, version & 0xffff, extra.extraversion,
148 	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
149 }
150 /* Check if running on Xen version (major, minor) or later */
151 bool
152 xen_running_on_version_or_later(unsigned int major, unsigned int minor)
153 {
154 	unsigned int version;
155 
156 	if (!xen_domain())
157 		return false;
158 
159 	version = HYPERVISOR_xen_version(XENVER_version, NULL);
160 	if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) ||
161 		((version >> 16) > major))
162 		return true;
163 	return false;
164 }
165 
166 static __read_mostly unsigned int cpuid_leaf5_ecx_val;
167 static __read_mostly unsigned int cpuid_leaf5_edx_val;
168 
169 static void xen_cpuid(unsigned int *ax, unsigned int *bx,
170 		      unsigned int *cx, unsigned int *dx)
171 {
172 	unsigned maskebx = ~0;
173 
174 	/*
175 	 * Mask out inconvenient features, to try and disable as many
176 	 * unsupported kernel subsystems as possible.
177 	 */
178 	switch (*ax) {
179 	case CPUID_MWAIT_LEAF:
180 		/* Synthesize the values.. */
181 		*ax = 0;
182 		*bx = 0;
183 		*cx = cpuid_leaf5_ecx_val;
184 		*dx = cpuid_leaf5_edx_val;
185 		return;
186 
187 	case 0xb:
188 		/* Suppress extended topology stuff */
189 		maskebx = 0;
190 		break;
191 	}
192 
193 	asm(XEN_EMULATE_PREFIX "cpuid"
194 		: "=a" (*ax),
195 		  "=b" (*bx),
196 		  "=c" (*cx),
197 		  "=d" (*dx)
198 		: "0" (*ax), "2" (*cx));
199 
200 	*bx &= maskebx;
201 }
202 STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */
203 
204 static bool __init xen_check_mwait(void)
205 {
206 #ifdef CONFIG_ACPI
207 	struct xen_platform_op op = {
208 		.cmd			= XENPF_set_processor_pminfo,
209 		.u.set_pminfo.id	= -1,
210 		.u.set_pminfo.type	= XEN_PM_PDC,
211 	};
212 	uint32_t buf[3];
213 	unsigned int ax, bx, cx, dx;
214 	unsigned int mwait_mask;
215 
216 	/* We need to determine whether it is OK to expose the MWAIT
217 	 * capability to the kernel to harvest deeper than C3 states from ACPI
218 	 * _CST using the processor_harvest_xen.c module. For this to work, we
219 	 * need to gather the MWAIT_LEAF values (which the cstate.c code
220 	 * checks against). The hypervisor won't expose the MWAIT flag because
221 	 * it would break backwards compatibility; so we will find out directly
222 	 * from the hardware and hypercall.
223 	 */
224 	if (!xen_initial_domain())
225 		return false;
226 
227 	/*
228 	 * When running under platform earlier than Xen4.2, do not expose
229 	 * mwait, to avoid the risk of loading native acpi pad driver
230 	 */
231 	if (!xen_running_on_version_or_later(4, 2))
232 		return false;
233 
234 	ax = 1;
235 	cx = 0;
236 
237 	native_cpuid(&ax, &bx, &cx, &dx);
238 
239 	mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
240 		     (1 << (X86_FEATURE_MWAIT % 32));
241 
242 	if ((cx & mwait_mask) != mwait_mask)
243 		return false;
244 
245 	/* We need to emulate the MWAIT_LEAF and for that we need both
246 	 * ecx and edx. The hypercall provides only partial information.
247 	 */
248 
249 	ax = CPUID_MWAIT_LEAF;
250 	bx = 0;
251 	cx = 0;
252 	dx = 0;
253 
254 	native_cpuid(&ax, &bx, &cx, &dx);
255 
256 	/* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
257 	 * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
258 	 */
259 	buf[0] = ACPI_PDC_REVISION_ID;
260 	buf[1] = 1;
261 	buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
262 
263 	set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
264 
265 	if ((HYPERVISOR_platform_op(&op) == 0) &&
266 	    (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
267 		cpuid_leaf5_ecx_val = cx;
268 		cpuid_leaf5_edx_val = dx;
269 	}
270 	return true;
271 #else
272 	return false;
273 #endif
274 }
275 
276 static bool __init xen_check_xsave(void)
277 {
278 	unsigned int cx, xsave_mask;
279 
280 	cx = cpuid_ecx(1);
281 
282 	xsave_mask = (1 << (X86_FEATURE_XSAVE % 32)) |
283 		     (1 << (X86_FEATURE_OSXSAVE % 32));
284 
285 	/* Xen will set CR4.OSXSAVE if supported and not disabled by force */
286 	return (cx & xsave_mask) == xsave_mask;
287 }
288 
289 static void __init xen_init_capabilities(void)
290 {
291 	setup_force_cpu_cap(X86_FEATURE_XENPV);
292 	setup_clear_cpu_cap(X86_FEATURE_DCA);
293 	setup_clear_cpu_cap(X86_FEATURE_APERFMPERF);
294 	setup_clear_cpu_cap(X86_FEATURE_MTRR);
295 	setup_clear_cpu_cap(X86_FEATURE_ACC);
296 	setup_clear_cpu_cap(X86_FEATURE_X2APIC);
297 
298 	if (!xen_initial_domain())
299 		setup_clear_cpu_cap(X86_FEATURE_ACPI);
300 
301 	if (xen_check_mwait())
302 		setup_force_cpu_cap(X86_FEATURE_MWAIT);
303 	else
304 		setup_clear_cpu_cap(X86_FEATURE_MWAIT);
305 
306 	if (!xen_check_xsave()) {
307 		setup_clear_cpu_cap(X86_FEATURE_XSAVE);
308 		setup_clear_cpu_cap(X86_FEATURE_OSXSAVE);
309 	}
310 }
311 
312 static void xen_set_debugreg(int reg, unsigned long val)
313 {
314 	HYPERVISOR_set_debugreg(reg, val);
315 }
316 
317 static unsigned long xen_get_debugreg(int reg)
318 {
319 	return HYPERVISOR_get_debugreg(reg);
320 }
321 
322 static void xen_end_context_switch(struct task_struct *next)
323 {
324 	xen_mc_flush();
325 	paravirt_end_context_switch(next);
326 }
327 
328 static unsigned long xen_store_tr(void)
329 {
330 	return 0;
331 }
332 
333 /*
334  * Set the page permissions for a particular virtual address.  If the
335  * address is a vmalloc mapping (or other non-linear mapping), then
336  * find the linear mapping of the page and also set its protections to
337  * match.
338  */
339 static void set_aliased_prot(void *v, pgprot_t prot)
340 {
341 	int level;
342 	pte_t *ptep;
343 	pte_t pte;
344 	unsigned long pfn;
345 	struct page *page;
346 	unsigned char dummy;
347 
348 	ptep = lookup_address((unsigned long)v, &level);
349 	BUG_ON(ptep == NULL);
350 
351 	pfn = pte_pfn(*ptep);
352 	page = pfn_to_page(pfn);
353 
354 	pte = pfn_pte(pfn, prot);
355 
356 	/*
357 	 * Careful: update_va_mapping() will fail if the virtual address
358 	 * we're poking isn't populated in the page tables.  We don't
359 	 * need to worry about the direct map (that's always in the page
360 	 * tables), but we need to be careful about vmap space.  In
361 	 * particular, the top level page table can lazily propagate
362 	 * entries between processes, so if we've switched mms since we
363 	 * vmapped the target in the first place, we might not have the
364 	 * top-level page table entry populated.
365 	 *
366 	 * We disable preemption because we want the same mm active when
367 	 * we probe the target and when we issue the hypercall.  We'll
368 	 * have the same nominal mm, but if we're a kernel thread, lazy
369 	 * mm dropping could change our pgd.
370 	 *
371 	 * Out of an abundance of caution, this uses __get_user() to fault
372 	 * in the target address just in case there's some obscure case
373 	 * in which the target address isn't readable.
374 	 */
375 
376 	preempt_disable();
377 
378 	probe_kernel_read(&dummy, v, 1);
379 
380 	if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
381 		BUG();
382 
383 	if (!PageHighMem(page)) {
384 		void *av = __va(PFN_PHYS(pfn));
385 
386 		if (av != v)
387 			if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
388 				BUG();
389 	} else
390 		kmap_flush_unused();
391 
392 	preempt_enable();
393 }
394 
395 static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
396 {
397 	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
398 	int i;
399 
400 	/*
401 	 * We need to mark the all aliases of the LDT pages RO.  We
402 	 * don't need to call vm_flush_aliases(), though, since that's
403 	 * only responsible for flushing aliases out the TLBs, not the
404 	 * page tables, and Xen will flush the TLB for us if needed.
405 	 *
406 	 * To avoid confusing future readers: none of this is necessary
407 	 * to load the LDT.  The hypervisor only checks this when the
408 	 * LDT is faulted in due to subsequent descriptor access.
409 	 */
410 
411 	for (i = 0; i < entries; i += entries_per_page)
412 		set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
413 }
414 
415 static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
416 {
417 	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
418 	int i;
419 
420 	for (i = 0; i < entries; i += entries_per_page)
421 		set_aliased_prot(ldt + i, PAGE_KERNEL);
422 }
423 
424 static void xen_set_ldt(const void *addr, unsigned entries)
425 {
426 	struct mmuext_op *op;
427 	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
428 
429 	trace_xen_cpu_set_ldt(addr, entries);
430 
431 	op = mcs.args;
432 	op->cmd = MMUEXT_SET_LDT;
433 	op->arg1.linear_addr = (unsigned long)addr;
434 	op->arg2.nr_ents = entries;
435 
436 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
437 
438 	xen_mc_issue(PARAVIRT_LAZY_CPU);
439 }
440 
441 static void xen_load_gdt(const struct desc_ptr *dtr)
442 {
443 	unsigned long va = dtr->address;
444 	unsigned int size = dtr->size + 1;
445 	unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE);
446 	unsigned long frames[pages];
447 	int f;
448 
449 	/*
450 	 * A GDT can be up to 64k in size, which corresponds to 8192
451 	 * 8-byte entries, or 16 4k pages..
452 	 */
453 
454 	BUG_ON(size > 65536);
455 	BUG_ON(va & ~PAGE_MASK);
456 
457 	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
458 		int level;
459 		pte_t *ptep;
460 		unsigned long pfn, mfn;
461 		void *virt;
462 
463 		/*
464 		 * The GDT is per-cpu and is in the percpu data area.
465 		 * That can be virtually mapped, so we need to do a
466 		 * page-walk to get the underlying MFN for the
467 		 * hypercall.  The page can also be in the kernel's
468 		 * linear range, so we need to RO that mapping too.
469 		 */
470 		ptep = lookup_address(va, &level);
471 		BUG_ON(ptep == NULL);
472 
473 		pfn = pte_pfn(*ptep);
474 		mfn = pfn_to_mfn(pfn);
475 		virt = __va(PFN_PHYS(pfn));
476 
477 		frames[f] = mfn;
478 
479 		make_lowmem_page_readonly((void *)va);
480 		make_lowmem_page_readonly(virt);
481 	}
482 
483 	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
484 		BUG();
485 }
486 
487 /*
488  * load_gdt for early boot, when the gdt is only mapped once
489  */
490 static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
491 {
492 	unsigned long va = dtr->address;
493 	unsigned int size = dtr->size + 1;
494 	unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE);
495 	unsigned long frames[pages];
496 	int f;
497 
498 	/*
499 	 * A GDT can be up to 64k in size, which corresponds to 8192
500 	 * 8-byte entries, or 16 4k pages..
501 	 */
502 
503 	BUG_ON(size > 65536);
504 	BUG_ON(va & ~PAGE_MASK);
505 
506 	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
507 		pte_t pte;
508 		unsigned long pfn, mfn;
509 
510 		pfn = virt_to_pfn(va);
511 		mfn = pfn_to_mfn(pfn);
512 
513 		pte = pfn_pte(pfn, PAGE_KERNEL_RO);
514 
515 		if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
516 			BUG();
517 
518 		frames[f] = mfn;
519 	}
520 
521 	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
522 		BUG();
523 }
524 
525 static inline bool desc_equal(const struct desc_struct *d1,
526 			      const struct desc_struct *d2)
527 {
528 	return d1->a == d2->a && d1->b == d2->b;
529 }
530 
531 static void load_TLS_descriptor(struct thread_struct *t,
532 				unsigned int cpu, unsigned int i)
533 {
534 	struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
535 	struct desc_struct *gdt;
536 	xmaddr_t maddr;
537 	struct multicall_space mc;
538 
539 	if (desc_equal(shadow, &t->tls_array[i]))
540 		return;
541 
542 	*shadow = t->tls_array[i];
543 
544 	gdt = get_cpu_gdt_rw(cpu);
545 	maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
546 	mc = __xen_mc_entry(0);
547 
548 	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
549 }
550 
551 static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
552 {
553 	/*
554 	 * XXX sleazy hack: If we're being called in a lazy-cpu zone
555 	 * and lazy gs handling is enabled, it means we're in a
556 	 * context switch, and %gs has just been saved.  This means we
557 	 * can zero it out to prevent faults on exit from the
558 	 * hypervisor if the next process has no %gs.  Either way, it
559 	 * has been saved, and the new value will get loaded properly.
560 	 * This will go away as soon as Xen has been modified to not
561 	 * save/restore %gs for normal hypercalls.
562 	 *
563 	 * On x86_64, this hack is not used for %gs, because gs points
564 	 * to KERNEL_GS_BASE (and uses it for PDA references), so we
565 	 * must not zero %gs on x86_64
566 	 *
567 	 * For x86_64, we need to zero %fs, otherwise we may get an
568 	 * exception between the new %fs descriptor being loaded and
569 	 * %fs being effectively cleared at __switch_to().
570 	 */
571 	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
572 #ifdef CONFIG_X86_32
573 		lazy_load_gs(0);
574 #else
575 		loadsegment(fs, 0);
576 #endif
577 	}
578 
579 	xen_mc_batch();
580 
581 	load_TLS_descriptor(t, cpu, 0);
582 	load_TLS_descriptor(t, cpu, 1);
583 	load_TLS_descriptor(t, cpu, 2);
584 
585 	xen_mc_issue(PARAVIRT_LAZY_CPU);
586 }
587 
588 #ifdef CONFIG_X86_64
589 static void xen_load_gs_index(unsigned int idx)
590 {
591 	if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
592 		BUG();
593 }
594 #endif
595 
596 static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
597 				const void *ptr)
598 {
599 	xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
600 	u64 entry = *(u64 *)ptr;
601 
602 	trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
603 
604 	preempt_disable();
605 
606 	xen_mc_flush();
607 	if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
608 		BUG();
609 
610 	preempt_enable();
611 }
612 
613 static int cvt_gate_to_trap(int vector, const gate_desc *val,
614 			    struct trap_info *info)
615 {
616 	unsigned long addr;
617 
618 	if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
619 		return 0;
620 
621 	info->vector = vector;
622 
623 	addr = gate_offset(*val);
624 #ifdef CONFIG_X86_64
625 	/*
626 	 * Look for known traps using IST, and substitute them
627 	 * appropriately.  The debugger ones are the only ones we care
628 	 * about.  Xen will handle faults like double_fault,
629 	 * so we should never see them.  Warn if
630 	 * there's an unexpected IST-using fault handler.
631 	 */
632 	if (addr == (unsigned long)debug)
633 		addr = (unsigned long)xen_debug;
634 	else if (addr == (unsigned long)int3)
635 		addr = (unsigned long)xen_int3;
636 	else if (addr == (unsigned long)stack_segment)
637 		addr = (unsigned long)xen_stack_segment;
638 	else if (addr == (unsigned long)double_fault) {
639 		/* Don't need to handle these */
640 		return 0;
641 #ifdef CONFIG_X86_MCE
642 	} else if (addr == (unsigned long)machine_check) {
643 		/*
644 		 * when xen hypervisor inject vMCE to guest,
645 		 * use native mce handler to handle it
646 		 */
647 		;
648 #endif
649 	} else if (addr == (unsigned long)nmi)
650 		/*
651 		 * Use the native version as well.
652 		 */
653 		;
654 	else {
655 		/* Some other trap using IST? */
656 		if (WARN_ON(val->ist != 0))
657 			return 0;
658 	}
659 #endif	/* CONFIG_X86_64 */
660 	info->address = addr;
661 
662 	info->cs = gate_segment(*val);
663 	info->flags = val->dpl;
664 	/* interrupt gates clear IF */
665 	if (val->type == GATE_INTERRUPT)
666 		info->flags |= 1 << 2;
667 
668 	return 1;
669 }
670 
671 /* Locations of each CPU's IDT */
672 static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
673 
674 /* Set an IDT entry.  If the entry is part of the current IDT, then
675    also update Xen. */
676 static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
677 {
678 	unsigned long p = (unsigned long)&dt[entrynum];
679 	unsigned long start, end;
680 
681 	trace_xen_cpu_write_idt_entry(dt, entrynum, g);
682 
683 	preempt_disable();
684 
685 	start = __this_cpu_read(idt_desc.address);
686 	end = start + __this_cpu_read(idt_desc.size) + 1;
687 
688 	xen_mc_flush();
689 
690 	native_write_idt_entry(dt, entrynum, g);
691 
692 	if (p >= start && (p + 8) <= end) {
693 		struct trap_info info[2];
694 
695 		info[1].address = 0;
696 
697 		if (cvt_gate_to_trap(entrynum, g, &info[0]))
698 			if (HYPERVISOR_set_trap_table(info))
699 				BUG();
700 	}
701 
702 	preempt_enable();
703 }
704 
705 static void xen_convert_trap_info(const struct desc_ptr *desc,
706 				  struct trap_info *traps)
707 {
708 	unsigned in, out, count;
709 
710 	count = (desc->size+1) / sizeof(gate_desc);
711 	BUG_ON(count > 256);
712 
713 	for (in = out = 0; in < count; in++) {
714 		gate_desc *entry = (gate_desc *)(desc->address) + in;
715 
716 		if (cvt_gate_to_trap(in, entry, &traps[out]))
717 			out++;
718 	}
719 	traps[out].address = 0;
720 }
721 
722 void xen_copy_trap_info(struct trap_info *traps)
723 {
724 	const struct desc_ptr *desc = this_cpu_ptr(&idt_desc);
725 
726 	xen_convert_trap_info(desc, traps);
727 }
728 
729 /* Load a new IDT into Xen.  In principle this can be per-CPU, so we
730    hold a spinlock to protect the static traps[] array (static because
731    it avoids allocation, and saves stack space). */
732 static void xen_load_idt(const struct desc_ptr *desc)
733 {
734 	static DEFINE_SPINLOCK(lock);
735 	static struct trap_info traps[257];
736 
737 	trace_xen_cpu_load_idt(desc);
738 
739 	spin_lock(&lock);
740 
741 	memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
742 
743 	xen_convert_trap_info(desc, traps);
744 
745 	xen_mc_flush();
746 	if (HYPERVISOR_set_trap_table(traps))
747 		BUG();
748 
749 	spin_unlock(&lock);
750 }
751 
752 /* Write a GDT descriptor entry.  Ignore LDT descriptors, since
753    they're handled differently. */
754 static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
755 				const void *desc, int type)
756 {
757 	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
758 
759 	preempt_disable();
760 
761 	switch (type) {
762 	case DESC_LDT:
763 	case DESC_TSS:
764 		/* ignore */
765 		break;
766 
767 	default: {
768 		xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
769 
770 		xen_mc_flush();
771 		if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
772 			BUG();
773 	}
774 
775 	}
776 
777 	preempt_enable();
778 }
779 
780 /*
781  * Version of write_gdt_entry for use at early boot-time needed to
782  * update an entry as simply as possible.
783  */
784 static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
785 					    const void *desc, int type)
786 {
787 	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
788 
789 	switch (type) {
790 	case DESC_LDT:
791 	case DESC_TSS:
792 		/* ignore */
793 		break;
794 
795 	default: {
796 		xmaddr_t maddr = virt_to_machine(&dt[entry]);
797 
798 		if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
799 			dt[entry] = *(struct desc_struct *)desc;
800 	}
801 
802 	}
803 }
804 
805 static void xen_load_sp0(struct tss_struct *tss,
806 			 struct thread_struct *thread)
807 {
808 	struct multicall_space mcs;
809 
810 	mcs = xen_mc_entry(0);
811 	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
812 	xen_mc_issue(PARAVIRT_LAZY_CPU);
813 	tss->x86_tss.sp0 = thread->sp0;
814 }
815 
816 void xen_set_iopl_mask(unsigned mask)
817 {
818 	struct physdev_set_iopl set_iopl;
819 
820 	/* Force the change at ring 0. */
821 	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
822 	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
823 }
824 
825 static void xen_io_delay(void)
826 {
827 }
828 
829 static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
830 
831 static unsigned long xen_read_cr0(void)
832 {
833 	unsigned long cr0 = this_cpu_read(xen_cr0_value);
834 
835 	if (unlikely(cr0 == 0)) {
836 		cr0 = native_read_cr0();
837 		this_cpu_write(xen_cr0_value, cr0);
838 	}
839 
840 	return cr0;
841 }
842 
843 static void xen_write_cr0(unsigned long cr0)
844 {
845 	struct multicall_space mcs;
846 
847 	this_cpu_write(xen_cr0_value, cr0);
848 
849 	/* Only pay attention to cr0.TS; everything else is
850 	   ignored. */
851 	mcs = xen_mc_entry(0);
852 
853 	MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
854 
855 	xen_mc_issue(PARAVIRT_LAZY_CPU);
856 }
857 
858 static void xen_write_cr4(unsigned long cr4)
859 {
860 	cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
861 
862 	native_write_cr4(cr4);
863 }
864 #ifdef CONFIG_X86_64
865 static inline unsigned long xen_read_cr8(void)
866 {
867 	return 0;
868 }
869 static inline void xen_write_cr8(unsigned long val)
870 {
871 	BUG_ON(val);
872 }
873 #endif
874 
875 static u64 xen_read_msr_safe(unsigned int msr, int *err)
876 {
877 	u64 val;
878 
879 	if (pmu_msr_read(msr, &val, err))
880 		return val;
881 
882 	val = native_read_msr_safe(msr, err);
883 	switch (msr) {
884 	case MSR_IA32_APICBASE:
885 #ifdef CONFIG_X86_X2APIC
886 		if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31))))
887 #endif
888 			val &= ~X2APIC_ENABLE;
889 		break;
890 	}
891 	return val;
892 }
893 
894 static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
895 {
896 	int ret;
897 
898 	ret = 0;
899 
900 	switch (msr) {
901 #ifdef CONFIG_X86_64
902 		unsigned which;
903 		u64 base;
904 
905 	case MSR_FS_BASE:		which = SEGBASE_FS; goto set;
906 	case MSR_KERNEL_GS_BASE:	which = SEGBASE_GS_USER; goto set;
907 	case MSR_GS_BASE:		which = SEGBASE_GS_KERNEL; goto set;
908 
909 	set:
910 		base = ((u64)high << 32) | low;
911 		if (HYPERVISOR_set_segment_base(which, base) != 0)
912 			ret = -EIO;
913 		break;
914 #endif
915 
916 	case MSR_STAR:
917 	case MSR_CSTAR:
918 	case MSR_LSTAR:
919 	case MSR_SYSCALL_MASK:
920 	case MSR_IA32_SYSENTER_CS:
921 	case MSR_IA32_SYSENTER_ESP:
922 	case MSR_IA32_SYSENTER_EIP:
923 		/* Fast syscall setup is all done in hypercalls, so
924 		   these are all ignored.  Stub them out here to stop
925 		   Xen console noise. */
926 		break;
927 
928 	default:
929 		if (!pmu_msr_write(msr, low, high, &ret))
930 			ret = native_write_msr_safe(msr, low, high);
931 	}
932 
933 	return ret;
934 }
935 
936 static u64 xen_read_msr(unsigned int msr)
937 {
938 	/*
939 	 * This will silently swallow a #GP from RDMSR.  It may be worth
940 	 * changing that.
941 	 */
942 	int err;
943 
944 	return xen_read_msr_safe(msr, &err);
945 }
946 
947 static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
948 {
949 	/*
950 	 * This will silently swallow a #GP from WRMSR.  It may be worth
951 	 * changing that.
952 	 */
953 	xen_write_msr_safe(msr, low, high);
954 }
955 
956 void xen_setup_shared_info(void)
957 {
958 	set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
959 
960 	HYPERVISOR_shared_info =
961 		(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
962 
963 #ifndef CONFIG_SMP
964 	/* In UP this is as good a place as any to set up shared info */
965 	xen_setup_vcpu_info_placement();
966 #endif
967 
968 	xen_setup_mfn_list_list();
969 
970 	/*
971 	 * Now that shared info is set up we can start using routines that
972 	 * point to pvclock area.
973 	 */
974 	if (system_state == SYSTEM_BOOTING)
975 		xen_init_time_ops();
976 }
977 
978 /* This is called once we have the cpu_possible_mask */
979 void xen_setup_vcpu_info_placement(void)
980 {
981 	int cpu;
982 
983 	for_each_possible_cpu(cpu) {
984 		/* Set up direct vCPU id mapping for PV guests. */
985 		per_cpu(xen_vcpu_id, cpu) = cpu;
986 		xen_vcpu_setup(cpu);
987 	}
988 
989 	/*
990 	 * xen_vcpu_setup managed to place the vcpu_info within the
991 	 * percpu area for all cpus, so make use of it.
992 	 */
993 	if (xen_have_vcpu_info_placement) {
994 		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
995 		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
996 		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
997 		pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
998 		pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
999 	}
1000 }
1001 
1002 static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1003 			  unsigned long addr, unsigned len)
1004 {
1005 	char *start, *end, *reloc;
1006 	unsigned ret;
1007 
1008 	start = end = reloc = NULL;
1009 
1010 #define SITE(op, x)							\
1011 	case PARAVIRT_PATCH(op.x):					\
1012 	if (xen_have_vcpu_info_placement) {				\
1013 		start = (char *)xen_##x##_direct;			\
1014 		end = xen_##x##_direct_end;				\
1015 		reloc = xen_##x##_direct_reloc;				\
1016 	}								\
1017 	goto patch_site
1018 
1019 	switch (type) {
1020 		SITE(pv_irq_ops, irq_enable);
1021 		SITE(pv_irq_ops, irq_disable);
1022 		SITE(pv_irq_ops, save_fl);
1023 		SITE(pv_irq_ops, restore_fl);
1024 #undef SITE
1025 
1026 	patch_site:
1027 		if (start == NULL || (end-start) > len)
1028 			goto default_patch;
1029 
1030 		ret = paravirt_patch_insns(insnbuf, len, start, end);
1031 
1032 		/* Note: because reloc is assigned from something that
1033 		   appears to be an array, gcc assumes it's non-null,
1034 		   but doesn't know its relationship with start and
1035 		   end. */
1036 		if (reloc > start && reloc < end) {
1037 			int reloc_off = reloc - start;
1038 			long *relocp = (long *)(insnbuf + reloc_off);
1039 			long delta = start - (char *)addr;
1040 
1041 			*relocp += delta;
1042 		}
1043 		break;
1044 
1045 	default_patch:
1046 	default:
1047 		ret = paravirt_patch_default(type, clobbers, insnbuf,
1048 					     addr, len);
1049 		break;
1050 	}
1051 
1052 	return ret;
1053 }
1054 
1055 static const struct pv_info xen_info __initconst = {
1056 	.shared_kernel_pmd = 0,
1057 
1058 #ifdef CONFIG_X86_64
1059 	.extra_user_64bit_cs = FLAT_USER_CS64,
1060 #endif
1061 	.name = "Xen",
1062 };
1063 
1064 static const struct pv_init_ops xen_init_ops __initconst = {
1065 	.patch = xen_patch,
1066 };
1067 
1068 static const struct pv_cpu_ops xen_cpu_ops __initconst = {
1069 	.cpuid = xen_cpuid,
1070 
1071 	.set_debugreg = xen_set_debugreg,
1072 	.get_debugreg = xen_get_debugreg,
1073 
1074 	.read_cr0 = xen_read_cr0,
1075 	.write_cr0 = xen_write_cr0,
1076 
1077 	.read_cr4 = native_read_cr4,
1078 	.write_cr4 = xen_write_cr4,
1079 
1080 #ifdef CONFIG_X86_64
1081 	.read_cr8 = xen_read_cr8,
1082 	.write_cr8 = xen_write_cr8,
1083 #endif
1084 
1085 	.wbinvd = native_wbinvd,
1086 
1087 	.read_msr = xen_read_msr,
1088 	.write_msr = xen_write_msr,
1089 
1090 	.read_msr_safe = xen_read_msr_safe,
1091 	.write_msr_safe = xen_write_msr_safe,
1092 
1093 	.read_pmc = xen_read_pmc,
1094 
1095 	.iret = xen_iret,
1096 #ifdef CONFIG_X86_64
1097 	.usergs_sysret64 = xen_sysret64,
1098 #endif
1099 
1100 	.load_tr_desc = paravirt_nop,
1101 	.set_ldt = xen_set_ldt,
1102 	.load_gdt = xen_load_gdt,
1103 	.load_idt = xen_load_idt,
1104 	.load_tls = xen_load_tls,
1105 #ifdef CONFIG_X86_64
1106 	.load_gs_index = xen_load_gs_index,
1107 #endif
1108 
1109 	.alloc_ldt = xen_alloc_ldt,
1110 	.free_ldt = xen_free_ldt,
1111 
1112 	.store_idt = native_store_idt,
1113 	.store_tr = xen_store_tr,
1114 
1115 	.write_ldt_entry = xen_write_ldt_entry,
1116 	.write_gdt_entry = xen_write_gdt_entry,
1117 	.write_idt_entry = xen_write_idt_entry,
1118 	.load_sp0 = xen_load_sp0,
1119 
1120 	.set_iopl_mask = xen_set_iopl_mask,
1121 	.io_delay = xen_io_delay,
1122 
1123 	/* Xen takes care of %gs when switching to usermode for us */
1124 	.swapgs = paravirt_nop,
1125 
1126 	.start_context_switch = paravirt_start_context_switch,
1127 	.end_context_switch = xen_end_context_switch,
1128 };
1129 
1130 static void xen_restart(char *msg)
1131 {
1132 	xen_reboot(SHUTDOWN_reboot);
1133 }
1134 
1135 static void xen_machine_halt(void)
1136 {
1137 	xen_reboot(SHUTDOWN_poweroff);
1138 }
1139 
1140 static void xen_machine_power_off(void)
1141 {
1142 	if (pm_power_off)
1143 		pm_power_off();
1144 	xen_reboot(SHUTDOWN_poweroff);
1145 }
1146 
1147 static void xen_crash_shutdown(struct pt_regs *regs)
1148 {
1149 	xen_reboot(SHUTDOWN_crash);
1150 }
1151 
1152 static const struct machine_ops xen_machine_ops __initconst = {
1153 	.restart = xen_restart,
1154 	.halt = xen_machine_halt,
1155 	.power_off = xen_machine_power_off,
1156 	.shutdown = xen_machine_halt,
1157 	.crash_shutdown = xen_crash_shutdown,
1158 	.emergency_restart = xen_emergency_restart,
1159 };
1160 
1161 static unsigned char xen_get_nmi_reason(void)
1162 {
1163 	unsigned char reason = 0;
1164 
1165 	/* Construct a value which looks like it came from port 0x61. */
1166 	if (test_bit(_XEN_NMIREASON_io_error,
1167 		     &HYPERVISOR_shared_info->arch.nmi_reason))
1168 		reason |= NMI_REASON_IOCHK;
1169 	if (test_bit(_XEN_NMIREASON_pci_serr,
1170 		     &HYPERVISOR_shared_info->arch.nmi_reason))
1171 		reason |= NMI_REASON_SERR;
1172 
1173 	return reason;
1174 }
1175 
1176 static void __init xen_boot_params_init_edd(void)
1177 {
1178 #if IS_ENABLED(CONFIG_EDD)
1179 	struct xen_platform_op op;
1180 	struct edd_info *edd_info;
1181 	u32 *mbr_signature;
1182 	unsigned nr;
1183 	int ret;
1184 
1185 	edd_info = boot_params.eddbuf;
1186 	mbr_signature = boot_params.edd_mbr_sig_buffer;
1187 
1188 	op.cmd = XENPF_firmware_info;
1189 
1190 	op.u.firmware_info.type = XEN_FW_DISK_INFO;
1191 	for (nr = 0; nr < EDDMAXNR; nr++) {
1192 		struct edd_info *info = edd_info + nr;
1193 
1194 		op.u.firmware_info.index = nr;
1195 		info->params.length = sizeof(info->params);
1196 		set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
1197 				     &info->params);
1198 		ret = HYPERVISOR_platform_op(&op);
1199 		if (ret)
1200 			break;
1201 
1202 #define C(x) info->x = op.u.firmware_info.u.disk_info.x
1203 		C(device);
1204 		C(version);
1205 		C(interface_support);
1206 		C(legacy_max_cylinder);
1207 		C(legacy_max_head);
1208 		C(legacy_sectors_per_track);
1209 #undef C
1210 	}
1211 	boot_params.eddbuf_entries = nr;
1212 
1213 	op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
1214 	for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
1215 		op.u.firmware_info.index = nr;
1216 		ret = HYPERVISOR_platform_op(&op);
1217 		if (ret)
1218 			break;
1219 		mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
1220 	}
1221 	boot_params.edd_mbr_sig_buf_entries = nr;
1222 #endif
1223 }
1224 
1225 /*
1226  * Set up the GDT and segment registers for -fstack-protector.  Until
1227  * we do this, we have to be careful not to call any stack-protected
1228  * function, which is most of the kernel.
1229  */
1230 static void xen_setup_gdt(int cpu)
1231 {
1232 	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
1233 	pv_cpu_ops.load_gdt = xen_load_gdt_boot;
1234 
1235 	setup_stack_canary_segment(0);
1236 	switch_to_new_gdt(0);
1237 
1238 	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
1239 	pv_cpu_ops.load_gdt = xen_load_gdt;
1240 }
1241 
1242 static void __init xen_dom0_set_legacy_features(void)
1243 {
1244 	x86_platform.legacy.rtc = 1;
1245 }
1246 
1247 /* First C function to be called on Xen boot */
1248 asmlinkage __visible void __init xen_start_kernel(void)
1249 {
1250 	struct physdev_set_iopl set_iopl;
1251 	unsigned long initrd_start = 0;
1252 	int rc;
1253 
1254 	if (!xen_start_info)
1255 		return;
1256 
1257 	xen_domain_type = XEN_PV_DOMAIN;
1258 
1259 	xen_setup_features();
1260 
1261 	xen_setup_machphys_mapping();
1262 
1263 	/* Install Xen paravirt ops */
1264 	pv_info = xen_info;
1265 	pv_init_ops = xen_init_ops;
1266 	pv_cpu_ops = xen_cpu_ops;
1267 
1268 	x86_platform.get_nmi_reason = xen_get_nmi_reason;
1269 
1270 	x86_init.resources.memory_setup = xen_memory_setup;
1271 	x86_init.oem.arch_setup = xen_arch_setup;
1272 	x86_init.oem.banner = xen_banner;
1273 
1274 	/*
1275 	 * Set up some pagetable state before starting to set any ptes.
1276 	 */
1277 
1278 	xen_init_mmu_ops();
1279 
1280 	/* Prevent unwanted bits from being set in PTEs. */
1281 	__supported_pte_mask &= ~_PAGE_GLOBAL;
1282 
1283 	/*
1284 	 * Prevent page tables from being allocated in highmem, even
1285 	 * if CONFIG_HIGHPTE is enabled.
1286 	 */
1287 	__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
1288 
1289 	/* Work out if we support NX */
1290 	x86_configure_nx();
1291 
1292 	/* Get mfn list */
1293 	xen_build_dynamic_phys_to_machine();
1294 
1295 	/*
1296 	 * Set up kernel GDT and segment registers, mainly so that
1297 	 * -fstack-protector code can be executed.
1298 	 */
1299 	xen_setup_gdt(0);
1300 
1301 	xen_init_irq_ops();
1302 	xen_init_capabilities();
1303 
1304 #ifdef CONFIG_X86_LOCAL_APIC
1305 	/*
1306 	 * set up the basic apic ops.
1307 	 */
1308 	xen_init_apic();
1309 #endif
1310 
1311 	if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1312 		pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1313 		pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
1314 	}
1315 
1316 	machine_ops = xen_machine_ops;
1317 
1318 	/*
1319 	 * The only reliable way to retain the initial address of the
1320 	 * percpu gdt_page is to remember it here, so we can go and
1321 	 * mark it RW later, when the initial percpu area is freed.
1322 	 */
1323 	xen_initial_gdt = &per_cpu(gdt_page, 0);
1324 
1325 	xen_smp_init();
1326 
1327 #ifdef CONFIG_ACPI_NUMA
1328 	/*
1329 	 * The pages we from Xen are not related to machine pages, so
1330 	 * any NUMA information the kernel tries to get from ACPI will
1331 	 * be meaningless.  Prevent it from trying.
1332 	 */
1333 	acpi_numa = -1;
1334 #endif
1335 	/* Don't do the full vcpu_info placement stuff until we have a
1336 	   possible map and a non-dummy shared_info. */
1337 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1338 
1339 	WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
1340 
1341 	local_irq_disable();
1342 	early_boot_irqs_disabled = true;
1343 
1344 	xen_raw_console_write("mapping kernel into physical memory\n");
1345 	xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
1346 				   xen_start_info->nr_pages);
1347 	xen_reserve_special_pages();
1348 
1349 	/* keep using Xen gdt for now; no urgent need to change it */
1350 
1351 #ifdef CONFIG_X86_32
1352 	pv_info.kernel_rpl = 1;
1353 	if (xen_feature(XENFEAT_supervisor_mode_kernel))
1354 		pv_info.kernel_rpl = 0;
1355 #else
1356 	pv_info.kernel_rpl = 0;
1357 #endif
1358 	/* set the limit of our address space */
1359 	xen_reserve_top();
1360 
1361 	/*
1362 	 * We used to do this in xen_arch_setup, but that is too late
1363 	 * on AMD were early_cpu_init (run before ->arch_setup()) calls
1364 	 * early_amd_init which pokes 0xcf8 port.
1365 	 */
1366 	set_iopl.iopl = 1;
1367 	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1368 	if (rc != 0)
1369 		xen_raw_printk("physdev_op failed %d\n", rc);
1370 
1371 #ifdef CONFIG_X86_32
1372 	/* set up basic CPUID stuff */
1373 	cpu_detect(&new_cpu_data);
1374 	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
1375 	new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
1376 #endif
1377 
1378 	if (xen_start_info->mod_start) {
1379 	    if (xen_start_info->flags & SIF_MOD_START_PFN)
1380 		initrd_start = PFN_PHYS(xen_start_info->mod_start);
1381 	    else
1382 		initrd_start = __pa(xen_start_info->mod_start);
1383 	}
1384 
1385 	/* Poke various useful things into boot_params */
1386 	boot_params.hdr.type_of_loader = (9 << 4) | 0;
1387 	boot_params.hdr.ramdisk_image = initrd_start;
1388 	boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1389 	boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1390 	boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN;
1391 
1392 	if (!xen_initial_domain()) {
1393 		add_preferred_console("xenboot", 0, NULL);
1394 		add_preferred_console("tty", 0, NULL);
1395 		add_preferred_console("hvc", 0, NULL);
1396 		if (pci_xen)
1397 			x86_init.pci.arch_init = pci_xen_init;
1398 	} else {
1399 		const struct dom0_vga_console_info *info =
1400 			(void *)((char *)xen_start_info +
1401 				 xen_start_info->console.dom0.info_off);
1402 		struct xen_platform_op op = {
1403 			.cmd = XENPF_firmware_info,
1404 			.interface_version = XENPF_INTERFACE_VERSION,
1405 			.u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
1406 		};
1407 
1408 		x86_platform.set_legacy_features =
1409 				xen_dom0_set_legacy_features;
1410 		xen_init_vga(info, xen_start_info->console.dom0.info_size);
1411 		xen_start_info->console.domU.mfn = 0;
1412 		xen_start_info->console.domU.evtchn = 0;
1413 
1414 		if (HYPERVISOR_platform_op(&op) == 0)
1415 			boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
1416 
1417 		/* Make sure ACS will be enabled */
1418 		pci_request_acs();
1419 
1420 		xen_acpi_sleep_register();
1421 
1422 		/* Avoid searching for BIOS MP tables */
1423 		x86_init.mpparse.find_smp_config = x86_init_noop;
1424 		x86_init.mpparse.get_smp_config = x86_init_uint_noop;
1425 
1426 		xen_boot_params_init_edd();
1427 	}
1428 #ifdef CONFIG_PCI
1429 	/* PCI BIOS service won't work from a PV guest. */
1430 	pci_probe &= ~PCI_PROBE_BIOS;
1431 #endif
1432 	xen_raw_console_write("about to get started...\n");
1433 
1434 	/* Let's presume PV guests always boot on vCPU with id 0. */
1435 	per_cpu(xen_vcpu_id, 0) = 0;
1436 
1437 	xen_setup_runstate_info(0);
1438 
1439 	xen_efi_init();
1440 
1441 	/* Start the world */
1442 #ifdef CONFIG_X86_32
1443 	i386_start_kernel();
1444 #else
1445 	cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
1446 	x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1447 #endif
1448 }
1449 
1450 static int xen_cpu_up_prepare_pv(unsigned int cpu)
1451 {
1452 	int rc;
1453 
1454 	xen_setup_timer(cpu);
1455 
1456 	rc = xen_smp_intr_init(cpu);
1457 	if (rc) {
1458 		WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
1459 		     cpu, rc);
1460 		return rc;
1461 	}
1462 
1463 	rc = xen_smp_intr_init_pv(cpu);
1464 	if (rc) {
1465 		WARN(1, "xen_smp_intr_init_pv() for CPU %d failed: %d\n",
1466 		     cpu, rc);
1467 		return rc;
1468 	}
1469 
1470 	return 0;
1471 }
1472 
1473 static int xen_cpu_dead_pv(unsigned int cpu)
1474 {
1475 	xen_smp_intr_free(cpu);
1476 	xen_smp_intr_free_pv(cpu);
1477 
1478 	xen_teardown_timer(cpu);
1479 
1480 	return 0;
1481 }
1482 
1483 static uint32_t __init xen_platform_pv(void)
1484 {
1485 	if (xen_pv_domain())
1486 		return xen_cpuid_base();
1487 
1488 	return 0;
1489 }
1490 
1491 const struct hypervisor_x86 x86_hyper_xen_pv = {
1492 	.name                   = "Xen PV",
1493 	.detect                 = xen_platform_pv,
1494 	.pin_vcpu               = xen_pin_vcpu,
1495 };
1496 EXPORT_SYMBOL(x86_hyper_xen_pv);
1497