1 /* 2 * Core of Xen paravirt_ops implementation. 3 * 4 * This file contains the xen_paravirt_ops structure itself, and the 5 * implementations for: 6 * - privileged instructions 7 * - interrupt flags 8 * - segment operations 9 * - booting and setup 10 * 11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 12 */ 13 14 #include <linux/cpu.h> 15 #include <linux/kernel.h> 16 #include <linux/init.h> 17 #include <linux/smp.h> 18 #include <linux/preempt.h> 19 #include <linux/hardirq.h> 20 #include <linux/percpu.h> 21 #include <linux/delay.h> 22 #include <linux/start_kernel.h> 23 #include <linux/sched.h> 24 #include <linux/kprobes.h> 25 #include <linux/bootmem.h> 26 #include <linux/export.h> 27 #include <linux/mm.h> 28 #include <linux/page-flags.h> 29 #include <linux/highmem.h> 30 #include <linux/console.h> 31 #include <linux/pci.h> 32 #include <linux/gfp.h> 33 #include <linux/memblock.h> 34 #include <linux/edd.h> 35 #include <linux/frame.h> 36 37 #include <linux/kexec.h> 38 39 #include <xen/xen.h> 40 #include <xen/events.h> 41 #include <xen/interface/xen.h> 42 #include <xen/interface/version.h> 43 #include <xen/interface/physdev.h> 44 #include <xen/interface/vcpu.h> 45 #include <xen/interface/memory.h> 46 #include <xen/interface/nmi.h> 47 #include <xen/interface/xen-mca.h> 48 #include <xen/interface/hvm/start_info.h> 49 #include <xen/features.h> 50 #include <xen/page.h> 51 #include <xen/hvm.h> 52 #include <xen/hvc-console.h> 53 #include <xen/acpi.h> 54 55 #include <asm/paravirt.h> 56 #include <asm/apic.h> 57 #include <asm/page.h> 58 #include <asm/xen/pci.h> 59 #include <asm/xen/hypercall.h> 60 #include <asm/xen/hypervisor.h> 61 #include <asm/xen/cpuid.h> 62 #include <asm/fixmap.h> 63 #include <asm/processor.h> 64 #include <asm/proto.h> 65 #include <asm/msr-index.h> 66 #include <asm/traps.h> 67 #include <asm/setup.h> 68 #include <asm/desc.h> 69 #include <asm/pgalloc.h> 70 #include <asm/pgtable.h> 71 #include <asm/tlbflush.h> 72 #include <asm/reboot.h> 73 #include <asm/stackprotector.h> 74 #include <asm/hypervisor.h> 75 #include <asm/mach_traps.h> 76 #include <asm/mwait.h> 77 #include <asm/pci_x86.h> 78 #include <asm/cpu.h> 79 80 #ifdef CONFIG_ACPI 81 #include <linux/acpi.h> 82 #include <asm/acpi.h> 83 #include <acpi/pdc_intel.h> 84 #include <acpi/processor.h> 85 #include <xen/interface/platform.h> 86 #endif 87 88 #include "xen-ops.h" 89 #include "mmu.h" 90 #include "smp.h" 91 #include "multicalls.h" 92 #include "pmu.h" 93 94 EXPORT_SYMBOL_GPL(hypercall_page); 95 96 /* 97 * Pointer to the xen_vcpu_info structure or 98 * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info 99 * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info 100 * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point 101 * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to 102 * acknowledge pending events. 103 * Also more subtly it is used by the patched version of irq enable/disable 104 * e.g. xen_irq_enable_direct and xen_iret in PV mode. 105 * 106 * The desire to be able to do those mask/unmask operations as a single 107 * instruction by using the per-cpu offset held in %gs is the real reason 108 * vcpu info is in a per-cpu pointer and the original reason for this 109 * hypercall. 110 * 111 */ 112 DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 113 114 /* 115 * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info 116 * hypercall. This can be used both in PV and PVHVM mode. The structure 117 * overrides the default per_cpu(xen_vcpu, cpu) value. 118 */ 119 DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 120 121 /* Linux <-> Xen vCPU id mapping */ 122 DEFINE_PER_CPU(uint32_t, xen_vcpu_id); 123 EXPORT_PER_CPU_SYMBOL(xen_vcpu_id); 124 125 enum xen_domain_type xen_domain_type = XEN_NATIVE; 126 EXPORT_SYMBOL_GPL(xen_domain_type); 127 128 unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; 129 EXPORT_SYMBOL(machine_to_phys_mapping); 130 unsigned long machine_to_phys_nr; 131 EXPORT_SYMBOL(machine_to_phys_nr); 132 133 struct start_info *xen_start_info; 134 EXPORT_SYMBOL_GPL(xen_start_info); 135 136 struct shared_info xen_dummy_shared_info; 137 138 void *xen_initial_gdt; 139 140 RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); 141 142 static int xen_cpu_up_prepare(unsigned int cpu); 143 static int xen_cpu_up_online(unsigned int cpu); 144 static int xen_cpu_dead(unsigned int cpu); 145 146 /* 147 * Point at some empty memory to start with. We map the real shared_info 148 * page as soon as fixmap is up and running. 149 */ 150 struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; 151 152 /* 153 * Flag to determine whether vcpu info placement is available on all 154 * VCPUs. We assume it is to start with, and then set it to zero on 155 * the first failure. This is because it can succeed on some VCPUs 156 * and not others, since it can involve hypervisor memory allocation, 157 * or because the guest failed to guarantee all the appropriate 158 * constraints on all VCPUs (ie buffer can't cross a page boundary). 159 * 160 * Note that any particular CPU may be using a placed vcpu structure, 161 * but we can only optimise if the all are. 162 * 163 * 0: not available, 1: available 164 */ 165 static int have_vcpu_info_placement = 1; 166 167 struct tls_descs { 168 struct desc_struct desc[3]; 169 }; 170 171 /* 172 * Updating the 3 TLS descriptors in the GDT on every task switch is 173 * surprisingly expensive so we avoid updating them if they haven't 174 * changed. Since Xen writes different descriptors than the one 175 * passed in the update_descriptor hypercall we keep shadow copies to 176 * compare against. 177 */ 178 static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); 179 180 #ifdef CONFIG_XEN_PVH 181 /* 182 * PVH variables. 183 * 184 * xen_pvh and pvh_bootparams need to live in data segment since they 185 * are used after startup_{32|64}, which clear .bss, are invoked. 186 */ 187 bool xen_pvh __attribute__((section(".data"))) = 0; 188 struct boot_params pvh_bootparams __attribute__((section(".data"))); 189 190 struct hvm_start_info pvh_start_info; 191 unsigned int pvh_start_info_sz = sizeof(pvh_start_info); 192 #endif 193 194 static void clamp_max_cpus(void) 195 { 196 #ifdef CONFIG_SMP 197 if (setup_max_cpus > MAX_VIRT_CPUS) 198 setup_max_cpus = MAX_VIRT_CPUS; 199 #endif 200 } 201 202 void xen_vcpu_setup(int cpu) 203 { 204 struct vcpu_register_vcpu_info info; 205 int err; 206 struct vcpu_info *vcpup; 207 208 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); 209 210 /* 211 * This path is called twice on PVHVM - first during bootup via 212 * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being 213 * hotplugged: cpu_up -> xen_hvm_cpu_notify. 214 * As we can only do the VCPUOP_register_vcpu_info once lets 215 * not over-write its result. 216 * 217 * For PV it is called during restore (xen_vcpu_restore) and bootup 218 * (xen_setup_vcpu_info_placement). The hotplug mechanism does not 219 * use this function. 220 */ 221 if (xen_hvm_domain()) { 222 if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu)) 223 return; 224 } 225 if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) 226 per_cpu(xen_vcpu, cpu) = 227 &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; 228 229 if (!have_vcpu_info_placement) { 230 if (cpu >= MAX_VIRT_CPUS) 231 clamp_max_cpus(); 232 return; 233 } 234 235 vcpup = &per_cpu(xen_vcpu_info, cpu); 236 info.mfn = arbitrary_virt_to_mfn(vcpup); 237 info.offset = offset_in_page(vcpup); 238 239 /* Check to see if the hypervisor will put the vcpu_info 240 structure where we want it, which allows direct access via 241 a percpu-variable. 242 N.B. This hypercall can _only_ be called once per CPU. Subsequent 243 calls will error out with -EINVAL. This is due to the fact that 244 hypervisor has no unregister variant and this hypercall does not 245 allow to over-write info.mfn and info.offset. 246 */ 247 err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, xen_vcpu_nr(cpu), 248 &info); 249 250 if (err) { 251 printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); 252 have_vcpu_info_placement = 0; 253 clamp_max_cpus(); 254 } else { 255 /* This cpu is using the registered vcpu info, even if 256 later ones fail to. */ 257 per_cpu(xen_vcpu, cpu) = vcpup; 258 } 259 } 260 261 /* 262 * On restore, set the vcpu placement up again. 263 * If it fails, then we're in a bad state, since 264 * we can't back out from using it... 265 */ 266 void xen_vcpu_restore(void) 267 { 268 int cpu; 269 270 for_each_possible_cpu(cpu) { 271 bool other_cpu = (cpu != smp_processor_id()); 272 bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), 273 NULL); 274 275 if (other_cpu && is_up && 276 HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) 277 BUG(); 278 279 xen_setup_runstate_info(cpu); 280 281 if (have_vcpu_info_placement) 282 xen_vcpu_setup(cpu); 283 284 if (other_cpu && is_up && 285 HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) 286 BUG(); 287 } 288 } 289 290 static void __init xen_banner(void) 291 { 292 unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); 293 struct xen_extraversion extra; 294 HYPERVISOR_xen_version(XENVER_extraversion, &extra); 295 296 pr_info("Booting paravirtualized kernel %son %s\n", 297 xen_feature(XENFEAT_auto_translated_physmap) ? 298 "with PVH extensions " : "", pv_info.name); 299 printk(KERN_INFO "Xen version: %d.%d%s%s\n", 300 version >> 16, version & 0xffff, extra.extraversion, 301 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 302 } 303 /* Check if running on Xen version (major, minor) or later */ 304 bool 305 xen_running_on_version_or_later(unsigned int major, unsigned int minor) 306 { 307 unsigned int version; 308 309 if (!xen_domain()) 310 return false; 311 312 version = HYPERVISOR_xen_version(XENVER_version, NULL); 313 if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || 314 ((version >> 16) > major)) 315 return true; 316 return false; 317 } 318 319 #define CPUID_THERM_POWER_LEAF 6 320 #define APERFMPERF_PRESENT 0 321 322 static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; 323 static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; 324 325 static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask; 326 static __read_mostly unsigned int cpuid_leaf5_ecx_val; 327 static __read_mostly unsigned int cpuid_leaf5_edx_val; 328 329 static void xen_cpuid(unsigned int *ax, unsigned int *bx, 330 unsigned int *cx, unsigned int *dx) 331 { 332 unsigned maskebx = ~0; 333 unsigned maskecx = ~0; 334 unsigned maskedx = ~0; 335 unsigned setecx = 0; 336 /* 337 * Mask out inconvenient features, to try and disable as many 338 * unsupported kernel subsystems as possible. 339 */ 340 switch (*ax) { 341 case 1: 342 maskecx = cpuid_leaf1_ecx_mask; 343 setecx = cpuid_leaf1_ecx_set_mask; 344 maskedx = cpuid_leaf1_edx_mask; 345 break; 346 347 case CPUID_MWAIT_LEAF: 348 /* Synthesize the values.. */ 349 *ax = 0; 350 *bx = 0; 351 *cx = cpuid_leaf5_ecx_val; 352 *dx = cpuid_leaf5_edx_val; 353 return; 354 355 case CPUID_THERM_POWER_LEAF: 356 /* Disabling APERFMPERF for kernel usage */ 357 maskecx = ~(1 << APERFMPERF_PRESENT); 358 break; 359 360 case 0xb: 361 /* Suppress extended topology stuff */ 362 maskebx = 0; 363 break; 364 } 365 366 asm(XEN_EMULATE_PREFIX "cpuid" 367 : "=a" (*ax), 368 "=b" (*bx), 369 "=c" (*cx), 370 "=d" (*dx) 371 : "0" (*ax), "2" (*cx)); 372 373 *bx &= maskebx; 374 *cx &= maskecx; 375 *cx |= setecx; 376 *dx &= maskedx; 377 } 378 STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */ 379 380 static bool __init xen_check_mwait(void) 381 { 382 #ifdef CONFIG_ACPI 383 struct xen_platform_op op = { 384 .cmd = XENPF_set_processor_pminfo, 385 .u.set_pminfo.id = -1, 386 .u.set_pminfo.type = XEN_PM_PDC, 387 }; 388 uint32_t buf[3]; 389 unsigned int ax, bx, cx, dx; 390 unsigned int mwait_mask; 391 392 /* We need to determine whether it is OK to expose the MWAIT 393 * capability to the kernel to harvest deeper than C3 states from ACPI 394 * _CST using the processor_harvest_xen.c module. For this to work, we 395 * need to gather the MWAIT_LEAF values (which the cstate.c code 396 * checks against). The hypervisor won't expose the MWAIT flag because 397 * it would break backwards compatibility; so we will find out directly 398 * from the hardware and hypercall. 399 */ 400 if (!xen_initial_domain()) 401 return false; 402 403 /* 404 * When running under platform earlier than Xen4.2, do not expose 405 * mwait, to avoid the risk of loading native acpi pad driver 406 */ 407 if (!xen_running_on_version_or_later(4, 2)) 408 return false; 409 410 ax = 1; 411 cx = 0; 412 413 native_cpuid(&ax, &bx, &cx, &dx); 414 415 mwait_mask = (1 << (X86_FEATURE_EST % 32)) | 416 (1 << (X86_FEATURE_MWAIT % 32)); 417 418 if ((cx & mwait_mask) != mwait_mask) 419 return false; 420 421 /* We need to emulate the MWAIT_LEAF and for that we need both 422 * ecx and edx. The hypercall provides only partial information. 423 */ 424 425 ax = CPUID_MWAIT_LEAF; 426 bx = 0; 427 cx = 0; 428 dx = 0; 429 430 native_cpuid(&ax, &bx, &cx, &dx); 431 432 /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, 433 * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. 434 */ 435 buf[0] = ACPI_PDC_REVISION_ID; 436 buf[1] = 1; 437 buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); 438 439 set_xen_guest_handle(op.u.set_pminfo.pdc, buf); 440 441 if ((HYPERVISOR_platform_op(&op) == 0) && 442 (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { 443 cpuid_leaf5_ecx_val = cx; 444 cpuid_leaf5_edx_val = dx; 445 } 446 return true; 447 #else 448 return false; 449 #endif 450 } 451 static void __init xen_init_cpuid_mask(void) 452 { 453 unsigned int ax, bx, cx, dx; 454 unsigned int xsave_mask; 455 456 cpuid_leaf1_edx_mask = 457 ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */ 458 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 459 460 if (!xen_initial_domain()) 461 cpuid_leaf1_edx_mask &= 462 ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */ 463 464 cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32)); 465 466 ax = 1; 467 cx = 0; 468 cpuid(1, &ax, &bx, &cx, &dx); 469 470 xsave_mask = 471 (1 << (X86_FEATURE_XSAVE % 32)) | 472 (1 << (X86_FEATURE_OSXSAVE % 32)); 473 474 /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ 475 if ((cx & xsave_mask) != xsave_mask) 476 cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ 477 if (xen_check_mwait()) 478 cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32)); 479 } 480 481 static void xen_set_debugreg(int reg, unsigned long val) 482 { 483 HYPERVISOR_set_debugreg(reg, val); 484 } 485 486 static unsigned long xen_get_debugreg(int reg) 487 { 488 return HYPERVISOR_get_debugreg(reg); 489 } 490 491 static void xen_end_context_switch(struct task_struct *next) 492 { 493 xen_mc_flush(); 494 paravirt_end_context_switch(next); 495 } 496 497 static unsigned long xen_store_tr(void) 498 { 499 return 0; 500 } 501 502 /* 503 * Set the page permissions for a particular virtual address. If the 504 * address is a vmalloc mapping (or other non-linear mapping), then 505 * find the linear mapping of the page and also set its protections to 506 * match. 507 */ 508 static void set_aliased_prot(void *v, pgprot_t prot) 509 { 510 int level; 511 pte_t *ptep; 512 pte_t pte; 513 unsigned long pfn; 514 struct page *page; 515 unsigned char dummy; 516 517 ptep = lookup_address((unsigned long)v, &level); 518 BUG_ON(ptep == NULL); 519 520 pfn = pte_pfn(*ptep); 521 page = pfn_to_page(pfn); 522 523 pte = pfn_pte(pfn, prot); 524 525 /* 526 * Careful: update_va_mapping() will fail if the virtual address 527 * we're poking isn't populated in the page tables. We don't 528 * need to worry about the direct map (that's always in the page 529 * tables), but we need to be careful about vmap space. In 530 * particular, the top level page table can lazily propagate 531 * entries between processes, so if we've switched mms since we 532 * vmapped the target in the first place, we might not have the 533 * top-level page table entry populated. 534 * 535 * We disable preemption because we want the same mm active when 536 * we probe the target and when we issue the hypercall. We'll 537 * have the same nominal mm, but if we're a kernel thread, lazy 538 * mm dropping could change our pgd. 539 * 540 * Out of an abundance of caution, this uses __get_user() to fault 541 * in the target address just in case there's some obscure case 542 * in which the target address isn't readable. 543 */ 544 545 preempt_disable(); 546 547 probe_kernel_read(&dummy, v, 1); 548 549 if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) 550 BUG(); 551 552 if (!PageHighMem(page)) { 553 void *av = __va(PFN_PHYS(pfn)); 554 555 if (av != v) 556 if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) 557 BUG(); 558 } else 559 kmap_flush_unused(); 560 561 preempt_enable(); 562 } 563 564 static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) 565 { 566 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; 567 int i; 568 569 /* 570 * We need to mark the all aliases of the LDT pages RO. We 571 * don't need to call vm_flush_aliases(), though, since that's 572 * only responsible for flushing aliases out the TLBs, not the 573 * page tables, and Xen will flush the TLB for us if needed. 574 * 575 * To avoid confusing future readers: none of this is necessary 576 * to load the LDT. The hypervisor only checks this when the 577 * LDT is faulted in due to subsequent descriptor access. 578 */ 579 580 for(i = 0; i < entries; i += entries_per_page) 581 set_aliased_prot(ldt + i, PAGE_KERNEL_RO); 582 } 583 584 static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) 585 { 586 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; 587 int i; 588 589 for(i = 0; i < entries; i += entries_per_page) 590 set_aliased_prot(ldt + i, PAGE_KERNEL); 591 } 592 593 static void xen_set_ldt(const void *addr, unsigned entries) 594 { 595 struct mmuext_op *op; 596 struct multicall_space mcs = xen_mc_entry(sizeof(*op)); 597 598 trace_xen_cpu_set_ldt(addr, entries); 599 600 op = mcs.args; 601 op->cmd = MMUEXT_SET_LDT; 602 op->arg1.linear_addr = (unsigned long)addr; 603 op->arg2.nr_ents = entries; 604 605 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 606 607 xen_mc_issue(PARAVIRT_LAZY_CPU); 608 } 609 610 static void xen_load_gdt(const struct desc_ptr *dtr) 611 { 612 unsigned long va = dtr->address; 613 unsigned int size = dtr->size + 1; 614 unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); 615 unsigned long frames[pages]; 616 int f; 617 618 /* 619 * A GDT can be up to 64k in size, which corresponds to 8192 620 * 8-byte entries, or 16 4k pages.. 621 */ 622 623 BUG_ON(size > 65536); 624 BUG_ON(va & ~PAGE_MASK); 625 626 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 627 int level; 628 pte_t *ptep; 629 unsigned long pfn, mfn; 630 void *virt; 631 632 /* 633 * The GDT is per-cpu and is in the percpu data area. 634 * That can be virtually mapped, so we need to do a 635 * page-walk to get the underlying MFN for the 636 * hypercall. The page can also be in the kernel's 637 * linear range, so we need to RO that mapping too. 638 */ 639 ptep = lookup_address(va, &level); 640 BUG_ON(ptep == NULL); 641 642 pfn = pte_pfn(*ptep); 643 mfn = pfn_to_mfn(pfn); 644 virt = __va(PFN_PHYS(pfn)); 645 646 frames[f] = mfn; 647 648 make_lowmem_page_readonly((void *)va); 649 make_lowmem_page_readonly(virt); 650 } 651 652 if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) 653 BUG(); 654 } 655 656 /* 657 * load_gdt for early boot, when the gdt is only mapped once 658 */ 659 static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) 660 { 661 unsigned long va = dtr->address; 662 unsigned int size = dtr->size + 1; 663 unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); 664 unsigned long frames[pages]; 665 int f; 666 667 /* 668 * A GDT can be up to 64k in size, which corresponds to 8192 669 * 8-byte entries, or 16 4k pages.. 670 */ 671 672 BUG_ON(size > 65536); 673 BUG_ON(va & ~PAGE_MASK); 674 675 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 676 pte_t pte; 677 unsigned long pfn, mfn; 678 679 pfn = virt_to_pfn(va); 680 mfn = pfn_to_mfn(pfn); 681 682 pte = pfn_pte(pfn, PAGE_KERNEL_RO); 683 684 if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) 685 BUG(); 686 687 frames[f] = mfn; 688 } 689 690 if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) 691 BUG(); 692 } 693 694 static inline bool desc_equal(const struct desc_struct *d1, 695 const struct desc_struct *d2) 696 { 697 return d1->a == d2->a && d1->b == d2->b; 698 } 699 700 static void load_TLS_descriptor(struct thread_struct *t, 701 unsigned int cpu, unsigned int i) 702 { 703 struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; 704 struct desc_struct *gdt; 705 xmaddr_t maddr; 706 struct multicall_space mc; 707 708 if (desc_equal(shadow, &t->tls_array[i])) 709 return; 710 711 *shadow = t->tls_array[i]; 712 713 gdt = get_cpu_gdt_table(cpu); 714 maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 715 mc = __xen_mc_entry(0); 716 717 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 718 } 719 720 static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 721 { 722 /* 723 * XXX sleazy hack: If we're being called in a lazy-cpu zone 724 * and lazy gs handling is enabled, it means we're in a 725 * context switch, and %gs has just been saved. This means we 726 * can zero it out to prevent faults on exit from the 727 * hypervisor if the next process has no %gs. Either way, it 728 * has been saved, and the new value will get loaded properly. 729 * This will go away as soon as Xen has been modified to not 730 * save/restore %gs for normal hypercalls. 731 * 732 * On x86_64, this hack is not used for %gs, because gs points 733 * to KERNEL_GS_BASE (and uses it for PDA references), so we 734 * must not zero %gs on x86_64 735 * 736 * For x86_64, we need to zero %fs, otherwise we may get an 737 * exception between the new %fs descriptor being loaded and 738 * %fs being effectively cleared at __switch_to(). 739 */ 740 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { 741 #ifdef CONFIG_X86_32 742 lazy_load_gs(0); 743 #else 744 loadsegment(fs, 0); 745 #endif 746 } 747 748 xen_mc_batch(); 749 750 load_TLS_descriptor(t, cpu, 0); 751 load_TLS_descriptor(t, cpu, 1); 752 load_TLS_descriptor(t, cpu, 2); 753 754 xen_mc_issue(PARAVIRT_LAZY_CPU); 755 } 756 757 #ifdef CONFIG_X86_64 758 static void xen_load_gs_index(unsigned int idx) 759 { 760 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) 761 BUG(); 762 } 763 #endif 764 765 static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 766 const void *ptr) 767 { 768 xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); 769 u64 entry = *(u64 *)ptr; 770 771 trace_xen_cpu_write_ldt_entry(dt, entrynum, entry); 772 773 preempt_disable(); 774 775 xen_mc_flush(); 776 if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) 777 BUG(); 778 779 preempt_enable(); 780 } 781 782 static int cvt_gate_to_trap(int vector, const gate_desc *val, 783 struct trap_info *info) 784 { 785 unsigned long addr; 786 787 if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) 788 return 0; 789 790 info->vector = vector; 791 792 addr = gate_offset(*val); 793 #ifdef CONFIG_X86_64 794 /* 795 * Look for known traps using IST, and substitute them 796 * appropriately. The debugger ones are the only ones we care 797 * about. Xen will handle faults like double_fault, 798 * so we should never see them. Warn if 799 * there's an unexpected IST-using fault handler. 800 */ 801 if (addr == (unsigned long)debug) 802 addr = (unsigned long)xen_debug; 803 else if (addr == (unsigned long)int3) 804 addr = (unsigned long)xen_int3; 805 else if (addr == (unsigned long)stack_segment) 806 addr = (unsigned long)xen_stack_segment; 807 else if (addr == (unsigned long)double_fault) { 808 /* Don't need to handle these */ 809 return 0; 810 #ifdef CONFIG_X86_MCE 811 } else if (addr == (unsigned long)machine_check) { 812 /* 813 * when xen hypervisor inject vMCE to guest, 814 * use native mce handler to handle it 815 */ 816 ; 817 #endif 818 } else if (addr == (unsigned long)nmi) 819 /* 820 * Use the native version as well. 821 */ 822 ; 823 else { 824 /* Some other trap using IST? */ 825 if (WARN_ON(val->ist != 0)) 826 return 0; 827 } 828 #endif /* CONFIG_X86_64 */ 829 info->address = addr; 830 831 info->cs = gate_segment(*val); 832 info->flags = val->dpl; 833 /* interrupt gates clear IF */ 834 if (val->type == GATE_INTERRUPT) 835 info->flags |= 1 << 2; 836 837 return 1; 838 } 839 840 /* Locations of each CPU's IDT */ 841 static DEFINE_PER_CPU(struct desc_ptr, idt_desc); 842 843 /* Set an IDT entry. If the entry is part of the current IDT, then 844 also update Xen. */ 845 static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) 846 { 847 unsigned long p = (unsigned long)&dt[entrynum]; 848 unsigned long start, end; 849 850 trace_xen_cpu_write_idt_entry(dt, entrynum, g); 851 852 preempt_disable(); 853 854 start = __this_cpu_read(idt_desc.address); 855 end = start + __this_cpu_read(idt_desc.size) + 1; 856 857 xen_mc_flush(); 858 859 native_write_idt_entry(dt, entrynum, g); 860 861 if (p >= start && (p + 8) <= end) { 862 struct trap_info info[2]; 863 864 info[1].address = 0; 865 866 if (cvt_gate_to_trap(entrynum, g, &info[0])) 867 if (HYPERVISOR_set_trap_table(info)) 868 BUG(); 869 } 870 871 preempt_enable(); 872 } 873 874 static void xen_convert_trap_info(const struct desc_ptr *desc, 875 struct trap_info *traps) 876 { 877 unsigned in, out, count; 878 879 count = (desc->size+1) / sizeof(gate_desc); 880 BUG_ON(count > 256); 881 882 for (in = out = 0; in < count; in++) { 883 gate_desc *entry = (gate_desc*)(desc->address) + in; 884 885 if (cvt_gate_to_trap(in, entry, &traps[out])) 886 out++; 887 } 888 traps[out].address = 0; 889 } 890 891 void xen_copy_trap_info(struct trap_info *traps) 892 { 893 const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); 894 895 xen_convert_trap_info(desc, traps); 896 } 897 898 /* Load a new IDT into Xen. In principle this can be per-CPU, so we 899 hold a spinlock to protect the static traps[] array (static because 900 it avoids allocation, and saves stack space). */ 901 static void xen_load_idt(const struct desc_ptr *desc) 902 { 903 static DEFINE_SPINLOCK(lock); 904 static struct trap_info traps[257]; 905 906 trace_xen_cpu_load_idt(desc); 907 908 spin_lock(&lock); 909 910 memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); 911 912 xen_convert_trap_info(desc, traps); 913 914 xen_mc_flush(); 915 if (HYPERVISOR_set_trap_table(traps)) 916 BUG(); 917 918 spin_unlock(&lock); 919 } 920 921 /* Write a GDT descriptor entry. Ignore LDT descriptors, since 922 they're handled differently. */ 923 static void xen_write_gdt_entry(struct desc_struct *dt, int entry, 924 const void *desc, int type) 925 { 926 trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); 927 928 preempt_disable(); 929 930 switch (type) { 931 case DESC_LDT: 932 case DESC_TSS: 933 /* ignore */ 934 break; 935 936 default: { 937 xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); 938 939 xen_mc_flush(); 940 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 941 BUG(); 942 } 943 944 } 945 946 preempt_enable(); 947 } 948 949 /* 950 * Version of write_gdt_entry for use at early boot-time needed to 951 * update an entry as simply as possible. 952 */ 953 static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, 954 const void *desc, int type) 955 { 956 trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); 957 958 switch (type) { 959 case DESC_LDT: 960 case DESC_TSS: 961 /* ignore */ 962 break; 963 964 default: { 965 xmaddr_t maddr = virt_to_machine(&dt[entry]); 966 967 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 968 dt[entry] = *(struct desc_struct *)desc; 969 } 970 971 } 972 } 973 974 static void xen_load_sp0(struct tss_struct *tss, 975 struct thread_struct *thread) 976 { 977 struct multicall_space mcs; 978 979 mcs = xen_mc_entry(0); 980 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 981 xen_mc_issue(PARAVIRT_LAZY_CPU); 982 tss->x86_tss.sp0 = thread->sp0; 983 } 984 985 void xen_set_iopl_mask(unsigned mask) 986 { 987 struct physdev_set_iopl set_iopl; 988 989 /* Force the change at ring 0. */ 990 set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; 991 HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 992 } 993 994 static void xen_io_delay(void) 995 { 996 } 997 998 static DEFINE_PER_CPU(unsigned long, xen_cr0_value); 999 1000 static unsigned long xen_read_cr0(void) 1001 { 1002 unsigned long cr0 = this_cpu_read(xen_cr0_value); 1003 1004 if (unlikely(cr0 == 0)) { 1005 cr0 = native_read_cr0(); 1006 this_cpu_write(xen_cr0_value, cr0); 1007 } 1008 1009 return cr0; 1010 } 1011 1012 static void xen_write_cr0(unsigned long cr0) 1013 { 1014 struct multicall_space mcs; 1015 1016 this_cpu_write(xen_cr0_value, cr0); 1017 1018 /* Only pay attention to cr0.TS; everything else is 1019 ignored. */ 1020 mcs = xen_mc_entry(0); 1021 1022 MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); 1023 1024 xen_mc_issue(PARAVIRT_LAZY_CPU); 1025 } 1026 1027 static void xen_write_cr4(unsigned long cr4) 1028 { 1029 cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE); 1030 1031 native_write_cr4(cr4); 1032 } 1033 #ifdef CONFIG_X86_64 1034 static inline unsigned long xen_read_cr8(void) 1035 { 1036 return 0; 1037 } 1038 static inline void xen_write_cr8(unsigned long val) 1039 { 1040 BUG_ON(val); 1041 } 1042 #endif 1043 1044 static u64 xen_read_msr_safe(unsigned int msr, int *err) 1045 { 1046 u64 val; 1047 1048 if (pmu_msr_read(msr, &val, err)) 1049 return val; 1050 1051 val = native_read_msr_safe(msr, err); 1052 switch (msr) { 1053 case MSR_IA32_APICBASE: 1054 #ifdef CONFIG_X86_X2APIC 1055 if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31)))) 1056 #endif 1057 val &= ~X2APIC_ENABLE; 1058 break; 1059 } 1060 return val; 1061 } 1062 1063 static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) 1064 { 1065 int ret; 1066 1067 ret = 0; 1068 1069 switch (msr) { 1070 #ifdef CONFIG_X86_64 1071 unsigned which; 1072 u64 base; 1073 1074 case MSR_FS_BASE: which = SEGBASE_FS; goto set; 1075 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; 1076 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; 1077 1078 set: 1079 base = ((u64)high << 32) | low; 1080 if (HYPERVISOR_set_segment_base(which, base) != 0) 1081 ret = -EIO; 1082 break; 1083 #endif 1084 1085 case MSR_STAR: 1086 case MSR_CSTAR: 1087 case MSR_LSTAR: 1088 case MSR_SYSCALL_MASK: 1089 case MSR_IA32_SYSENTER_CS: 1090 case MSR_IA32_SYSENTER_ESP: 1091 case MSR_IA32_SYSENTER_EIP: 1092 /* Fast syscall setup is all done in hypercalls, so 1093 these are all ignored. Stub them out here to stop 1094 Xen console noise. */ 1095 break; 1096 1097 default: 1098 if (!pmu_msr_write(msr, low, high, &ret)) 1099 ret = native_write_msr_safe(msr, low, high); 1100 } 1101 1102 return ret; 1103 } 1104 1105 static u64 xen_read_msr(unsigned int msr) 1106 { 1107 /* 1108 * This will silently swallow a #GP from RDMSR. It may be worth 1109 * changing that. 1110 */ 1111 int err; 1112 1113 return xen_read_msr_safe(msr, &err); 1114 } 1115 1116 static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) 1117 { 1118 /* 1119 * This will silently swallow a #GP from WRMSR. It may be worth 1120 * changing that. 1121 */ 1122 xen_write_msr_safe(msr, low, high); 1123 } 1124 1125 void xen_setup_shared_info(void) 1126 { 1127 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1128 set_fixmap(FIX_PARAVIRT_BOOTMAP, 1129 xen_start_info->shared_info); 1130 1131 HYPERVISOR_shared_info = 1132 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); 1133 } else 1134 HYPERVISOR_shared_info = 1135 (struct shared_info *)__va(xen_start_info->shared_info); 1136 1137 #ifndef CONFIG_SMP 1138 /* In UP this is as good a place as any to set up shared info */ 1139 xen_setup_vcpu_info_placement(); 1140 #endif 1141 1142 xen_setup_mfn_list_list(); 1143 } 1144 1145 /* This is called once we have the cpu_possible_mask */ 1146 void xen_setup_vcpu_info_placement(void) 1147 { 1148 int cpu; 1149 1150 for_each_possible_cpu(cpu) { 1151 /* Set up direct vCPU id mapping for PV guests. */ 1152 per_cpu(xen_vcpu_id, cpu) = cpu; 1153 xen_vcpu_setup(cpu); 1154 } 1155 1156 /* 1157 * xen_vcpu_setup managed to place the vcpu_info within the 1158 * percpu area for all cpus, so make use of it. 1159 */ 1160 if (have_vcpu_info_placement) { 1161 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); 1162 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); 1163 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); 1164 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); 1165 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1166 } 1167 } 1168 1169 static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1170 unsigned long addr, unsigned len) 1171 { 1172 char *start, *end, *reloc; 1173 unsigned ret; 1174 1175 start = end = reloc = NULL; 1176 1177 #define SITE(op, x) \ 1178 case PARAVIRT_PATCH(op.x): \ 1179 if (have_vcpu_info_placement) { \ 1180 start = (char *)xen_##x##_direct; \ 1181 end = xen_##x##_direct_end; \ 1182 reloc = xen_##x##_direct_reloc; \ 1183 } \ 1184 goto patch_site 1185 1186 switch (type) { 1187 SITE(pv_irq_ops, irq_enable); 1188 SITE(pv_irq_ops, irq_disable); 1189 SITE(pv_irq_ops, save_fl); 1190 SITE(pv_irq_ops, restore_fl); 1191 #undef SITE 1192 1193 patch_site: 1194 if (start == NULL || (end-start) > len) 1195 goto default_patch; 1196 1197 ret = paravirt_patch_insns(insnbuf, len, start, end); 1198 1199 /* Note: because reloc is assigned from something that 1200 appears to be an array, gcc assumes it's non-null, 1201 but doesn't know its relationship with start and 1202 end. */ 1203 if (reloc > start && reloc < end) { 1204 int reloc_off = reloc - start; 1205 long *relocp = (long *)(insnbuf + reloc_off); 1206 long delta = start - (char *)addr; 1207 1208 *relocp += delta; 1209 } 1210 break; 1211 1212 default_patch: 1213 default: 1214 ret = paravirt_patch_default(type, clobbers, insnbuf, 1215 addr, len); 1216 break; 1217 } 1218 1219 return ret; 1220 } 1221 1222 static const struct pv_info xen_info __initconst = { 1223 .shared_kernel_pmd = 0, 1224 1225 #ifdef CONFIG_X86_64 1226 .extra_user_64bit_cs = FLAT_USER_CS64, 1227 #endif 1228 .name = "Xen", 1229 }; 1230 1231 static const struct pv_init_ops xen_init_ops __initconst = { 1232 .patch = xen_patch, 1233 }; 1234 1235 static const struct pv_cpu_ops xen_cpu_ops __initconst = { 1236 .cpuid = xen_cpuid, 1237 1238 .set_debugreg = xen_set_debugreg, 1239 .get_debugreg = xen_get_debugreg, 1240 1241 .read_cr0 = xen_read_cr0, 1242 .write_cr0 = xen_write_cr0, 1243 1244 .read_cr4 = native_read_cr4, 1245 .write_cr4 = xen_write_cr4, 1246 1247 #ifdef CONFIG_X86_64 1248 .read_cr8 = xen_read_cr8, 1249 .write_cr8 = xen_write_cr8, 1250 #endif 1251 1252 .wbinvd = native_wbinvd, 1253 1254 .read_msr = xen_read_msr, 1255 .write_msr = xen_write_msr, 1256 1257 .read_msr_safe = xen_read_msr_safe, 1258 .write_msr_safe = xen_write_msr_safe, 1259 1260 .read_pmc = xen_read_pmc, 1261 1262 .iret = xen_iret, 1263 #ifdef CONFIG_X86_64 1264 .usergs_sysret64 = xen_sysret64, 1265 #endif 1266 1267 .load_tr_desc = paravirt_nop, 1268 .set_ldt = xen_set_ldt, 1269 .load_gdt = xen_load_gdt, 1270 .load_idt = xen_load_idt, 1271 .load_tls = xen_load_tls, 1272 #ifdef CONFIG_X86_64 1273 .load_gs_index = xen_load_gs_index, 1274 #endif 1275 1276 .alloc_ldt = xen_alloc_ldt, 1277 .free_ldt = xen_free_ldt, 1278 1279 .store_idt = native_store_idt, 1280 .store_tr = xen_store_tr, 1281 1282 .write_ldt_entry = xen_write_ldt_entry, 1283 .write_gdt_entry = xen_write_gdt_entry, 1284 .write_idt_entry = xen_write_idt_entry, 1285 .load_sp0 = xen_load_sp0, 1286 1287 .set_iopl_mask = xen_set_iopl_mask, 1288 .io_delay = xen_io_delay, 1289 1290 /* Xen takes care of %gs when switching to usermode for us */ 1291 .swapgs = paravirt_nop, 1292 1293 .start_context_switch = paravirt_start_context_switch, 1294 .end_context_switch = xen_end_context_switch, 1295 }; 1296 1297 static void xen_reboot(int reason) 1298 { 1299 struct sched_shutdown r = { .reason = reason }; 1300 int cpu; 1301 1302 for_each_online_cpu(cpu) 1303 xen_pmu_finish(cpu); 1304 1305 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) 1306 BUG(); 1307 } 1308 1309 static void xen_restart(char *msg) 1310 { 1311 xen_reboot(SHUTDOWN_reboot); 1312 } 1313 1314 static void xen_emergency_restart(void) 1315 { 1316 xen_reboot(SHUTDOWN_reboot); 1317 } 1318 1319 static void xen_machine_halt(void) 1320 { 1321 xen_reboot(SHUTDOWN_poweroff); 1322 } 1323 1324 static void xen_machine_power_off(void) 1325 { 1326 if (pm_power_off) 1327 pm_power_off(); 1328 xen_reboot(SHUTDOWN_poweroff); 1329 } 1330 1331 static void xen_crash_shutdown(struct pt_regs *regs) 1332 { 1333 xen_reboot(SHUTDOWN_crash); 1334 } 1335 1336 static int 1337 xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) 1338 { 1339 if (!kexec_crash_loaded()) 1340 xen_reboot(SHUTDOWN_crash); 1341 return NOTIFY_DONE; 1342 } 1343 1344 static struct notifier_block xen_panic_block = { 1345 .notifier_call= xen_panic_event, 1346 .priority = INT_MIN 1347 }; 1348 1349 int xen_panic_handler_init(void) 1350 { 1351 atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); 1352 return 0; 1353 } 1354 1355 static const struct machine_ops xen_machine_ops __initconst = { 1356 .restart = xen_restart, 1357 .halt = xen_machine_halt, 1358 .power_off = xen_machine_power_off, 1359 .shutdown = xen_machine_halt, 1360 .crash_shutdown = xen_crash_shutdown, 1361 .emergency_restart = xen_emergency_restart, 1362 }; 1363 1364 static unsigned char xen_get_nmi_reason(void) 1365 { 1366 unsigned char reason = 0; 1367 1368 /* Construct a value which looks like it came from port 0x61. */ 1369 if (test_bit(_XEN_NMIREASON_io_error, 1370 &HYPERVISOR_shared_info->arch.nmi_reason)) 1371 reason |= NMI_REASON_IOCHK; 1372 if (test_bit(_XEN_NMIREASON_pci_serr, 1373 &HYPERVISOR_shared_info->arch.nmi_reason)) 1374 reason |= NMI_REASON_SERR; 1375 1376 return reason; 1377 } 1378 1379 static void __init xen_boot_params_init_edd(void) 1380 { 1381 #if IS_ENABLED(CONFIG_EDD) 1382 struct xen_platform_op op; 1383 struct edd_info *edd_info; 1384 u32 *mbr_signature; 1385 unsigned nr; 1386 int ret; 1387 1388 edd_info = boot_params.eddbuf; 1389 mbr_signature = boot_params.edd_mbr_sig_buffer; 1390 1391 op.cmd = XENPF_firmware_info; 1392 1393 op.u.firmware_info.type = XEN_FW_DISK_INFO; 1394 for (nr = 0; nr < EDDMAXNR; nr++) { 1395 struct edd_info *info = edd_info + nr; 1396 1397 op.u.firmware_info.index = nr; 1398 info->params.length = sizeof(info->params); 1399 set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params, 1400 &info->params); 1401 ret = HYPERVISOR_platform_op(&op); 1402 if (ret) 1403 break; 1404 1405 #define C(x) info->x = op.u.firmware_info.u.disk_info.x 1406 C(device); 1407 C(version); 1408 C(interface_support); 1409 C(legacy_max_cylinder); 1410 C(legacy_max_head); 1411 C(legacy_sectors_per_track); 1412 #undef C 1413 } 1414 boot_params.eddbuf_entries = nr; 1415 1416 op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; 1417 for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) { 1418 op.u.firmware_info.index = nr; 1419 ret = HYPERVISOR_platform_op(&op); 1420 if (ret) 1421 break; 1422 mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature; 1423 } 1424 boot_params.edd_mbr_sig_buf_entries = nr; 1425 #endif 1426 } 1427 1428 /* 1429 * Set up the GDT and segment registers for -fstack-protector. Until 1430 * we do this, we have to be careful not to call any stack-protected 1431 * function, which is most of the kernel. 1432 */ 1433 static void xen_setup_gdt(int cpu) 1434 { 1435 pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; 1436 pv_cpu_ops.load_gdt = xen_load_gdt_boot; 1437 1438 setup_stack_canary_segment(0); 1439 switch_to_new_gdt(0); 1440 1441 pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; 1442 pv_cpu_ops.load_gdt = xen_load_gdt; 1443 } 1444 1445 static void __init xen_dom0_set_legacy_features(void) 1446 { 1447 x86_platform.legacy.rtc = 1; 1448 } 1449 1450 static int xen_cpuhp_setup(void) 1451 { 1452 int rc; 1453 1454 rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, 1455 "x86/xen/hvm_guest:prepare", 1456 xen_cpu_up_prepare, xen_cpu_dead); 1457 if (rc >= 0) { 1458 rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, 1459 "x86/xen/hvm_guest:online", 1460 xen_cpu_up_online, NULL); 1461 if (rc < 0) 1462 cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); 1463 } 1464 1465 return rc >= 0 ? 0 : rc; 1466 } 1467 1468 /* First C function to be called on Xen boot */ 1469 asmlinkage __visible void __init xen_start_kernel(void) 1470 { 1471 struct physdev_set_iopl set_iopl; 1472 unsigned long initrd_start = 0; 1473 int rc; 1474 1475 if (!xen_start_info) 1476 return; 1477 1478 xen_domain_type = XEN_PV_DOMAIN; 1479 1480 xen_setup_features(); 1481 1482 xen_setup_machphys_mapping(); 1483 1484 /* Install Xen paravirt ops */ 1485 pv_info = xen_info; 1486 pv_init_ops = xen_init_ops; 1487 pv_cpu_ops = xen_cpu_ops; 1488 1489 x86_platform.get_nmi_reason = xen_get_nmi_reason; 1490 1491 x86_init.resources.memory_setup = xen_memory_setup; 1492 x86_init.oem.arch_setup = xen_arch_setup; 1493 x86_init.oem.banner = xen_banner; 1494 1495 xen_init_time_ops(); 1496 1497 /* 1498 * Set up some pagetable state before starting to set any ptes. 1499 */ 1500 1501 xen_init_mmu_ops(); 1502 1503 /* Prevent unwanted bits from being set in PTEs. */ 1504 __supported_pte_mask &= ~_PAGE_GLOBAL; 1505 1506 /* 1507 * Prevent page tables from being allocated in highmem, even 1508 * if CONFIG_HIGHPTE is enabled. 1509 */ 1510 __userpte_alloc_gfp &= ~__GFP_HIGHMEM; 1511 1512 /* Work out if we support NX */ 1513 x86_configure_nx(); 1514 1515 /* Get mfn list */ 1516 xen_build_dynamic_phys_to_machine(); 1517 1518 /* 1519 * Set up kernel GDT and segment registers, mainly so that 1520 * -fstack-protector code can be executed. 1521 */ 1522 xen_setup_gdt(0); 1523 1524 xen_init_irq_ops(); 1525 xen_init_cpuid_mask(); 1526 1527 #ifdef CONFIG_X86_LOCAL_APIC 1528 /* 1529 * set up the basic apic ops. 1530 */ 1531 xen_init_apic(); 1532 #endif 1533 1534 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 1535 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; 1536 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; 1537 } 1538 1539 machine_ops = xen_machine_ops; 1540 1541 /* 1542 * The only reliable way to retain the initial address of the 1543 * percpu gdt_page is to remember it here, so we can go and 1544 * mark it RW later, when the initial percpu area is freed. 1545 */ 1546 xen_initial_gdt = &per_cpu(gdt_page, 0); 1547 1548 xen_smp_init(); 1549 1550 #ifdef CONFIG_ACPI_NUMA 1551 /* 1552 * The pages we from Xen are not related to machine pages, so 1553 * any NUMA information the kernel tries to get from ACPI will 1554 * be meaningless. Prevent it from trying. 1555 */ 1556 acpi_numa = -1; 1557 #endif 1558 /* Don't do the full vcpu_info placement stuff until we have a 1559 possible map and a non-dummy shared_info. */ 1560 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1561 1562 WARN_ON(xen_cpuhp_setup()); 1563 1564 local_irq_disable(); 1565 early_boot_irqs_disabled = true; 1566 1567 xen_raw_console_write("mapping kernel into physical memory\n"); 1568 xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, 1569 xen_start_info->nr_pages); 1570 xen_reserve_special_pages(); 1571 1572 /* keep using Xen gdt for now; no urgent need to change it */ 1573 1574 #ifdef CONFIG_X86_32 1575 pv_info.kernel_rpl = 1; 1576 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1577 pv_info.kernel_rpl = 0; 1578 #else 1579 pv_info.kernel_rpl = 0; 1580 #endif 1581 /* set the limit of our address space */ 1582 xen_reserve_top(); 1583 1584 /* 1585 * We used to do this in xen_arch_setup, but that is too late 1586 * on AMD were early_cpu_init (run before ->arch_setup()) calls 1587 * early_amd_init which pokes 0xcf8 port. 1588 */ 1589 set_iopl.iopl = 1; 1590 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1591 if (rc != 0) 1592 xen_raw_printk("physdev_op failed %d\n", rc); 1593 1594 #ifdef CONFIG_X86_32 1595 /* set up basic CPUID stuff */ 1596 cpu_detect(&new_cpu_data); 1597 set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); 1598 new_cpu_data.wp_works_ok = 1; 1599 new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); 1600 #endif 1601 1602 if (xen_start_info->mod_start) { 1603 if (xen_start_info->flags & SIF_MOD_START_PFN) 1604 initrd_start = PFN_PHYS(xen_start_info->mod_start); 1605 else 1606 initrd_start = __pa(xen_start_info->mod_start); 1607 } 1608 1609 /* Poke various useful things into boot_params */ 1610 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1611 boot_params.hdr.ramdisk_image = initrd_start; 1612 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1613 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); 1614 boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; 1615 1616 if (!xen_initial_domain()) { 1617 add_preferred_console("xenboot", 0, NULL); 1618 add_preferred_console("tty", 0, NULL); 1619 add_preferred_console("hvc", 0, NULL); 1620 if (pci_xen) 1621 x86_init.pci.arch_init = pci_xen_init; 1622 } else { 1623 const struct dom0_vga_console_info *info = 1624 (void *)((char *)xen_start_info + 1625 xen_start_info->console.dom0.info_off); 1626 struct xen_platform_op op = { 1627 .cmd = XENPF_firmware_info, 1628 .interface_version = XENPF_INTERFACE_VERSION, 1629 .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, 1630 }; 1631 1632 x86_platform.set_legacy_features = 1633 xen_dom0_set_legacy_features; 1634 xen_init_vga(info, xen_start_info->console.dom0.info_size); 1635 xen_start_info->console.domU.mfn = 0; 1636 xen_start_info->console.domU.evtchn = 0; 1637 1638 if (HYPERVISOR_platform_op(&op) == 0) 1639 boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; 1640 1641 /* Make sure ACS will be enabled */ 1642 pci_request_acs(); 1643 1644 xen_acpi_sleep_register(); 1645 1646 /* Avoid searching for BIOS MP tables */ 1647 x86_init.mpparse.find_smp_config = x86_init_noop; 1648 x86_init.mpparse.get_smp_config = x86_init_uint_noop; 1649 1650 xen_boot_params_init_edd(); 1651 } 1652 #ifdef CONFIG_PCI 1653 /* PCI BIOS service won't work from a PV guest. */ 1654 pci_probe &= ~PCI_PROBE_BIOS; 1655 #endif 1656 xen_raw_console_write("about to get started...\n"); 1657 1658 /* Let's presume PV guests always boot on vCPU with id 0. */ 1659 per_cpu(xen_vcpu_id, 0) = 0; 1660 1661 xen_setup_runstate_info(0); 1662 1663 xen_efi_init(); 1664 1665 /* Start the world */ 1666 #ifdef CONFIG_X86_32 1667 i386_start_kernel(); 1668 #else 1669 cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */ 1670 x86_64_start_reservations((char *)__pa_symbol(&boot_params)); 1671 #endif 1672 } 1673 1674 #ifdef CONFIG_XEN_PVH 1675 1676 static void xen_pvh_arch_setup(void) 1677 { 1678 #ifdef CONFIG_ACPI 1679 /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ 1680 if (nr_ioapics == 0) 1681 acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; 1682 #endif 1683 } 1684 1685 static void __init init_pvh_bootparams(void) 1686 { 1687 struct xen_memory_map memmap; 1688 unsigned int i; 1689 int rc; 1690 1691 memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); 1692 1693 memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_map); 1694 set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_map); 1695 rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); 1696 if (rc) { 1697 xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); 1698 BUG(); 1699 } 1700 1701 if (memmap.nr_entries < E820MAX - 1) { 1702 pvh_bootparams.e820_map[memmap.nr_entries].addr = 1703 ISA_START_ADDRESS; 1704 pvh_bootparams.e820_map[memmap.nr_entries].size = 1705 ISA_END_ADDRESS - ISA_START_ADDRESS; 1706 pvh_bootparams.e820_map[memmap.nr_entries].type = 1707 E820_RESERVED; 1708 memmap.nr_entries++; 1709 } else 1710 xen_raw_printk("Warning: Can fit ISA range into e820\n"); 1711 1712 sanitize_e820_map(pvh_bootparams.e820_map, 1713 ARRAY_SIZE(pvh_bootparams.e820_map), 1714 &memmap.nr_entries); 1715 1716 pvh_bootparams.e820_entries = memmap.nr_entries; 1717 for (i = 0; i < pvh_bootparams.e820_entries; i++) 1718 e820_add_region(pvh_bootparams.e820_map[i].addr, 1719 pvh_bootparams.e820_map[i].size, 1720 pvh_bootparams.e820_map[i].type); 1721 1722 pvh_bootparams.hdr.cmd_line_ptr = 1723 pvh_start_info.cmdline_paddr; 1724 1725 /* The first module is always ramdisk. */ 1726 if (pvh_start_info.nr_modules) { 1727 struct hvm_modlist_entry *modaddr = 1728 __va(pvh_start_info.modlist_paddr); 1729 pvh_bootparams.hdr.ramdisk_image = modaddr->paddr; 1730 pvh_bootparams.hdr.ramdisk_size = modaddr->size; 1731 } 1732 1733 /* 1734 * See Documentation/x86/boot.txt. 1735 * 1736 * Version 2.12 supports Xen entry point but we will use default x86/PC 1737 * environment (i.e. hardware_subarch 0). 1738 */ 1739 pvh_bootparams.hdr.version = 0x212; 1740 pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ 1741 } 1742 1743 /* 1744 * This routine (and those that it might call) should not use 1745 * anything that lives in .bss since that segment will be cleared later. 1746 */ 1747 void __init xen_prepare_pvh(void) 1748 { 1749 u32 msr; 1750 u64 pfn; 1751 1752 if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { 1753 xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", 1754 pvh_start_info.magic); 1755 BUG(); 1756 } 1757 1758 xen_pvh = 1; 1759 1760 msr = cpuid_ebx(xen_cpuid_base() + 2); 1761 pfn = __pa(hypercall_page); 1762 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); 1763 1764 init_pvh_bootparams(); 1765 1766 x86_init.oem.arch_setup = xen_pvh_arch_setup; 1767 } 1768 #endif 1769 1770 void __ref xen_hvm_init_shared_info(void) 1771 { 1772 int cpu; 1773 struct xen_add_to_physmap xatp; 1774 static struct shared_info *shared_info_page = 0; 1775 1776 if (!shared_info_page) 1777 shared_info_page = (struct shared_info *) 1778 extend_brk(PAGE_SIZE, PAGE_SIZE); 1779 xatp.domid = DOMID_SELF; 1780 xatp.idx = 0; 1781 xatp.space = XENMAPSPACE_shared_info; 1782 xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; 1783 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) 1784 BUG(); 1785 1786 HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; 1787 1788 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info 1789 * page, we use it in the event channel upcall and in some pvclock 1790 * related functions. We don't need the vcpu_info placement 1791 * optimizations because we don't use any pv_mmu or pv_irq op on 1792 * HVM. 1793 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is 1794 * online but xen_hvm_init_shared_info is run at resume time too and 1795 * in that case multiple vcpus might be online. */ 1796 for_each_online_cpu(cpu) { 1797 /* Leave it to be NULL. */ 1798 if (xen_vcpu_nr(cpu) >= MAX_VIRT_CPUS) 1799 continue; 1800 per_cpu(xen_vcpu, cpu) = 1801 &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; 1802 } 1803 } 1804 1805 #ifdef CONFIG_XEN_PVHVM 1806 static void __init init_hvm_pv_info(void) 1807 { 1808 int major, minor; 1809 uint32_t eax, ebx, ecx, edx, base; 1810 1811 base = xen_cpuid_base(); 1812 eax = cpuid_eax(base + 1); 1813 1814 major = eax >> 16; 1815 minor = eax & 0xffff; 1816 printk(KERN_INFO "Xen version %d.%d.\n", major, minor); 1817 1818 xen_domain_type = XEN_HVM_DOMAIN; 1819 1820 /* PVH set up hypercall page in xen_prepare_pvh(). */ 1821 if (xen_pvh_domain()) 1822 pv_info.name = "Xen PVH"; 1823 else { 1824 u64 pfn; 1825 uint32_t msr; 1826 1827 pv_info.name = "Xen HVM"; 1828 msr = cpuid_ebx(base + 2); 1829 pfn = __pa(hypercall_page); 1830 wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); 1831 } 1832 1833 xen_setup_features(); 1834 1835 cpuid(base + 4, &eax, &ebx, &ecx, &edx); 1836 if (eax & XEN_HVM_CPUID_VCPU_ID_PRESENT) 1837 this_cpu_write(xen_vcpu_id, ebx); 1838 else 1839 this_cpu_write(xen_vcpu_id, smp_processor_id()); 1840 } 1841 #endif 1842 1843 static int xen_cpu_up_prepare(unsigned int cpu) 1844 { 1845 int rc; 1846 1847 if (xen_hvm_domain()) { 1848 /* 1849 * This can happen if CPU was offlined earlier and 1850 * offlining timed out in common_cpu_die(). 1851 */ 1852 if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) { 1853 xen_smp_intr_free(cpu); 1854 xen_uninit_lock_cpu(cpu); 1855 } 1856 1857 if (cpu_acpi_id(cpu) != U32_MAX) 1858 per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); 1859 else 1860 per_cpu(xen_vcpu_id, cpu) = cpu; 1861 xen_vcpu_setup(cpu); 1862 } 1863 1864 if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) 1865 xen_setup_timer(cpu); 1866 1867 rc = xen_smp_intr_init(cpu); 1868 if (rc) { 1869 WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n", 1870 cpu, rc); 1871 return rc; 1872 } 1873 return 0; 1874 } 1875 1876 static int xen_cpu_dead(unsigned int cpu) 1877 { 1878 xen_smp_intr_free(cpu); 1879 1880 if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) 1881 xen_teardown_timer(cpu); 1882 1883 return 0; 1884 } 1885 1886 static int xen_cpu_up_online(unsigned int cpu) 1887 { 1888 xen_init_lock_cpu(cpu); 1889 return 0; 1890 } 1891 1892 #ifdef CONFIG_XEN_PVHVM 1893 #ifdef CONFIG_KEXEC_CORE 1894 static void xen_hvm_shutdown(void) 1895 { 1896 native_machine_shutdown(); 1897 if (kexec_in_progress) 1898 xen_reboot(SHUTDOWN_soft_reset); 1899 } 1900 1901 static void xen_hvm_crash_shutdown(struct pt_regs *regs) 1902 { 1903 native_machine_crash_shutdown(regs); 1904 xen_reboot(SHUTDOWN_soft_reset); 1905 } 1906 #endif 1907 1908 static void __init xen_hvm_guest_init(void) 1909 { 1910 if (xen_pv_domain()) 1911 return; 1912 1913 init_hvm_pv_info(); 1914 1915 xen_hvm_init_shared_info(); 1916 1917 xen_panic_handler_init(); 1918 1919 BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector)); 1920 1921 xen_hvm_smp_init(); 1922 WARN_ON(xen_cpuhp_setup()); 1923 xen_unplug_emulated_devices(); 1924 x86_init.irqs.intr_init = xen_init_IRQ; 1925 xen_hvm_init_time_ops(); 1926 xen_hvm_init_mmu_ops(); 1927 1928 if (xen_pvh_domain()) 1929 machine_ops.emergency_restart = xen_emergency_restart; 1930 #ifdef CONFIG_KEXEC_CORE 1931 machine_ops.shutdown = xen_hvm_shutdown; 1932 machine_ops.crash_shutdown = xen_hvm_crash_shutdown; 1933 #endif 1934 } 1935 #endif 1936 1937 static bool xen_nopv = false; 1938 static __init int xen_parse_nopv(char *arg) 1939 { 1940 xen_nopv = true; 1941 return 0; 1942 } 1943 early_param("xen_nopv", xen_parse_nopv); 1944 1945 static uint32_t __init xen_platform(void) 1946 { 1947 if (xen_nopv) 1948 return 0; 1949 1950 return xen_cpuid_base(); 1951 } 1952 1953 bool xen_hvm_need_lapic(void) 1954 { 1955 if (xen_nopv) 1956 return false; 1957 if (xen_pv_domain()) 1958 return false; 1959 if (!xen_hvm_domain()) 1960 return false; 1961 if (xen_feature(XENFEAT_hvm_pirqs)) 1962 return false; 1963 return true; 1964 } 1965 EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); 1966 1967 static void xen_set_cpu_features(struct cpuinfo_x86 *c) 1968 { 1969 if (xen_pv_domain()) { 1970 clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); 1971 set_cpu_cap(c, X86_FEATURE_XENPV); 1972 } 1973 } 1974 1975 static void xen_pin_vcpu(int cpu) 1976 { 1977 static bool disable_pinning; 1978 struct sched_pin_override pin_override; 1979 int ret; 1980 1981 if (disable_pinning) 1982 return; 1983 1984 pin_override.pcpu = cpu; 1985 ret = HYPERVISOR_sched_op(SCHEDOP_pin_override, &pin_override); 1986 1987 /* Ignore errors when removing override. */ 1988 if (cpu < 0) 1989 return; 1990 1991 switch (ret) { 1992 case -ENOSYS: 1993 pr_warn("Unable to pin on physical cpu %d. In case of problems consider vcpu pinning.\n", 1994 cpu); 1995 disable_pinning = true; 1996 break; 1997 case -EPERM: 1998 WARN(1, "Trying to pin vcpu without having privilege to do so\n"); 1999 disable_pinning = true; 2000 break; 2001 case -EINVAL: 2002 case -EBUSY: 2003 pr_warn("Physical cpu %d not available for pinning. Check Xen cpu configuration.\n", 2004 cpu); 2005 break; 2006 case 0: 2007 break; 2008 default: 2009 WARN(1, "rc %d while trying to pin vcpu\n", ret); 2010 disable_pinning = true; 2011 } 2012 } 2013 2014 const struct hypervisor_x86 x86_hyper_xen = { 2015 .name = "Xen", 2016 .detect = xen_platform, 2017 #ifdef CONFIG_XEN_PVHVM 2018 .init_platform = xen_hvm_guest_init, 2019 #endif 2020 .x2apic_available = xen_x2apic_para_available, 2021 .set_cpu_features = xen_set_cpu_features, 2022 .pin_vcpu = xen_pin_vcpu, 2023 }; 2024 EXPORT_SYMBOL(x86_hyper_xen); 2025 2026 #ifdef CONFIG_HOTPLUG_CPU 2027 void xen_arch_register_cpu(int num) 2028 { 2029 arch_register_cpu(num); 2030 } 2031 EXPORT_SYMBOL(xen_arch_register_cpu); 2032 2033 void xen_arch_unregister_cpu(int num) 2034 { 2035 arch_unregister_cpu(num); 2036 } 2037 EXPORT_SYMBOL(xen_arch_unregister_cpu); 2038 #endif 2039