1 /* 2 * Core of Xen paravirt_ops implementation. 3 * 4 * This file contains the xen_paravirt_ops structure itself, and the 5 * implementations for: 6 * - privileged instructions 7 * - interrupt flags 8 * - segment operations 9 * - booting and setup 10 * 11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 12 */ 13 14 #include <linux/kernel.h> 15 #include <linux/init.h> 16 #include <linux/smp.h> 17 #include <linux/preempt.h> 18 #include <linux/hardirq.h> 19 #include <linux/percpu.h> 20 #include <linux/delay.h> 21 #include <linux/start_kernel.h> 22 #include <linux/sched.h> 23 #include <linux/bootmem.h> 24 #include <linux/module.h> 25 #include <linux/mm.h> 26 #include <linux/page-flags.h> 27 #include <linux/highmem.h> 28 #include <linux/console.h> 29 30 #include <xen/interface/xen.h> 31 #include <xen/interface/physdev.h> 32 #include <xen/interface/vcpu.h> 33 #include <xen/interface/sched.h> 34 #include <xen/features.h> 35 #include <xen/page.h> 36 #include <xen/hvc-console.h> 37 38 #include <asm/paravirt.h> 39 #include <asm/page.h> 40 #include <asm/xen/hypercall.h> 41 #include <asm/xen/hypervisor.h> 42 #include <asm/fixmap.h> 43 #include <asm/processor.h> 44 #include <asm/msr-index.h> 45 #include <asm/setup.h> 46 #include <asm/desc.h> 47 #include <asm/pgtable.h> 48 #include <asm/tlbflush.h> 49 #include <asm/reboot.h> 50 51 #include "xen-ops.h" 52 #include "mmu.h" 53 #include "multicalls.h" 54 55 EXPORT_SYMBOL_GPL(hypercall_page); 56 57 DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 58 DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 59 60 /* 61 * Identity map, in addition to plain kernel map. This needs to be 62 * large enough to allocate page table pages to allocate the rest. 63 * Each page can map 2MB. 64 */ 65 static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; 66 67 #ifdef CONFIG_X86_64 68 /* l3 pud for userspace vsyscall mapping */ 69 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; 70 #endif /* CONFIG_X86_64 */ 71 72 /* 73 * Note about cr3 (pagetable base) values: 74 * 75 * xen_cr3 contains the current logical cr3 value; it contains the 76 * last set cr3. This may not be the current effective cr3, because 77 * its update may be being lazily deferred. However, a vcpu looking 78 * at its own cr3 can use this value knowing that it everything will 79 * be self-consistent. 80 * 81 * xen_current_cr3 contains the actual vcpu cr3; it is set once the 82 * hypercall to set the vcpu cr3 is complete (so it may be a little 83 * out of date, but it will never be set early). If one vcpu is 84 * looking at another vcpu's cr3 value, it should use this variable. 85 */ 86 DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ 87 DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ 88 89 struct start_info *xen_start_info; 90 EXPORT_SYMBOL_GPL(xen_start_info); 91 92 struct shared_info xen_dummy_shared_info; 93 94 /* 95 * Point at some empty memory to start with. We map the real shared_info 96 * page as soon as fixmap is up and running. 97 */ 98 struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; 99 100 /* 101 * Flag to determine whether vcpu info placement is available on all 102 * VCPUs. We assume it is to start with, and then set it to zero on 103 * the first failure. This is because it can succeed on some VCPUs 104 * and not others, since it can involve hypervisor memory allocation, 105 * or because the guest failed to guarantee all the appropriate 106 * constraints on all VCPUs (ie buffer can't cross a page boundary). 107 * 108 * Note that any particular CPU may be using a placed vcpu structure, 109 * but we can only optimise if the all are. 110 * 111 * 0: not available, 1: available 112 */ 113 static int have_vcpu_info_placement = 1; 114 115 static void xen_vcpu_setup(int cpu) 116 { 117 struct vcpu_register_vcpu_info info; 118 int err; 119 struct vcpu_info *vcpup; 120 121 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); 122 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 123 124 if (!have_vcpu_info_placement) 125 return; /* already tested, not available */ 126 127 vcpup = &per_cpu(xen_vcpu_info, cpu); 128 129 info.mfn = virt_to_mfn(vcpup); 130 info.offset = offset_in_page(vcpup); 131 132 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", 133 cpu, vcpup, info.mfn, info.offset); 134 135 /* Check to see if the hypervisor will put the vcpu_info 136 structure where we want it, which allows direct access via 137 a percpu-variable. */ 138 err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); 139 140 if (err) { 141 printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); 142 have_vcpu_info_placement = 0; 143 } else { 144 /* This cpu is using the registered vcpu info, even if 145 later ones fail to. */ 146 per_cpu(xen_vcpu, cpu) = vcpup; 147 148 printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", 149 cpu, vcpup); 150 } 151 } 152 153 /* 154 * On restore, set the vcpu placement up again. 155 * If it fails, then we're in a bad state, since 156 * we can't back out from using it... 157 */ 158 void xen_vcpu_restore(void) 159 { 160 if (have_vcpu_info_placement) { 161 int cpu; 162 163 for_each_online_cpu(cpu) { 164 bool other_cpu = (cpu != smp_processor_id()); 165 166 if (other_cpu && 167 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) 168 BUG(); 169 170 xen_vcpu_setup(cpu); 171 172 if (other_cpu && 173 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) 174 BUG(); 175 } 176 177 BUG_ON(!have_vcpu_info_placement); 178 } 179 } 180 181 static void __init xen_banner(void) 182 { 183 unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); 184 struct xen_extraversion extra; 185 HYPERVISOR_xen_version(XENVER_extraversion, &extra); 186 187 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 188 pv_info.name); 189 printk(KERN_INFO "Xen version: %d.%d%s%s\n", 190 version >> 16, version & 0xffff, extra.extraversion, 191 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 192 } 193 194 static void xen_cpuid(unsigned int *ax, unsigned int *bx, 195 unsigned int *cx, unsigned int *dx) 196 { 197 unsigned maskedx = ~0; 198 199 /* 200 * Mask out inconvenient features, to try and disable as many 201 * unsupported kernel subsystems as possible. 202 */ 203 if (*ax == 1) 204 maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ 205 (1 << X86_FEATURE_ACPI) | /* disable ACPI */ 206 (1 << X86_FEATURE_MCE) | /* disable MCE */ 207 (1 << X86_FEATURE_MCA) | /* disable MCA */ 208 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 209 210 asm(XEN_EMULATE_PREFIX "cpuid" 211 : "=a" (*ax), 212 "=b" (*bx), 213 "=c" (*cx), 214 "=d" (*dx) 215 : "0" (*ax), "2" (*cx)); 216 *dx &= maskedx; 217 } 218 219 static void xen_set_debugreg(int reg, unsigned long val) 220 { 221 HYPERVISOR_set_debugreg(reg, val); 222 } 223 224 static unsigned long xen_get_debugreg(int reg) 225 { 226 return HYPERVISOR_get_debugreg(reg); 227 } 228 229 static unsigned long xen_save_fl(void) 230 { 231 struct vcpu_info *vcpu; 232 unsigned long flags; 233 234 vcpu = x86_read_percpu(xen_vcpu); 235 236 /* flag has opposite sense of mask */ 237 flags = !vcpu->evtchn_upcall_mask; 238 239 /* convert to IF type flag 240 -0 -> 0x00000000 241 -1 -> 0xffffffff 242 */ 243 return (-flags) & X86_EFLAGS_IF; 244 } 245 246 static void xen_restore_fl(unsigned long flags) 247 { 248 struct vcpu_info *vcpu; 249 250 /* convert from IF type flag */ 251 flags = !(flags & X86_EFLAGS_IF); 252 253 /* There's a one instruction preempt window here. We need to 254 make sure we're don't switch CPUs between getting the vcpu 255 pointer and updating the mask. */ 256 preempt_disable(); 257 vcpu = x86_read_percpu(xen_vcpu); 258 vcpu->evtchn_upcall_mask = flags; 259 preempt_enable_no_resched(); 260 261 /* Doesn't matter if we get preempted here, because any 262 pending event will get dealt with anyway. */ 263 264 if (flags == 0) { 265 preempt_check_resched(); 266 barrier(); /* unmask then check (avoid races) */ 267 if (unlikely(vcpu->evtchn_upcall_pending)) 268 force_evtchn_callback(); 269 } 270 } 271 272 static void xen_irq_disable(void) 273 { 274 /* There's a one instruction preempt window here. We need to 275 make sure we're don't switch CPUs between getting the vcpu 276 pointer and updating the mask. */ 277 preempt_disable(); 278 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; 279 preempt_enable_no_resched(); 280 } 281 282 static void xen_irq_enable(void) 283 { 284 struct vcpu_info *vcpu; 285 286 /* We don't need to worry about being preempted here, since 287 either a) interrupts are disabled, so no preemption, or b) 288 the caller is confused and is trying to re-enable interrupts 289 on an indeterminate processor. */ 290 291 vcpu = x86_read_percpu(xen_vcpu); 292 vcpu->evtchn_upcall_mask = 0; 293 294 /* Doesn't matter if we get preempted here, because any 295 pending event will get dealt with anyway. */ 296 297 barrier(); /* unmask then check (avoid races) */ 298 if (unlikely(vcpu->evtchn_upcall_pending)) 299 force_evtchn_callback(); 300 } 301 302 static void xen_safe_halt(void) 303 { 304 /* Blocking includes an implicit local_irq_enable(). */ 305 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) 306 BUG(); 307 } 308 309 static void xen_halt(void) 310 { 311 if (irqs_disabled()) 312 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 313 else 314 xen_safe_halt(); 315 } 316 317 static void xen_leave_lazy(void) 318 { 319 paravirt_leave_lazy(paravirt_get_lazy_mode()); 320 xen_mc_flush(); 321 } 322 323 static unsigned long xen_store_tr(void) 324 { 325 return 0; 326 } 327 328 static void xen_set_ldt(const void *addr, unsigned entries) 329 { 330 struct mmuext_op *op; 331 struct multicall_space mcs = xen_mc_entry(sizeof(*op)); 332 333 op = mcs.args; 334 op->cmd = MMUEXT_SET_LDT; 335 op->arg1.linear_addr = (unsigned long)addr; 336 op->arg2.nr_ents = entries; 337 338 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 339 340 xen_mc_issue(PARAVIRT_LAZY_CPU); 341 } 342 343 static void xen_load_gdt(const struct desc_ptr *dtr) 344 { 345 unsigned long *frames; 346 unsigned long va = dtr->address; 347 unsigned int size = dtr->size + 1; 348 unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; 349 int f; 350 struct multicall_space mcs; 351 352 /* A GDT can be up to 64k in size, which corresponds to 8192 353 8-byte entries, or 16 4k pages.. */ 354 355 BUG_ON(size > 65536); 356 BUG_ON(va & ~PAGE_MASK); 357 358 mcs = xen_mc_entry(sizeof(*frames) * pages); 359 frames = mcs.args; 360 361 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 362 frames[f] = virt_to_mfn(va); 363 make_lowmem_page_readonly((void *)va); 364 } 365 366 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); 367 368 xen_mc_issue(PARAVIRT_LAZY_CPU); 369 } 370 371 static void load_TLS_descriptor(struct thread_struct *t, 372 unsigned int cpu, unsigned int i) 373 { 374 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 375 xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 376 struct multicall_space mc = __xen_mc_entry(0); 377 378 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 379 } 380 381 static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 382 { 383 /* 384 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 385 * it means we're in a context switch, and %gs has just been 386 * saved. This means we can zero it out to prevent faults on 387 * exit from the hypervisor if the next process has no %gs. 388 * Either way, it has been saved, and the new value will get 389 * loaded properly. This will go away as soon as Xen has been 390 * modified to not save/restore %gs for normal hypercalls. 391 * 392 * On x86_64, this hack is not used for %gs, because gs points 393 * to KERNEL_GS_BASE (and uses it for PDA references), so we 394 * must not zero %gs on x86_64 395 * 396 * For x86_64, we need to zero %fs, otherwise we may get an 397 * exception between the new %fs descriptor being loaded and 398 * %fs being effectively cleared at __switch_to(). 399 */ 400 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { 401 #ifdef CONFIG_X86_32 402 loadsegment(gs, 0); 403 #else 404 loadsegment(fs, 0); 405 #endif 406 } 407 408 xen_mc_batch(); 409 410 load_TLS_descriptor(t, cpu, 0); 411 load_TLS_descriptor(t, cpu, 1); 412 load_TLS_descriptor(t, cpu, 2); 413 414 xen_mc_issue(PARAVIRT_LAZY_CPU); 415 } 416 417 #ifdef CONFIG_X86_64 418 static void xen_load_gs_index(unsigned int idx) 419 { 420 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) 421 BUG(); 422 } 423 #endif 424 425 static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 426 const void *ptr) 427 { 428 unsigned long lp = (unsigned long)&dt[entrynum]; 429 xmaddr_t mach_lp = virt_to_machine(lp); 430 u64 entry = *(u64 *)ptr; 431 432 preempt_disable(); 433 434 xen_mc_flush(); 435 if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) 436 BUG(); 437 438 preempt_enable(); 439 } 440 441 static int cvt_gate_to_trap(int vector, const gate_desc *val, 442 struct trap_info *info) 443 { 444 if (val->type != 0xf && val->type != 0xe) 445 return 0; 446 447 info->vector = vector; 448 info->address = gate_offset(*val); 449 info->cs = gate_segment(*val); 450 info->flags = val->dpl; 451 /* interrupt gates clear IF */ 452 if (val->type == 0xe) 453 info->flags |= 4; 454 455 return 1; 456 } 457 458 /* Locations of each CPU's IDT */ 459 static DEFINE_PER_CPU(struct desc_ptr, idt_desc); 460 461 /* Set an IDT entry. If the entry is part of the current IDT, then 462 also update Xen. */ 463 static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) 464 { 465 unsigned long p = (unsigned long)&dt[entrynum]; 466 unsigned long start, end; 467 468 preempt_disable(); 469 470 start = __get_cpu_var(idt_desc).address; 471 end = start + __get_cpu_var(idt_desc).size + 1; 472 473 xen_mc_flush(); 474 475 native_write_idt_entry(dt, entrynum, g); 476 477 if (p >= start && (p + 8) <= end) { 478 struct trap_info info[2]; 479 480 info[1].address = 0; 481 482 if (cvt_gate_to_trap(entrynum, g, &info[0])) 483 if (HYPERVISOR_set_trap_table(info)) 484 BUG(); 485 } 486 487 preempt_enable(); 488 } 489 490 static void xen_convert_trap_info(const struct desc_ptr *desc, 491 struct trap_info *traps) 492 { 493 unsigned in, out, count; 494 495 count = (desc->size+1) / sizeof(gate_desc); 496 BUG_ON(count > 256); 497 498 for (in = out = 0; in < count; in++) { 499 gate_desc *entry = (gate_desc*)(desc->address) + in; 500 501 if (cvt_gate_to_trap(in, entry, &traps[out])) 502 out++; 503 } 504 traps[out].address = 0; 505 } 506 507 void xen_copy_trap_info(struct trap_info *traps) 508 { 509 const struct desc_ptr *desc = &__get_cpu_var(idt_desc); 510 511 xen_convert_trap_info(desc, traps); 512 } 513 514 /* Load a new IDT into Xen. In principle this can be per-CPU, so we 515 hold a spinlock to protect the static traps[] array (static because 516 it avoids allocation, and saves stack space). */ 517 static void xen_load_idt(const struct desc_ptr *desc) 518 { 519 static DEFINE_SPINLOCK(lock); 520 static struct trap_info traps[257]; 521 522 spin_lock(&lock); 523 524 __get_cpu_var(idt_desc) = *desc; 525 526 xen_convert_trap_info(desc, traps); 527 528 xen_mc_flush(); 529 if (HYPERVISOR_set_trap_table(traps)) 530 BUG(); 531 532 spin_unlock(&lock); 533 } 534 535 /* Write a GDT descriptor entry. Ignore LDT descriptors, since 536 they're handled differently. */ 537 static void xen_write_gdt_entry(struct desc_struct *dt, int entry, 538 const void *desc, int type) 539 { 540 preempt_disable(); 541 542 switch (type) { 543 case DESC_LDT: 544 case DESC_TSS: 545 /* ignore */ 546 break; 547 548 default: { 549 xmaddr_t maddr = virt_to_machine(&dt[entry]); 550 551 xen_mc_flush(); 552 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 553 BUG(); 554 } 555 556 } 557 558 preempt_enable(); 559 } 560 561 static void xen_load_sp0(struct tss_struct *tss, 562 struct thread_struct *thread) 563 { 564 struct multicall_space mcs = xen_mc_entry(0); 565 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 566 xen_mc_issue(PARAVIRT_LAZY_CPU); 567 } 568 569 static void xen_set_iopl_mask(unsigned mask) 570 { 571 struct physdev_set_iopl set_iopl; 572 573 /* Force the change at ring 0. */ 574 set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; 575 HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 576 } 577 578 static void xen_io_delay(void) 579 { 580 } 581 582 #ifdef CONFIG_X86_LOCAL_APIC 583 static u32 xen_apic_read(unsigned long reg) 584 { 585 return 0; 586 } 587 588 static void xen_apic_write(unsigned long reg, u32 val) 589 { 590 /* Warn to see if there's any stray references */ 591 WARN_ON(1); 592 } 593 #endif 594 595 static void xen_flush_tlb(void) 596 { 597 struct mmuext_op *op; 598 struct multicall_space mcs; 599 600 preempt_disable(); 601 602 mcs = xen_mc_entry(sizeof(*op)); 603 604 op = mcs.args; 605 op->cmd = MMUEXT_TLB_FLUSH_LOCAL; 606 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 607 608 xen_mc_issue(PARAVIRT_LAZY_MMU); 609 610 preempt_enable(); 611 } 612 613 static void xen_flush_tlb_single(unsigned long addr) 614 { 615 struct mmuext_op *op; 616 struct multicall_space mcs; 617 618 preempt_disable(); 619 620 mcs = xen_mc_entry(sizeof(*op)); 621 op = mcs.args; 622 op->cmd = MMUEXT_INVLPG_LOCAL; 623 op->arg1.linear_addr = addr & PAGE_MASK; 624 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 625 626 xen_mc_issue(PARAVIRT_LAZY_MMU); 627 628 preempt_enable(); 629 } 630 631 static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, 632 unsigned long va) 633 { 634 struct { 635 struct mmuext_op op; 636 cpumask_t mask; 637 } *args; 638 cpumask_t cpumask = *cpus; 639 struct multicall_space mcs; 640 641 /* 642 * A couple of (to be removed) sanity checks: 643 * 644 * - current CPU must not be in mask 645 * - mask must exist :) 646 */ 647 BUG_ON(cpus_empty(cpumask)); 648 BUG_ON(cpu_isset(smp_processor_id(), cpumask)); 649 BUG_ON(!mm); 650 651 /* If a CPU which we ran on has gone down, OK. */ 652 cpus_and(cpumask, cpumask, cpu_online_map); 653 if (cpus_empty(cpumask)) 654 return; 655 656 mcs = xen_mc_entry(sizeof(*args)); 657 args = mcs.args; 658 args->mask = cpumask; 659 args->op.arg2.vcpumask = &args->mask; 660 661 if (va == TLB_FLUSH_ALL) { 662 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 663 } else { 664 args->op.cmd = MMUEXT_INVLPG_MULTI; 665 args->op.arg1.linear_addr = va; 666 } 667 668 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); 669 670 xen_mc_issue(PARAVIRT_LAZY_MMU); 671 } 672 673 static void xen_clts(void) 674 { 675 struct multicall_space mcs; 676 677 mcs = xen_mc_entry(0); 678 679 MULTI_fpu_taskswitch(mcs.mc, 0); 680 681 xen_mc_issue(PARAVIRT_LAZY_CPU); 682 } 683 684 static void xen_write_cr0(unsigned long cr0) 685 { 686 struct multicall_space mcs; 687 688 /* Only pay attention to cr0.TS; everything else is 689 ignored. */ 690 mcs = xen_mc_entry(0); 691 692 MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); 693 694 xen_mc_issue(PARAVIRT_LAZY_CPU); 695 } 696 697 static void xen_write_cr2(unsigned long cr2) 698 { 699 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; 700 } 701 702 static unsigned long xen_read_cr2(void) 703 { 704 return x86_read_percpu(xen_vcpu)->arch.cr2; 705 } 706 707 static unsigned long xen_read_cr2_direct(void) 708 { 709 return x86_read_percpu(xen_vcpu_info.arch.cr2); 710 } 711 712 static void xen_write_cr4(unsigned long cr4) 713 { 714 cr4 &= ~X86_CR4_PGE; 715 cr4 &= ~X86_CR4_PSE; 716 717 native_write_cr4(cr4); 718 } 719 720 static unsigned long xen_read_cr3(void) 721 { 722 return x86_read_percpu(xen_cr3); 723 } 724 725 static void set_current_cr3(void *v) 726 { 727 x86_write_percpu(xen_current_cr3, (unsigned long)v); 728 } 729 730 static void __xen_write_cr3(bool kernel, unsigned long cr3) 731 { 732 struct mmuext_op *op; 733 struct multicall_space mcs; 734 unsigned long mfn; 735 736 if (cr3) 737 mfn = pfn_to_mfn(PFN_DOWN(cr3)); 738 else 739 mfn = 0; 740 741 WARN_ON(mfn == 0 && kernel); 742 743 mcs = __xen_mc_entry(sizeof(*op)); 744 745 op = mcs.args; 746 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; 747 op->arg1.mfn = mfn; 748 749 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 750 751 if (kernel) { 752 x86_write_percpu(xen_cr3, cr3); 753 754 /* Update xen_current_cr3 once the batch has actually 755 been submitted. */ 756 xen_mc_callback(set_current_cr3, (void *)cr3); 757 } 758 } 759 760 static void xen_write_cr3(unsigned long cr3) 761 { 762 BUG_ON(preemptible()); 763 764 xen_mc_batch(); /* disables interrupts */ 765 766 /* Update while interrupts are disabled, so its atomic with 767 respect to ipis */ 768 x86_write_percpu(xen_cr3, cr3); 769 770 __xen_write_cr3(true, cr3); 771 772 #ifdef CONFIG_X86_64 773 { 774 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); 775 if (user_pgd) 776 __xen_write_cr3(false, __pa(user_pgd)); 777 else 778 __xen_write_cr3(false, 0); 779 } 780 #endif 781 782 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 783 } 784 785 static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) 786 { 787 int ret; 788 789 ret = 0; 790 791 switch(msr) { 792 #ifdef CONFIG_X86_64 793 unsigned which; 794 u64 base; 795 796 case MSR_FS_BASE: which = SEGBASE_FS; goto set; 797 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; 798 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; 799 800 set: 801 base = ((u64)high << 32) | low; 802 if (HYPERVISOR_set_segment_base(which, base) != 0) 803 ret = -EFAULT; 804 break; 805 #endif 806 default: 807 ret = native_write_msr_safe(msr, low, high); 808 } 809 810 return ret; 811 } 812 813 /* Early in boot, while setting up the initial pagetable, assume 814 everything is pinned. */ 815 static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 816 { 817 #ifdef CONFIG_FLATMEM 818 BUG_ON(mem_map); /* should only be used early */ 819 #endif 820 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 821 } 822 823 /* Early release_pte assumes that all pts are pinned, since there's 824 only init_mm and anything attached to that is pinned. */ 825 static void xen_release_pte_init(u32 pfn) 826 { 827 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 828 } 829 830 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 831 { 832 struct mmuext_op op; 833 op.cmd = cmd; 834 op.arg1.mfn = pfn_to_mfn(pfn); 835 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) 836 BUG(); 837 } 838 839 /* This needs to make sure the new pte page is pinned iff its being 840 attached to a pinned pagetable. */ 841 static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) 842 { 843 struct page *page = pfn_to_page(pfn); 844 845 if (PagePinned(virt_to_page(mm->pgd))) { 846 SetPagePinned(page); 847 848 if (!PageHighMem(page)) { 849 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 850 if (level == PT_PTE) 851 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 852 } else 853 /* make sure there are no stray mappings of 854 this page */ 855 kmap_flush_unused(); 856 } 857 } 858 859 static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) 860 { 861 xen_alloc_ptpage(mm, pfn, PT_PTE); 862 } 863 864 static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) 865 { 866 xen_alloc_ptpage(mm, pfn, PT_PMD); 867 } 868 869 static int xen_pgd_alloc(struct mm_struct *mm) 870 { 871 pgd_t *pgd = mm->pgd; 872 int ret = 0; 873 874 BUG_ON(PagePinned(virt_to_page(pgd))); 875 876 #ifdef CONFIG_X86_64 877 { 878 struct page *page = virt_to_page(pgd); 879 pgd_t *user_pgd; 880 881 BUG_ON(page->private != 0); 882 883 ret = -ENOMEM; 884 885 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 886 page->private = (unsigned long)user_pgd; 887 888 if (user_pgd != NULL) { 889 user_pgd[pgd_index(VSYSCALL_START)] = 890 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); 891 ret = 0; 892 } 893 894 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); 895 } 896 #endif 897 898 return ret; 899 } 900 901 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) 902 { 903 #ifdef CONFIG_X86_64 904 pgd_t *user_pgd = xen_get_user_pgd(pgd); 905 906 if (user_pgd) 907 free_page((unsigned long)user_pgd); 908 #endif 909 } 910 911 /* This should never happen until we're OK to use struct page */ 912 static void xen_release_ptpage(u32 pfn, unsigned level) 913 { 914 struct page *page = pfn_to_page(pfn); 915 916 if (PagePinned(page)) { 917 if (!PageHighMem(page)) { 918 if (level == PT_PTE) 919 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 920 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 921 } 922 ClearPagePinned(page); 923 } 924 } 925 926 static void xen_release_pte(u32 pfn) 927 { 928 xen_release_ptpage(pfn, PT_PTE); 929 } 930 931 static void xen_release_pmd(u32 pfn) 932 { 933 xen_release_ptpage(pfn, PT_PMD); 934 } 935 936 #if PAGETABLE_LEVELS == 4 937 static void xen_alloc_pud(struct mm_struct *mm, u32 pfn) 938 { 939 xen_alloc_ptpage(mm, pfn, PT_PUD); 940 } 941 942 static void xen_release_pud(u32 pfn) 943 { 944 xen_release_ptpage(pfn, PT_PUD); 945 } 946 #endif 947 948 #ifdef CONFIG_HIGHPTE 949 static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 950 { 951 pgprot_t prot = PAGE_KERNEL; 952 953 if (PagePinned(page)) 954 prot = PAGE_KERNEL_RO; 955 956 if (0 && PageHighMem(page)) 957 printk("mapping highpte %lx type %d prot %s\n", 958 page_to_pfn(page), type, 959 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); 960 961 return kmap_atomic_prot(page, type, prot); 962 } 963 #endif 964 965 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 966 { 967 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 968 if (pte_val_ma(*ptep) & _PAGE_PRESENT) 969 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & 970 pte_val_ma(pte)); 971 972 return pte; 973 } 974 975 /* Init-time set_pte while constructing initial pagetables, which 976 doesn't allow RO pagetable pages to be remapped RW */ 977 static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) 978 { 979 pte = mask_rw_pte(ptep, pte); 980 981 xen_set_pte(ptep, pte); 982 } 983 984 static __init void xen_pagetable_setup_start(pgd_t *base) 985 { 986 } 987 988 void xen_setup_shared_info(void) 989 { 990 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 991 set_fixmap(FIX_PARAVIRT_BOOTMAP, 992 xen_start_info->shared_info); 993 994 HYPERVISOR_shared_info = 995 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); 996 } else 997 HYPERVISOR_shared_info = 998 (struct shared_info *)__va(xen_start_info->shared_info); 999 1000 #ifndef CONFIG_SMP 1001 /* In UP this is as good a place as any to set up shared info */ 1002 xen_setup_vcpu_info_placement(); 1003 #endif 1004 1005 xen_setup_mfn_list_list(); 1006 } 1007 1008 static __init void xen_pagetable_setup_done(pgd_t *base) 1009 { 1010 xen_setup_shared_info(); 1011 } 1012 1013 static __init void xen_post_allocator_init(void) 1014 { 1015 pv_mmu_ops.set_pte = xen_set_pte; 1016 pv_mmu_ops.set_pmd = xen_set_pmd; 1017 pv_mmu_ops.set_pud = xen_set_pud; 1018 #if PAGETABLE_LEVELS == 4 1019 pv_mmu_ops.set_pgd = xen_set_pgd; 1020 #endif 1021 1022 /* This will work as long as patching hasn't happened yet 1023 (which it hasn't) */ 1024 pv_mmu_ops.alloc_pte = xen_alloc_pte; 1025 pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 1026 pv_mmu_ops.release_pte = xen_release_pte; 1027 pv_mmu_ops.release_pmd = xen_release_pmd; 1028 #if PAGETABLE_LEVELS == 4 1029 pv_mmu_ops.alloc_pud = xen_alloc_pud; 1030 pv_mmu_ops.release_pud = xen_release_pud; 1031 #endif 1032 1033 #ifdef CONFIG_X86_64 1034 SetPagePinned(virt_to_page(level3_user_vsyscall)); 1035 #endif 1036 xen_mark_init_mm_pinned(); 1037 } 1038 1039 /* This is called once we have the cpu_possible_map */ 1040 void xen_setup_vcpu_info_placement(void) 1041 { 1042 int cpu; 1043 1044 for_each_possible_cpu(cpu) 1045 xen_vcpu_setup(cpu); 1046 1047 /* xen_vcpu_setup managed to place the vcpu_info within the 1048 percpu area for all cpus, so make use of it */ 1049 #ifdef CONFIG_X86_32 1050 if (have_vcpu_info_placement) { 1051 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1052 1053 pv_irq_ops.save_fl = xen_save_fl_direct; 1054 pv_irq_ops.restore_fl = xen_restore_fl_direct; 1055 pv_irq_ops.irq_disable = xen_irq_disable_direct; 1056 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1057 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1058 } 1059 #endif 1060 } 1061 1062 static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1063 unsigned long addr, unsigned len) 1064 { 1065 char *start, *end, *reloc; 1066 unsigned ret; 1067 1068 start = end = reloc = NULL; 1069 1070 #define SITE(op, x) \ 1071 case PARAVIRT_PATCH(op.x): \ 1072 if (have_vcpu_info_placement) { \ 1073 start = (char *)xen_##x##_direct; \ 1074 end = xen_##x##_direct_end; \ 1075 reloc = xen_##x##_direct_reloc; \ 1076 } \ 1077 goto patch_site 1078 1079 switch (type) { 1080 #ifdef CONFIG_X86_32 1081 SITE(pv_irq_ops, irq_enable); 1082 SITE(pv_irq_ops, irq_disable); 1083 SITE(pv_irq_ops, save_fl); 1084 SITE(pv_irq_ops, restore_fl); 1085 #endif /* CONFIG_X86_32 */ 1086 #undef SITE 1087 1088 patch_site: 1089 if (start == NULL || (end-start) > len) 1090 goto default_patch; 1091 1092 ret = paravirt_patch_insns(insnbuf, len, start, end); 1093 1094 /* Note: because reloc is assigned from something that 1095 appears to be an array, gcc assumes it's non-null, 1096 but doesn't know its relationship with start and 1097 end. */ 1098 if (reloc > start && reloc < end) { 1099 int reloc_off = reloc - start; 1100 long *relocp = (long *)(insnbuf + reloc_off); 1101 long delta = start - (char *)addr; 1102 1103 *relocp += delta; 1104 } 1105 break; 1106 1107 default_patch: 1108 default: 1109 ret = paravirt_patch_default(type, clobbers, insnbuf, 1110 addr, len); 1111 break; 1112 } 1113 1114 return ret; 1115 } 1116 1117 static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) 1118 { 1119 pte_t pte; 1120 1121 phys >>= PAGE_SHIFT; 1122 1123 switch (idx) { 1124 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: 1125 #ifdef CONFIG_X86_F00F_BUG 1126 case FIX_F00F_IDT: 1127 #endif 1128 #ifdef CONFIG_X86_32 1129 case FIX_WP_TEST: 1130 case FIX_VDSO: 1131 # ifdef CONFIG_HIGHMEM 1132 case FIX_KMAP_BEGIN ... FIX_KMAP_END: 1133 # endif 1134 #else 1135 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: 1136 #endif 1137 #ifdef CONFIG_X86_LOCAL_APIC 1138 case FIX_APIC_BASE: /* maps dummy local APIC */ 1139 #endif 1140 pte = pfn_pte(phys, prot); 1141 break; 1142 1143 default: 1144 pte = mfn_pte(phys, prot); 1145 break; 1146 } 1147 1148 __native_set_fixmap(idx, pte); 1149 1150 #ifdef CONFIG_X86_64 1151 /* Replicate changes to map the vsyscall page into the user 1152 pagetable vsyscall mapping. */ 1153 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { 1154 unsigned long vaddr = __fix_to_virt(idx); 1155 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); 1156 } 1157 #endif 1158 } 1159 1160 static const struct pv_info xen_info __initdata = { 1161 .paravirt_enabled = 1, 1162 .shared_kernel_pmd = 0, 1163 1164 .name = "Xen", 1165 }; 1166 1167 static const struct pv_init_ops xen_init_ops __initdata = { 1168 .patch = xen_patch, 1169 1170 .banner = xen_banner, 1171 .memory_setup = xen_memory_setup, 1172 .arch_setup = xen_arch_setup, 1173 .post_allocator_init = xen_post_allocator_init, 1174 }; 1175 1176 static const struct pv_time_ops xen_time_ops __initdata = { 1177 .time_init = xen_time_init, 1178 1179 .set_wallclock = xen_set_wallclock, 1180 .get_wallclock = xen_get_wallclock, 1181 .get_tsc_khz = xen_tsc_khz, 1182 .sched_clock = xen_sched_clock, 1183 }; 1184 1185 static const struct pv_cpu_ops xen_cpu_ops __initdata = { 1186 .cpuid = xen_cpuid, 1187 1188 .set_debugreg = xen_set_debugreg, 1189 .get_debugreg = xen_get_debugreg, 1190 1191 .clts = xen_clts, 1192 1193 .read_cr0 = native_read_cr0, 1194 .write_cr0 = xen_write_cr0, 1195 1196 .read_cr4 = native_read_cr4, 1197 .read_cr4_safe = native_read_cr4_safe, 1198 .write_cr4 = xen_write_cr4, 1199 1200 .wbinvd = native_wbinvd, 1201 1202 .read_msr = native_read_msr_safe, 1203 .write_msr = xen_write_msr_safe, 1204 .read_tsc = native_read_tsc, 1205 .read_pmc = native_read_pmc, 1206 1207 .iret = xen_iret, 1208 .irq_enable_sysexit = xen_sysexit, 1209 #ifdef CONFIG_X86_64 1210 .usergs_sysret32 = xen_sysret32, 1211 .usergs_sysret64 = xen_sysret64, 1212 #endif 1213 1214 .load_tr_desc = paravirt_nop, 1215 .set_ldt = xen_set_ldt, 1216 .load_gdt = xen_load_gdt, 1217 .load_idt = xen_load_idt, 1218 .load_tls = xen_load_tls, 1219 #ifdef CONFIG_X86_64 1220 .load_gs_index = xen_load_gs_index, 1221 #endif 1222 1223 .store_gdt = native_store_gdt, 1224 .store_idt = native_store_idt, 1225 .store_tr = xen_store_tr, 1226 1227 .write_ldt_entry = xen_write_ldt_entry, 1228 .write_gdt_entry = xen_write_gdt_entry, 1229 .write_idt_entry = xen_write_idt_entry, 1230 .load_sp0 = xen_load_sp0, 1231 1232 .set_iopl_mask = xen_set_iopl_mask, 1233 .io_delay = xen_io_delay, 1234 1235 /* Xen takes care of %gs when switching to usermode for us */ 1236 .swapgs = paravirt_nop, 1237 1238 .lazy_mode = { 1239 .enter = paravirt_enter_lazy_cpu, 1240 .leave = xen_leave_lazy, 1241 }, 1242 }; 1243 1244 static void __init __xen_init_IRQ(void) 1245 { 1246 #ifdef CONFIG_X86_64 1247 int i; 1248 1249 /* Create identity vector->irq map */ 1250 for(i = 0; i < NR_VECTORS; i++) { 1251 int cpu; 1252 1253 for_each_possible_cpu(cpu) 1254 per_cpu(vector_irq, cpu)[i] = i; 1255 } 1256 #endif /* CONFIG_X86_64 */ 1257 1258 xen_init_IRQ(); 1259 } 1260 1261 static const struct pv_irq_ops xen_irq_ops __initdata = { 1262 .init_IRQ = __xen_init_IRQ, 1263 .save_fl = xen_save_fl, 1264 .restore_fl = xen_restore_fl, 1265 .irq_disable = xen_irq_disable, 1266 .irq_enable = xen_irq_enable, 1267 .safe_halt = xen_safe_halt, 1268 .halt = xen_halt, 1269 #ifdef CONFIG_X86_64 1270 .adjust_exception_frame = xen_adjust_exception_frame, 1271 #endif 1272 }; 1273 1274 static const struct pv_apic_ops xen_apic_ops __initdata = { 1275 #ifdef CONFIG_X86_LOCAL_APIC 1276 .apic_write = xen_apic_write, 1277 .apic_read = xen_apic_read, 1278 .setup_boot_clock = paravirt_nop, 1279 .setup_secondary_clock = paravirt_nop, 1280 .startup_ipi_hook = paravirt_nop, 1281 #endif 1282 }; 1283 1284 static const struct pv_mmu_ops xen_mmu_ops __initdata = { 1285 .pagetable_setup_start = xen_pagetable_setup_start, 1286 .pagetable_setup_done = xen_pagetable_setup_done, 1287 1288 .read_cr2 = xen_read_cr2, 1289 .write_cr2 = xen_write_cr2, 1290 1291 .read_cr3 = xen_read_cr3, 1292 .write_cr3 = xen_write_cr3, 1293 1294 .flush_tlb_user = xen_flush_tlb, 1295 .flush_tlb_kernel = xen_flush_tlb, 1296 .flush_tlb_single = xen_flush_tlb_single, 1297 .flush_tlb_others = xen_flush_tlb_others, 1298 1299 .pte_update = paravirt_nop, 1300 .pte_update_defer = paravirt_nop, 1301 1302 .pgd_alloc = xen_pgd_alloc, 1303 .pgd_free = xen_pgd_free, 1304 1305 .alloc_pte = xen_alloc_pte_init, 1306 .release_pte = xen_release_pte_init, 1307 .alloc_pmd = xen_alloc_pte_init, 1308 .alloc_pmd_clone = paravirt_nop, 1309 .release_pmd = xen_release_pte_init, 1310 1311 #ifdef CONFIG_HIGHPTE 1312 .kmap_atomic_pte = xen_kmap_atomic_pte, 1313 #endif 1314 1315 #ifdef CONFIG_X86_64 1316 .set_pte = xen_set_pte, 1317 #else 1318 .set_pte = xen_set_pte_init, 1319 #endif 1320 .set_pte_at = xen_set_pte_at, 1321 .set_pmd = xen_set_pmd_hyper, 1322 1323 .ptep_modify_prot_start = __ptep_modify_prot_start, 1324 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 1325 1326 .pte_val = xen_pte_val, 1327 .pte_flags = native_pte_val, 1328 .pgd_val = xen_pgd_val, 1329 1330 .make_pte = xen_make_pte, 1331 .make_pgd = xen_make_pgd, 1332 1333 #ifdef CONFIG_X86_PAE 1334 .set_pte_atomic = xen_set_pte_atomic, 1335 .set_pte_present = xen_set_pte_at, 1336 .pte_clear = xen_pte_clear, 1337 .pmd_clear = xen_pmd_clear, 1338 #endif /* CONFIG_X86_PAE */ 1339 .set_pud = xen_set_pud_hyper, 1340 1341 .make_pmd = xen_make_pmd, 1342 .pmd_val = xen_pmd_val, 1343 1344 #if PAGETABLE_LEVELS == 4 1345 .pud_val = xen_pud_val, 1346 .make_pud = xen_make_pud, 1347 .set_pgd = xen_set_pgd_hyper, 1348 1349 .alloc_pud = xen_alloc_pte_init, 1350 .release_pud = xen_release_pte_init, 1351 #endif /* PAGETABLE_LEVELS == 4 */ 1352 1353 .activate_mm = xen_activate_mm, 1354 .dup_mmap = xen_dup_mmap, 1355 .exit_mmap = xen_exit_mmap, 1356 1357 .lazy_mode = { 1358 .enter = paravirt_enter_lazy_mmu, 1359 .leave = xen_leave_lazy, 1360 }, 1361 1362 .set_fixmap = xen_set_fixmap, 1363 }; 1364 1365 static void xen_reboot(int reason) 1366 { 1367 struct sched_shutdown r = { .reason = reason }; 1368 1369 #ifdef CONFIG_SMP 1370 smp_send_stop(); 1371 #endif 1372 1373 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) 1374 BUG(); 1375 } 1376 1377 static void xen_restart(char *msg) 1378 { 1379 xen_reboot(SHUTDOWN_reboot); 1380 } 1381 1382 static void xen_emergency_restart(void) 1383 { 1384 xen_reboot(SHUTDOWN_reboot); 1385 } 1386 1387 static void xen_machine_halt(void) 1388 { 1389 xen_reboot(SHUTDOWN_poweroff); 1390 } 1391 1392 static void xen_crash_shutdown(struct pt_regs *regs) 1393 { 1394 xen_reboot(SHUTDOWN_crash); 1395 } 1396 1397 static const struct machine_ops __initdata xen_machine_ops = { 1398 .restart = xen_restart, 1399 .halt = xen_machine_halt, 1400 .power_off = xen_machine_halt, 1401 .shutdown = xen_machine_halt, 1402 .crash_shutdown = xen_crash_shutdown, 1403 .emergency_restart = xen_emergency_restart, 1404 }; 1405 1406 1407 static void __init xen_reserve_top(void) 1408 { 1409 #ifdef CONFIG_X86_32 1410 unsigned long top = HYPERVISOR_VIRT_START; 1411 struct xen_platform_parameters pp; 1412 1413 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) 1414 top = pp.virt_start; 1415 1416 reserve_top_address(-top + 2 * PAGE_SIZE); 1417 #endif /* CONFIG_X86_32 */ 1418 } 1419 1420 /* 1421 * Like __va(), but returns address in the kernel mapping (which is 1422 * all we have until the physical memory mapping has been set up. 1423 */ 1424 static void *__ka(phys_addr_t paddr) 1425 { 1426 #ifdef CONFIG_X86_64 1427 return (void *)(paddr + __START_KERNEL_map); 1428 #else 1429 return __va(paddr); 1430 #endif 1431 } 1432 1433 /* Convert a machine address to physical address */ 1434 static unsigned long m2p(phys_addr_t maddr) 1435 { 1436 phys_addr_t paddr; 1437 1438 maddr &= PTE_PFN_MASK; 1439 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; 1440 1441 return paddr; 1442 } 1443 1444 /* Convert a machine address to kernel virtual */ 1445 static void *m2v(phys_addr_t maddr) 1446 { 1447 return __ka(m2p(maddr)); 1448 } 1449 1450 #ifdef CONFIG_X86_64 1451 static void walk(pgd_t *pgd, unsigned long addr) 1452 { 1453 unsigned l4idx = pgd_index(addr); 1454 unsigned l3idx = pud_index(addr); 1455 unsigned l2idx = pmd_index(addr); 1456 unsigned l1idx = pte_index(addr); 1457 pgd_t l4; 1458 pud_t l3; 1459 pmd_t l2; 1460 pte_t l1; 1461 1462 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n", 1463 pgd, addr, l4idx, l3idx, l2idx, l1idx); 1464 1465 l4 = pgd[l4idx]; 1466 xen_raw_printk(" l4: %016lx\n", l4.pgd); 1467 xen_raw_printk(" %016lx\n", pgd_val(l4)); 1468 1469 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx]; 1470 xen_raw_printk(" l3: %016lx\n", l3.pud); 1471 xen_raw_printk(" %016lx\n", pud_val(l3)); 1472 1473 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx]; 1474 xen_raw_printk(" l2: %016lx\n", l2.pmd); 1475 xen_raw_printk(" %016lx\n", pmd_val(l2)); 1476 1477 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx]; 1478 xen_raw_printk(" l1: %016lx\n", l1.pte); 1479 xen_raw_printk(" %016lx\n", pte_val(l1)); 1480 } 1481 #endif 1482 1483 static void set_page_prot(void *addr, pgprot_t prot) 1484 { 1485 unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 1486 pte_t pte = pfn_pte(pfn, prot); 1487 1488 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n", 1489 addr, pfn, get_phys_to_machine(pfn), 1490 pgprot_val(prot), pte.pte); 1491 1492 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) 1493 BUG(); 1494 } 1495 1496 static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) 1497 { 1498 unsigned pmdidx, pteidx; 1499 unsigned ident_pte; 1500 unsigned long pfn; 1501 1502 ident_pte = 0; 1503 pfn = 0; 1504 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { 1505 pte_t *pte_page; 1506 1507 /* Reuse or allocate a page of ptes */ 1508 if (pmd_present(pmd[pmdidx])) 1509 pte_page = m2v(pmd[pmdidx].pmd); 1510 else { 1511 /* Check for free pte pages */ 1512 if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) 1513 break; 1514 1515 pte_page = &level1_ident_pgt[ident_pte]; 1516 ident_pte += PTRS_PER_PTE; 1517 1518 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); 1519 } 1520 1521 /* Install mappings */ 1522 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { 1523 pte_t pte; 1524 1525 if (pfn > max_pfn_mapped) 1526 max_pfn_mapped = pfn; 1527 1528 if (!pte_none(pte_page[pteidx])) 1529 continue; 1530 1531 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); 1532 pte_page[pteidx] = pte; 1533 } 1534 } 1535 1536 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) 1537 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); 1538 1539 set_page_prot(pmd, PAGE_KERNEL_RO); 1540 } 1541 1542 #ifdef CONFIG_X86_64 1543 static void convert_pfn_mfn(void *v) 1544 { 1545 pte_t *pte = v; 1546 int i; 1547 1548 /* All levels are converted the same way, so just treat them 1549 as ptes. */ 1550 for(i = 0; i < PTRS_PER_PTE; i++) 1551 pte[i] = xen_make_pte(pte[i].pte); 1552 } 1553 1554 /* 1555 * Set up the inital kernel pagetable. 1556 * 1557 * We can construct this by grafting the Xen provided pagetable into 1558 * head_64.S's preconstructed pagetables. We copy the Xen L2's into 1559 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This 1560 * means that only the kernel has a physical mapping to start with - 1561 * but that's enough to get __va working. We need to fill in the rest 1562 * of the physical mapping once some sort of allocator has been set 1563 * up. 1564 */ 1565 static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1566 { 1567 pud_t *l3; 1568 pmd_t *l2; 1569 1570 /* Zap identity mapping */ 1571 init_level4_pgt[0] = __pgd(0); 1572 1573 /* Pre-constructed entries are in pfn, so convert to mfn */ 1574 convert_pfn_mfn(init_level4_pgt); 1575 convert_pfn_mfn(level3_ident_pgt); 1576 convert_pfn_mfn(level3_kernel_pgt); 1577 1578 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); 1579 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); 1580 1581 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); 1582 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); 1583 1584 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); 1585 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); 1586 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); 1587 1588 /* Set up identity map */ 1589 xen_map_identity_early(level2_ident_pgt, max_pfn); 1590 1591 /* Make pagetable pieces RO */ 1592 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); 1593 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); 1594 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); 1595 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); 1596 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); 1597 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); 1598 1599 /* Pin down new L4 */ 1600 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, 1601 PFN_DOWN(__pa_symbol(init_level4_pgt))); 1602 1603 /* Unpin Xen-provided one */ 1604 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 1605 1606 /* Switch over */ 1607 pgd = init_level4_pgt; 1608 1609 /* 1610 * At this stage there can be no user pgd, and no page 1611 * structure to attach it to, so make sure we just set kernel 1612 * pgd. 1613 */ 1614 xen_mc_batch(); 1615 __xen_write_cr3(true, __pa(pgd)); 1616 xen_mc_issue(PARAVIRT_LAZY_CPU); 1617 1618 reserve_early(__pa(xen_start_info->pt_base), 1619 __pa(xen_start_info->pt_base + 1620 xen_start_info->nr_pt_frames * PAGE_SIZE), 1621 "XEN PAGETABLES"); 1622 1623 return pgd; 1624 } 1625 #else /* !CONFIG_X86_64 */ 1626 static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; 1627 1628 static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1629 { 1630 pmd_t *kernel_pmd; 1631 1632 init_pg_tables_start = __pa(pgd); 1633 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1634 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); 1635 1636 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 1637 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); 1638 1639 xen_map_identity_early(level2_kernel_pgt, max_pfn); 1640 1641 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); 1642 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], 1643 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); 1644 1645 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); 1646 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); 1647 set_page_prot(empty_zero_page, PAGE_KERNEL_RO); 1648 1649 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 1650 1651 xen_write_cr3(__pa(swapper_pg_dir)); 1652 1653 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); 1654 1655 return swapper_pg_dir; 1656 } 1657 #endif /* CONFIG_X86_64 */ 1658 1659 /* First C function to be called on Xen boot */ 1660 asmlinkage void __init xen_start_kernel(void) 1661 { 1662 pgd_t *pgd; 1663 1664 if (!xen_start_info) 1665 return; 1666 1667 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1668 1669 xen_setup_features(); 1670 1671 /* Install Xen paravirt ops */ 1672 pv_info = xen_info; 1673 pv_init_ops = xen_init_ops; 1674 pv_time_ops = xen_time_ops; 1675 pv_cpu_ops = xen_cpu_ops; 1676 pv_irq_ops = xen_irq_ops; 1677 pv_apic_ops = xen_apic_ops; 1678 pv_mmu_ops = xen_mmu_ops; 1679 1680 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 1681 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; 1682 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; 1683 } 1684 1685 machine_ops = xen_machine_ops; 1686 1687 #ifdef CONFIG_X86_64 1688 /* Disable until direct per-cpu data access. */ 1689 have_vcpu_info_placement = 0; 1690 x86_64_init_pda(); 1691 #endif 1692 1693 xen_smp_init(); 1694 1695 /* Get mfn list */ 1696 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1697 xen_build_dynamic_phys_to_machine(); 1698 1699 pgd = (pgd_t *)xen_start_info->pt_base; 1700 1701 /* Prevent unwanted bits from being set in PTEs. */ 1702 __supported_pte_mask &= ~_PAGE_GLOBAL; 1703 if (!is_initial_xendomain()) 1704 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); 1705 1706 /* Don't do the full vcpu_info placement stuff until we have a 1707 possible map and a non-dummy shared_info. */ 1708 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1709 1710 xen_raw_console_write("mapping kernel into physical memory\n"); 1711 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); 1712 1713 init_mm.pgd = pgd; 1714 1715 /* keep using Xen gdt for now; no urgent need to change it */ 1716 1717 pv_info.kernel_rpl = 1; 1718 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1719 pv_info.kernel_rpl = 0; 1720 1721 /* set the limit of our address space */ 1722 xen_reserve_top(); 1723 1724 #ifdef CONFIG_X86_32 1725 /* set up basic CPUID stuff */ 1726 cpu_detect(&new_cpu_data); 1727 new_cpu_data.hard_math = 1; 1728 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1729 #endif 1730 1731 /* Poke various useful things into boot_params */ 1732 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1733 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1734 ? __pa(xen_start_info->mod_start) : 0; 1735 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1736 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); 1737 1738 if (!is_initial_xendomain()) { 1739 add_preferred_console("xenboot", 0, NULL); 1740 add_preferred_console("tty", 0, NULL); 1741 add_preferred_console("hvc", 0, NULL); 1742 } 1743 1744 xen_raw_console_write("about to get started...\n"); 1745 1746 #if 0 1747 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n", 1748 &boot_params, __pa_symbol(&boot_params), 1749 __va(__pa_symbol(&boot_params))); 1750 1751 walk(pgd, &boot_params); 1752 walk(pgd, __va(__pa(&boot_params))); 1753 #endif 1754 1755 /* Start the world */ 1756 #ifdef CONFIG_X86_32 1757 i386_start_kernel(); 1758 #else 1759 x86_64_start_reservations((char *)__pa_symbol(&boot_params)); 1760 #endif 1761 } 1762