1 /* 2 * Core of Xen paravirt_ops implementation. 3 * 4 * This file contains the xen_paravirt_ops structure itself, and the 5 * implementations for: 6 * - privileged instructions 7 * - interrupt flags 8 * - segment operations 9 * - booting and setup 10 * 11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 12 */ 13 14 #include <linux/kernel.h> 15 #include <linux/init.h> 16 #include <linux/smp.h> 17 #include <linux/preempt.h> 18 #include <linux/hardirq.h> 19 #include <linux/percpu.h> 20 #include <linux/delay.h> 21 #include <linux/start_kernel.h> 22 #include <linux/sched.h> 23 #include <linux/bootmem.h> 24 #include <linux/module.h> 25 #include <linux/mm.h> 26 #include <linux/page-flags.h> 27 #include <linux/highmem.h> 28 29 #include <xen/interface/xen.h> 30 #include <xen/interface/physdev.h> 31 #include <xen/interface/vcpu.h> 32 #include <xen/interface/sched.h> 33 #include <xen/features.h> 34 #include <xen/page.h> 35 36 #include <asm/paravirt.h> 37 #include <asm/page.h> 38 #include <asm/xen/hypercall.h> 39 #include <asm/xen/hypervisor.h> 40 #include <asm/fixmap.h> 41 #include <asm/processor.h> 42 #include <asm/setup.h> 43 #include <asm/desc.h> 44 #include <asm/pgtable.h> 45 #include <asm/tlbflush.h> 46 #include <asm/reboot.h> 47 48 #include "xen-ops.h" 49 #include "mmu.h" 50 #include "multicalls.h" 51 52 EXPORT_SYMBOL_GPL(hypercall_page); 53 54 DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 55 DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 56 57 /* 58 * Note about cr3 (pagetable base) values: 59 * 60 * xen_cr3 contains the current logical cr3 value; it contains the 61 * last set cr3. This may not be the current effective cr3, because 62 * its update may be being lazily deferred. However, a vcpu looking 63 * at its own cr3 can use this value knowing that it everything will 64 * be self-consistent. 65 * 66 * xen_current_cr3 contains the actual vcpu cr3; it is set once the 67 * hypercall to set the vcpu cr3 is complete (so it may be a little 68 * out of date, but it will never be set early). If one vcpu is 69 * looking at another vcpu's cr3 value, it should use this variable. 70 */ 71 DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ 72 DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ 73 74 struct start_info *xen_start_info; 75 EXPORT_SYMBOL_GPL(xen_start_info); 76 77 static /* __initdata */ struct shared_info dummy_shared_info; 78 79 /* 80 * Point at some empty memory to start with. We map the real shared_info 81 * page as soon as fixmap is up and running. 82 */ 83 struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; 84 85 /* 86 * Flag to determine whether vcpu info placement is available on all 87 * VCPUs. We assume it is to start with, and then set it to zero on 88 * the first failure. This is because it can succeed on some VCPUs 89 * and not others, since it can involve hypervisor memory allocation, 90 * or because the guest failed to guarantee all the appropriate 91 * constraints on all VCPUs (ie buffer can't cross a page boundary). 92 * 93 * Note that any particular CPU may be using a placed vcpu structure, 94 * but we can only optimise if the all are. 95 * 96 * 0: not available, 1: available 97 */ 98 static int have_vcpu_info_placement = 1; 99 100 static void __init xen_vcpu_setup(int cpu) 101 { 102 struct vcpu_register_vcpu_info info; 103 int err; 104 struct vcpu_info *vcpup; 105 106 BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info); 107 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 108 109 if (!have_vcpu_info_placement) 110 return; /* already tested, not available */ 111 112 vcpup = &per_cpu(xen_vcpu_info, cpu); 113 114 info.mfn = virt_to_mfn(vcpup); 115 info.offset = offset_in_page(vcpup); 116 117 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", 118 cpu, vcpup, info.mfn, info.offset); 119 120 /* Check to see if the hypervisor will put the vcpu_info 121 structure where we want it, which allows direct access via 122 a percpu-variable. */ 123 err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); 124 125 if (err) { 126 printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); 127 have_vcpu_info_placement = 0; 128 } else { 129 /* This cpu is using the registered vcpu info, even if 130 later ones fail to. */ 131 per_cpu(xen_vcpu, cpu) = vcpup; 132 133 printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", 134 cpu, vcpup); 135 } 136 } 137 138 static void __init xen_banner(void) 139 { 140 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 141 pv_info.name); 142 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); 143 } 144 145 static void xen_cpuid(unsigned int *ax, unsigned int *bx, 146 unsigned int *cx, unsigned int *dx) 147 { 148 unsigned maskedx = ~0; 149 150 /* 151 * Mask out inconvenient features, to try and disable as many 152 * unsupported kernel subsystems as possible. 153 */ 154 if (*ax == 1) 155 maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ 156 (1 << X86_FEATURE_ACPI) | /* disable ACPI */ 157 (1 << X86_FEATURE_SEP) | /* disable SEP */ 158 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 159 160 asm(XEN_EMULATE_PREFIX "cpuid" 161 : "=a" (*ax), 162 "=b" (*bx), 163 "=c" (*cx), 164 "=d" (*dx) 165 : "0" (*ax), "2" (*cx)); 166 *dx &= maskedx; 167 } 168 169 static void xen_set_debugreg(int reg, unsigned long val) 170 { 171 HYPERVISOR_set_debugreg(reg, val); 172 } 173 174 static unsigned long xen_get_debugreg(int reg) 175 { 176 return HYPERVISOR_get_debugreg(reg); 177 } 178 179 static unsigned long xen_save_fl(void) 180 { 181 struct vcpu_info *vcpu; 182 unsigned long flags; 183 184 vcpu = x86_read_percpu(xen_vcpu); 185 186 /* flag has opposite sense of mask */ 187 flags = !vcpu->evtchn_upcall_mask; 188 189 /* convert to IF type flag 190 -0 -> 0x00000000 191 -1 -> 0xffffffff 192 */ 193 return (-flags) & X86_EFLAGS_IF; 194 } 195 196 static void xen_restore_fl(unsigned long flags) 197 { 198 struct vcpu_info *vcpu; 199 200 /* convert from IF type flag */ 201 flags = !(flags & X86_EFLAGS_IF); 202 203 /* There's a one instruction preempt window here. We need to 204 make sure we're don't switch CPUs between getting the vcpu 205 pointer and updating the mask. */ 206 preempt_disable(); 207 vcpu = x86_read_percpu(xen_vcpu); 208 vcpu->evtchn_upcall_mask = flags; 209 preempt_enable_no_resched(); 210 211 /* Doesn't matter if we get preempted here, because any 212 pending event will get dealt with anyway. */ 213 214 if (flags == 0) { 215 preempt_check_resched(); 216 barrier(); /* unmask then check (avoid races) */ 217 if (unlikely(vcpu->evtchn_upcall_pending)) 218 force_evtchn_callback(); 219 } 220 } 221 222 static void xen_irq_disable(void) 223 { 224 /* There's a one instruction preempt window here. We need to 225 make sure we're don't switch CPUs between getting the vcpu 226 pointer and updating the mask. */ 227 preempt_disable(); 228 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; 229 preempt_enable_no_resched(); 230 } 231 232 static void xen_irq_enable(void) 233 { 234 struct vcpu_info *vcpu; 235 236 /* There's a one instruction preempt window here. We need to 237 make sure we're don't switch CPUs between getting the vcpu 238 pointer and updating the mask. */ 239 preempt_disable(); 240 vcpu = x86_read_percpu(xen_vcpu); 241 vcpu->evtchn_upcall_mask = 0; 242 preempt_enable_no_resched(); 243 244 /* Doesn't matter if we get preempted here, because any 245 pending event will get dealt with anyway. */ 246 247 barrier(); /* unmask then check (avoid races) */ 248 if (unlikely(vcpu->evtchn_upcall_pending)) 249 force_evtchn_callback(); 250 } 251 252 static void xen_safe_halt(void) 253 { 254 /* Blocking includes an implicit local_irq_enable(). */ 255 if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) 256 BUG(); 257 } 258 259 static void xen_halt(void) 260 { 261 if (irqs_disabled()) 262 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 263 else 264 xen_safe_halt(); 265 } 266 267 static void xen_leave_lazy(void) 268 { 269 paravirt_leave_lazy(paravirt_get_lazy_mode()); 270 xen_mc_flush(); 271 } 272 273 static unsigned long xen_store_tr(void) 274 { 275 return 0; 276 } 277 278 static void xen_set_ldt(const void *addr, unsigned entries) 279 { 280 struct mmuext_op *op; 281 struct multicall_space mcs = xen_mc_entry(sizeof(*op)); 282 283 op = mcs.args; 284 op->cmd = MMUEXT_SET_LDT; 285 op->arg1.linear_addr = (unsigned long)addr; 286 op->arg2.nr_ents = entries; 287 288 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 289 290 xen_mc_issue(PARAVIRT_LAZY_CPU); 291 } 292 293 static void xen_load_gdt(const struct desc_ptr *dtr) 294 { 295 unsigned long *frames; 296 unsigned long va = dtr->address; 297 unsigned int size = dtr->size + 1; 298 unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; 299 int f; 300 struct multicall_space mcs; 301 302 /* A GDT can be up to 64k in size, which corresponds to 8192 303 8-byte entries, or 16 4k pages.. */ 304 305 BUG_ON(size > 65536); 306 BUG_ON(va & ~PAGE_MASK); 307 308 mcs = xen_mc_entry(sizeof(*frames) * pages); 309 frames = mcs.args; 310 311 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 312 frames[f] = virt_to_mfn(va); 313 make_lowmem_page_readonly((void *)va); 314 } 315 316 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); 317 318 xen_mc_issue(PARAVIRT_LAZY_CPU); 319 } 320 321 static void load_TLS_descriptor(struct thread_struct *t, 322 unsigned int cpu, unsigned int i) 323 { 324 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 325 xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 326 struct multicall_space mc = __xen_mc_entry(0); 327 328 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 329 } 330 331 static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 332 { 333 xen_mc_batch(); 334 335 load_TLS_descriptor(t, cpu, 0); 336 load_TLS_descriptor(t, cpu, 1); 337 load_TLS_descriptor(t, cpu, 2); 338 339 xen_mc_issue(PARAVIRT_LAZY_CPU); 340 341 /* 342 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 343 * it means we're in a context switch, and %gs has just been 344 * saved. This means we can zero it out to prevent faults on 345 * exit from the hypervisor if the next process has no %gs. 346 * Either way, it has been saved, and the new value will get 347 * loaded properly. This will go away as soon as Xen has been 348 * modified to not save/restore %gs for normal hypercalls. 349 */ 350 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 351 loadsegment(gs, 0); 352 } 353 354 static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 355 const void *ptr) 356 { 357 unsigned long lp = (unsigned long)&dt[entrynum]; 358 xmaddr_t mach_lp = virt_to_machine(lp); 359 u64 entry = *(u64 *)ptr; 360 361 preempt_disable(); 362 363 xen_mc_flush(); 364 if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) 365 BUG(); 366 367 preempt_enable(); 368 } 369 370 static int cvt_gate_to_trap(int vector, u32 low, u32 high, 371 struct trap_info *info) 372 { 373 u8 type, dpl; 374 375 type = (high >> 8) & 0x1f; 376 dpl = (high >> 13) & 3; 377 378 if (type != 0xf && type != 0xe) 379 return 0; 380 381 info->vector = vector; 382 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 383 info->cs = low >> 16; 384 info->flags = dpl; 385 /* interrupt gates clear IF */ 386 if (type == 0xe) 387 info->flags |= 4; 388 389 return 1; 390 } 391 392 /* Locations of each CPU's IDT */ 393 static DEFINE_PER_CPU(struct desc_ptr, idt_desc); 394 395 /* Set an IDT entry. If the entry is part of the current IDT, then 396 also update Xen. */ 397 static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) 398 { 399 unsigned long p = (unsigned long)&dt[entrynum]; 400 unsigned long start, end; 401 402 preempt_disable(); 403 404 start = __get_cpu_var(idt_desc).address; 405 end = start + __get_cpu_var(idt_desc).size + 1; 406 407 xen_mc_flush(); 408 409 native_write_idt_entry(dt, entrynum, g); 410 411 if (p >= start && (p + 8) <= end) { 412 struct trap_info info[2]; 413 u32 *desc = (u32 *)g; 414 415 info[1].address = 0; 416 417 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 418 if (HYPERVISOR_set_trap_table(info)) 419 BUG(); 420 } 421 422 preempt_enable(); 423 } 424 425 static void xen_convert_trap_info(const struct desc_ptr *desc, 426 struct trap_info *traps) 427 { 428 unsigned in, out, count; 429 430 count = (desc->size+1) / 8; 431 BUG_ON(count > 256); 432 433 for (in = out = 0; in < count; in++) { 434 const u32 *entry = (u32 *)(desc->address + in * 8); 435 436 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 437 out++; 438 } 439 traps[out].address = 0; 440 } 441 442 void xen_copy_trap_info(struct trap_info *traps) 443 { 444 const struct desc_ptr *desc = &__get_cpu_var(idt_desc); 445 446 xen_convert_trap_info(desc, traps); 447 } 448 449 /* Load a new IDT into Xen. In principle this can be per-CPU, so we 450 hold a spinlock to protect the static traps[] array (static because 451 it avoids allocation, and saves stack space). */ 452 static void xen_load_idt(const struct desc_ptr *desc) 453 { 454 static DEFINE_SPINLOCK(lock); 455 static struct trap_info traps[257]; 456 457 spin_lock(&lock); 458 459 __get_cpu_var(idt_desc) = *desc; 460 461 xen_convert_trap_info(desc, traps); 462 463 xen_mc_flush(); 464 if (HYPERVISOR_set_trap_table(traps)) 465 BUG(); 466 467 spin_unlock(&lock); 468 } 469 470 /* Write a GDT descriptor entry. Ignore LDT descriptors, since 471 they're handled differently. */ 472 static void xen_write_gdt_entry(struct desc_struct *dt, int entry, 473 const void *desc, int type) 474 { 475 preempt_disable(); 476 477 switch (type) { 478 case DESC_LDT: 479 case DESC_TSS: 480 /* ignore */ 481 break; 482 483 default: { 484 xmaddr_t maddr = virt_to_machine(&dt[entry]); 485 486 xen_mc_flush(); 487 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 488 BUG(); 489 } 490 491 } 492 493 preempt_enable(); 494 } 495 496 static void xen_load_sp0(struct tss_struct *tss, 497 struct thread_struct *thread) 498 { 499 struct multicall_space mcs = xen_mc_entry(0); 500 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 501 xen_mc_issue(PARAVIRT_LAZY_CPU); 502 } 503 504 static void xen_set_iopl_mask(unsigned mask) 505 { 506 struct physdev_set_iopl set_iopl; 507 508 /* Force the change at ring 0. */ 509 set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; 510 HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 511 } 512 513 static void xen_io_delay(void) 514 { 515 } 516 517 #ifdef CONFIG_X86_LOCAL_APIC 518 static u32 xen_apic_read(unsigned long reg) 519 { 520 return 0; 521 } 522 523 static void xen_apic_write(unsigned long reg, u32 val) 524 { 525 /* Warn to see if there's any stray references */ 526 WARN_ON(1); 527 } 528 #endif 529 530 static void xen_flush_tlb(void) 531 { 532 struct mmuext_op *op; 533 struct multicall_space mcs = xen_mc_entry(sizeof(*op)); 534 535 op = mcs.args; 536 op->cmd = MMUEXT_TLB_FLUSH_LOCAL; 537 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 538 539 xen_mc_issue(PARAVIRT_LAZY_MMU); 540 } 541 542 static void xen_flush_tlb_single(unsigned long addr) 543 { 544 struct mmuext_op *op; 545 struct multicall_space mcs = xen_mc_entry(sizeof(*op)); 546 547 op = mcs.args; 548 op->cmd = MMUEXT_INVLPG_LOCAL; 549 op->arg1.linear_addr = addr & PAGE_MASK; 550 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 551 552 xen_mc_issue(PARAVIRT_LAZY_MMU); 553 } 554 555 static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, 556 unsigned long va) 557 { 558 struct { 559 struct mmuext_op op; 560 cpumask_t mask; 561 } *args; 562 cpumask_t cpumask = *cpus; 563 struct multicall_space mcs; 564 565 /* 566 * A couple of (to be removed) sanity checks: 567 * 568 * - current CPU must not be in mask 569 * - mask must exist :) 570 */ 571 BUG_ON(cpus_empty(cpumask)); 572 BUG_ON(cpu_isset(smp_processor_id(), cpumask)); 573 BUG_ON(!mm); 574 575 /* If a CPU which we ran on has gone down, OK. */ 576 cpus_and(cpumask, cpumask, cpu_online_map); 577 if (cpus_empty(cpumask)) 578 return; 579 580 mcs = xen_mc_entry(sizeof(*args)); 581 args = mcs.args; 582 args->mask = cpumask; 583 args->op.arg2.vcpumask = &args->mask; 584 585 if (va == TLB_FLUSH_ALL) { 586 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 587 } else { 588 args->op.cmd = MMUEXT_INVLPG_MULTI; 589 args->op.arg1.linear_addr = va; 590 } 591 592 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); 593 594 xen_mc_issue(PARAVIRT_LAZY_MMU); 595 } 596 597 static void xen_write_cr2(unsigned long cr2) 598 { 599 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; 600 } 601 602 static unsigned long xen_read_cr2(void) 603 { 604 return x86_read_percpu(xen_vcpu)->arch.cr2; 605 } 606 607 static unsigned long xen_read_cr2_direct(void) 608 { 609 return x86_read_percpu(xen_vcpu_info.arch.cr2); 610 } 611 612 static void xen_write_cr4(unsigned long cr4) 613 { 614 /* Just ignore cr4 changes; Xen doesn't allow us to do 615 anything anyway. */ 616 } 617 618 static unsigned long xen_read_cr3(void) 619 { 620 return x86_read_percpu(xen_cr3); 621 } 622 623 static void set_current_cr3(void *v) 624 { 625 x86_write_percpu(xen_current_cr3, (unsigned long)v); 626 } 627 628 static void xen_write_cr3(unsigned long cr3) 629 { 630 struct mmuext_op *op; 631 struct multicall_space mcs; 632 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 633 634 BUG_ON(preemptible()); 635 636 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 637 638 /* Update while interrupts are disabled, so its atomic with 639 respect to ipis */ 640 x86_write_percpu(xen_cr3, cr3); 641 642 op = mcs.args; 643 op->cmd = MMUEXT_NEW_BASEPTR; 644 op->arg1.mfn = mfn; 645 646 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 647 648 /* Update xen_update_cr3 once the batch has actually 649 been submitted. */ 650 xen_mc_callback(set_current_cr3, (void *)cr3); 651 652 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 653 } 654 655 /* Early in boot, while setting up the initial pagetable, assume 656 everything is pinned. */ 657 static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) 658 { 659 BUG_ON(mem_map); /* should only be used early */ 660 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 661 } 662 663 /* Early release_pt assumes that all pts are pinned, since there's 664 only init_mm and anything attached to that is pinned. */ 665 static void xen_release_pt_init(u32 pfn) 666 { 667 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 668 } 669 670 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 671 { 672 struct mmuext_op op; 673 op.cmd = cmd; 674 op.arg1.mfn = pfn_to_mfn(pfn); 675 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) 676 BUG(); 677 } 678 679 /* This needs to make sure the new pte page is pinned iff its being 680 attached to a pinned pagetable. */ 681 static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) 682 { 683 struct page *page = pfn_to_page(pfn); 684 685 if (PagePinned(virt_to_page(mm->pgd))) { 686 SetPagePinned(page); 687 688 if (!PageHighMem(page)) { 689 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 690 if (level == PT_PTE) 691 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 692 } else 693 /* make sure there are no stray mappings of 694 this page */ 695 kmap_flush_unused(); 696 } 697 } 698 699 static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) 700 { 701 xen_alloc_ptpage(mm, pfn, PT_PTE); 702 } 703 704 static void xen_alloc_pd(struct mm_struct *mm, u32 pfn) 705 { 706 xen_alloc_ptpage(mm, pfn, PT_PMD); 707 } 708 709 /* This should never happen until we're OK to use struct page */ 710 static void xen_release_ptpage(u32 pfn, unsigned level) 711 { 712 struct page *page = pfn_to_page(pfn); 713 714 if (PagePinned(page)) { 715 if (!PageHighMem(page)) { 716 if (level == PT_PTE) 717 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 718 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 719 } 720 ClearPagePinned(page); 721 } 722 } 723 724 static void xen_release_pt(u32 pfn) 725 { 726 xen_release_ptpage(pfn, PT_PTE); 727 } 728 729 static void xen_release_pd(u32 pfn) 730 { 731 xen_release_ptpage(pfn, PT_PMD); 732 } 733 734 #ifdef CONFIG_HIGHPTE 735 static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 736 { 737 pgprot_t prot = PAGE_KERNEL; 738 739 if (PagePinned(page)) 740 prot = PAGE_KERNEL_RO; 741 742 if (0 && PageHighMem(page)) 743 printk("mapping highpte %lx type %d prot %s\n", 744 page_to_pfn(page), type, 745 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); 746 747 return kmap_atomic_prot(page, type, prot); 748 } 749 #endif 750 751 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 752 { 753 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 754 if (pte_val_ma(*ptep) & _PAGE_PRESENT) 755 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & 756 pte_val_ma(pte)); 757 758 return pte; 759 } 760 761 /* Init-time set_pte while constructing initial pagetables, which 762 doesn't allow RO pagetable pages to be remapped RW */ 763 static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) 764 { 765 pte = mask_rw_pte(ptep, pte); 766 767 xen_set_pte(ptep, pte); 768 } 769 770 static __init void xen_pagetable_setup_start(pgd_t *base) 771 { 772 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; 773 774 /* special set_pte for pagetable initialization */ 775 pv_mmu_ops.set_pte = xen_set_pte_init; 776 777 init_mm.pgd = base; 778 /* 779 * copy top-level of Xen-supplied pagetable into place. For 780 * !PAE we can use this as-is, but for PAE it is a stand-in 781 * while we copy the pmd pages. 782 */ 783 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); 784 785 if (PTRS_PER_PMD > 1) { 786 int i; 787 /* 788 * For PAE, need to allocate new pmds, rather than 789 * share Xen's, since Xen doesn't like pmd's being 790 * shared between address spaces. 791 */ 792 for (i = 0; i < PTRS_PER_PGD; i++) { 793 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { 794 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); 795 796 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), 797 PAGE_SIZE); 798 799 make_lowmem_page_readonly(pmd); 800 801 set_pgd(&base[i], __pgd(1 + __pa(pmd))); 802 } else 803 pgd_clear(&base[i]); 804 } 805 } 806 807 /* make sure zero_page is mapped RO so we can use it in pagetables */ 808 make_lowmem_page_readonly(empty_zero_page); 809 make_lowmem_page_readonly(base); 810 /* 811 * Switch to new pagetable. This is done before 812 * pagetable_init has done anything so that the new pages 813 * added to the table can be prepared properly for Xen. 814 */ 815 xen_write_cr3(__pa(base)); 816 817 /* Unpin initial Xen pagetable */ 818 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, 819 PFN_DOWN(__pa(xen_start_info->pt_base))); 820 } 821 822 static __init void setup_shared_info(void) 823 { 824 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 825 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 826 827 /* 828 * Create a mapping for the shared info page. 829 * Should be set_fixmap(), but shared_info is a machine 830 * address with no corresponding pseudo-phys address. 831 */ 832 set_pte_mfn(addr, 833 PFN_DOWN(xen_start_info->shared_info), 834 PAGE_KERNEL); 835 836 HYPERVISOR_shared_info = (struct shared_info *)addr; 837 } else 838 HYPERVISOR_shared_info = 839 (struct shared_info *)__va(xen_start_info->shared_info); 840 841 #ifndef CONFIG_SMP 842 /* In UP this is as good a place as any to set up shared info */ 843 xen_setup_vcpu_info_placement(); 844 #endif 845 } 846 847 static __init void xen_pagetable_setup_done(pgd_t *base) 848 { 849 /* This will work as long as patching hasn't happened yet 850 (which it hasn't) */ 851 pv_mmu_ops.alloc_pt = xen_alloc_pt; 852 pv_mmu_ops.alloc_pd = xen_alloc_pd; 853 pv_mmu_ops.release_pt = xen_release_pt; 854 pv_mmu_ops.release_pd = xen_release_pd; 855 pv_mmu_ops.set_pte = xen_set_pte; 856 857 setup_shared_info(); 858 859 /* Actually pin the pagetable down, but we can't set PG_pinned 860 yet because the page structures don't exist yet. */ 861 { 862 unsigned level; 863 864 #ifdef CONFIG_X86_PAE 865 level = MMUEXT_PIN_L3_TABLE; 866 #else 867 level = MMUEXT_PIN_L2_TABLE; 868 #endif 869 870 pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); 871 } 872 } 873 874 /* This is called once we have the cpu_possible_map */ 875 void __init xen_setup_vcpu_info_placement(void) 876 { 877 int cpu; 878 879 for_each_possible_cpu(cpu) 880 xen_vcpu_setup(cpu); 881 882 /* xen_vcpu_setup managed to place the vcpu_info within the 883 percpu area for all cpus, so make use of it */ 884 if (have_vcpu_info_placement) { 885 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 886 887 pv_irq_ops.save_fl = xen_save_fl_direct; 888 pv_irq_ops.restore_fl = xen_restore_fl_direct; 889 pv_irq_ops.irq_disable = xen_irq_disable_direct; 890 pv_irq_ops.irq_enable = xen_irq_enable_direct; 891 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 892 pv_cpu_ops.iret = xen_iret_direct; 893 } 894 } 895 896 static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 897 unsigned long addr, unsigned len) 898 { 899 char *start, *end, *reloc; 900 unsigned ret; 901 902 start = end = reloc = NULL; 903 904 #define SITE(op, x) \ 905 case PARAVIRT_PATCH(op.x): \ 906 if (have_vcpu_info_placement) { \ 907 start = (char *)xen_##x##_direct; \ 908 end = xen_##x##_direct_end; \ 909 reloc = xen_##x##_direct_reloc; \ 910 } \ 911 goto patch_site 912 913 switch (type) { 914 SITE(pv_irq_ops, irq_enable); 915 SITE(pv_irq_ops, irq_disable); 916 SITE(pv_irq_ops, save_fl); 917 SITE(pv_irq_ops, restore_fl); 918 #undef SITE 919 920 patch_site: 921 if (start == NULL || (end-start) > len) 922 goto default_patch; 923 924 ret = paravirt_patch_insns(insnbuf, len, start, end); 925 926 /* Note: because reloc is assigned from something that 927 appears to be an array, gcc assumes it's non-null, 928 but doesn't know its relationship with start and 929 end. */ 930 if (reloc > start && reloc < end) { 931 int reloc_off = reloc - start; 932 long *relocp = (long *)(insnbuf + reloc_off); 933 long delta = start - (char *)addr; 934 935 *relocp += delta; 936 } 937 break; 938 939 default_patch: 940 default: 941 ret = paravirt_patch_default(type, clobbers, insnbuf, 942 addr, len); 943 break; 944 } 945 946 return ret; 947 } 948 949 static const struct pv_info xen_info __initdata = { 950 .paravirt_enabled = 1, 951 .shared_kernel_pmd = 0, 952 953 .name = "Xen", 954 }; 955 956 static const struct pv_init_ops xen_init_ops __initdata = { 957 .patch = xen_patch, 958 959 .banner = xen_banner, 960 .memory_setup = xen_memory_setup, 961 .arch_setup = xen_arch_setup, 962 .post_allocator_init = xen_mark_init_mm_pinned, 963 }; 964 965 static const struct pv_time_ops xen_time_ops __initdata = { 966 .time_init = xen_time_init, 967 968 .set_wallclock = xen_set_wallclock, 969 .get_wallclock = xen_get_wallclock, 970 .get_cpu_khz = xen_cpu_khz, 971 .sched_clock = xen_sched_clock, 972 }; 973 974 static const struct pv_cpu_ops xen_cpu_ops __initdata = { 975 .cpuid = xen_cpuid, 976 977 .set_debugreg = xen_set_debugreg, 978 .get_debugreg = xen_get_debugreg, 979 980 .clts = native_clts, 981 982 .read_cr0 = native_read_cr0, 983 .write_cr0 = native_write_cr0, 984 985 .read_cr4 = native_read_cr4, 986 .read_cr4_safe = native_read_cr4_safe, 987 .write_cr4 = xen_write_cr4, 988 989 .wbinvd = native_wbinvd, 990 991 .read_msr = native_read_msr_safe, 992 .write_msr = native_write_msr_safe, 993 .read_tsc = native_read_tsc, 994 .read_pmc = native_read_pmc, 995 996 .iret = (void *)&hypercall_page[__HYPERVISOR_iret], 997 .irq_enable_syscall_ret = NULL, /* never called */ 998 999 .load_tr_desc = paravirt_nop, 1000 .set_ldt = xen_set_ldt, 1001 .load_gdt = xen_load_gdt, 1002 .load_idt = xen_load_idt, 1003 .load_tls = xen_load_tls, 1004 1005 .store_gdt = native_store_gdt, 1006 .store_idt = native_store_idt, 1007 .store_tr = xen_store_tr, 1008 1009 .write_ldt_entry = xen_write_ldt_entry, 1010 .write_gdt_entry = xen_write_gdt_entry, 1011 .write_idt_entry = xen_write_idt_entry, 1012 .load_sp0 = xen_load_sp0, 1013 1014 .set_iopl_mask = xen_set_iopl_mask, 1015 .io_delay = xen_io_delay, 1016 1017 .lazy_mode = { 1018 .enter = paravirt_enter_lazy_cpu, 1019 .leave = xen_leave_lazy, 1020 }, 1021 }; 1022 1023 static const struct pv_irq_ops xen_irq_ops __initdata = { 1024 .init_IRQ = xen_init_IRQ, 1025 .save_fl = xen_save_fl, 1026 .restore_fl = xen_restore_fl, 1027 .irq_disable = xen_irq_disable, 1028 .irq_enable = xen_irq_enable, 1029 .safe_halt = xen_safe_halt, 1030 .halt = xen_halt, 1031 }; 1032 1033 static const struct pv_apic_ops xen_apic_ops __initdata = { 1034 #ifdef CONFIG_X86_LOCAL_APIC 1035 .apic_write = xen_apic_write, 1036 .apic_write_atomic = xen_apic_write, 1037 .apic_read = xen_apic_read, 1038 .setup_boot_clock = paravirt_nop, 1039 .setup_secondary_clock = paravirt_nop, 1040 .startup_ipi_hook = paravirt_nop, 1041 #endif 1042 }; 1043 1044 static const struct pv_mmu_ops xen_mmu_ops __initdata = { 1045 .pagetable_setup_start = xen_pagetable_setup_start, 1046 .pagetable_setup_done = xen_pagetable_setup_done, 1047 1048 .read_cr2 = xen_read_cr2, 1049 .write_cr2 = xen_write_cr2, 1050 1051 .read_cr3 = xen_read_cr3, 1052 .write_cr3 = xen_write_cr3, 1053 1054 .flush_tlb_user = xen_flush_tlb, 1055 .flush_tlb_kernel = xen_flush_tlb, 1056 .flush_tlb_single = xen_flush_tlb_single, 1057 .flush_tlb_others = xen_flush_tlb_others, 1058 1059 .pte_update = paravirt_nop, 1060 .pte_update_defer = paravirt_nop, 1061 1062 .alloc_pt = xen_alloc_pt_init, 1063 .release_pt = xen_release_pt_init, 1064 .alloc_pd = xen_alloc_pt_init, 1065 .alloc_pd_clone = paravirt_nop, 1066 .release_pd = xen_release_pt_init, 1067 1068 #ifdef CONFIG_HIGHPTE 1069 .kmap_atomic_pte = xen_kmap_atomic_pte, 1070 #endif 1071 1072 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1073 .set_pte_at = xen_set_pte_at, 1074 .set_pmd = xen_set_pmd, 1075 1076 .pte_val = xen_pte_val, 1077 .pgd_val = xen_pgd_val, 1078 1079 .make_pte = xen_make_pte, 1080 .make_pgd = xen_make_pgd, 1081 1082 #ifdef CONFIG_X86_PAE 1083 .set_pte_atomic = xen_set_pte_atomic, 1084 .set_pte_present = xen_set_pte_at, 1085 .set_pud = xen_set_pud, 1086 .pte_clear = xen_pte_clear, 1087 .pmd_clear = xen_pmd_clear, 1088 1089 .make_pmd = xen_make_pmd, 1090 .pmd_val = xen_pmd_val, 1091 #endif /* PAE */ 1092 1093 .activate_mm = xen_activate_mm, 1094 .dup_mmap = xen_dup_mmap, 1095 .exit_mmap = xen_exit_mmap, 1096 1097 .lazy_mode = { 1098 .enter = paravirt_enter_lazy_mmu, 1099 .leave = xen_leave_lazy, 1100 }, 1101 }; 1102 1103 #ifdef CONFIG_SMP 1104 static const struct smp_ops xen_smp_ops __initdata = { 1105 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 1106 .smp_prepare_cpus = xen_smp_prepare_cpus, 1107 .cpu_up = xen_cpu_up, 1108 .smp_cpus_done = xen_smp_cpus_done, 1109 1110 .smp_send_stop = xen_smp_send_stop, 1111 .smp_send_reschedule = xen_smp_send_reschedule, 1112 .smp_call_function_mask = xen_smp_call_function_mask, 1113 }; 1114 #endif /* CONFIG_SMP */ 1115 1116 static void xen_reboot(int reason) 1117 { 1118 #ifdef CONFIG_SMP 1119 smp_send_stop(); 1120 #endif 1121 1122 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) 1123 BUG(); 1124 } 1125 1126 static void xen_restart(char *msg) 1127 { 1128 xen_reboot(SHUTDOWN_reboot); 1129 } 1130 1131 static void xen_emergency_restart(void) 1132 { 1133 xen_reboot(SHUTDOWN_reboot); 1134 } 1135 1136 static void xen_machine_halt(void) 1137 { 1138 xen_reboot(SHUTDOWN_poweroff); 1139 } 1140 1141 static void xen_crash_shutdown(struct pt_regs *regs) 1142 { 1143 xen_reboot(SHUTDOWN_crash); 1144 } 1145 1146 static const struct machine_ops __initdata xen_machine_ops = { 1147 .restart = xen_restart, 1148 .halt = xen_machine_halt, 1149 .power_off = xen_machine_halt, 1150 .shutdown = xen_machine_halt, 1151 .crash_shutdown = xen_crash_shutdown, 1152 .emergency_restart = xen_emergency_restart, 1153 }; 1154 1155 1156 static void __init xen_reserve_top(void) 1157 { 1158 unsigned long top = HYPERVISOR_VIRT_START; 1159 struct xen_platform_parameters pp; 1160 1161 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) 1162 top = pp.virt_start; 1163 1164 reserve_top_address(-top + 2 * PAGE_SIZE); 1165 } 1166 1167 /* First C function to be called on Xen boot */ 1168 asmlinkage void __init xen_start_kernel(void) 1169 { 1170 pgd_t *pgd; 1171 1172 if (!xen_start_info) 1173 return; 1174 1175 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1176 1177 /* Install Xen paravirt ops */ 1178 pv_info = xen_info; 1179 pv_init_ops = xen_init_ops; 1180 pv_time_ops = xen_time_ops; 1181 pv_cpu_ops = xen_cpu_ops; 1182 pv_irq_ops = xen_irq_ops; 1183 pv_apic_ops = xen_apic_ops; 1184 pv_mmu_ops = xen_mmu_ops; 1185 1186 machine_ops = xen_machine_ops; 1187 1188 #ifdef CONFIG_SMP 1189 smp_ops = xen_smp_ops; 1190 #endif 1191 1192 xen_setup_features(); 1193 1194 /* Get mfn list */ 1195 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1196 phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; 1197 1198 pgd = (pgd_t *)xen_start_info->pt_base; 1199 1200 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1201 1202 init_mm.pgd = pgd; /* use the Xen pagetables to start */ 1203 1204 /* keep using Xen gdt for now; no urgent need to change it */ 1205 1206 x86_write_percpu(xen_cr3, __pa(pgd)); 1207 x86_write_percpu(xen_current_cr3, __pa(pgd)); 1208 1209 /* Don't do the full vcpu_info placement stuff until we have a 1210 possible map and a non-dummy shared_info. */ 1211 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1212 1213 pv_info.kernel_rpl = 1; 1214 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1215 pv_info.kernel_rpl = 0; 1216 1217 /* set the limit of our address space */ 1218 xen_reserve_top(); 1219 1220 /* set up basic CPUID stuff */ 1221 cpu_detect(&new_cpu_data); 1222 new_cpu_data.hard_math = 1; 1223 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1224 1225 /* Poke various useful things into boot_params */ 1226 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1227 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1228 ? __pa(xen_start_info->mod_start) : 0; 1229 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1230 1231 /* Start the world */ 1232 start_kernel(); 1233 } 1234