1 /* 2 * Xen mmu operations 3 * 4 * This file contains the various mmu fetch and update operations. 5 * The most important job they must perform is the mapping between the 6 * domain's pfn and the overall machine mfns. 7 * 8 * Xen allows guests to directly update the pagetable, in a controlled 9 * fashion. In other words, the guest modifies the same pagetable 10 * that the CPU actually uses, which eliminates the overhead of having 11 * a separate shadow pagetable. 12 * 13 * In order to allow this, it falls on the guest domain to map its 14 * notion of a "physical" pfn - which is just a domain-local linear 15 * address - into a real "machine address" which the CPU's MMU can 16 * use. 17 * 18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be 19 * inserted directly into the pagetable. When creating a new 20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, 21 * when reading the content back with __(pgd|pmd|pte)_val, it converts 22 * the mfn back into a pfn. 23 * 24 * The other constraint is that all pages which make up a pagetable 25 * must be mapped read-only in the guest. This prevents uncontrolled 26 * guest updates to the pagetable. Xen strictly enforces this, and 27 * will disallow any pagetable update which will end up mapping a 28 * pagetable page RW, and will disallow using any writable page as a 29 * pagetable. 30 * 31 * Naively, when loading %cr3 with the base of a new pagetable, Xen 32 * would need to validate the whole pagetable before going on. 33 * Naturally, this is quite slow. The solution is to "pin" a 34 * pagetable, which enforces all the constraints on the pagetable even 35 * when it is not actively in use. This menas that Xen can be assured 36 * that it is still valid when you do load it into %cr3, and doesn't 37 * need to revalidate it. 38 * 39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 40 */ 41 #include <linux/sched/mm.h> 42 #include <linux/highmem.h> 43 #include <linux/debugfs.h> 44 #include <linux/bug.h> 45 #include <linux/vmalloc.h> 46 #include <linux/export.h> 47 #include <linux/init.h> 48 #include <linux/gfp.h> 49 #include <linux/memblock.h> 50 #include <linux/seq_file.h> 51 #include <linux/crash_dump.h> 52 #ifdef CONFIG_KEXEC_CORE 53 #include <linux/kexec.h> 54 #endif 55 56 #include <trace/events/xen.h> 57 58 #include <asm/pgtable.h> 59 #include <asm/tlbflush.h> 60 #include <asm/fixmap.h> 61 #include <asm/mmu_context.h> 62 #include <asm/setup.h> 63 #include <asm/paravirt.h> 64 #include <asm/e820/api.h> 65 #include <asm/linkage.h> 66 #include <asm/page.h> 67 #include <asm/init.h> 68 #include <asm/pat.h> 69 #include <asm/smp.h> 70 71 #include <asm/xen/hypercall.h> 72 #include <asm/xen/hypervisor.h> 73 74 #include <xen/xen.h> 75 #include <xen/page.h> 76 #include <xen/interface/xen.h> 77 #include <xen/interface/hvm/hvm_op.h> 78 #include <xen/interface/version.h> 79 #include <xen/interface/memory.h> 80 #include <xen/hvc-console.h> 81 82 #include "multicalls.h" 83 #include "mmu.h" 84 #include "debugfs.h" 85 86 #ifdef CONFIG_X86_32 87 /* 88 * Identity map, in addition to plain kernel map. This needs to be 89 * large enough to allocate page table pages to allocate the rest. 90 * Each page can map 2MB. 91 */ 92 #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) 93 static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); 94 #endif 95 #ifdef CONFIG_X86_64 96 /* l3 pud for userspace vsyscall mapping */ 97 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; 98 #endif /* CONFIG_X86_64 */ 99 100 /* 101 * Note about cr3 (pagetable base) values: 102 * 103 * xen_cr3 contains the current logical cr3 value; it contains the 104 * last set cr3. This may not be the current effective cr3, because 105 * its update may be being lazily deferred. However, a vcpu looking 106 * at its own cr3 can use this value knowing that it everything will 107 * be self-consistent. 108 * 109 * xen_current_cr3 contains the actual vcpu cr3; it is set once the 110 * hypercall to set the vcpu cr3 is complete (so it may be a little 111 * out of date, but it will never be set early). If one vcpu is 112 * looking at another vcpu's cr3 value, it should use this variable. 113 */ 114 DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ 115 DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ 116 117 static phys_addr_t xen_pt_base, xen_pt_size __initdata; 118 119 /* 120 * Just beyond the highest usermode address. STACK_TOP_MAX has a 121 * redzone above it, so round it up to a PGD boundary. 122 */ 123 #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 124 125 void make_lowmem_page_readonly(void *vaddr) 126 { 127 pte_t *pte, ptev; 128 unsigned long address = (unsigned long)vaddr; 129 unsigned int level; 130 131 pte = lookup_address(address, &level); 132 if (pte == NULL) 133 return; /* vaddr missing */ 134 135 ptev = pte_wrprotect(*pte); 136 137 if (HYPERVISOR_update_va_mapping(address, ptev, 0)) 138 BUG(); 139 } 140 141 void make_lowmem_page_readwrite(void *vaddr) 142 { 143 pte_t *pte, ptev; 144 unsigned long address = (unsigned long)vaddr; 145 unsigned int level; 146 147 pte = lookup_address(address, &level); 148 if (pte == NULL) 149 return; /* vaddr missing */ 150 151 ptev = pte_mkwrite(*pte); 152 153 if (HYPERVISOR_update_va_mapping(address, ptev, 0)) 154 BUG(); 155 } 156 157 158 static bool xen_page_pinned(void *ptr) 159 { 160 struct page *page = virt_to_page(ptr); 161 162 return PagePinned(page); 163 } 164 165 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) 166 { 167 struct multicall_space mcs; 168 struct mmu_update *u; 169 170 trace_xen_mmu_set_domain_pte(ptep, pteval, domid); 171 172 mcs = xen_mc_entry(sizeof(*u)); 173 u = mcs.args; 174 175 /* ptep might be kmapped when using 32-bit HIGHPTE */ 176 u->ptr = virt_to_machine(ptep).maddr; 177 u->val = pte_val_ma(pteval); 178 179 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); 180 181 xen_mc_issue(PARAVIRT_LAZY_MMU); 182 } 183 EXPORT_SYMBOL_GPL(xen_set_domain_pte); 184 185 static void xen_extend_mmu_update(const struct mmu_update *update) 186 { 187 struct multicall_space mcs; 188 struct mmu_update *u; 189 190 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 191 192 if (mcs.mc != NULL) { 193 mcs.mc->args[1]++; 194 } else { 195 mcs = __xen_mc_entry(sizeof(*u)); 196 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 197 } 198 199 u = mcs.args; 200 *u = *update; 201 } 202 203 static void xen_extend_mmuext_op(const struct mmuext_op *op) 204 { 205 struct multicall_space mcs; 206 struct mmuext_op *u; 207 208 mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u)); 209 210 if (mcs.mc != NULL) { 211 mcs.mc->args[1]++; 212 } else { 213 mcs = __xen_mc_entry(sizeof(*u)); 214 MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 215 } 216 217 u = mcs.args; 218 *u = *op; 219 } 220 221 static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) 222 { 223 struct mmu_update u; 224 225 preempt_disable(); 226 227 xen_mc_batch(); 228 229 /* ptr may be ioremapped for 64-bit pagetable setup */ 230 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 231 u.val = pmd_val_ma(val); 232 xen_extend_mmu_update(&u); 233 234 xen_mc_issue(PARAVIRT_LAZY_MMU); 235 236 preempt_enable(); 237 } 238 239 static void xen_set_pmd(pmd_t *ptr, pmd_t val) 240 { 241 trace_xen_mmu_set_pmd(ptr, val); 242 243 /* If page is not pinned, we can just update the entry 244 directly */ 245 if (!xen_page_pinned(ptr)) { 246 *ptr = val; 247 return; 248 } 249 250 xen_set_pmd_hyper(ptr, val); 251 } 252 253 /* 254 * Associate a virtual page frame with a given physical page frame 255 * and protection flags for that frame. 256 */ 257 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 258 { 259 set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); 260 } 261 262 static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) 263 { 264 struct mmu_update u; 265 266 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) 267 return false; 268 269 xen_mc_batch(); 270 271 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; 272 u.val = pte_val_ma(pteval); 273 xen_extend_mmu_update(&u); 274 275 xen_mc_issue(PARAVIRT_LAZY_MMU); 276 277 return true; 278 } 279 280 static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) 281 { 282 if (!xen_batched_set_pte(ptep, pteval)) { 283 /* 284 * Could call native_set_pte() here and trap and 285 * emulate the PTE write but with 32-bit guests this 286 * needs two traps (one for each of the two 32-bit 287 * words in the PTE) so do one hypercall directly 288 * instead. 289 */ 290 struct mmu_update u; 291 292 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; 293 u.val = pte_val_ma(pteval); 294 HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); 295 } 296 } 297 298 static void xen_set_pte(pte_t *ptep, pte_t pteval) 299 { 300 trace_xen_mmu_set_pte(ptep, pteval); 301 __xen_set_pte(ptep, pteval); 302 } 303 304 static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 305 pte_t *ptep, pte_t pteval) 306 { 307 trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval); 308 __xen_set_pte(ptep, pteval); 309 } 310 311 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 312 unsigned long addr, pte_t *ptep) 313 { 314 /* Just return the pte as-is. We preserve the bits on commit */ 315 trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); 316 return *ptep; 317 } 318 319 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 320 pte_t *ptep, pte_t pte) 321 { 322 struct mmu_update u; 323 324 trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); 325 xen_mc_batch(); 326 327 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 328 u.val = pte_val_ma(pte); 329 xen_extend_mmu_update(&u); 330 331 xen_mc_issue(PARAVIRT_LAZY_MMU); 332 } 333 334 /* Assume pteval_t is equivalent to all the other *val_t types. */ 335 static pteval_t pte_mfn_to_pfn(pteval_t val) 336 { 337 if (val & _PAGE_PRESENT) { 338 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 339 unsigned long pfn = mfn_to_pfn(mfn); 340 341 pteval_t flags = val & PTE_FLAGS_MASK; 342 if (unlikely(pfn == ~0)) 343 val = flags & ~_PAGE_PRESENT; 344 else 345 val = ((pteval_t)pfn << PAGE_SHIFT) | flags; 346 } 347 348 return val; 349 } 350 351 static pteval_t pte_pfn_to_mfn(pteval_t val) 352 { 353 if (val & _PAGE_PRESENT) { 354 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 355 pteval_t flags = val & PTE_FLAGS_MASK; 356 unsigned long mfn; 357 358 if (!xen_feature(XENFEAT_auto_translated_physmap)) 359 mfn = __pfn_to_mfn(pfn); 360 else 361 mfn = pfn; 362 /* 363 * If there's no mfn for the pfn, then just create an 364 * empty non-present pte. Unfortunately this loses 365 * information about the original pfn, so 366 * pte_mfn_to_pfn is asymmetric. 367 */ 368 if (unlikely(mfn == INVALID_P2M_ENTRY)) { 369 mfn = 0; 370 flags = 0; 371 } else 372 mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); 373 val = ((pteval_t)mfn << PAGE_SHIFT) | flags; 374 } 375 376 return val; 377 } 378 379 __visible pteval_t xen_pte_val(pte_t pte) 380 { 381 pteval_t pteval = pte.pte; 382 383 return pte_mfn_to_pfn(pteval); 384 } 385 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); 386 387 __visible pgdval_t xen_pgd_val(pgd_t pgd) 388 { 389 return pte_mfn_to_pfn(pgd.pgd); 390 } 391 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); 392 393 __visible pte_t xen_make_pte(pteval_t pte) 394 { 395 pte = pte_pfn_to_mfn(pte); 396 397 return native_make_pte(pte); 398 } 399 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 400 401 __visible pgd_t xen_make_pgd(pgdval_t pgd) 402 { 403 pgd = pte_pfn_to_mfn(pgd); 404 return native_make_pgd(pgd); 405 } 406 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); 407 408 __visible pmdval_t xen_pmd_val(pmd_t pmd) 409 { 410 return pte_mfn_to_pfn(pmd.pmd); 411 } 412 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); 413 414 static void xen_set_pud_hyper(pud_t *ptr, pud_t val) 415 { 416 struct mmu_update u; 417 418 preempt_disable(); 419 420 xen_mc_batch(); 421 422 /* ptr may be ioremapped for 64-bit pagetable setup */ 423 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 424 u.val = pud_val_ma(val); 425 xen_extend_mmu_update(&u); 426 427 xen_mc_issue(PARAVIRT_LAZY_MMU); 428 429 preempt_enable(); 430 } 431 432 static void xen_set_pud(pud_t *ptr, pud_t val) 433 { 434 trace_xen_mmu_set_pud(ptr, val); 435 436 /* If page is not pinned, we can just update the entry 437 directly */ 438 if (!xen_page_pinned(ptr)) { 439 *ptr = val; 440 return; 441 } 442 443 xen_set_pud_hyper(ptr, val); 444 } 445 446 #ifdef CONFIG_X86_PAE 447 static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 448 { 449 trace_xen_mmu_set_pte_atomic(ptep, pte); 450 set_64bit((u64 *)ptep, native_pte_val(pte)); 451 } 452 453 static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 454 { 455 trace_xen_mmu_pte_clear(mm, addr, ptep); 456 if (!xen_batched_set_pte(ptep, native_make_pte(0))) 457 native_pte_clear(mm, addr, ptep); 458 } 459 460 static void xen_pmd_clear(pmd_t *pmdp) 461 { 462 trace_xen_mmu_pmd_clear(pmdp); 463 set_pmd(pmdp, __pmd(0)); 464 } 465 #endif /* CONFIG_X86_PAE */ 466 467 __visible pmd_t xen_make_pmd(pmdval_t pmd) 468 { 469 pmd = pte_pfn_to_mfn(pmd); 470 return native_make_pmd(pmd); 471 } 472 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); 473 474 #if CONFIG_PGTABLE_LEVELS == 4 475 __visible pudval_t xen_pud_val(pud_t pud) 476 { 477 return pte_mfn_to_pfn(pud.pud); 478 } 479 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); 480 481 __visible pud_t xen_make_pud(pudval_t pud) 482 { 483 pud = pte_pfn_to_mfn(pud); 484 485 return native_make_pud(pud); 486 } 487 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); 488 489 static pgd_t *xen_get_user_pgd(pgd_t *pgd) 490 { 491 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); 492 unsigned offset = pgd - pgd_page; 493 pgd_t *user_ptr = NULL; 494 495 if (offset < pgd_index(USER_LIMIT)) { 496 struct page *page = virt_to_page(pgd_page); 497 user_ptr = (pgd_t *)page->private; 498 if (user_ptr) 499 user_ptr += offset; 500 } 501 502 return user_ptr; 503 } 504 505 static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) 506 { 507 struct mmu_update u; 508 509 u.ptr = virt_to_machine(ptr).maddr; 510 u.val = p4d_val_ma(val); 511 xen_extend_mmu_update(&u); 512 } 513 514 /* 515 * Raw hypercall-based set_p4d, intended for in early boot before 516 * there's a page structure. This implies: 517 * 1. The only existing pagetable is the kernel's 518 * 2. It is always pinned 519 * 3. It has no user pagetable attached to it 520 */ 521 static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) 522 { 523 preempt_disable(); 524 525 xen_mc_batch(); 526 527 __xen_set_p4d_hyper(ptr, val); 528 529 xen_mc_issue(PARAVIRT_LAZY_MMU); 530 531 preempt_enable(); 532 } 533 534 static void xen_set_p4d(p4d_t *ptr, p4d_t val) 535 { 536 pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr); 537 pgd_t pgd_val; 538 539 trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val); 540 541 /* If page is not pinned, we can just update the entry 542 directly */ 543 if (!xen_page_pinned(ptr)) { 544 *ptr = val; 545 if (user_ptr) { 546 WARN_ON(xen_page_pinned(user_ptr)); 547 pgd_val.pgd = p4d_val_ma(val); 548 *user_ptr = pgd_val; 549 } 550 return; 551 } 552 553 /* If it's pinned, then we can at least batch the kernel and 554 user updates together. */ 555 xen_mc_batch(); 556 557 __xen_set_p4d_hyper(ptr, val); 558 if (user_ptr) 559 __xen_set_p4d_hyper((p4d_t *)user_ptr, val); 560 561 xen_mc_issue(PARAVIRT_LAZY_MMU); 562 } 563 #endif /* CONFIG_PGTABLE_LEVELS == 4 */ 564 565 static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, 566 int (*func)(struct mm_struct *mm, struct page *, enum pt_level), 567 bool last, unsigned long limit) 568 { 569 int i, nr, flush = 0; 570 571 nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD; 572 for (i = 0; i < nr; i++) { 573 if (!pmd_none(pmd[i])) 574 flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE); 575 } 576 return flush; 577 } 578 579 static int xen_pud_walk(struct mm_struct *mm, pud_t *pud, 580 int (*func)(struct mm_struct *mm, struct page *, enum pt_level), 581 bool last, unsigned long limit) 582 { 583 int i, nr, flush = 0; 584 585 nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD; 586 for (i = 0; i < nr; i++) { 587 pmd_t *pmd; 588 589 if (pud_none(pud[i])) 590 continue; 591 592 pmd = pmd_offset(&pud[i], 0); 593 if (PTRS_PER_PMD > 1) 594 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); 595 flush |= xen_pmd_walk(mm, pmd, func, 596 last && i == nr - 1, limit); 597 } 598 return flush; 599 } 600 601 static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, 602 int (*func)(struct mm_struct *mm, struct page *, enum pt_level), 603 bool last, unsigned long limit) 604 { 605 int i, nr, flush = 0; 606 607 nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; 608 for (i = 0; i < nr; i++) { 609 pud_t *pud; 610 611 if (p4d_none(p4d[i])) 612 continue; 613 614 pud = pud_offset(&p4d[i], 0); 615 if (PTRS_PER_PUD > 1) 616 flush |= (*func)(mm, virt_to_page(pud), PT_PUD); 617 flush |= xen_pud_walk(mm, pud, func, 618 last && i == nr - 1, limit); 619 } 620 return flush; 621 } 622 623 /* 624 * (Yet another) pagetable walker. This one is intended for pinning a 625 * pagetable. This means that it walks a pagetable and calls the 626 * callback function on each page it finds making up the page table, 627 * at every level. It walks the entire pagetable, but it only bothers 628 * pinning pte pages which are below limit. In the normal case this 629 * will be STACK_TOP_MAX, but at boot we need to pin up to 630 * FIXADDR_TOP. 631 * 632 * For 32-bit the important bit is that we don't pin beyond there, 633 * because then we start getting into Xen's ptes. 634 * 635 * For 64-bit, we must skip the Xen hole in the middle of the address 636 * space, just after the big x86-64 virtual hole. 637 */ 638 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, 639 int (*func)(struct mm_struct *mm, struct page *, 640 enum pt_level), 641 unsigned long limit) 642 { 643 int i, nr, flush = 0; 644 unsigned hole_low, hole_high; 645 646 /* The limit is the last byte to be touched */ 647 limit--; 648 BUG_ON(limit >= FIXADDR_TOP); 649 650 if (xen_feature(XENFEAT_auto_translated_physmap)) 651 return 0; 652 653 /* 654 * 64-bit has a great big hole in the middle of the address 655 * space, which contains the Xen mappings. On 32-bit these 656 * will end up making a zero-sized hole and so is a no-op. 657 */ 658 hole_low = pgd_index(USER_LIMIT); 659 hole_high = pgd_index(PAGE_OFFSET); 660 661 nr = pgd_index(limit) + 1; 662 for (i = 0; i < nr; i++) { 663 p4d_t *p4d; 664 665 if (i >= hole_low && i < hole_high) 666 continue; 667 668 if (pgd_none(pgd[i])) 669 continue; 670 671 p4d = p4d_offset(&pgd[i], 0); 672 if (PTRS_PER_P4D > 1) 673 flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); 674 flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); 675 } 676 677 /* Do the top level last, so that the callbacks can use it as 678 a cue to do final things like tlb flushes. */ 679 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); 680 681 return flush; 682 } 683 684 static int xen_pgd_walk(struct mm_struct *mm, 685 int (*func)(struct mm_struct *mm, struct page *, 686 enum pt_level), 687 unsigned long limit) 688 { 689 return __xen_pgd_walk(mm, mm->pgd, func, limit); 690 } 691 692 /* If we're using split pte locks, then take the page's lock and 693 return a pointer to it. Otherwise return NULL. */ 694 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) 695 { 696 spinlock_t *ptl = NULL; 697 698 #if USE_SPLIT_PTE_PTLOCKS 699 ptl = ptlock_ptr(page); 700 spin_lock_nest_lock(ptl, &mm->page_table_lock); 701 #endif 702 703 return ptl; 704 } 705 706 static void xen_pte_unlock(void *v) 707 { 708 spinlock_t *ptl = v; 709 spin_unlock(ptl); 710 } 711 712 static void xen_do_pin(unsigned level, unsigned long pfn) 713 { 714 struct mmuext_op op; 715 716 op.cmd = level; 717 op.arg1.mfn = pfn_to_mfn(pfn); 718 719 xen_extend_mmuext_op(&op); 720 } 721 722 static int xen_pin_page(struct mm_struct *mm, struct page *page, 723 enum pt_level level) 724 { 725 unsigned pgfl = TestSetPagePinned(page); 726 int flush; 727 728 if (pgfl) 729 flush = 0; /* already pinned */ 730 else if (PageHighMem(page)) 731 /* kmaps need flushing if we found an unpinned 732 highpage */ 733 flush = 1; 734 else { 735 void *pt = lowmem_page_address(page); 736 unsigned long pfn = page_to_pfn(page); 737 struct multicall_space mcs = __xen_mc_entry(0); 738 spinlock_t *ptl; 739 740 flush = 0; 741 742 /* 743 * We need to hold the pagetable lock between the time 744 * we make the pagetable RO and when we actually pin 745 * it. If we don't, then other users may come in and 746 * attempt to update the pagetable by writing it, 747 * which will fail because the memory is RO but not 748 * pinned, so Xen won't do the trap'n'emulate. 749 * 750 * If we're using split pte locks, we can't hold the 751 * entire pagetable's worth of locks during the 752 * traverse, because we may wrap the preempt count (8 753 * bits). The solution is to mark RO and pin each PTE 754 * page while holding the lock. This means the number 755 * of locks we end up holding is never more than a 756 * batch size (~32 entries, at present). 757 * 758 * If we're not using split pte locks, we needn't pin 759 * the PTE pages independently, because we're 760 * protected by the overall pagetable lock. 761 */ 762 ptl = NULL; 763 if (level == PT_PTE) 764 ptl = xen_pte_lock(page, mm); 765 766 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 767 pfn_pte(pfn, PAGE_KERNEL_RO), 768 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 769 770 if (ptl) { 771 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 772 773 /* Queue a deferred unlock for when this batch 774 is completed. */ 775 xen_mc_callback(xen_pte_unlock, ptl); 776 } 777 } 778 779 return flush; 780 } 781 782 /* This is called just after a mm has been created, but it has not 783 been used yet. We need to make sure that its pagetable is all 784 read-only, and can be pinned. */ 785 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) 786 { 787 trace_xen_mmu_pgd_pin(mm, pgd); 788 789 xen_mc_batch(); 790 791 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { 792 /* re-enable interrupts for flushing */ 793 xen_mc_issue(0); 794 795 kmap_flush_unused(); 796 797 xen_mc_batch(); 798 } 799 800 #ifdef CONFIG_X86_64 801 { 802 pgd_t *user_pgd = xen_get_user_pgd(pgd); 803 804 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); 805 806 if (user_pgd) { 807 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); 808 xen_do_pin(MMUEXT_PIN_L4_TABLE, 809 PFN_DOWN(__pa(user_pgd))); 810 } 811 } 812 #else /* CONFIG_X86_32 */ 813 #ifdef CONFIG_X86_PAE 814 /* Need to make sure unshared kernel PMD is pinnable */ 815 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), 816 PT_PMD); 817 #endif 818 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 819 #endif /* CONFIG_X86_64 */ 820 xen_mc_issue(0); 821 } 822 823 static void xen_pgd_pin(struct mm_struct *mm) 824 { 825 __xen_pgd_pin(mm, mm->pgd); 826 } 827 828 /* 829 * On save, we need to pin all pagetables to make sure they get their 830 * mfns turned into pfns. Search the list for any unpinned pgds and pin 831 * them (unpinned pgds are not currently in use, probably because the 832 * process is under construction or destruction). 833 * 834 * Expected to be called in stop_machine() ("equivalent to taking 835 * every spinlock in the system"), so the locking doesn't really 836 * matter all that much. 837 */ 838 void xen_mm_pin_all(void) 839 { 840 struct page *page; 841 842 spin_lock(&pgd_lock); 843 844 list_for_each_entry(page, &pgd_list, lru) { 845 if (!PagePinned(page)) { 846 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); 847 SetPageSavePinned(page); 848 } 849 } 850 851 spin_unlock(&pgd_lock); 852 } 853 854 /* 855 * The init_mm pagetable is really pinned as soon as its created, but 856 * that's before we have page structures to store the bits. So do all 857 * the book-keeping now. 858 */ 859 static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, 860 enum pt_level level) 861 { 862 SetPagePinned(page); 863 return 0; 864 } 865 866 static void __init xen_mark_init_mm_pinned(void) 867 { 868 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); 869 } 870 871 static int xen_unpin_page(struct mm_struct *mm, struct page *page, 872 enum pt_level level) 873 { 874 unsigned pgfl = TestClearPagePinned(page); 875 876 if (pgfl && !PageHighMem(page)) { 877 void *pt = lowmem_page_address(page); 878 unsigned long pfn = page_to_pfn(page); 879 spinlock_t *ptl = NULL; 880 struct multicall_space mcs; 881 882 /* 883 * Do the converse to pin_page. If we're using split 884 * pte locks, we must be holding the lock for while 885 * the pte page is unpinned but still RO to prevent 886 * concurrent updates from seeing it in this 887 * partially-pinned state. 888 */ 889 if (level == PT_PTE) { 890 ptl = xen_pte_lock(page, mm); 891 892 if (ptl) 893 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 894 } 895 896 mcs = __xen_mc_entry(0); 897 898 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 899 pfn_pte(pfn, PAGE_KERNEL), 900 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 901 902 if (ptl) { 903 /* unlock when batch completed */ 904 xen_mc_callback(xen_pte_unlock, ptl); 905 } 906 } 907 908 return 0; /* never need to flush on unpin */ 909 } 910 911 /* Release a pagetables pages back as normal RW */ 912 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) 913 { 914 trace_xen_mmu_pgd_unpin(mm, pgd); 915 916 xen_mc_batch(); 917 918 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 919 920 #ifdef CONFIG_X86_64 921 { 922 pgd_t *user_pgd = xen_get_user_pgd(pgd); 923 924 if (user_pgd) { 925 xen_do_pin(MMUEXT_UNPIN_TABLE, 926 PFN_DOWN(__pa(user_pgd))); 927 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); 928 } 929 } 930 #endif 931 932 #ifdef CONFIG_X86_PAE 933 /* Need to make sure unshared kernel PMD is unpinned */ 934 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), 935 PT_PMD); 936 #endif 937 938 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT); 939 940 xen_mc_issue(0); 941 } 942 943 static void xen_pgd_unpin(struct mm_struct *mm) 944 { 945 __xen_pgd_unpin(mm, mm->pgd); 946 } 947 948 /* 949 * On resume, undo any pinning done at save, so that the rest of the 950 * kernel doesn't see any unexpected pinned pagetables. 951 */ 952 void xen_mm_unpin_all(void) 953 { 954 struct page *page; 955 956 spin_lock(&pgd_lock); 957 958 list_for_each_entry(page, &pgd_list, lru) { 959 if (PageSavePinned(page)) { 960 BUG_ON(!PagePinned(page)); 961 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); 962 ClearPageSavePinned(page); 963 } 964 } 965 966 spin_unlock(&pgd_lock); 967 } 968 969 static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 970 { 971 spin_lock(&next->page_table_lock); 972 xen_pgd_pin(next); 973 spin_unlock(&next->page_table_lock); 974 } 975 976 static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 977 { 978 spin_lock(&mm->page_table_lock); 979 xen_pgd_pin(mm); 980 spin_unlock(&mm->page_table_lock); 981 } 982 983 984 #ifdef CONFIG_SMP 985 /* Another cpu may still have their %cr3 pointing at the pagetable, so 986 we need to repoint it somewhere else before we can unpin it. */ 987 static void drop_other_mm_ref(void *info) 988 { 989 struct mm_struct *mm = info; 990 struct mm_struct *active_mm; 991 992 active_mm = this_cpu_read(cpu_tlbstate.active_mm); 993 994 if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) 995 leave_mm(smp_processor_id()); 996 997 /* If this cpu still has a stale cr3 reference, then make sure 998 it has been flushed. */ 999 if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) 1000 load_cr3(swapper_pg_dir); 1001 } 1002 1003 static void xen_drop_mm_ref(struct mm_struct *mm) 1004 { 1005 cpumask_var_t mask; 1006 unsigned cpu; 1007 1008 if (current->active_mm == mm) { 1009 if (current->mm == mm) 1010 load_cr3(swapper_pg_dir); 1011 else 1012 leave_mm(smp_processor_id()); 1013 } 1014 1015 /* Get the "official" set of cpus referring to our pagetable. */ 1016 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { 1017 for_each_online_cpu(cpu) { 1018 if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) 1019 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) 1020 continue; 1021 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); 1022 } 1023 return; 1024 } 1025 cpumask_copy(mask, mm_cpumask(mm)); 1026 1027 /* It's possible that a vcpu may have a stale reference to our 1028 cr3, because its in lazy mode, and it hasn't yet flushed 1029 its set of pending hypercalls yet. In this case, we can 1030 look at its actual current cr3 value, and force it to flush 1031 if needed. */ 1032 for_each_online_cpu(cpu) { 1033 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) 1034 cpumask_set_cpu(cpu, mask); 1035 } 1036 1037 if (!cpumask_empty(mask)) 1038 smp_call_function_many(mask, drop_other_mm_ref, mm, 1); 1039 free_cpumask_var(mask); 1040 } 1041 #else 1042 static void xen_drop_mm_ref(struct mm_struct *mm) 1043 { 1044 if (current->active_mm == mm) 1045 load_cr3(swapper_pg_dir); 1046 } 1047 #endif 1048 1049 /* 1050 * While a process runs, Xen pins its pagetables, which means that the 1051 * hypervisor forces it to be read-only, and it controls all updates 1052 * to it. This means that all pagetable updates have to go via the 1053 * hypervisor, which is moderately expensive. 1054 * 1055 * Since we're pulling the pagetable down, we switch to use init_mm, 1056 * unpin old process pagetable and mark it all read-write, which 1057 * allows further operations on it to be simple memory accesses. 1058 * 1059 * The only subtle point is that another CPU may be still using the 1060 * pagetable because of lazy tlb flushing. This means we need need to 1061 * switch all CPUs off this pagetable before we can unpin it. 1062 */ 1063 static void xen_exit_mmap(struct mm_struct *mm) 1064 { 1065 get_cpu(); /* make sure we don't move around */ 1066 xen_drop_mm_ref(mm); 1067 put_cpu(); 1068 1069 spin_lock(&mm->page_table_lock); 1070 1071 /* pgd may not be pinned in the error exit path of execve */ 1072 if (xen_page_pinned(mm->pgd)) 1073 xen_pgd_unpin(mm); 1074 1075 spin_unlock(&mm->page_table_lock); 1076 } 1077 1078 static void xen_post_allocator_init(void); 1079 1080 static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1081 { 1082 struct mmuext_op op; 1083 1084 op.cmd = cmd; 1085 op.arg1.mfn = pfn_to_mfn(pfn); 1086 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) 1087 BUG(); 1088 } 1089 1090 #ifdef CONFIG_X86_64 1091 static void __init xen_cleanhighmap(unsigned long vaddr, 1092 unsigned long vaddr_end) 1093 { 1094 unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; 1095 pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); 1096 1097 /* NOTE: The loop is more greedy than the cleanup_highmap variant. 1098 * We include the PMD passed in on _both_ boundaries. */ 1099 for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD)); 1100 pmd++, vaddr += PMD_SIZE) { 1101 if (pmd_none(*pmd)) 1102 continue; 1103 if (vaddr < (unsigned long) _text || vaddr > kernel_end) 1104 set_pmd(pmd, __pmd(0)); 1105 } 1106 /* In case we did something silly, we should crash in this function 1107 * instead of somewhere later and be confusing. */ 1108 xen_mc_flush(); 1109 } 1110 1111 /* 1112 * Make a page range writeable and free it. 1113 */ 1114 static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) 1115 { 1116 void *vaddr = __va(paddr); 1117 void *vaddr_end = vaddr + size; 1118 1119 for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) 1120 make_lowmem_page_readwrite(vaddr); 1121 1122 memblock_free(paddr, size); 1123 } 1124 1125 static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) 1126 { 1127 unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK; 1128 1129 if (unpin) 1130 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa)); 1131 ClearPagePinned(virt_to_page(__va(pa))); 1132 xen_free_ro_pages(pa, PAGE_SIZE); 1133 } 1134 1135 static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin) 1136 { 1137 unsigned long pa; 1138 pte_t *pte_tbl; 1139 int i; 1140 1141 if (pmd_large(*pmd)) { 1142 pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; 1143 xen_free_ro_pages(pa, PMD_SIZE); 1144 return; 1145 } 1146 1147 pte_tbl = pte_offset_kernel(pmd, 0); 1148 for (i = 0; i < PTRS_PER_PTE; i++) { 1149 if (pte_none(pte_tbl[i])) 1150 continue; 1151 pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT; 1152 xen_free_ro_pages(pa, PAGE_SIZE); 1153 } 1154 set_pmd(pmd, __pmd(0)); 1155 xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin); 1156 } 1157 1158 static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin) 1159 { 1160 unsigned long pa; 1161 pmd_t *pmd_tbl; 1162 int i; 1163 1164 if (pud_large(*pud)) { 1165 pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; 1166 xen_free_ro_pages(pa, PUD_SIZE); 1167 return; 1168 } 1169 1170 pmd_tbl = pmd_offset(pud, 0); 1171 for (i = 0; i < PTRS_PER_PMD; i++) { 1172 if (pmd_none(pmd_tbl[i])) 1173 continue; 1174 xen_cleanmfnmap_pmd(pmd_tbl + i, unpin); 1175 } 1176 set_pud(pud, __pud(0)); 1177 xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin); 1178 } 1179 1180 static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin) 1181 { 1182 unsigned long pa; 1183 pud_t *pud_tbl; 1184 int i; 1185 1186 if (p4d_large(*p4d)) { 1187 pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK; 1188 xen_free_ro_pages(pa, P4D_SIZE); 1189 return; 1190 } 1191 1192 pud_tbl = pud_offset(p4d, 0); 1193 for (i = 0; i < PTRS_PER_PUD; i++) { 1194 if (pud_none(pud_tbl[i])) 1195 continue; 1196 xen_cleanmfnmap_pud(pud_tbl + i, unpin); 1197 } 1198 set_p4d(p4d, __p4d(0)); 1199 xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin); 1200 } 1201 1202 /* 1203 * Since it is well isolated we can (and since it is perhaps large we should) 1204 * also free the page tables mapping the initial P->M table. 1205 */ 1206 static void __init xen_cleanmfnmap(unsigned long vaddr) 1207 { 1208 pgd_t *pgd; 1209 p4d_t *p4d; 1210 unsigned int i; 1211 bool unpin; 1212 1213 unpin = (vaddr == 2 * PGDIR_SIZE); 1214 vaddr &= PMD_MASK; 1215 pgd = pgd_offset_k(vaddr); 1216 p4d = p4d_offset(pgd, 0); 1217 for (i = 0; i < PTRS_PER_P4D; i++) { 1218 if (p4d_none(p4d[i])) 1219 continue; 1220 xen_cleanmfnmap_p4d(p4d + i, unpin); 1221 } 1222 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 1223 set_pgd(pgd, __pgd(0)); 1224 xen_cleanmfnmap_free_pgtbl(p4d, unpin); 1225 } 1226 } 1227 1228 static void __init xen_pagetable_p2m_free(void) 1229 { 1230 unsigned long size; 1231 unsigned long addr; 1232 1233 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 1234 1235 /* No memory or already called. */ 1236 if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list) 1237 return; 1238 1239 /* using __ka address and sticking INVALID_P2M_ENTRY! */ 1240 memset((void *)xen_start_info->mfn_list, 0xff, size); 1241 1242 addr = xen_start_info->mfn_list; 1243 /* 1244 * We could be in __ka space. 1245 * We roundup to the PMD, which means that if anybody at this stage is 1246 * using the __ka address of xen_start_info or 1247 * xen_start_info->shared_info they are in going to crash. Fortunatly 1248 * we have already revectored in xen_setup_kernel_pagetable and in 1249 * xen_setup_shared_info. 1250 */ 1251 size = roundup(size, PMD_SIZE); 1252 1253 if (addr >= __START_KERNEL_map) { 1254 xen_cleanhighmap(addr, addr + size); 1255 size = PAGE_ALIGN(xen_start_info->nr_pages * 1256 sizeof(unsigned long)); 1257 memblock_free(__pa(addr), size); 1258 } else { 1259 xen_cleanmfnmap(addr); 1260 } 1261 } 1262 1263 static void __init xen_pagetable_cleanhighmap(void) 1264 { 1265 unsigned long size; 1266 unsigned long addr; 1267 1268 /* At this stage, cleanup_highmap has already cleaned __ka space 1269 * from _brk_limit way up to the max_pfn_mapped (which is the end of 1270 * the ramdisk). We continue on, erasing PMD entries that point to page 1271 * tables - do note that they are accessible at this stage via __va. 1272 * For good measure we also round up to the PMD - which means that if 1273 * anybody is using __ka address to the initial boot-stack - and try 1274 * to use it - they are going to crash. The xen_start_info has been 1275 * taken care of already in xen_setup_kernel_pagetable. */ 1276 addr = xen_start_info->pt_base; 1277 size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); 1278 1279 xen_cleanhighmap(addr, addr + size); 1280 xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); 1281 #ifdef DEBUG 1282 /* This is superfluous and is not necessary, but you know what 1283 * lets do it. The MODULES_VADDR -> MODULES_END should be clear of 1284 * anything at this stage. */ 1285 xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); 1286 #endif 1287 } 1288 #endif 1289 1290 static void __init xen_pagetable_p2m_setup(void) 1291 { 1292 if (xen_feature(XENFEAT_auto_translated_physmap)) 1293 return; 1294 1295 xen_vmalloc_p2m_tree(); 1296 1297 #ifdef CONFIG_X86_64 1298 xen_pagetable_p2m_free(); 1299 1300 xen_pagetable_cleanhighmap(); 1301 #endif 1302 /* And revector! Bye bye old array */ 1303 xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; 1304 } 1305 1306 static void __init xen_pagetable_init(void) 1307 { 1308 paging_init(); 1309 xen_post_allocator_init(); 1310 1311 xen_pagetable_p2m_setup(); 1312 1313 /* Allocate and initialize top and mid mfn levels for p2m structure */ 1314 xen_build_mfn_list_list(); 1315 1316 /* Remap memory freed due to conflicts with E820 map */ 1317 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1318 xen_remap_memory(); 1319 1320 xen_setup_shared_info(); 1321 } 1322 static void xen_write_cr2(unsigned long cr2) 1323 { 1324 this_cpu_read(xen_vcpu)->arch.cr2 = cr2; 1325 } 1326 1327 static unsigned long xen_read_cr2(void) 1328 { 1329 return this_cpu_read(xen_vcpu)->arch.cr2; 1330 } 1331 1332 unsigned long xen_read_cr2_direct(void) 1333 { 1334 return this_cpu_read(xen_vcpu_info.arch.cr2); 1335 } 1336 1337 static void xen_flush_tlb(void) 1338 { 1339 struct mmuext_op *op; 1340 struct multicall_space mcs; 1341 1342 trace_xen_mmu_flush_tlb(0); 1343 1344 preempt_disable(); 1345 1346 mcs = xen_mc_entry(sizeof(*op)); 1347 1348 op = mcs.args; 1349 op->cmd = MMUEXT_TLB_FLUSH_LOCAL; 1350 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 1351 1352 xen_mc_issue(PARAVIRT_LAZY_MMU); 1353 1354 preempt_enable(); 1355 } 1356 1357 static void xen_flush_tlb_single(unsigned long addr) 1358 { 1359 struct mmuext_op *op; 1360 struct multicall_space mcs; 1361 1362 trace_xen_mmu_flush_tlb_single(addr); 1363 1364 preempt_disable(); 1365 1366 mcs = xen_mc_entry(sizeof(*op)); 1367 op = mcs.args; 1368 op->cmd = MMUEXT_INVLPG_LOCAL; 1369 op->arg1.linear_addr = addr & PAGE_MASK; 1370 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 1371 1372 xen_mc_issue(PARAVIRT_LAZY_MMU); 1373 1374 preempt_enable(); 1375 } 1376 1377 static void xen_flush_tlb_others(const struct cpumask *cpus, 1378 struct mm_struct *mm, unsigned long start, 1379 unsigned long end) 1380 { 1381 struct { 1382 struct mmuext_op op; 1383 #ifdef CONFIG_SMP 1384 DECLARE_BITMAP(mask, num_processors); 1385 #else 1386 DECLARE_BITMAP(mask, NR_CPUS); 1387 #endif 1388 } *args; 1389 struct multicall_space mcs; 1390 1391 trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); 1392 1393 if (cpumask_empty(cpus)) 1394 return; /* nothing to do */ 1395 1396 mcs = xen_mc_entry(sizeof(*args)); 1397 args = mcs.args; 1398 args->op.arg2.vcpumask = to_cpumask(args->mask); 1399 1400 /* Remove us, and any offline CPUS. */ 1401 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); 1402 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); 1403 1404 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 1405 if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { 1406 args->op.cmd = MMUEXT_INVLPG_MULTI; 1407 args->op.arg1.linear_addr = start; 1408 } 1409 1410 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); 1411 1412 xen_mc_issue(PARAVIRT_LAZY_MMU); 1413 } 1414 1415 static unsigned long xen_read_cr3(void) 1416 { 1417 return this_cpu_read(xen_cr3); 1418 } 1419 1420 static void set_current_cr3(void *v) 1421 { 1422 this_cpu_write(xen_current_cr3, (unsigned long)v); 1423 } 1424 1425 static void __xen_write_cr3(bool kernel, unsigned long cr3) 1426 { 1427 struct mmuext_op op; 1428 unsigned long mfn; 1429 1430 trace_xen_mmu_write_cr3(kernel, cr3); 1431 1432 if (cr3) 1433 mfn = pfn_to_mfn(PFN_DOWN(cr3)); 1434 else 1435 mfn = 0; 1436 1437 WARN_ON(mfn == 0 && kernel); 1438 1439 op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; 1440 op.arg1.mfn = mfn; 1441 1442 xen_extend_mmuext_op(&op); 1443 1444 if (kernel) { 1445 this_cpu_write(xen_cr3, cr3); 1446 1447 /* Update xen_current_cr3 once the batch has actually 1448 been submitted. */ 1449 xen_mc_callback(set_current_cr3, (void *)cr3); 1450 } 1451 } 1452 static void xen_write_cr3(unsigned long cr3) 1453 { 1454 BUG_ON(preemptible()); 1455 1456 xen_mc_batch(); /* disables interrupts */ 1457 1458 /* Update while interrupts are disabled, so its atomic with 1459 respect to ipis */ 1460 this_cpu_write(xen_cr3, cr3); 1461 1462 __xen_write_cr3(true, cr3); 1463 1464 #ifdef CONFIG_X86_64 1465 { 1466 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); 1467 if (user_pgd) 1468 __xen_write_cr3(false, __pa(user_pgd)); 1469 else 1470 __xen_write_cr3(false, 0); 1471 } 1472 #endif 1473 1474 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 1475 } 1476 1477 #ifdef CONFIG_X86_64 1478 /* 1479 * At the start of the day - when Xen launches a guest, it has already 1480 * built pagetables for the guest. We diligently look over them 1481 * in xen_setup_kernel_pagetable and graft as appropriate them in the 1482 * init_level4_pgt and its friends. Then when we are happy we load 1483 * the new init_level4_pgt - and continue on. 1484 * 1485 * The generic code starts (start_kernel) and 'init_mem_mapping' sets 1486 * up the rest of the pagetables. When it has completed it loads the cr3. 1487 * N.B. that baremetal would start at 'start_kernel' (and the early 1488 * #PF handler would create bootstrap pagetables) - so we are running 1489 * with the same assumptions as what to do when write_cr3 is executed 1490 * at this point. 1491 * 1492 * Since there are no user-page tables at all, we have two variants 1493 * of xen_write_cr3 - the early bootup (this one), and the late one 1494 * (xen_write_cr3). The reason we have to do that is that in 64-bit 1495 * the Linux kernel and user-space are both in ring 3 while the 1496 * hypervisor is in ring 0. 1497 */ 1498 static void __init xen_write_cr3_init(unsigned long cr3) 1499 { 1500 BUG_ON(preemptible()); 1501 1502 xen_mc_batch(); /* disables interrupts */ 1503 1504 /* Update while interrupts are disabled, so its atomic with 1505 respect to ipis */ 1506 this_cpu_write(xen_cr3, cr3); 1507 1508 __xen_write_cr3(true, cr3); 1509 1510 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 1511 } 1512 #endif 1513 1514 static int xen_pgd_alloc(struct mm_struct *mm) 1515 { 1516 pgd_t *pgd = mm->pgd; 1517 int ret = 0; 1518 1519 BUG_ON(PagePinned(virt_to_page(pgd))); 1520 1521 #ifdef CONFIG_X86_64 1522 { 1523 struct page *page = virt_to_page(pgd); 1524 pgd_t *user_pgd; 1525 1526 BUG_ON(page->private != 0); 1527 1528 ret = -ENOMEM; 1529 1530 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 1531 page->private = (unsigned long)user_pgd; 1532 1533 if (user_pgd != NULL) { 1534 #ifdef CONFIG_X86_VSYSCALL_EMULATION 1535 user_pgd[pgd_index(VSYSCALL_ADDR)] = 1536 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); 1537 #endif 1538 ret = 0; 1539 } 1540 1541 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); 1542 } 1543 #endif 1544 return ret; 1545 } 1546 1547 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) 1548 { 1549 #ifdef CONFIG_X86_64 1550 pgd_t *user_pgd = xen_get_user_pgd(pgd); 1551 1552 if (user_pgd) 1553 free_page((unsigned long)user_pgd); 1554 #endif 1555 } 1556 1557 /* 1558 * Init-time set_pte while constructing initial pagetables, which 1559 * doesn't allow RO page table pages to be remapped RW. 1560 * 1561 * If there is no MFN for this PFN then this page is initially 1562 * ballooned out so clear the PTE (as in decrease_reservation() in 1563 * drivers/xen/balloon.c). 1564 * 1565 * Many of these PTE updates are done on unpinned and writable pages 1566 * and doing a hypercall for these is unnecessary and expensive. At 1567 * this point it is not possible to tell if a page is pinned or not, 1568 * so always write the PTE directly and rely on Xen trapping and 1569 * emulating any updates as necessary. 1570 */ 1571 __visible pte_t xen_make_pte_init(pteval_t pte) 1572 { 1573 #ifdef CONFIG_X86_64 1574 unsigned long pfn; 1575 1576 /* 1577 * Pages belonging to the initial p2m list mapped outside the default 1578 * address range must be mapped read-only. This region contains the 1579 * page tables for mapping the p2m list, too, and page tables MUST be 1580 * mapped read-only. 1581 */ 1582 pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT; 1583 if (xen_start_info->mfn_list < __START_KERNEL_map && 1584 pfn >= xen_start_info->first_p2m_pfn && 1585 pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames) 1586 pte &= ~_PAGE_RW; 1587 #endif 1588 pte = pte_pfn_to_mfn(pte); 1589 return native_make_pte(pte); 1590 } 1591 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init); 1592 1593 static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) 1594 { 1595 #ifdef CONFIG_X86_32 1596 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 1597 if (pte_mfn(pte) != INVALID_P2M_ENTRY 1598 && pte_val_ma(*ptep) & _PAGE_PRESENT) 1599 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & 1600 pte_val_ma(pte)); 1601 #endif 1602 native_set_pte(ptep, pte); 1603 } 1604 1605 /* Early in boot, while setting up the initial pagetable, assume 1606 everything is pinned. */ 1607 static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) 1608 { 1609 #ifdef CONFIG_FLATMEM 1610 BUG_ON(mem_map); /* should only be used early */ 1611 #endif 1612 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 1613 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 1614 } 1615 1616 /* Used for pmd and pud */ 1617 static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) 1618 { 1619 #ifdef CONFIG_FLATMEM 1620 BUG_ON(mem_map); /* should only be used early */ 1621 #endif 1622 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 1623 } 1624 1625 /* Early release_pte assumes that all pts are pinned, since there's 1626 only init_mm and anything attached to that is pinned. */ 1627 static void __init xen_release_pte_init(unsigned long pfn) 1628 { 1629 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 1630 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1631 } 1632 1633 static void __init xen_release_pmd_init(unsigned long pfn) 1634 { 1635 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1636 } 1637 1638 static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1639 { 1640 struct multicall_space mcs; 1641 struct mmuext_op *op; 1642 1643 mcs = __xen_mc_entry(sizeof(*op)); 1644 op = mcs.args; 1645 op->cmd = cmd; 1646 op->arg1.mfn = pfn_to_mfn(pfn); 1647 1648 MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 1649 } 1650 1651 static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) 1652 { 1653 struct multicall_space mcs; 1654 unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT); 1655 1656 mcs = __xen_mc_entry(0); 1657 MULTI_update_va_mapping(mcs.mc, (unsigned long)addr, 1658 pfn_pte(pfn, prot), 0); 1659 } 1660 1661 /* This needs to make sure the new pte page is pinned iff its being 1662 attached to a pinned pagetable. */ 1663 static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, 1664 unsigned level) 1665 { 1666 bool pinned = PagePinned(virt_to_page(mm->pgd)); 1667 1668 trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); 1669 1670 if (pinned) { 1671 struct page *page = pfn_to_page(pfn); 1672 1673 SetPagePinned(page); 1674 1675 if (!PageHighMem(page)) { 1676 xen_mc_batch(); 1677 1678 __set_pfn_prot(pfn, PAGE_KERNEL_RO); 1679 1680 if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) 1681 __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 1682 1683 xen_mc_issue(PARAVIRT_LAZY_MMU); 1684 } else { 1685 /* make sure there are no stray mappings of 1686 this page */ 1687 kmap_flush_unused(); 1688 } 1689 } 1690 } 1691 1692 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) 1693 { 1694 xen_alloc_ptpage(mm, pfn, PT_PTE); 1695 } 1696 1697 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) 1698 { 1699 xen_alloc_ptpage(mm, pfn, PT_PMD); 1700 } 1701 1702 /* This should never happen until we're OK to use struct page */ 1703 static inline void xen_release_ptpage(unsigned long pfn, unsigned level) 1704 { 1705 struct page *page = pfn_to_page(pfn); 1706 bool pinned = PagePinned(page); 1707 1708 trace_xen_mmu_release_ptpage(pfn, level, pinned); 1709 1710 if (pinned) { 1711 if (!PageHighMem(page)) { 1712 xen_mc_batch(); 1713 1714 if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) 1715 __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 1716 1717 __set_pfn_prot(pfn, PAGE_KERNEL); 1718 1719 xen_mc_issue(PARAVIRT_LAZY_MMU); 1720 } 1721 ClearPagePinned(page); 1722 } 1723 } 1724 1725 static void xen_release_pte(unsigned long pfn) 1726 { 1727 xen_release_ptpage(pfn, PT_PTE); 1728 } 1729 1730 static void xen_release_pmd(unsigned long pfn) 1731 { 1732 xen_release_ptpage(pfn, PT_PMD); 1733 } 1734 1735 #if CONFIG_PGTABLE_LEVELS >= 4 1736 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) 1737 { 1738 xen_alloc_ptpage(mm, pfn, PT_PUD); 1739 } 1740 1741 static void xen_release_pud(unsigned long pfn) 1742 { 1743 xen_release_ptpage(pfn, PT_PUD); 1744 } 1745 #endif 1746 1747 void __init xen_reserve_top(void) 1748 { 1749 #ifdef CONFIG_X86_32 1750 unsigned long top = HYPERVISOR_VIRT_START; 1751 struct xen_platform_parameters pp; 1752 1753 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) 1754 top = pp.virt_start; 1755 1756 reserve_top_address(-top); 1757 #endif /* CONFIG_X86_32 */ 1758 } 1759 1760 /* 1761 * Like __va(), but returns address in the kernel mapping (which is 1762 * all we have until the physical memory mapping has been set up. 1763 */ 1764 static void * __init __ka(phys_addr_t paddr) 1765 { 1766 #ifdef CONFIG_X86_64 1767 return (void *)(paddr + __START_KERNEL_map); 1768 #else 1769 return __va(paddr); 1770 #endif 1771 } 1772 1773 /* Convert a machine address to physical address */ 1774 static unsigned long __init m2p(phys_addr_t maddr) 1775 { 1776 phys_addr_t paddr; 1777 1778 maddr &= PTE_PFN_MASK; 1779 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; 1780 1781 return paddr; 1782 } 1783 1784 /* Convert a machine address to kernel virtual */ 1785 static void * __init m2v(phys_addr_t maddr) 1786 { 1787 return __ka(m2p(maddr)); 1788 } 1789 1790 /* Set the page permissions on an identity-mapped pages */ 1791 static void __init set_page_prot_flags(void *addr, pgprot_t prot, 1792 unsigned long flags) 1793 { 1794 unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 1795 pte_t pte = pfn_pte(pfn, prot); 1796 1797 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) 1798 BUG(); 1799 } 1800 static void __init set_page_prot(void *addr, pgprot_t prot) 1801 { 1802 return set_page_prot_flags(addr, prot, UVMF_NONE); 1803 } 1804 #ifdef CONFIG_X86_32 1805 static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) 1806 { 1807 unsigned pmdidx, pteidx; 1808 unsigned ident_pte; 1809 unsigned long pfn; 1810 1811 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, 1812 PAGE_SIZE); 1813 1814 ident_pte = 0; 1815 pfn = 0; 1816 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { 1817 pte_t *pte_page; 1818 1819 /* Reuse or allocate a page of ptes */ 1820 if (pmd_present(pmd[pmdidx])) 1821 pte_page = m2v(pmd[pmdidx].pmd); 1822 else { 1823 /* Check for free pte pages */ 1824 if (ident_pte == LEVEL1_IDENT_ENTRIES) 1825 break; 1826 1827 pte_page = &level1_ident_pgt[ident_pte]; 1828 ident_pte += PTRS_PER_PTE; 1829 1830 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); 1831 } 1832 1833 /* Install mappings */ 1834 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { 1835 pte_t pte; 1836 1837 if (pfn > max_pfn_mapped) 1838 max_pfn_mapped = pfn; 1839 1840 if (!pte_none(pte_page[pteidx])) 1841 continue; 1842 1843 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); 1844 pte_page[pteidx] = pte; 1845 } 1846 } 1847 1848 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) 1849 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); 1850 1851 set_page_prot(pmd, PAGE_KERNEL_RO); 1852 } 1853 #endif 1854 void __init xen_setup_machphys_mapping(void) 1855 { 1856 struct xen_machphys_mapping mapping; 1857 1858 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { 1859 machine_to_phys_mapping = (unsigned long *)mapping.v_start; 1860 machine_to_phys_nr = mapping.max_mfn + 1; 1861 } else { 1862 machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; 1863 } 1864 #ifdef CONFIG_X86_32 1865 WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1)) 1866 < machine_to_phys_mapping); 1867 #endif 1868 } 1869 1870 #ifdef CONFIG_X86_64 1871 static void __init convert_pfn_mfn(void *v) 1872 { 1873 pte_t *pte = v; 1874 int i; 1875 1876 /* All levels are converted the same way, so just treat them 1877 as ptes. */ 1878 for (i = 0; i < PTRS_PER_PTE; i++) 1879 pte[i] = xen_make_pte(pte[i].pte); 1880 } 1881 static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, 1882 unsigned long addr) 1883 { 1884 if (*pt_base == PFN_DOWN(__pa(addr))) { 1885 set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); 1886 clear_page((void *)addr); 1887 (*pt_base)++; 1888 } 1889 if (*pt_end == PFN_DOWN(__pa(addr))) { 1890 set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); 1891 clear_page((void *)addr); 1892 (*pt_end)--; 1893 } 1894 } 1895 /* 1896 * Set up the initial kernel pagetable. 1897 * 1898 * We can construct this by grafting the Xen provided pagetable into 1899 * head_64.S's preconstructed pagetables. We copy the Xen L2's into 1900 * level2_ident_pgt, and level2_kernel_pgt. This means that only the 1901 * kernel has a physical mapping to start with - but that's enough to 1902 * get __va working. We need to fill in the rest of the physical 1903 * mapping once some sort of allocator has been set up. 1904 */ 1905 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1906 { 1907 pud_t *l3; 1908 pmd_t *l2; 1909 unsigned long addr[3]; 1910 unsigned long pt_base, pt_end; 1911 unsigned i; 1912 1913 /* max_pfn_mapped is the last pfn mapped in the initial memory 1914 * mappings. Considering that on Xen after the kernel mappings we 1915 * have the mappings of some pages that don't exist in pfn space, we 1916 * set max_pfn_mapped to the last real pfn mapped. */ 1917 if (xen_start_info->mfn_list < __START_KERNEL_map) 1918 max_pfn_mapped = xen_start_info->first_p2m_pfn; 1919 else 1920 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); 1921 1922 pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); 1923 pt_end = pt_base + xen_start_info->nr_pt_frames; 1924 1925 /* Zap identity mapping */ 1926 init_level4_pgt[0] = __pgd(0); 1927 1928 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1929 /* Pre-constructed entries are in pfn, so convert to mfn */ 1930 /* L4[272] -> level3_ident_pgt 1931 * L4[511] -> level3_kernel_pgt */ 1932 convert_pfn_mfn(init_level4_pgt); 1933 1934 /* L3_i[0] -> level2_ident_pgt */ 1935 convert_pfn_mfn(level3_ident_pgt); 1936 /* L3_k[510] -> level2_kernel_pgt 1937 * L3_k[511] -> level2_fixmap_pgt */ 1938 convert_pfn_mfn(level3_kernel_pgt); 1939 1940 /* L3_k[511][506] -> level1_fixmap_pgt */ 1941 convert_pfn_mfn(level2_fixmap_pgt); 1942 } 1943 /* We get [511][511] and have Xen's version of level2_kernel_pgt */ 1944 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); 1945 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); 1946 1947 addr[0] = (unsigned long)pgd; 1948 addr[1] = (unsigned long)l3; 1949 addr[2] = (unsigned long)l2; 1950 /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: 1951 * Both L4[272][0] and L4[511][510] have entries that point to the same 1952 * L2 (PMD) tables. Meaning that if you modify it in __va space 1953 * it will be also modified in the __ka space! (But if you just 1954 * modify the PMD table to point to other PTE's or none, then you 1955 * are OK - which is what cleanup_highmap does) */ 1956 copy_page(level2_ident_pgt, l2); 1957 /* Graft it onto L4[511][510] */ 1958 copy_page(level2_kernel_pgt, l2); 1959 1960 /* Copy the initial P->M table mappings if necessary. */ 1961 i = pgd_index(xen_start_info->mfn_list); 1962 if (i && i < pgd_index(__START_KERNEL_map)) 1963 init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; 1964 1965 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1966 /* Make pagetable pieces RO */ 1967 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); 1968 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); 1969 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); 1970 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); 1971 set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); 1972 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); 1973 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); 1974 set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); 1975 1976 /* Pin down new L4 */ 1977 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, 1978 PFN_DOWN(__pa_symbol(init_level4_pgt))); 1979 1980 /* Unpin Xen-provided one */ 1981 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 1982 1983 /* 1984 * At this stage there can be no user pgd, and no page 1985 * structure to attach it to, so make sure we just set kernel 1986 * pgd. 1987 */ 1988 xen_mc_batch(); 1989 __xen_write_cr3(true, __pa(init_level4_pgt)); 1990 xen_mc_issue(PARAVIRT_LAZY_CPU); 1991 } else 1992 native_write_cr3(__pa(init_level4_pgt)); 1993 1994 /* We can't that easily rip out L3 and L2, as the Xen pagetables are 1995 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for 1996 * the initial domain. For guests using the toolstack, they are in: 1997 * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only 1998 * rip out the [L4] (pgd), but for guests we shave off three pages. 1999 */ 2000 for (i = 0; i < ARRAY_SIZE(addr); i++) 2001 check_pt_base(&pt_base, &pt_end, addr[i]); 2002 2003 /* Our (by three pages) smaller Xen pagetable that we are using */ 2004 xen_pt_base = PFN_PHYS(pt_base); 2005 xen_pt_size = (pt_end - pt_base) * PAGE_SIZE; 2006 memblock_reserve(xen_pt_base, xen_pt_size); 2007 2008 /* Revector the xen_start_info */ 2009 xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); 2010 } 2011 2012 /* 2013 * Read a value from a physical address. 2014 */ 2015 static unsigned long __init xen_read_phys_ulong(phys_addr_t addr) 2016 { 2017 unsigned long *vaddr; 2018 unsigned long val; 2019 2020 vaddr = early_memremap_ro(addr, sizeof(val)); 2021 val = *vaddr; 2022 early_memunmap(vaddr, sizeof(val)); 2023 return val; 2024 } 2025 2026 /* 2027 * Translate a virtual address to a physical one without relying on mapped 2028 * page tables. Don't rely on big pages being aligned in (guest) physical 2029 * space! 2030 */ 2031 static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) 2032 { 2033 phys_addr_t pa; 2034 pgd_t pgd; 2035 pud_t pud; 2036 pmd_t pmd; 2037 pte_t pte; 2038 2039 pa = read_cr3(); 2040 pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * 2041 sizeof(pgd))); 2042 if (!pgd_present(pgd)) 2043 return 0; 2044 2045 pa = pgd_val(pgd) & PTE_PFN_MASK; 2046 pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) * 2047 sizeof(pud))); 2048 if (!pud_present(pud)) 2049 return 0; 2050 pa = pud_val(pud) & PTE_PFN_MASK; 2051 if (pud_large(pud)) 2052 return pa + (vaddr & ~PUD_MASK); 2053 2054 pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * 2055 sizeof(pmd))); 2056 if (!pmd_present(pmd)) 2057 return 0; 2058 pa = pmd_val(pmd) & PTE_PFN_MASK; 2059 if (pmd_large(pmd)) 2060 return pa + (vaddr & ~PMD_MASK); 2061 2062 pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * 2063 sizeof(pte))); 2064 if (!pte_present(pte)) 2065 return 0; 2066 pa = pte_pfn(pte) << PAGE_SHIFT; 2067 2068 return pa | (vaddr & ~PAGE_MASK); 2069 } 2070 2071 /* 2072 * Find a new area for the hypervisor supplied p2m list and relocate the p2m to 2073 * this area. 2074 */ 2075 void __init xen_relocate_p2m(void) 2076 { 2077 phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; 2078 unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; 2079 int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; 2080 pte_t *pt; 2081 pmd_t *pmd; 2082 pud_t *pud; 2083 p4d_t *p4d = NULL; 2084 pgd_t *pgd; 2085 unsigned long *new_p2m; 2086 int save_pud; 2087 2088 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 2089 n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; 2090 n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; 2091 n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; 2092 n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; 2093 if (PTRS_PER_P4D > 1) 2094 n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; 2095 else 2096 n_p4d = 0; 2097 n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; 2098 2099 new_area = xen_find_free_area(PFN_PHYS(n_frames)); 2100 if (!new_area) { 2101 xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n"); 2102 BUG(); 2103 } 2104 2105 /* 2106 * Setup the page tables for addressing the new p2m list. 2107 * We have asked the hypervisor to map the p2m list at the user address 2108 * PUD_SIZE. It may have done so, or it may have used a kernel space 2109 * address depending on the Xen version. 2110 * To avoid any possible virtual address collision, just use 2111 * 2 * PUD_SIZE for the new area. 2112 */ 2113 p4d_phys = new_area; 2114 pud_phys = p4d_phys + PFN_PHYS(n_p4d); 2115 pmd_phys = pud_phys + PFN_PHYS(n_pud); 2116 pt_phys = pmd_phys + PFN_PHYS(n_pmd); 2117 p2m_pfn = PFN_DOWN(pt_phys) + n_pt; 2118 2119 pgd = __va(read_cr3()); 2120 new_p2m = (unsigned long *)(2 * PGDIR_SIZE); 2121 idx_p4d = 0; 2122 save_pud = n_pud; 2123 do { 2124 if (n_p4d > 0) { 2125 p4d = early_memremap(p4d_phys, PAGE_SIZE); 2126 clear_page(p4d); 2127 n_pud = min(save_pud, PTRS_PER_P4D); 2128 } 2129 for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { 2130 pud = early_memremap(pud_phys, PAGE_SIZE); 2131 clear_page(pud); 2132 for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); 2133 idx_pmd++) { 2134 pmd = early_memremap(pmd_phys, PAGE_SIZE); 2135 clear_page(pmd); 2136 for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); 2137 idx_pt++) { 2138 pt = early_memremap(pt_phys, PAGE_SIZE); 2139 clear_page(pt); 2140 for (idx_pte = 0; 2141 idx_pte < min(n_pte, PTRS_PER_PTE); 2142 idx_pte++) { 2143 set_pte(pt + idx_pte, 2144 pfn_pte(p2m_pfn, PAGE_KERNEL)); 2145 p2m_pfn++; 2146 } 2147 n_pte -= PTRS_PER_PTE; 2148 early_memunmap(pt, PAGE_SIZE); 2149 make_lowmem_page_readonly(__va(pt_phys)); 2150 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, 2151 PFN_DOWN(pt_phys)); 2152 set_pmd(pmd + idx_pt, 2153 __pmd(_PAGE_TABLE | pt_phys)); 2154 pt_phys += PAGE_SIZE; 2155 } 2156 n_pt -= PTRS_PER_PMD; 2157 early_memunmap(pmd, PAGE_SIZE); 2158 make_lowmem_page_readonly(__va(pmd_phys)); 2159 pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, 2160 PFN_DOWN(pmd_phys)); 2161 set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); 2162 pmd_phys += PAGE_SIZE; 2163 } 2164 n_pmd -= PTRS_PER_PUD; 2165 early_memunmap(pud, PAGE_SIZE); 2166 make_lowmem_page_readonly(__va(pud_phys)); 2167 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); 2168 if (n_p4d > 0) 2169 set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); 2170 else 2171 set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); 2172 pud_phys += PAGE_SIZE; 2173 } 2174 if (n_p4d > 0) { 2175 save_pud -= PTRS_PER_P4D; 2176 early_memunmap(p4d, PAGE_SIZE); 2177 make_lowmem_page_readonly(__va(p4d_phys)); 2178 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); 2179 set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); 2180 p4d_phys += PAGE_SIZE; 2181 } 2182 } while (++idx_p4d < n_p4d); 2183 2184 /* Now copy the old p2m info to the new area. */ 2185 memcpy(new_p2m, xen_p2m_addr, size); 2186 xen_p2m_addr = new_p2m; 2187 2188 /* Release the old p2m list and set new list info. */ 2189 p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list)); 2190 BUG_ON(!p2m_pfn); 2191 p2m_pfn_end = p2m_pfn + PFN_DOWN(size); 2192 2193 if (xen_start_info->mfn_list < __START_KERNEL_map) { 2194 pfn = xen_start_info->first_p2m_pfn; 2195 pfn_end = xen_start_info->first_p2m_pfn + 2196 xen_start_info->nr_p2m_frames; 2197 set_pgd(pgd + 1, __pgd(0)); 2198 } else { 2199 pfn = p2m_pfn; 2200 pfn_end = p2m_pfn_end; 2201 } 2202 2203 memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); 2204 while (pfn < pfn_end) { 2205 if (pfn == p2m_pfn) { 2206 pfn = p2m_pfn_end; 2207 continue; 2208 } 2209 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 2210 pfn++; 2211 } 2212 2213 xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; 2214 xen_start_info->first_p2m_pfn = PFN_DOWN(new_area); 2215 xen_start_info->nr_p2m_frames = n_frames; 2216 } 2217 2218 #else /* !CONFIG_X86_64 */ 2219 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); 2220 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); 2221 2222 static void __init xen_write_cr3_init(unsigned long cr3) 2223 { 2224 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); 2225 2226 BUG_ON(read_cr3() != __pa(initial_page_table)); 2227 BUG_ON(cr3 != __pa(swapper_pg_dir)); 2228 2229 /* 2230 * We are switching to swapper_pg_dir for the first time (from 2231 * initial_page_table) and therefore need to mark that page 2232 * read-only and then pin it. 2233 * 2234 * Xen disallows sharing of kernel PMDs for PAE 2235 * guests. Therefore we must copy the kernel PMD from 2236 * initial_page_table into a new kernel PMD to be used in 2237 * swapper_pg_dir. 2238 */ 2239 swapper_kernel_pmd = 2240 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 2241 copy_page(swapper_kernel_pmd, initial_kernel_pmd); 2242 swapper_pg_dir[KERNEL_PGD_BOUNDARY] = 2243 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); 2244 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); 2245 2246 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); 2247 xen_write_cr3(cr3); 2248 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); 2249 2250 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, 2251 PFN_DOWN(__pa(initial_page_table))); 2252 set_page_prot(initial_page_table, PAGE_KERNEL); 2253 set_page_prot(initial_kernel_pmd, PAGE_KERNEL); 2254 2255 pv_mmu_ops.write_cr3 = &xen_write_cr3; 2256 } 2257 2258 /* 2259 * For 32 bit domains xen_start_info->pt_base is the pgd address which might be 2260 * not the first page table in the page table pool. 2261 * Iterate through the initial page tables to find the real page table base. 2262 */ 2263 static phys_addr_t xen_find_pt_base(pmd_t *pmd) 2264 { 2265 phys_addr_t pt_base, paddr; 2266 unsigned pmdidx; 2267 2268 pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd)); 2269 2270 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) 2271 if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) { 2272 paddr = m2p(pmd[pmdidx].pmd); 2273 pt_base = min(pt_base, paddr); 2274 } 2275 2276 return pt_base; 2277 } 2278 2279 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 2280 { 2281 pmd_t *kernel_pmd; 2282 2283 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 2284 2285 xen_pt_base = xen_find_pt_base(kernel_pmd); 2286 xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE; 2287 2288 initial_kernel_pmd = 2289 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 2290 2291 max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024); 2292 2293 copy_page(initial_kernel_pmd, kernel_pmd); 2294 2295 xen_map_identity_early(initial_kernel_pmd, max_pfn); 2296 2297 copy_page(initial_page_table, pgd); 2298 initial_page_table[KERNEL_PGD_BOUNDARY] = 2299 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); 2300 2301 set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); 2302 set_page_prot(initial_page_table, PAGE_KERNEL_RO); 2303 set_page_prot(empty_zero_page, PAGE_KERNEL_RO); 2304 2305 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 2306 2307 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, 2308 PFN_DOWN(__pa(initial_page_table))); 2309 xen_write_cr3(__pa(initial_page_table)); 2310 2311 memblock_reserve(xen_pt_base, xen_pt_size); 2312 } 2313 #endif /* CONFIG_X86_64 */ 2314 2315 void __init xen_reserve_special_pages(void) 2316 { 2317 phys_addr_t paddr; 2318 2319 memblock_reserve(__pa(xen_start_info), PAGE_SIZE); 2320 if (xen_start_info->store_mfn) { 2321 paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn)); 2322 memblock_reserve(paddr, PAGE_SIZE); 2323 } 2324 if (!xen_initial_domain()) { 2325 paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn)); 2326 memblock_reserve(paddr, PAGE_SIZE); 2327 } 2328 } 2329 2330 void __init xen_pt_check_e820(void) 2331 { 2332 if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { 2333 xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); 2334 BUG(); 2335 } 2336 } 2337 2338 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; 2339 2340 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) 2341 { 2342 pte_t pte; 2343 2344 phys >>= PAGE_SHIFT; 2345 2346 switch (idx) { 2347 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: 2348 case FIX_RO_IDT: 2349 #ifdef CONFIG_X86_32 2350 case FIX_WP_TEST: 2351 # ifdef CONFIG_HIGHMEM 2352 case FIX_KMAP_BEGIN ... FIX_KMAP_END: 2353 # endif 2354 #elif defined(CONFIG_X86_VSYSCALL_EMULATION) 2355 case VSYSCALL_PAGE: 2356 #endif 2357 case FIX_TEXT_POKE0: 2358 case FIX_TEXT_POKE1: 2359 case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: 2360 /* All local page mappings */ 2361 pte = pfn_pte(phys, prot); 2362 break; 2363 2364 #ifdef CONFIG_X86_LOCAL_APIC 2365 case FIX_APIC_BASE: /* maps dummy local APIC */ 2366 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); 2367 break; 2368 #endif 2369 2370 #ifdef CONFIG_X86_IO_APIC 2371 case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: 2372 /* 2373 * We just don't map the IO APIC - all access is via 2374 * hypercalls. Keep the address in the pte for reference. 2375 */ 2376 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); 2377 break; 2378 #endif 2379 2380 case FIX_PARAVIRT_BOOTMAP: 2381 /* This is an MFN, but it isn't an IO mapping from the 2382 IO domain */ 2383 pte = mfn_pte(phys, prot); 2384 break; 2385 2386 default: 2387 /* By default, set_fixmap is used for hardware mappings */ 2388 pte = mfn_pte(phys, prot); 2389 break; 2390 } 2391 2392 __native_set_fixmap(idx, pte); 2393 2394 #ifdef CONFIG_X86_VSYSCALL_EMULATION 2395 /* Replicate changes to map the vsyscall page into the user 2396 pagetable vsyscall mapping. */ 2397 if (idx == VSYSCALL_PAGE) { 2398 unsigned long vaddr = __fix_to_virt(idx); 2399 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); 2400 } 2401 #endif 2402 } 2403 2404 static void __init xen_post_allocator_init(void) 2405 { 2406 if (xen_feature(XENFEAT_auto_translated_physmap)) 2407 return; 2408 2409 pv_mmu_ops.set_pte = xen_set_pte; 2410 pv_mmu_ops.set_pmd = xen_set_pmd; 2411 pv_mmu_ops.set_pud = xen_set_pud; 2412 #if CONFIG_PGTABLE_LEVELS >= 4 2413 pv_mmu_ops.set_p4d = xen_set_p4d; 2414 #endif 2415 2416 /* This will work as long as patching hasn't happened yet 2417 (which it hasn't) */ 2418 pv_mmu_ops.alloc_pte = xen_alloc_pte; 2419 pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 2420 pv_mmu_ops.release_pte = xen_release_pte; 2421 pv_mmu_ops.release_pmd = xen_release_pmd; 2422 #if CONFIG_PGTABLE_LEVELS >= 4 2423 pv_mmu_ops.alloc_pud = xen_alloc_pud; 2424 pv_mmu_ops.release_pud = xen_release_pud; 2425 #endif 2426 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte); 2427 2428 #ifdef CONFIG_X86_64 2429 pv_mmu_ops.write_cr3 = &xen_write_cr3; 2430 SetPagePinned(virt_to_page(level3_user_vsyscall)); 2431 #endif 2432 xen_mark_init_mm_pinned(); 2433 } 2434 2435 static void xen_leave_lazy_mmu(void) 2436 { 2437 preempt_disable(); 2438 xen_mc_flush(); 2439 paravirt_leave_lazy_mmu(); 2440 preempt_enable(); 2441 } 2442 2443 static const struct pv_mmu_ops xen_mmu_ops __initconst = { 2444 .read_cr2 = xen_read_cr2, 2445 .write_cr2 = xen_write_cr2, 2446 2447 .read_cr3 = xen_read_cr3, 2448 .write_cr3 = xen_write_cr3_init, 2449 2450 .flush_tlb_user = xen_flush_tlb, 2451 .flush_tlb_kernel = xen_flush_tlb, 2452 .flush_tlb_single = xen_flush_tlb_single, 2453 .flush_tlb_others = xen_flush_tlb_others, 2454 2455 .pte_update = paravirt_nop, 2456 2457 .pgd_alloc = xen_pgd_alloc, 2458 .pgd_free = xen_pgd_free, 2459 2460 .alloc_pte = xen_alloc_pte_init, 2461 .release_pte = xen_release_pte_init, 2462 .alloc_pmd = xen_alloc_pmd_init, 2463 .release_pmd = xen_release_pmd_init, 2464 2465 .set_pte = xen_set_pte_init, 2466 .set_pte_at = xen_set_pte_at, 2467 .set_pmd = xen_set_pmd_hyper, 2468 2469 .ptep_modify_prot_start = __ptep_modify_prot_start, 2470 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 2471 2472 .pte_val = PV_CALLEE_SAVE(xen_pte_val), 2473 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), 2474 2475 .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), 2476 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), 2477 2478 #ifdef CONFIG_X86_PAE 2479 .set_pte_atomic = xen_set_pte_atomic, 2480 .pte_clear = xen_pte_clear, 2481 .pmd_clear = xen_pmd_clear, 2482 #endif /* CONFIG_X86_PAE */ 2483 .set_pud = xen_set_pud_hyper, 2484 2485 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), 2486 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), 2487 2488 #if CONFIG_PGTABLE_LEVELS >= 4 2489 .pud_val = PV_CALLEE_SAVE(xen_pud_val), 2490 .make_pud = PV_CALLEE_SAVE(xen_make_pud), 2491 .set_p4d = xen_set_p4d_hyper, 2492 2493 .alloc_pud = xen_alloc_pmd_init, 2494 .release_pud = xen_release_pmd_init, 2495 #endif /* CONFIG_PGTABLE_LEVELS == 4 */ 2496 2497 .activate_mm = xen_activate_mm, 2498 .dup_mmap = xen_dup_mmap, 2499 .exit_mmap = xen_exit_mmap, 2500 2501 .lazy_mode = { 2502 .enter = paravirt_enter_lazy_mmu, 2503 .leave = xen_leave_lazy_mmu, 2504 .flush = paravirt_flush_lazy_mmu, 2505 }, 2506 2507 .set_fixmap = xen_set_fixmap, 2508 }; 2509 2510 void __init xen_init_mmu_ops(void) 2511 { 2512 x86_init.paging.pagetable_init = xen_pagetable_init; 2513 2514 if (xen_feature(XENFEAT_auto_translated_physmap)) 2515 return; 2516 2517 pv_mmu_ops = xen_mmu_ops; 2518 2519 memset(dummy_mapping, 0xff, PAGE_SIZE); 2520 } 2521 2522 /* Protected by xen_reservation_lock. */ 2523 #define MAX_CONTIG_ORDER 9 /* 2MB */ 2524 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; 2525 2526 #define VOID_PTE (mfn_pte(0, __pgprot(0))) 2527 static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, 2528 unsigned long *in_frames, 2529 unsigned long *out_frames) 2530 { 2531 int i; 2532 struct multicall_space mcs; 2533 2534 xen_mc_batch(); 2535 for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) { 2536 mcs = __xen_mc_entry(0); 2537 2538 if (in_frames) 2539 in_frames[i] = virt_to_mfn(vaddr); 2540 2541 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); 2542 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); 2543 2544 if (out_frames) 2545 out_frames[i] = virt_to_pfn(vaddr); 2546 } 2547 xen_mc_issue(0); 2548 } 2549 2550 /* 2551 * Update the pfn-to-mfn mappings for a virtual address range, either to 2552 * point to an array of mfns, or contiguously from a single starting 2553 * mfn. 2554 */ 2555 static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, 2556 unsigned long *mfns, 2557 unsigned long first_mfn) 2558 { 2559 unsigned i, limit; 2560 unsigned long mfn; 2561 2562 xen_mc_batch(); 2563 2564 limit = 1u << order; 2565 for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) { 2566 struct multicall_space mcs; 2567 unsigned flags; 2568 2569 mcs = __xen_mc_entry(0); 2570 if (mfns) 2571 mfn = mfns[i]; 2572 else 2573 mfn = first_mfn + i; 2574 2575 if (i < (limit - 1)) 2576 flags = 0; 2577 else { 2578 if (order == 0) 2579 flags = UVMF_INVLPG | UVMF_ALL; 2580 else 2581 flags = UVMF_TLB_FLUSH | UVMF_ALL; 2582 } 2583 2584 MULTI_update_va_mapping(mcs.mc, vaddr, 2585 mfn_pte(mfn, PAGE_KERNEL), flags); 2586 2587 set_phys_to_machine(virt_to_pfn(vaddr), mfn); 2588 } 2589 2590 xen_mc_issue(0); 2591 } 2592 2593 /* 2594 * Perform the hypercall to exchange a region of our pfns to point to 2595 * memory with the required contiguous alignment. Takes the pfns as 2596 * input, and populates mfns as output. 2597 * 2598 * Returns a success code indicating whether the hypervisor was able to 2599 * satisfy the request or not. 2600 */ 2601 static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, 2602 unsigned long *pfns_in, 2603 unsigned long extents_out, 2604 unsigned int order_out, 2605 unsigned long *mfns_out, 2606 unsigned int address_bits) 2607 { 2608 long rc; 2609 int success; 2610 2611 struct xen_memory_exchange exchange = { 2612 .in = { 2613 .nr_extents = extents_in, 2614 .extent_order = order_in, 2615 .extent_start = pfns_in, 2616 .domid = DOMID_SELF 2617 }, 2618 .out = { 2619 .nr_extents = extents_out, 2620 .extent_order = order_out, 2621 .extent_start = mfns_out, 2622 .address_bits = address_bits, 2623 .domid = DOMID_SELF 2624 } 2625 }; 2626 2627 BUG_ON(extents_in << order_in != extents_out << order_out); 2628 2629 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); 2630 success = (exchange.nr_exchanged == extents_in); 2631 2632 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); 2633 BUG_ON(success && (rc != 0)); 2634 2635 return success; 2636 } 2637 2638 int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, 2639 unsigned int address_bits, 2640 dma_addr_t *dma_handle) 2641 { 2642 unsigned long *in_frames = discontig_frames, out_frame; 2643 unsigned long flags; 2644 int success; 2645 unsigned long vstart = (unsigned long)phys_to_virt(pstart); 2646 2647 /* 2648 * Currently an auto-translated guest will not perform I/O, nor will 2649 * it require PAE page directories below 4GB. Therefore any calls to 2650 * this function are redundant and can be ignored. 2651 */ 2652 2653 if (xen_feature(XENFEAT_auto_translated_physmap)) 2654 return 0; 2655 2656 if (unlikely(order > MAX_CONTIG_ORDER)) 2657 return -ENOMEM; 2658 2659 memset((void *) vstart, 0, PAGE_SIZE << order); 2660 2661 spin_lock_irqsave(&xen_reservation_lock, flags); 2662 2663 /* 1. Zap current PTEs, remembering MFNs. */ 2664 xen_zap_pfn_range(vstart, order, in_frames, NULL); 2665 2666 /* 2. Get a new contiguous memory extent. */ 2667 out_frame = virt_to_pfn(vstart); 2668 success = xen_exchange_memory(1UL << order, 0, in_frames, 2669 1, order, &out_frame, 2670 address_bits); 2671 2672 /* 3. Map the new extent in place of old pages. */ 2673 if (success) 2674 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); 2675 else 2676 xen_remap_exchanged_ptes(vstart, order, in_frames, 0); 2677 2678 spin_unlock_irqrestore(&xen_reservation_lock, flags); 2679 2680 *dma_handle = virt_to_machine(vstart).maddr; 2681 return success ? 0 : -ENOMEM; 2682 } 2683 EXPORT_SYMBOL_GPL(xen_create_contiguous_region); 2684 2685 void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) 2686 { 2687 unsigned long *out_frames = discontig_frames, in_frame; 2688 unsigned long flags; 2689 int success; 2690 unsigned long vstart; 2691 2692 if (xen_feature(XENFEAT_auto_translated_physmap)) 2693 return; 2694 2695 if (unlikely(order > MAX_CONTIG_ORDER)) 2696 return; 2697 2698 vstart = (unsigned long)phys_to_virt(pstart); 2699 memset((void *) vstart, 0, PAGE_SIZE << order); 2700 2701 spin_lock_irqsave(&xen_reservation_lock, flags); 2702 2703 /* 1. Find start MFN of contiguous extent. */ 2704 in_frame = virt_to_mfn(vstart); 2705 2706 /* 2. Zap current PTEs. */ 2707 xen_zap_pfn_range(vstart, order, NULL, out_frames); 2708 2709 /* 3. Do the exchange for non-contiguous MFNs. */ 2710 success = xen_exchange_memory(1, order, &in_frame, 1UL << order, 2711 0, out_frames, 0); 2712 2713 /* 4. Map new pages in place of old pages. */ 2714 if (success) 2715 xen_remap_exchanged_ptes(vstart, order, out_frames, 0); 2716 else 2717 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); 2718 2719 spin_unlock_irqrestore(&xen_reservation_lock, flags); 2720 } 2721 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); 2722 2723 #ifdef CONFIG_KEXEC_CORE 2724 phys_addr_t paddr_vmcoreinfo_note(void) 2725 { 2726 if (xen_pv_domain()) 2727 return virt_to_machine(&vmcoreinfo_note).maddr; 2728 else 2729 return __pa_symbol(&vmcoreinfo_note); 2730 } 2731 #endif /* CONFIG_KEXEC_CORE */ 2732