xref: /openbmc/linux/arch/x86/xen/mmu_pv.c (revision a6ca5ac746d104019e76c29e69c2a1fc6dd2b29f)
1 /*
2  * Xen mmu operations
3  *
4  * This file contains the various mmu fetch and update operations.
5  * The most important job they must perform is the mapping between the
6  * domain's pfn and the overall machine mfns.
7  *
8  * Xen allows guests to directly update the pagetable, in a controlled
9  * fashion.  In other words, the guest modifies the same pagetable
10  * that the CPU actually uses, which eliminates the overhead of having
11  * a separate shadow pagetable.
12  *
13  * In order to allow this, it falls on the guest domain to map its
14  * notion of a "physical" pfn - which is just a domain-local linear
15  * address - into a real "machine address" which the CPU's MMU can
16  * use.
17  *
18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19  * inserted directly into the pagetable.  When creating a new
20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
22  * the mfn back into a pfn.
23  *
24  * The other constraint is that all pages which make up a pagetable
25  * must be mapped read-only in the guest.  This prevents uncontrolled
26  * guest updates to the pagetable.  Xen strictly enforces this, and
27  * will disallow any pagetable update which will end up mapping a
28  * pagetable page RW, and will disallow using any writable page as a
29  * pagetable.
30  *
31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
32  * would need to validate the whole pagetable before going on.
33  * Naturally, this is quite slow.  The solution is to "pin" a
34  * pagetable, which enforces all the constraints on the pagetable even
35  * when it is not actively in use.  This menas that Xen can be assured
36  * that it is still valid when you do load it into %cr3, and doesn't
37  * need to revalidate it.
38  *
39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40  */
41 #include <linux/sched/mm.h>
42 #include <linux/highmem.h>
43 #include <linux/debugfs.h>
44 #include <linux/bug.h>
45 #include <linux/vmalloc.h>
46 #include <linux/export.h>
47 #include <linux/init.h>
48 #include <linux/gfp.h>
49 #include <linux/memblock.h>
50 #include <linux/seq_file.h>
51 #include <linux/crash_dump.h>
52 #ifdef CONFIG_KEXEC_CORE
53 #include <linux/kexec.h>
54 #endif
55 
56 #include <trace/events/xen.h>
57 
58 #include <asm/pgtable.h>
59 #include <asm/tlbflush.h>
60 #include <asm/fixmap.h>
61 #include <asm/mmu_context.h>
62 #include <asm/setup.h>
63 #include <asm/paravirt.h>
64 #include <asm/e820/api.h>
65 #include <asm/linkage.h>
66 #include <asm/page.h>
67 #include <asm/init.h>
68 #include <asm/pat.h>
69 #include <asm/smp.h>
70 
71 #include <asm/xen/hypercall.h>
72 #include <asm/xen/hypervisor.h>
73 
74 #include <xen/xen.h>
75 #include <xen/page.h>
76 #include <xen/interface/xen.h>
77 #include <xen/interface/hvm/hvm_op.h>
78 #include <xen/interface/version.h>
79 #include <xen/interface/memory.h>
80 #include <xen/hvc-console.h>
81 
82 #include "multicalls.h"
83 #include "mmu.h"
84 #include "debugfs.h"
85 
86 #ifdef CONFIG_X86_32
87 /*
88  * Identity map, in addition to plain kernel map.  This needs to be
89  * large enough to allocate page table pages to allocate the rest.
90  * Each page can map 2MB.
91  */
92 #define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)
93 static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
94 #endif
95 #ifdef CONFIG_X86_64
96 /* l3 pud for userspace vsyscall mapping */
97 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
98 #endif /* CONFIG_X86_64 */
99 
100 /*
101  * Note about cr3 (pagetable base) values:
102  *
103  * xen_cr3 contains the current logical cr3 value; it contains the
104  * last set cr3.  This may not be the current effective cr3, because
105  * its update may be being lazily deferred.  However, a vcpu looking
106  * at its own cr3 can use this value knowing that it everything will
107  * be self-consistent.
108  *
109  * xen_current_cr3 contains the actual vcpu cr3; it is set once the
110  * hypercall to set the vcpu cr3 is complete (so it may be a little
111  * out of date, but it will never be set early).  If one vcpu is
112  * looking at another vcpu's cr3 value, it should use this variable.
113  */
114 DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
115 DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
116 
117 static phys_addr_t xen_pt_base, xen_pt_size __initdata;
118 
119 /*
120  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
121  * redzone above it, so round it up to a PGD boundary.
122  */
123 #define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
124 
125 void make_lowmem_page_readonly(void *vaddr)
126 {
127 	pte_t *pte, ptev;
128 	unsigned long address = (unsigned long)vaddr;
129 	unsigned int level;
130 
131 	pte = lookup_address(address, &level);
132 	if (pte == NULL)
133 		return;		/* vaddr missing */
134 
135 	ptev = pte_wrprotect(*pte);
136 
137 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
138 		BUG();
139 }
140 
141 void make_lowmem_page_readwrite(void *vaddr)
142 {
143 	pte_t *pte, ptev;
144 	unsigned long address = (unsigned long)vaddr;
145 	unsigned int level;
146 
147 	pte = lookup_address(address, &level);
148 	if (pte == NULL)
149 		return;		/* vaddr missing */
150 
151 	ptev = pte_mkwrite(*pte);
152 
153 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
154 		BUG();
155 }
156 
157 
158 static bool xen_page_pinned(void *ptr)
159 {
160 	struct page *page = virt_to_page(ptr);
161 
162 	return PagePinned(page);
163 }
164 
165 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
166 {
167 	struct multicall_space mcs;
168 	struct mmu_update *u;
169 
170 	trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
171 
172 	mcs = xen_mc_entry(sizeof(*u));
173 	u = mcs.args;
174 
175 	/* ptep might be kmapped when using 32-bit HIGHPTE */
176 	u->ptr = virt_to_machine(ptep).maddr;
177 	u->val = pte_val_ma(pteval);
178 
179 	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
180 
181 	xen_mc_issue(PARAVIRT_LAZY_MMU);
182 }
183 EXPORT_SYMBOL_GPL(xen_set_domain_pte);
184 
185 static void xen_extend_mmu_update(const struct mmu_update *update)
186 {
187 	struct multicall_space mcs;
188 	struct mmu_update *u;
189 
190 	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
191 
192 	if (mcs.mc != NULL) {
193 		mcs.mc->args[1]++;
194 	} else {
195 		mcs = __xen_mc_entry(sizeof(*u));
196 		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
197 	}
198 
199 	u = mcs.args;
200 	*u = *update;
201 }
202 
203 static void xen_extend_mmuext_op(const struct mmuext_op *op)
204 {
205 	struct multicall_space mcs;
206 	struct mmuext_op *u;
207 
208 	mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
209 
210 	if (mcs.mc != NULL) {
211 		mcs.mc->args[1]++;
212 	} else {
213 		mcs = __xen_mc_entry(sizeof(*u));
214 		MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
215 	}
216 
217 	u = mcs.args;
218 	*u = *op;
219 }
220 
221 static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
222 {
223 	struct mmu_update u;
224 
225 	preempt_disable();
226 
227 	xen_mc_batch();
228 
229 	/* ptr may be ioremapped for 64-bit pagetable setup */
230 	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
231 	u.val = pmd_val_ma(val);
232 	xen_extend_mmu_update(&u);
233 
234 	xen_mc_issue(PARAVIRT_LAZY_MMU);
235 
236 	preempt_enable();
237 }
238 
239 static void xen_set_pmd(pmd_t *ptr, pmd_t val)
240 {
241 	trace_xen_mmu_set_pmd(ptr, val);
242 
243 	/* If page is not pinned, we can just update the entry
244 	   directly */
245 	if (!xen_page_pinned(ptr)) {
246 		*ptr = val;
247 		return;
248 	}
249 
250 	xen_set_pmd_hyper(ptr, val);
251 }
252 
253 /*
254  * Associate a virtual page frame with a given physical page frame
255  * and protection flags for that frame.
256  */
257 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
258 {
259 	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
260 }
261 
262 static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
263 {
264 	struct mmu_update u;
265 
266 	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
267 		return false;
268 
269 	xen_mc_batch();
270 
271 	u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
272 	u.val = pte_val_ma(pteval);
273 	xen_extend_mmu_update(&u);
274 
275 	xen_mc_issue(PARAVIRT_LAZY_MMU);
276 
277 	return true;
278 }
279 
280 static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
281 {
282 	if (!xen_batched_set_pte(ptep, pteval)) {
283 		/*
284 		 * Could call native_set_pte() here and trap and
285 		 * emulate the PTE write but with 32-bit guests this
286 		 * needs two traps (one for each of the two 32-bit
287 		 * words in the PTE) so do one hypercall directly
288 		 * instead.
289 		 */
290 		struct mmu_update u;
291 
292 		u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
293 		u.val = pte_val_ma(pteval);
294 		HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
295 	}
296 }
297 
298 static void xen_set_pte(pte_t *ptep, pte_t pteval)
299 {
300 	trace_xen_mmu_set_pte(ptep, pteval);
301 	__xen_set_pte(ptep, pteval);
302 }
303 
304 static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
305 		    pte_t *ptep, pte_t pteval)
306 {
307 	trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
308 	__xen_set_pte(ptep, pteval);
309 }
310 
311 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
312 				 unsigned long addr, pte_t *ptep)
313 {
314 	/* Just return the pte as-is.  We preserve the bits on commit */
315 	trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
316 	return *ptep;
317 }
318 
319 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
320 				 pte_t *ptep, pte_t pte)
321 {
322 	struct mmu_update u;
323 
324 	trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
325 	xen_mc_batch();
326 
327 	u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
328 	u.val = pte_val_ma(pte);
329 	xen_extend_mmu_update(&u);
330 
331 	xen_mc_issue(PARAVIRT_LAZY_MMU);
332 }
333 
334 /* Assume pteval_t is equivalent to all the other *val_t types. */
335 static pteval_t pte_mfn_to_pfn(pteval_t val)
336 {
337 	if (val & _PAGE_PRESENT) {
338 		unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
339 		unsigned long pfn = mfn_to_pfn(mfn);
340 
341 		pteval_t flags = val & PTE_FLAGS_MASK;
342 		if (unlikely(pfn == ~0))
343 			val = flags & ~_PAGE_PRESENT;
344 		else
345 			val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
346 	}
347 
348 	return val;
349 }
350 
351 static pteval_t pte_pfn_to_mfn(pteval_t val)
352 {
353 	if (val & _PAGE_PRESENT) {
354 		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
355 		pteval_t flags = val & PTE_FLAGS_MASK;
356 		unsigned long mfn;
357 
358 		if (!xen_feature(XENFEAT_auto_translated_physmap))
359 			mfn = __pfn_to_mfn(pfn);
360 		else
361 			mfn = pfn;
362 		/*
363 		 * If there's no mfn for the pfn, then just create an
364 		 * empty non-present pte.  Unfortunately this loses
365 		 * information about the original pfn, so
366 		 * pte_mfn_to_pfn is asymmetric.
367 		 */
368 		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
369 			mfn = 0;
370 			flags = 0;
371 		} else
372 			mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
373 		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
374 	}
375 
376 	return val;
377 }
378 
379 __visible pteval_t xen_pte_val(pte_t pte)
380 {
381 	pteval_t pteval = pte.pte;
382 
383 	return pte_mfn_to_pfn(pteval);
384 }
385 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
386 
387 __visible pgdval_t xen_pgd_val(pgd_t pgd)
388 {
389 	return pte_mfn_to_pfn(pgd.pgd);
390 }
391 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
392 
393 __visible pte_t xen_make_pte(pteval_t pte)
394 {
395 	pte = pte_pfn_to_mfn(pte);
396 
397 	return native_make_pte(pte);
398 }
399 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
400 
401 __visible pgd_t xen_make_pgd(pgdval_t pgd)
402 {
403 	pgd = pte_pfn_to_mfn(pgd);
404 	return native_make_pgd(pgd);
405 }
406 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
407 
408 __visible pmdval_t xen_pmd_val(pmd_t pmd)
409 {
410 	return pte_mfn_to_pfn(pmd.pmd);
411 }
412 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
413 
414 static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
415 {
416 	struct mmu_update u;
417 
418 	preempt_disable();
419 
420 	xen_mc_batch();
421 
422 	/* ptr may be ioremapped for 64-bit pagetable setup */
423 	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
424 	u.val = pud_val_ma(val);
425 	xen_extend_mmu_update(&u);
426 
427 	xen_mc_issue(PARAVIRT_LAZY_MMU);
428 
429 	preempt_enable();
430 }
431 
432 static void xen_set_pud(pud_t *ptr, pud_t val)
433 {
434 	trace_xen_mmu_set_pud(ptr, val);
435 
436 	/* If page is not pinned, we can just update the entry
437 	   directly */
438 	if (!xen_page_pinned(ptr)) {
439 		*ptr = val;
440 		return;
441 	}
442 
443 	xen_set_pud_hyper(ptr, val);
444 }
445 
446 #ifdef CONFIG_X86_PAE
447 static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
448 {
449 	trace_xen_mmu_set_pte_atomic(ptep, pte);
450 	set_64bit((u64 *)ptep, native_pte_val(pte));
451 }
452 
453 static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
454 {
455 	trace_xen_mmu_pte_clear(mm, addr, ptep);
456 	if (!xen_batched_set_pte(ptep, native_make_pte(0)))
457 		native_pte_clear(mm, addr, ptep);
458 }
459 
460 static void xen_pmd_clear(pmd_t *pmdp)
461 {
462 	trace_xen_mmu_pmd_clear(pmdp);
463 	set_pmd(pmdp, __pmd(0));
464 }
465 #endif	/* CONFIG_X86_PAE */
466 
467 __visible pmd_t xen_make_pmd(pmdval_t pmd)
468 {
469 	pmd = pte_pfn_to_mfn(pmd);
470 	return native_make_pmd(pmd);
471 }
472 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
473 
474 #if CONFIG_PGTABLE_LEVELS == 4
475 __visible pudval_t xen_pud_val(pud_t pud)
476 {
477 	return pte_mfn_to_pfn(pud.pud);
478 }
479 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
480 
481 __visible pud_t xen_make_pud(pudval_t pud)
482 {
483 	pud = pte_pfn_to_mfn(pud);
484 
485 	return native_make_pud(pud);
486 }
487 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
488 
489 static pgd_t *xen_get_user_pgd(pgd_t *pgd)
490 {
491 	pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
492 	unsigned offset = pgd - pgd_page;
493 	pgd_t *user_ptr = NULL;
494 
495 	if (offset < pgd_index(USER_LIMIT)) {
496 		struct page *page = virt_to_page(pgd_page);
497 		user_ptr = (pgd_t *)page->private;
498 		if (user_ptr)
499 			user_ptr += offset;
500 	}
501 
502 	return user_ptr;
503 }
504 
505 static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
506 {
507 	struct mmu_update u;
508 
509 	u.ptr = virt_to_machine(ptr).maddr;
510 	u.val = p4d_val_ma(val);
511 	xen_extend_mmu_update(&u);
512 }
513 
514 /*
515  * Raw hypercall-based set_p4d, intended for in early boot before
516  * there's a page structure.  This implies:
517  *  1. The only existing pagetable is the kernel's
518  *  2. It is always pinned
519  *  3. It has no user pagetable attached to it
520  */
521 static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
522 {
523 	preempt_disable();
524 
525 	xen_mc_batch();
526 
527 	__xen_set_p4d_hyper(ptr, val);
528 
529 	xen_mc_issue(PARAVIRT_LAZY_MMU);
530 
531 	preempt_enable();
532 }
533 
534 static void xen_set_p4d(p4d_t *ptr, p4d_t val)
535 {
536 	pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr);
537 	pgd_t pgd_val;
538 
539 	trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val);
540 
541 	/* If page is not pinned, we can just update the entry
542 	   directly */
543 	if (!xen_page_pinned(ptr)) {
544 		*ptr = val;
545 		if (user_ptr) {
546 			WARN_ON(xen_page_pinned(user_ptr));
547 			pgd_val.pgd = p4d_val_ma(val);
548 			*user_ptr = pgd_val;
549 		}
550 		return;
551 	}
552 
553 	/* If it's pinned, then we can at least batch the kernel and
554 	   user updates together. */
555 	xen_mc_batch();
556 
557 	__xen_set_p4d_hyper(ptr, val);
558 	if (user_ptr)
559 		__xen_set_p4d_hyper((p4d_t *)user_ptr, val);
560 
561 	xen_mc_issue(PARAVIRT_LAZY_MMU);
562 }
563 #endif	/* CONFIG_PGTABLE_LEVELS == 4 */
564 
565 static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
566 		int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
567 		bool last, unsigned long limit)
568 {
569 	int i, nr, flush = 0;
570 
571 	nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
572 	for (i = 0; i < nr; i++) {
573 		if (!pmd_none(pmd[i]))
574 			flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE);
575 	}
576 	return flush;
577 }
578 
579 static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
580 		int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
581 		bool last, unsigned long limit)
582 {
583 	int i, nr, flush = 0;
584 
585 	nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
586 	for (i = 0; i < nr; i++) {
587 		pmd_t *pmd;
588 
589 		if (pud_none(pud[i]))
590 			continue;
591 
592 		pmd = pmd_offset(&pud[i], 0);
593 		if (PTRS_PER_PMD > 1)
594 			flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
595 		flush |= xen_pmd_walk(mm, pmd, func,
596 				last && i == nr - 1, limit);
597 	}
598 	return flush;
599 }
600 
601 static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
602 		int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
603 		bool last, unsigned long limit)
604 {
605 	int i, nr, flush = 0;
606 
607 	nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
608 	for (i = 0; i < nr; i++) {
609 		pud_t *pud;
610 
611 		if (p4d_none(p4d[i]))
612 			continue;
613 
614 		pud = pud_offset(&p4d[i], 0);
615 		if (PTRS_PER_PUD > 1)
616 			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
617 		flush |= xen_pud_walk(mm, pud, func,
618 				last && i == nr - 1, limit);
619 	}
620 	return flush;
621 }
622 
623 /*
624  * (Yet another) pagetable walker.  This one is intended for pinning a
625  * pagetable.  This means that it walks a pagetable and calls the
626  * callback function on each page it finds making up the page table,
627  * at every level.  It walks the entire pagetable, but it only bothers
628  * pinning pte pages which are below limit.  In the normal case this
629  * will be STACK_TOP_MAX, but at boot we need to pin up to
630  * FIXADDR_TOP.
631  *
632  * For 32-bit the important bit is that we don't pin beyond there,
633  * because then we start getting into Xen's ptes.
634  *
635  * For 64-bit, we must skip the Xen hole in the middle of the address
636  * space, just after the big x86-64 virtual hole.
637  */
638 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
639 			  int (*func)(struct mm_struct *mm, struct page *,
640 				      enum pt_level),
641 			  unsigned long limit)
642 {
643 	int i, nr, flush = 0;
644 	unsigned hole_low, hole_high;
645 
646 	/* The limit is the last byte to be touched */
647 	limit--;
648 	BUG_ON(limit >= FIXADDR_TOP);
649 
650 	if (xen_feature(XENFEAT_auto_translated_physmap))
651 		return 0;
652 
653 	/*
654 	 * 64-bit has a great big hole in the middle of the address
655 	 * space, which contains the Xen mappings.  On 32-bit these
656 	 * will end up making a zero-sized hole and so is a no-op.
657 	 */
658 	hole_low = pgd_index(USER_LIMIT);
659 	hole_high = pgd_index(PAGE_OFFSET);
660 
661 	nr = pgd_index(limit) + 1;
662 	for (i = 0; i < nr; i++) {
663 		p4d_t *p4d;
664 
665 		if (i >= hole_low && i < hole_high)
666 			continue;
667 
668 		if (pgd_none(pgd[i]))
669 			continue;
670 
671 		p4d = p4d_offset(&pgd[i], 0);
672 		if (PTRS_PER_P4D > 1)
673 			flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
674 		flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
675 	}
676 
677 	/* Do the top level last, so that the callbacks can use it as
678 	   a cue to do final things like tlb flushes. */
679 	flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
680 
681 	return flush;
682 }
683 
684 static int xen_pgd_walk(struct mm_struct *mm,
685 			int (*func)(struct mm_struct *mm, struct page *,
686 				    enum pt_level),
687 			unsigned long limit)
688 {
689 	return __xen_pgd_walk(mm, mm->pgd, func, limit);
690 }
691 
692 /* If we're using split pte locks, then take the page's lock and
693    return a pointer to it.  Otherwise return NULL. */
694 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
695 {
696 	spinlock_t *ptl = NULL;
697 
698 #if USE_SPLIT_PTE_PTLOCKS
699 	ptl = ptlock_ptr(page);
700 	spin_lock_nest_lock(ptl, &mm->page_table_lock);
701 #endif
702 
703 	return ptl;
704 }
705 
706 static void xen_pte_unlock(void *v)
707 {
708 	spinlock_t *ptl = v;
709 	spin_unlock(ptl);
710 }
711 
712 static void xen_do_pin(unsigned level, unsigned long pfn)
713 {
714 	struct mmuext_op op;
715 
716 	op.cmd = level;
717 	op.arg1.mfn = pfn_to_mfn(pfn);
718 
719 	xen_extend_mmuext_op(&op);
720 }
721 
722 static int xen_pin_page(struct mm_struct *mm, struct page *page,
723 			enum pt_level level)
724 {
725 	unsigned pgfl = TestSetPagePinned(page);
726 	int flush;
727 
728 	if (pgfl)
729 		flush = 0;		/* already pinned */
730 	else if (PageHighMem(page))
731 		/* kmaps need flushing if we found an unpinned
732 		   highpage */
733 		flush = 1;
734 	else {
735 		void *pt = lowmem_page_address(page);
736 		unsigned long pfn = page_to_pfn(page);
737 		struct multicall_space mcs = __xen_mc_entry(0);
738 		spinlock_t *ptl;
739 
740 		flush = 0;
741 
742 		/*
743 		 * We need to hold the pagetable lock between the time
744 		 * we make the pagetable RO and when we actually pin
745 		 * it.  If we don't, then other users may come in and
746 		 * attempt to update the pagetable by writing it,
747 		 * which will fail because the memory is RO but not
748 		 * pinned, so Xen won't do the trap'n'emulate.
749 		 *
750 		 * If we're using split pte locks, we can't hold the
751 		 * entire pagetable's worth of locks during the
752 		 * traverse, because we may wrap the preempt count (8
753 		 * bits).  The solution is to mark RO and pin each PTE
754 		 * page while holding the lock.  This means the number
755 		 * of locks we end up holding is never more than a
756 		 * batch size (~32 entries, at present).
757 		 *
758 		 * If we're not using split pte locks, we needn't pin
759 		 * the PTE pages independently, because we're
760 		 * protected by the overall pagetable lock.
761 		 */
762 		ptl = NULL;
763 		if (level == PT_PTE)
764 			ptl = xen_pte_lock(page, mm);
765 
766 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
767 					pfn_pte(pfn, PAGE_KERNEL_RO),
768 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
769 
770 		if (ptl) {
771 			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
772 
773 			/* Queue a deferred unlock for when this batch
774 			   is completed. */
775 			xen_mc_callback(xen_pte_unlock, ptl);
776 		}
777 	}
778 
779 	return flush;
780 }
781 
782 /* This is called just after a mm has been created, but it has not
783    been used yet.  We need to make sure that its pagetable is all
784    read-only, and can be pinned. */
785 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
786 {
787 	trace_xen_mmu_pgd_pin(mm, pgd);
788 
789 	xen_mc_batch();
790 
791 	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
792 		/* re-enable interrupts for flushing */
793 		xen_mc_issue(0);
794 
795 		kmap_flush_unused();
796 
797 		xen_mc_batch();
798 	}
799 
800 #ifdef CONFIG_X86_64
801 	{
802 		pgd_t *user_pgd = xen_get_user_pgd(pgd);
803 
804 		xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
805 
806 		if (user_pgd) {
807 			xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
808 			xen_do_pin(MMUEXT_PIN_L4_TABLE,
809 				   PFN_DOWN(__pa(user_pgd)));
810 		}
811 	}
812 #else /* CONFIG_X86_32 */
813 #ifdef CONFIG_X86_PAE
814 	/* Need to make sure unshared kernel PMD is pinnable */
815 	xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
816 		     PT_PMD);
817 #endif
818 	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
819 #endif /* CONFIG_X86_64 */
820 	xen_mc_issue(0);
821 }
822 
823 static void xen_pgd_pin(struct mm_struct *mm)
824 {
825 	__xen_pgd_pin(mm, mm->pgd);
826 }
827 
828 /*
829  * On save, we need to pin all pagetables to make sure they get their
830  * mfns turned into pfns.  Search the list for any unpinned pgds and pin
831  * them (unpinned pgds are not currently in use, probably because the
832  * process is under construction or destruction).
833  *
834  * Expected to be called in stop_machine() ("equivalent to taking
835  * every spinlock in the system"), so the locking doesn't really
836  * matter all that much.
837  */
838 void xen_mm_pin_all(void)
839 {
840 	struct page *page;
841 
842 	spin_lock(&pgd_lock);
843 
844 	list_for_each_entry(page, &pgd_list, lru) {
845 		if (!PagePinned(page)) {
846 			__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
847 			SetPageSavePinned(page);
848 		}
849 	}
850 
851 	spin_unlock(&pgd_lock);
852 }
853 
854 /*
855  * The init_mm pagetable is really pinned as soon as its created, but
856  * that's before we have page structures to store the bits.  So do all
857  * the book-keeping now.
858  */
859 static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
860 				  enum pt_level level)
861 {
862 	SetPagePinned(page);
863 	return 0;
864 }
865 
866 static void __init xen_mark_init_mm_pinned(void)
867 {
868 	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
869 }
870 
871 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
872 			  enum pt_level level)
873 {
874 	unsigned pgfl = TestClearPagePinned(page);
875 
876 	if (pgfl && !PageHighMem(page)) {
877 		void *pt = lowmem_page_address(page);
878 		unsigned long pfn = page_to_pfn(page);
879 		spinlock_t *ptl = NULL;
880 		struct multicall_space mcs;
881 
882 		/*
883 		 * Do the converse to pin_page.  If we're using split
884 		 * pte locks, we must be holding the lock for while
885 		 * the pte page is unpinned but still RO to prevent
886 		 * concurrent updates from seeing it in this
887 		 * partially-pinned state.
888 		 */
889 		if (level == PT_PTE) {
890 			ptl = xen_pte_lock(page, mm);
891 
892 			if (ptl)
893 				xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
894 		}
895 
896 		mcs = __xen_mc_entry(0);
897 
898 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
899 					pfn_pte(pfn, PAGE_KERNEL),
900 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
901 
902 		if (ptl) {
903 			/* unlock when batch completed */
904 			xen_mc_callback(xen_pte_unlock, ptl);
905 		}
906 	}
907 
908 	return 0;		/* never need to flush on unpin */
909 }
910 
911 /* Release a pagetables pages back as normal RW */
912 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
913 {
914 	trace_xen_mmu_pgd_unpin(mm, pgd);
915 
916 	xen_mc_batch();
917 
918 	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
919 
920 #ifdef CONFIG_X86_64
921 	{
922 		pgd_t *user_pgd = xen_get_user_pgd(pgd);
923 
924 		if (user_pgd) {
925 			xen_do_pin(MMUEXT_UNPIN_TABLE,
926 				   PFN_DOWN(__pa(user_pgd)));
927 			xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
928 		}
929 	}
930 #endif
931 
932 #ifdef CONFIG_X86_PAE
933 	/* Need to make sure unshared kernel PMD is unpinned */
934 	xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
935 		       PT_PMD);
936 #endif
937 
938 	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
939 
940 	xen_mc_issue(0);
941 }
942 
943 static void xen_pgd_unpin(struct mm_struct *mm)
944 {
945 	__xen_pgd_unpin(mm, mm->pgd);
946 }
947 
948 /*
949  * On resume, undo any pinning done at save, so that the rest of the
950  * kernel doesn't see any unexpected pinned pagetables.
951  */
952 void xen_mm_unpin_all(void)
953 {
954 	struct page *page;
955 
956 	spin_lock(&pgd_lock);
957 
958 	list_for_each_entry(page, &pgd_list, lru) {
959 		if (PageSavePinned(page)) {
960 			BUG_ON(!PagePinned(page));
961 			__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
962 			ClearPageSavePinned(page);
963 		}
964 	}
965 
966 	spin_unlock(&pgd_lock);
967 }
968 
969 static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
970 {
971 	spin_lock(&next->page_table_lock);
972 	xen_pgd_pin(next);
973 	spin_unlock(&next->page_table_lock);
974 }
975 
976 static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
977 {
978 	spin_lock(&mm->page_table_lock);
979 	xen_pgd_pin(mm);
980 	spin_unlock(&mm->page_table_lock);
981 }
982 
983 
984 #ifdef CONFIG_SMP
985 /* Another cpu may still have their %cr3 pointing at the pagetable, so
986    we need to repoint it somewhere else before we can unpin it. */
987 static void drop_other_mm_ref(void *info)
988 {
989 	struct mm_struct *mm = info;
990 	struct mm_struct *active_mm;
991 
992 	active_mm = this_cpu_read(cpu_tlbstate.active_mm);
993 
994 	if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
995 		leave_mm(smp_processor_id());
996 
997 	/* If this cpu still has a stale cr3 reference, then make sure
998 	   it has been flushed. */
999 	if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
1000 		load_cr3(swapper_pg_dir);
1001 }
1002 
1003 static void xen_drop_mm_ref(struct mm_struct *mm)
1004 {
1005 	cpumask_var_t mask;
1006 	unsigned cpu;
1007 
1008 	if (current->active_mm == mm) {
1009 		if (current->mm == mm)
1010 			load_cr3(swapper_pg_dir);
1011 		else
1012 			leave_mm(smp_processor_id());
1013 	}
1014 
1015 	/* Get the "official" set of cpus referring to our pagetable. */
1016 	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1017 		for_each_online_cpu(cpu) {
1018 			if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1019 			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1020 				continue;
1021 			smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1022 		}
1023 		return;
1024 	}
1025 	cpumask_copy(mask, mm_cpumask(mm));
1026 
1027 	/* It's possible that a vcpu may have a stale reference to our
1028 	   cr3, because its in lazy mode, and it hasn't yet flushed
1029 	   its set of pending hypercalls yet.  In this case, we can
1030 	   look at its actual current cr3 value, and force it to flush
1031 	   if needed. */
1032 	for_each_online_cpu(cpu) {
1033 		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1034 			cpumask_set_cpu(cpu, mask);
1035 	}
1036 
1037 	if (!cpumask_empty(mask))
1038 		smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1039 	free_cpumask_var(mask);
1040 }
1041 #else
1042 static void xen_drop_mm_ref(struct mm_struct *mm)
1043 {
1044 	if (current->active_mm == mm)
1045 		load_cr3(swapper_pg_dir);
1046 }
1047 #endif
1048 
1049 /*
1050  * While a process runs, Xen pins its pagetables, which means that the
1051  * hypervisor forces it to be read-only, and it controls all updates
1052  * to it.  This means that all pagetable updates have to go via the
1053  * hypervisor, which is moderately expensive.
1054  *
1055  * Since we're pulling the pagetable down, we switch to use init_mm,
1056  * unpin old process pagetable and mark it all read-write, which
1057  * allows further operations on it to be simple memory accesses.
1058  *
1059  * The only subtle point is that another CPU may be still using the
1060  * pagetable because of lazy tlb flushing.  This means we need need to
1061  * switch all CPUs off this pagetable before we can unpin it.
1062  */
1063 static void xen_exit_mmap(struct mm_struct *mm)
1064 {
1065 	get_cpu();		/* make sure we don't move around */
1066 	xen_drop_mm_ref(mm);
1067 	put_cpu();
1068 
1069 	spin_lock(&mm->page_table_lock);
1070 
1071 	/* pgd may not be pinned in the error exit path of execve */
1072 	if (xen_page_pinned(mm->pgd))
1073 		xen_pgd_unpin(mm);
1074 
1075 	spin_unlock(&mm->page_table_lock);
1076 }
1077 
1078 static void xen_post_allocator_init(void);
1079 
1080 static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1081 {
1082 	struct mmuext_op op;
1083 
1084 	op.cmd = cmd;
1085 	op.arg1.mfn = pfn_to_mfn(pfn);
1086 	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1087 		BUG();
1088 }
1089 
1090 #ifdef CONFIG_X86_64
1091 static void __init xen_cleanhighmap(unsigned long vaddr,
1092 				    unsigned long vaddr_end)
1093 {
1094 	unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
1095 	pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
1096 
1097 	/* NOTE: The loop is more greedy than the cleanup_highmap variant.
1098 	 * We include the PMD passed in on _both_ boundaries. */
1099 	for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD));
1100 			pmd++, vaddr += PMD_SIZE) {
1101 		if (pmd_none(*pmd))
1102 			continue;
1103 		if (vaddr < (unsigned long) _text || vaddr > kernel_end)
1104 			set_pmd(pmd, __pmd(0));
1105 	}
1106 	/* In case we did something silly, we should crash in this function
1107 	 * instead of somewhere later and be confusing. */
1108 	xen_mc_flush();
1109 }
1110 
1111 /*
1112  * Make a page range writeable and free it.
1113  */
1114 static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
1115 {
1116 	void *vaddr = __va(paddr);
1117 	void *vaddr_end = vaddr + size;
1118 
1119 	for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
1120 		make_lowmem_page_readwrite(vaddr);
1121 
1122 	memblock_free(paddr, size);
1123 }
1124 
1125 static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
1126 {
1127 	unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
1128 
1129 	if (unpin)
1130 		pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
1131 	ClearPagePinned(virt_to_page(__va(pa)));
1132 	xen_free_ro_pages(pa, PAGE_SIZE);
1133 }
1134 
1135 static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
1136 {
1137 	unsigned long pa;
1138 	pte_t *pte_tbl;
1139 	int i;
1140 
1141 	if (pmd_large(*pmd)) {
1142 		pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
1143 		xen_free_ro_pages(pa, PMD_SIZE);
1144 		return;
1145 	}
1146 
1147 	pte_tbl = pte_offset_kernel(pmd, 0);
1148 	for (i = 0; i < PTRS_PER_PTE; i++) {
1149 		if (pte_none(pte_tbl[i]))
1150 			continue;
1151 		pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
1152 		xen_free_ro_pages(pa, PAGE_SIZE);
1153 	}
1154 	set_pmd(pmd, __pmd(0));
1155 	xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
1156 }
1157 
1158 static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
1159 {
1160 	unsigned long pa;
1161 	pmd_t *pmd_tbl;
1162 	int i;
1163 
1164 	if (pud_large(*pud)) {
1165 		pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
1166 		xen_free_ro_pages(pa, PUD_SIZE);
1167 		return;
1168 	}
1169 
1170 	pmd_tbl = pmd_offset(pud, 0);
1171 	for (i = 0; i < PTRS_PER_PMD; i++) {
1172 		if (pmd_none(pmd_tbl[i]))
1173 			continue;
1174 		xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
1175 	}
1176 	set_pud(pud, __pud(0));
1177 	xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
1178 }
1179 
1180 static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
1181 {
1182 	unsigned long pa;
1183 	pud_t *pud_tbl;
1184 	int i;
1185 
1186 	if (p4d_large(*p4d)) {
1187 		pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
1188 		xen_free_ro_pages(pa, P4D_SIZE);
1189 		return;
1190 	}
1191 
1192 	pud_tbl = pud_offset(p4d, 0);
1193 	for (i = 0; i < PTRS_PER_PUD; i++) {
1194 		if (pud_none(pud_tbl[i]))
1195 			continue;
1196 		xen_cleanmfnmap_pud(pud_tbl + i, unpin);
1197 	}
1198 	set_p4d(p4d, __p4d(0));
1199 	xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
1200 }
1201 
1202 /*
1203  * Since it is well isolated we can (and since it is perhaps large we should)
1204  * also free the page tables mapping the initial P->M table.
1205  */
1206 static void __init xen_cleanmfnmap(unsigned long vaddr)
1207 {
1208 	pgd_t *pgd;
1209 	p4d_t *p4d;
1210 	unsigned int i;
1211 	bool unpin;
1212 
1213 	unpin = (vaddr == 2 * PGDIR_SIZE);
1214 	vaddr &= PMD_MASK;
1215 	pgd = pgd_offset_k(vaddr);
1216 	p4d = p4d_offset(pgd, 0);
1217 	for (i = 0; i < PTRS_PER_P4D; i++) {
1218 		if (p4d_none(p4d[i]))
1219 			continue;
1220 		xen_cleanmfnmap_p4d(p4d + i, unpin);
1221 	}
1222 	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
1223 		set_pgd(pgd, __pgd(0));
1224 		xen_cleanmfnmap_free_pgtbl(p4d, unpin);
1225 	}
1226 }
1227 
1228 static void __init xen_pagetable_p2m_free(void)
1229 {
1230 	unsigned long size;
1231 	unsigned long addr;
1232 
1233 	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1234 
1235 	/* No memory or already called. */
1236 	if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
1237 		return;
1238 
1239 	/* using __ka address and sticking INVALID_P2M_ENTRY! */
1240 	memset((void *)xen_start_info->mfn_list, 0xff, size);
1241 
1242 	addr = xen_start_info->mfn_list;
1243 	/*
1244 	 * We could be in __ka space.
1245 	 * We roundup to the PMD, which means that if anybody at this stage is
1246 	 * using the __ka address of xen_start_info or
1247 	 * xen_start_info->shared_info they are in going to crash. Fortunatly
1248 	 * we have already revectored in xen_setup_kernel_pagetable and in
1249 	 * xen_setup_shared_info.
1250 	 */
1251 	size = roundup(size, PMD_SIZE);
1252 
1253 	if (addr >= __START_KERNEL_map) {
1254 		xen_cleanhighmap(addr, addr + size);
1255 		size = PAGE_ALIGN(xen_start_info->nr_pages *
1256 				  sizeof(unsigned long));
1257 		memblock_free(__pa(addr), size);
1258 	} else {
1259 		xen_cleanmfnmap(addr);
1260 	}
1261 }
1262 
1263 static void __init xen_pagetable_cleanhighmap(void)
1264 {
1265 	unsigned long size;
1266 	unsigned long addr;
1267 
1268 	/* At this stage, cleanup_highmap has already cleaned __ka space
1269 	 * from _brk_limit way up to the max_pfn_mapped (which is the end of
1270 	 * the ramdisk). We continue on, erasing PMD entries that point to page
1271 	 * tables - do note that they are accessible at this stage via __va.
1272 	 * For good measure we also round up to the PMD - which means that if
1273 	 * anybody is using __ka address to the initial boot-stack - and try
1274 	 * to use it - they are going to crash. The xen_start_info has been
1275 	 * taken care of already in xen_setup_kernel_pagetable. */
1276 	addr = xen_start_info->pt_base;
1277 	size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
1278 
1279 	xen_cleanhighmap(addr, addr + size);
1280 	xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
1281 #ifdef DEBUG
1282 	/* This is superfluous and is not necessary, but you know what
1283 	 * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
1284 	 * anything at this stage. */
1285 	xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
1286 #endif
1287 }
1288 #endif
1289 
1290 static void __init xen_pagetable_p2m_setup(void)
1291 {
1292 	if (xen_feature(XENFEAT_auto_translated_physmap))
1293 		return;
1294 
1295 	xen_vmalloc_p2m_tree();
1296 
1297 #ifdef CONFIG_X86_64
1298 	xen_pagetable_p2m_free();
1299 
1300 	xen_pagetable_cleanhighmap();
1301 #endif
1302 	/* And revector! Bye bye old array */
1303 	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
1304 }
1305 
1306 static void __init xen_pagetable_init(void)
1307 {
1308 	paging_init();
1309 	xen_post_allocator_init();
1310 
1311 	xen_pagetable_p2m_setup();
1312 
1313 	/* Allocate and initialize top and mid mfn levels for p2m structure */
1314 	xen_build_mfn_list_list();
1315 
1316 	/* Remap memory freed due to conflicts with E820 map */
1317 	if (!xen_feature(XENFEAT_auto_translated_physmap))
1318 		xen_remap_memory();
1319 
1320 	xen_setup_shared_info();
1321 }
1322 static void xen_write_cr2(unsigned long cr2)
1323 {
1324 	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
1325 }
1326 
1327 static unsigned long xen_read_cr2(void)
1328 {
1329 	return this_cpu_read(xen_vcpu)->arch.cr2;
1330 }
1331 
1332 unsigned long xen_read_cr2_direct(void)
1333 {
1334 	return this_cpu_read(xen_vcpu_info.arch.cr2);
1335 }
1336 
1337 static void xen_flush_tlb(void)
1338 {
1339 	struct mmuext_op *op;
1340 	struct multicall_space mcs;
1341 
1342 	trace_xen_mmu_flush_tlb(0);
1343 
1344 	preempt_disable();
1345 
1346 	mcs = xen_mc_entry(sizeof(*op));
1347 
1348 	op = mcs.args;
1349 	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1350 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1351 
1352 	xen_mc_issue(PARAVIRT_LAZY_MMU);
1353 
1354 	preempt_enable();
1355 }
1356 
1357 static void xen_flush_tlb_single(unsigned long addr)
1358 {
1359 	struct mmuext_op *op;
1360 	struct multicall_space mcs;
1361 
1362 	trace_xen_mmu_flush_tlb_single(addr);
1363 
1364 	preempt_disable();
1365 
1366 	mcs = xen_mc_entry(sizeof(*op));
1367 	op = mcs.args;
1368 	op->cmd = MMUEXT_INVLPG_LOCAL;
1369 	op->arg1.linear_addr = addr & PAGE_MASK;
1370 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1371 
1372 	xen_mc_issue(PARAVIRT_LAZY_MMU);
1373 
1374 	preempt_enable();
1375 }
1376 
1377 static void xen_flush_tlb_others(const struct cpumask *cpus,
1378 				 struct mm_struct *mm, unsigned long start,
1379 				 unsigned long end)
1380 {
1381 	struct {
1382 		struct mmuext_op op;
1383 #ifdef CONFIG_SMP
1384 		DECLARE_BITMAP(mask, num_processors);
1385 #else
1386 		DECLARE_BITMAP(mask, NR_CPUS);
1387 #endif
1388 	} *args;
1389 	struct multicall_space mcs;
1390 
1391 	trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
1392 
1393 	if (cpumask_empty(cpus))
1394 		return;		/* nothing to do */
1395 
1396 	mcs = xen_mc_entry(sizeof(*args));
1397 	args = mcs.args;
1398 	args->op.arg2.vcpumask = to_cpumask(args->mask);
1399 
1400 	/* Remove us, and any offline CPUS. */
1401 	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1402 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1403 
1404 	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1405 	if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1406 		args->op.cmd = MMUEXT_INVLPG_MULTI;
1407 		args->op.arg1.linear_addr = start;
1408 	}
1409 
1410 	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1411 
1412 	xen_mc_issue(PARAVIRT_LAZY_MMU);
1413 }
1414 
1415 static unsigned long xen_read_cr3(void)
1416 {
1417 	return this_cpu_read(xen_cr3);
1418 }
1419 
1420 static void set_current_cr3(void *v)
1421 {
1422 	this_cpu_write(xen_current_cr3, (unsigned long)v);
1423 }
1424 
1425 static void __xen_write_cr3(bool kernel, unsigned long cr3)
1426 {
1427 	struct mmuext_op op;
1428 	unsigned long mfn;
1429 
1430 	trace_xen_mmu_write_cr3(kernel, cr3);
1431 
1432 	if (cr3)
1433 		mfn = pfn_to_mfn(PFN_DOWN(cr3));
1434 	else
1435 		mfn = 0;
1436 
1437 	WARN_ON(mfn == 0 && kernel);
1438 
1439 	op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1440 	op.arg1.mfn = mfn;
1441 
1442 	xen_extend_mmuext_op(&op);
1443 
1444 	if (kernel) {
1445 		this_cpu_write(xen_cr3, cr3);
1446 
1447 		/* Update xen_current_cr3 once the batch has actually
1448 		   been submitted. */
1449 		xen_mc_callback(set_current_cr3, (void *)cr3);
1450 	}
1451 }
1452 static void xen_write_cr3(unsigned long cr3)
1453 {
1454 	BUG_ON(preemptible());
1455 
1456 	xen_mc_batch();  /* disables interrupts */
1457 
1458 	/* Update while interrupts are disabled, so its atomic with
1459 	   respect to ipis */
1460 	this_cpu_write(xen_cr3, cr3);
1461 
1462 	__xen_write_cr3(true, cr3);
1463 
1464 #ifdef CONFIG_X86_64
1465 	{
1466 		pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1467 		if (user_pgd)
1468 			__xen_write_cr3(false, __pa(user_pgd));
1469 		else
1470 			__xen_write_cr3(false, 0);
1471 	}
1472 #endif
1473 
1474 	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1475 }
1476 
1477 #ifdef CONFIG_X86_64
1478 /*
1479  * At the start of the day - when Xen launches a guest, it has already
1480  * built pagetables for the guest. We diligently look over them
1481  * in xen_setup_kernel_pagetable and graft as appropriate them in the
1482  * init_level4_pgt and its friends. Then when we are happy we load
1483  * the new init_level4_pgt - and continue on.
1484  *
1485  * The generic code starts (start_kernel) and 'init_mem_mapping' sets
1486  * up the rest of the pagetables. When it has completed it loads the cr3.
1487  * N.B. that baremetal would start at 'start_kernel' (and the early
1488  * #PF handler would create bootstrap pagetables) - so we are running
1489  * with the same assumptions as what to do when write_cr3 is executed
1490  * at this point.
1491  *
1492  * Since there are no user-page tables at all, we have two variants
1493  * of xen_write_cr3 - the early bootup (this one), and the late one
1494  * (xen_write_cr3). The reason we have to do that is that in 64-bit
1495  * the Linux kernel and user-space are both in ring 3 while the
1496  * hypervisor is in ring 0.
1497  */
1498 static void __init xen_write_cr3_init(unsigned long cr3)
1499 {
1500 	BUG_ON(preemptible());
1501 
1502 	xen_mc_batch();  /* disables interrupts */
1503 
1504 	/* Update while interrupts are disabled, so its atomic with
1505 	   respect to ipis */
1506 	this_cpu_write(xen_cr3, cr3);
1507 
1508 	__xen_write_cr3(true, cr3);
1509 
1510 	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1511 }
1512 #endif
1513 
1514 static int xen_pgd_alloc(struct mm_struct *mm)
1515 {
1516 	pgd_t *pgd = mm->pgd;
1517 	int ret = 0;
1518 
1519 	BUG_ON(PagePinned(virt_to_page(pgd)));
1520 
1521 #ifdef CONFIG_X86_64
1522 	{
1523 		struct page *page = virt_to_page(pgd);
1524 		pgd_t *user_pgd;
1525 
1526 		BUG_ON(page->private != 0);
1527 
1528 		ret = -ENOMEM;
1529 
1530 		user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1531 		page->private = (unsigned long)user_pgd;
1532 
1533 		if (user_pgd != NULL) {
1534 #ifdef CONFIG_X86_VSYSCALL_EMULATION
1535 			user_pgd[pgd_index(VSYSCALL_ADDR)] =
1536 				__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1537 #endif
1538 			ret = 0;
1539 		}
1540 
1541 		BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1542 	}
1543 #endif
1544 	return ret;
1545 }
1546 
1547 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1548 {
1549 #ifdef CONFIG_X86_64
1550 	pgd_t *user_pgd = xen_get_user_pgd(pgd);
1551 
1552 	if (user_pgd)
1553 		free_page((unsigned long)user_pgd);
1554 #endif
1555 }
1556 
1557 /*
1558  * Init-time set_pte while constructing initial pagetables, which
1559  * doesn't allow RO page table pages to be remapped RW.
1560  *
1561  * If there is no MFN for this PFN then this page is initially
1562  * ballooned out so clear the PTE (as in decrease_reservation() in
1563  * drivers/xen/balloon.c).
1564  *
1565  * Many of these PTE updates are done on unpinned and writable pages
1566  * and doing a hypercall for these is unnecessary and expensive.  At
1567  * this point it is not possible to tell if a page is pinned or not,
1568  * so always write the PTE directly and rely on Xen trapping and
1569  * emulating any updates as necessary.
1570  */
1571 __visible pte_t xen_make_pte_init(pteval_t pte)
1572 {
1573 #ifdef CONFIG_X86_64
1574 	unsigned long pfn;
1575 
1576 	/*
1577 	 * Pages belonging to the initial p2m list mapped outside the default
1578 	 * address range must be mapped read-only. This region contains the
1579 	 * page tables for mapping the p2m list, too, and page tables MUST be
1580 	 * mapped read-only.
1581 	 */
1582 	pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
1583 	if (xen_start_info->mfn_list < __START_KERNEL_map &&
1584 	    pfn >= xen_start_info->first_p2m_pfn &&
1585 	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
1586 		pte &= ~_PAGE_RW;
1587 #endif
1588 	pte = pte_pfn_to_mfn(pte);
1589 	return native_make_pte(pte);
1590 }
1591 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
1592 
1593 static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1594 {
1595 #ifdef CONFIG_X86_32
1596 	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
1597 	if (pte_mfn(pte) != INVALID_P2M_ENTRY
1598 	    && pte_val_ma(*ptep) & _PAGE_PRESENT)
1599 		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1600 			       pte_val_ma(pte));
1601 #endif
1602 	native_set_pte(ptep, pte);
1603 }
1604 
1605 /* Early in boot, while setting up the initial pagetable, assume
1606    everything is pinned. */
1607 static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1608 {
1609 #ifdef CONFIG_FLATMEM
1610 	BUG_ON(mem_map);	/* should only be used early */
1611 #endif
1612 	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1613 	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1614 }
1615 
1616 /* Used for pmd and pud */
1617 static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1618 {
1619 #ifdef CONFIG_FLATMEM
1620 	BUG_ON(mem_map);	/* should only be used early */
1621 #endif
1622 	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1623 }
1624 
1625 /* Early release_pte assumes that all pts are pinned, since there's
1626    only init_mm and anything attached to that is pinned. */
1627 static void __init xen_release_pte_init(unsigned long pfn)
1628 {
1629 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1630 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1631 }
1632 
1633 static void __init xen_release_pmd_init(unsigned long pfn)
1634 {
1635 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1636 }
1637 
1638 static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1639 {
1640 	struct multicall_space mcs;
1641 	struct mmuext_op *op;
1642 
1643 	mcs = __xen_mc_entry(sizeof(*op));
1644 	op = mcs.args;
1645 	op->cmd = cmd;
1646 	op->arg1.mfn = pfn_to_mfn(pfn);
1647 
1648 	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
1649 }
1650 
1651 static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1652 {
1653 	struct multicall_space mcs;
1654 	unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1655 
1656 	mcs = __xen_mc_entry(0);
1657 	MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
1658 				pfn_pte(pfn, prot), 0);
1659 }
1660 
1661 /* This needs to make sure the new pte page is pinned iff its being
1662    attached to a pinned pagetable. */
1663 static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
1664 				    unsigned level)
1665 {
1666 	bool pinned = PagePinned(virt_to_page(mm->pgd));
1667 
1668 	trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
1669 
1670 	if (pinned) {
1671 		struct page *page = pfn_to_page(pfn);
1672 
1673 		SetPagePinned(page);
1674 
1675 		if (!PageHighMem(page)) {
1676 			xen_mc_batch();
1677 
1678 			__set_pfn_prot(pfn, PAGE_KERNEL_RO);
1679 
1680 			if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1681 				__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1682 
1683 			xen_mc_issue(PARAVIRT_LAZY_MMU);
1684 		} else {
1685 			/* make sure there are no stray mappings of
1686 			   this page */
1687 			kmap_flush_unused();
1688 		}
1689 	}
1690 }
1691 
1692 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1693 {
1694 	xen_alloc_ptpage(mm, pfn, PT_PTE);
1695 }
1696 
1697 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1698 {
1699 	xen_alloc_ptpage(mm, pfn, PT_PMD);
1700 }
1701 
1702 /* This should never happen until we're OK to use struct page */
1703 static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
1704 {
1705 	struct page *page = pfn_to_page(pfn);
1706 	bool pinned = PagePinned(page);
1707 
1708 	trace_xen_mmu_release_ptpage(pfn, level, pinned);
1709 
1710 	if (pinned) {
1711 		if (!PageHighMem(page)) {
1712 			xen_mc_batch();
1713 
1714 			if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1715 				__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1716 
1717 			__set_pfn_prot(pfn, PAGE_KERNEL);
1718 
1719 			xen_mc_issue(PARAVIRT_LAZY_MMU);
1720 		}
1721 		ClearPagePinned(page);
1722 	}
1723 }
1724 
1725 static void xen_release_pte(unsigned long pfn)
1726 {
1727 	xen_release_ptpage(pfn, PT_PTE);
1728 }
1729 
1730 static void xen_release_pmd(unsigned long pfn)
1731 {
1732 	xen_release_ptpage(pfn, PT_PMD);
1733 }
1734 
1735 #if CONFIG_PGTABLE_LEVELS >= 4
1736 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1737 {
1738 	xen_alloc_ptpage(mm, pfn, PT_PUD);
1739 }
1740 
1741 static void xen_release_pud(unsigned long pfn)
1742 {
1743 	xen_release_ptpage(pfn, PT_PUD);
1744 }
1745 #endif
1746 
1747 void __init xen_reserve_top(void)
1748 {
1749 #ifdef CONFIG_X86_32
1750 	unsigned long top = HYPERVISOR_VIRT_START;
1751 	struct xen_platform_parameters pp;
1752 
1753 	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1754 		top = pp.virt_start;
1755 
1756 	reserve_top_address(-top);
1757 #endif	/* CONFIG_X86_32 */
1758 }
1759 
1760 /*
1761  * Like __va(), but returns address in the kernel mapping (which is
1762  * all we have until the physical memory mapping has been set up.
1763  */
1764 static void * __init __ka(phys_addr_t paddr)
1765 {
1766 #ifdef CONFIG_X86_64
1767 	return (void *)(paddr + __START_KERNEL_map);
1768 #else
1769 	return __va(paddr);
1770 #endif
1771 }
1772 
1773 /* Convert a machine address to physical address */
1774 static unsigned long __init m2p(phys_addr_t maddr)
1775 {
1776 	phys_addr_t paddr;
1777 
1778 	maddr &= PTE_PFN_MASK;
1779 	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1780 
1781 	return paddr;
1782 }
1783 
1784 /* Convert a machine address to kernel virtual */
1785 static void * __init m2v(phys_addr_t maddr)
1786 {
1787 	return __ka(m2p(maddr));
1788 }
1789 
1790 /* Set the page permissions on an identity-mapped pages */
1791 static void __init set_page_prot_flags(void *addr, pgprot_t prot,
1792 				       unsigned long flags)
1793 {
1794 	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1795 	pte_t pte = pfn_pte(pfn, prot);
1796 
1797 	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
1798 		BUG();
1799 }
1800 static void __init set_page_prot(void *addr, pgprot_t prot)
1801 {
1802 	return set_page_prot_flags(addr, prot, UVMF_NONE);
1803 }
1804 #ifdef CONFIG_X86_32
1805 static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1806 {
1807 	unsigned pmdidx, pteidx;
1808 	unsigned ident_pte;
1809 	unsigned long pfn;
1810 
1811 	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1812 				      PAGE_SIZE);
1813 
1814 	ident_pte = 0;
1815 	pfn = 0;
1816 	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1817 		pte_t *pte_page;
1818 
1819 		/* Reuse or allocate a page of ptes */
1820 		if (pmd_present(pmd[pmdidx]))
1821 			pte_page = m2v(pmd[pmdidx].pmd);
1822 		else {
1823 			/* Check for free pte pages */
1824 			if (ident_pte == LEVEL1_IDENT_ENTRIES)
1825 				break;
1826 
1827 			pte_page = &level1_ident_pgt[ident_pte];
1828 			ident_pte += PTRS_PER_PTE;
1829 
1830 			pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1831 		}
1832 
1833 		/* Install mappings */
1834 		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1835 			pte_t pte;
1836 
1837 			if (pfn > max_pfn_mapped)
1838 				max_pfn_mapped = pfn;
1839 
1840 			if (!pte_none(pte_page[pteidx]))
1841 				continue;
1842 
1843 			pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1844 			pte_page[pteidx] = pte;
1845 		}
1846 	}
1847 
1848 	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1849 		set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1850 
1851 	set_page_prot(pmd, PAGE_KERNEL_RO);
1852 }
1853 #endif
1854 void __init xen_setup_machphys_mapping(void)
1855 {
1856 	struct xen_machphys_mapping mapping;
1857 
1858 	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1859 		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1860 		machine_to_phys_nr = mapping.max_mfn + 1;
1861 	} else {
1862 		machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
1863 	}
1864 #ifdef CONFIG_X86_32
1865 	WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
1866 		< machine_to_phys_mapping);
1867 #endif
1868 }
1869 
1870 #ifdef CONFIG_X86_64
1871 static void __init convert_pfn_mfn(void *v)
1872 {
1873 	pte_t *pte = v;
1874 	int i;
1875 
1876 	/* All levels are converted the same way, so just treat them
1877 	   as ptes. */
1878 	for (i = 0; i < PTRS_PER_PTE; i++)
1879 		pte[i] = xen_make_pte(pte[i].pte);
1880 }
1881 static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
1882 				 unsigned long addr)
1883 {
1884 	if (*pt_base == PFN_DOWN(__pa(addr))) {
1885 		set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1886 		clear_page((void *)addr);
1887 		(*pt_base)++;
1888 	}
1889 	if (*pt_end == PFN_DOWN(__pa(addr))) {
1890 		set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1891 		clear_page((void *)addr);
1892 		(*pt_end)--;
1893 	}
1894 }
1895 /*
1896  * Set up the initial kernel pagetable.
1897  *
1898  * We can construct this by grafting the Xen provided pagetable into
1899  * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1900  * level2_ident_pgt, and level2_kernel_pgt.  This means that only the
1901  * kernel has a physical mapping to start with - but that's enough to
1902  * get __va working.  We need to fill in the rest of the physical
1903  * mapping once some sort of allocator has been set up.
1904  */
1905 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1906 {
1907 	pud_t *l3;
1908 	pmd_t *l2;
1909 	unsigned long addr[3];
1910 	unsigned long pt_base, pt_end;
1911 	unsigned i;
1912 
1913 	/* max_pfn_mapped is the last pfn mapped in the initial memory
1914 	 * mappings. Considering that on Xen after the kernel mappings we
1915 	 * have the mappings of some pages that don't exist in pfn space, we
1916 	 * set max_pfn_mapped to the last real pfn mapped. */
1917 	if (xen_start_info->mfn_list < __START_KERNEL_map)
1918 		max_pfn_mapped = xen_start_info->first_p2m_pfn;
1919 	else
1920 		max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1921 
1922 	pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
1923 	pt_end = pt_base + xen_start_info->nr_pt_frames;
1924 
1925 	/* Zap identity mapping */
1926 	init_level4_pgt[0] = __pgd(0);
1927 
1928 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1929 		/* Pre-constructed entries are in pfn, so convert to mfn */
1930 		/* L4[272] -> level3_ident_pgt
1931 		 * L4[511] -> level3_kernel_pgt */
1932 		convert_pfn_mfn(init_level4_pgt);
1933 
1934 		/* L3_i[0] -> level2_ident_pgt */
1935 		convert_pfn_mfn(level3_ident_pgt);
1936 		/* L3_k[510] -> level2_kernel_pgt
1937 		 * L3_k[511] -> level2_fixmap_pgt */
1938 		convert_pfn_mfn(level3_kernel_pgt);
1939 
1940 		/* L3_k[511][506] -> level1_fixmap_pgt */
1941 		convert_pfn_mfn(level2_fixmap_pgt);
1942 	}
1943 	/* We get [511][511] and have Xen's version of level2_kernel_pgt */
1944 	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1945 	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1946 
1947 	addr[0] = (unsigned long)pgd;
1948 	addr[1] = (unsigned long)l3;
1949 	addr[2] = (unsigned long)l2;
1950 	/* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
1951 	 * Both L4[272][0] and L4[511][510] have entries that point to the same
1952 	 * L2 (PMD) tables. Meaning that if you modify it in __va space
1953 	 * it will be also modified in the __ka space! (But if you just
1954 	 * modify the PMD table to point to other PTE's or none, then you
1955 	 * are OK - which is what cleanup_highmap does) */
1956 	copy_page(level2_ident_pgt, l2);
1957 	/* Graft it onto L4[511][510] */
1958 	copy_page(level2_kernel_pgt, l2);
1959 
1960 	/* Copy the initial P->M table mappings if necessary. */
1961 	i = pgd_index(xen_start_info->mfn_list);
1962 	if (i && i < pgd_index(__START_KERNEL_map))
1963 		init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
1964 
1965 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1966 		/* Make pagetable pieces RO */
1967 		set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1968 		set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1969 		set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1970 		set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1971 		set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
1972 		set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1973 		set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1974 		set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
1975 
1976 		/* Pin down new L4 */
1977 		pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1978 				  PFN_DOWN(__pa_symbol(init_level4_pgt)));
1979 
1980 		/* Unpin Xen-provided one */
1981 		pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1982 
1983 		/*
1984 		 * At this stage there can be no user pgd, and no page
1985 		 * structure to attach it to, so make sure we just set kernel
1986 		 * pgd.
1987 		 */
1988 		xen_mc_batch();
1989 		__xen_write_cr3(true, __pa(init_level4_pgt));
1990 		xen_mc_issue(PARAVIRT_LAZY_CPU);
1991 	} else
1992 		native_write_cr3(__pa(init_level4_pgt));
1993 
1994 	/* We can't that easily rip out L3 and L2, as the Xen pagetables are
1995 	 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ...  for
1996 	 * the initial domain. For guests using the toolstack, they are in:
1997 	 * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
1998 	 * rip out the [L4] (pgd), but for guests we shave off three pages.
1999 	 */
2000 	for (i = 0; i < ARRAY_SIZE(addr); i++)
2001 		check_pt_base(&pt_base, &pt_end, addr[i]);
2002 
2003 	/* Our (by three pages) smaller Xen pagetable that we are using */
2004 	xen_pt_base = PFN_PHYS(pt_base);
2005 	xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
2006 	memblock_reserve(xen_pt_base, xen_pt_size);
2007 
2008 	/* Revector the xen_start_info */
2009 	xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
2010 }
2011 
2012 /*
2013  * Read a value from a physical address.
2014  */
2015 static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
2016 {
2017 	unsigned long *vaddr;
2018 	unsigned long val;
2019 
2020 	vaddr = early_memremap_ro(addr, sizeof(val));
2021 	val = *vaddr;
2022 	early_memunmap(vaddr, sizeof(val));
2023 	return val;
2024 }
2025 
2026 /*
2027  * Translate a virtual address to a physical one without relying on mapped
2028  * page tables. Don't rely on big pages being aligned in (guest) physical
2029  * space!
2030  */
2031 static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
2032 {
2033 	phys_addr_t pa;
2034 	pgd_t pgd;
2035 	pud_t pud;
2036 	pmd_t pmd;
2037 	pte_t pte;
2038 
2039 	pa = read_cr3();
2040 	pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
2041 						       sizeof(pgd)));
2042 	if (!pgd_present(pgd))
2043 		return 0;
2044 
2045 	pa = pgd_val(pgd) & PTE_PFN_MASK;
2046 	pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
2047 						       sizeof(pud)));
2048 	if (!pud_present(pud))
2049 		return 0;
2050 	pa = pud_val(pud) & PTE_PFN_MASK;
2051 	if (pud_large(pud))
2052 		return pa + (vaddr & ~PUD_MASK);
2053 
2054 	pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
2055 						       sizeof(pmd)));
2056 	if (!pmd_present(pmd))
2057 		return 0;
2058 	pa = pmd_val(pmd) & PTE_PFN_MASK;
2059 	if (pmd_large(pmd))
2060 		return pa + (vaddr & ~PMD_MASK);
2061 
2062 	pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
2063 						       sizeof(pte)));
2064 	if (!pte_present(pte))
2065 		return 0;
2066 	pa = pte_pfn(pte) << PAGE_SHIFT;
2067 
2068 	return pa | (vaddr & ~PAGE_MASK);
2069 }
2070 
2071 /*
2072  * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
2073  * this area.
2074  */
2075 void __init xen_relocate_p2m(void)
2076 {
2077 	phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys;
2078 	unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
2079 	int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d;
2080 	pte_t *pt;
2081 	pmd_t *pmd;
2082 	pud_t *pud;
2083 	p4d_t *p4d = NULL;
2084 	pgd_t *pgd;
2085 	unsigned long *new_p2m;
2086 	int save_pud;
2087 
2088 	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
2089 	n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
2090 	n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
2091 	n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
2092 	n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
2093 	if (PTRS_PER_P4D > 1)
2094 		n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
2095 	else
2096 		n_p4d = 0;
2097 	n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d;
2098 
2099 	new_area = xen_find_free_area(PFN_PHYS(n_frames));
2100 	if (!new_area) {
2101 		xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
2102 		BUG();
2103 	}
2104 
2105 	/*
2106 	 * Setup the page tables for addressing the new p2m list.
2107 	 * We have asked the hypervisor to map the p2m list at the user address
2108 	 * PUD_SIZE. It may have done so, or it may have used a kernel space
2109 	 * address depending on the Xen version.
2110 	 * To avoid any possible virtual address collision, just use
2111 	 * 2 * PUD_SIZE for the new area.
2112 	 */
2113 	p4d_phys = new_area;
2114 	pud_phys = p4d_phys + PFN_PHYS(n_p4d);
2115 	pmd_phys = pud_phys + PFN_PHYS(n_pud);
2116 	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
2117 	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
2118 
2119 	pgd = __va(read_cr3());
2120 	new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
2121 	idx_p4d = 0;
2122 	save_pud = n_pud;
2123 	do {
2124 		if (n_p4d > 0) {
2125 			p4d = early_memremap(p4d_phys, PAGE_SIZE);
2126 			clear_page(p4d);
2127 			n_pud = min(save_pud, PTRS_PER_P4D);
2128 		}
2129 		for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
2130 			pud = early_memremap(pud_phys, PAGE_SIZE);
2131 			clear_page(pud);
2132 			for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
2133 				 idx_pmd++) {
2134 				pmd = early_memremap(pmd_phys, PAGE_SIZE);
2135 				clear_page(pmd);
2136 				for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
2137 					 idx_pt++) {
2138 					pt = early_memremap(pt_phys, PAGE_SIZE);
2139 					clear_page(pt);
2140 					for (idx_pte = 0;
2141 						 idx_pte < min(n_pte, PTRS_PER_PTE);
2142 						 idx_pte++) {
2143 						set_pte(pt + idx_pte,
2144 								pfn_pte(p2m_pfn, PAGE_KERNEL));
2145 						p2m_pfn++;
2146 					}
2147 					n_pte -= PTRS_PER_PTE;
2148 					early_memunmap(pt, PAGE_SIZE);
2149 					make_lowmem_page_readonly(__va(pt_phys));
2150 					pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
2151 							PFN_DOWN(pt_phys));
2152 					set_pmd(pmd + idx_pt,
2153 							__pmd(_PAGE_TABLE | pt_phys));
2154 					pt_phys += PAGE_SIZE;
2155 				}
2156 				n_pt -= PTRS_PER_PMD;
2157 				early_memunmap(pmd, PAGE_SIZE);
2158 				make_lowmem_page_readonly(__va(pmd_phys));
2159 				pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
2160 						PFN_DOWN(pmd_phys));
2161 				set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
2162 				pmd_phys += PAGE_SIZE;
2163 			}
2164 			n_pmd -= PTRS_PER_PUD;
2165 			early_memunmap(pud, PAGE_SIZE);
2166 			make_lowmem_page_readonly(__va(pud_phys));
2167 			pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
2168 			if (n_p4d > 0)
2169 				set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys));
2170 			else
2171 				set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
2172 			pud_phys += PAGE_SIZE;
2173 		}
2174 		if (n_p4d > 0) {
2175 			save_pud -= PTRS_PER_P4D;
2176 			early_memunmap(p4d, PAGE_SIZE);
2177 			make_lowmem_page_readonly(__va(p4d_phys));
2178 			pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys));
2179 			set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys));
2180 			p4d_phys += PAGE_SIZE;
2181 		}
2182 	} while (++idx_p4d < n_p4d);
2183 
2184 	/* Now copy the old p2m info to the new area. */
2185 	memcpy(new_p2m, xen_p2m_addr, size);
2186 	xen_p2m_addr = new_p2m;
2187 
2188 	/* Release the old p2m list and set new list info. */
2189 	p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
2190 	BUG_ON(!p2m_pfn);
2191 	p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
2192 
2193 	if (xen_start_info->mfn_list < __START_KERNEL_map) {
2194 		pfn = xen_start_info->first_p2m_pfn;
2195 		pfn_end = xen_start_info->first_p2m_pfn +
2196 			  xen_start_info->nr_p2m_frames;
2197 		set_pgd(pgd + 1, __pgd(0));
2198 	} else {
2199 		pfn = p2m_pfn;
2200 		pfn_end = p2m_pfn_end;
2201 	}
2202 
2203 	memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
2204 	while (pfn < pfn_end) {
2205 		if (pfn == p2m_pfn) {
2206 			pfn = p2m_pfn_end;
2207 			continue;
2208 		}
2209 		make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
2210 		pfn++;
2211 	}
2212 
2213 	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
2214 	xen_start_info->first_p2m_pfn =  PFN_DOWN(new_area);
2215 	xen_start_info->nr_p2m_frames = n_frames;
2216 }
2217 
2218 #else	/* !CONFIG_X86_64 */
2219 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
2220 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
2221 
2222 static void __init xen_write_cr3_init(unsigned long cr3)
2223 {
2224 	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
2225 
2226 	BUG_ON(read_cr3() != __pa(initial_page_table));
2227 	BUG_ON(cr3 != __pa(swapper_pg_dir));
2228 
2229 	/*
2230 	 * We are switching to swapper_pg_dir for the first time (from
2231 	 * initial_page_table) and therefore need to mark that page
2232 	 * read-only and then pin it.
2233 	 *
2234 	 * Xen disallows sharing of kernel PMDs for PAE
2235 	 * guests. Therefore we must copy the kernel PMD from
2236 	 * initial_page_table into a new kernel PMD to be used in
2237 	 * swapper_pg_dir.
2238 	 */
2239 	swapper_kernel_pmd =
2240 		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2241 	copy_page(swapper_kernel_pmd, initial_kernel_pmd);
2242 	swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
2243 		__pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
2244 	set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
2245 
2246 	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
2247 	xen_write_cr3(cr3);
2248 	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
2249 
2250 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
2251 			  PFN_DOWN(__pa(initial_page_table)));
2252 	set_page_prot(initial_page_table, PAGE_KERNEL);
2253 	set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
2254 
2255 	pv_mmu_ops.write_cr3 = &xen_write_cr3;
2256 }
2257 
2258 /*
2259  * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
2260  * not the first page table in the page table pool.
2261  * Iterate through the initial page tables to find the real page table base.
2262  */
2263 static phys_addr_t xen_find_pt_base(pmd_t *pmd)
2264 {
2265 	phys_addr_t pt_base, paddr;
2266 	unsigned pmdidx;
2267 
2268 	pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
2269 
2270 	for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
2271 		if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
2272 			paddr = m2p(pmd[pmdidx].pmd);
2273 			pt_base = min(pt_base, paddr);
2274 		}
2275 
2276 	return pt_base;
2277 }
2278 
2279 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
2280 {
2281 	pmd_t *kernel_pmd;
2282 
2283 	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
2284 
2285 	xen_pt_base = xen_find_pt_base(kernel_pmd);
2286 	xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
2287 
2288 	initial_kernel_pmd =
2289 		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2290 
2291 	max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
2292 
2293 	copy_page(initial_kernel_pmd, kernel_pmd);
2294 
2295 	xen_map_identity_early(initial_kernel_pmd, max_pfn);
2296 
2297 	copy_page(initial_page_table, pgd);
2298 	initial_page_table[KERNEL_PGD_BOUNDARY] =
2299 		__pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
2300 
2301 	set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
2302 	set_page_prot(initial_page_table, PAGE_KERNEL_RO);
2303 	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
2304 
2305 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
2306 
2307 	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
2308 			  PFN_DOWN(__pa(initial_page_table)));
2309 	xen_write_cr3(__pa(initial_page_table));
2310 
2311 	memblock_reserve(xen_pt_base, xen_pt_size);
2312 }
2313 #endif	/* CONFIG_X86_64 */
2314 
2315 void __init xen_reserve_special_pages(void)
2316 {
2317 	phys_addr_t paddr;
2318 
2319 	memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
2320 	if (xen_start_info->store_mfn) {
2321 		paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
2322 		memblock_reserve(paddr, PAGE_SIZE);
2323 	}
2324 	if (!xen_initial_domain()) {
2325 		paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
2326 		memblock_reserve(paddr, PAGE_SIZE);
2327 	}
2328 }
2329 
2330 void __init xen_pt_check_e820(void)
2331 {
2332 	if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
2333 		xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
2334 		BUG();
2335 	}
2336 }
2337 
2338 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
2339 
2340 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2341 {
2342 	pte_t pte;
2343 
2344 	phys >>= PAGE_SHIFT;
2345 
2346 	switch (idx) {
2347 	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
2348 	case FIX_RO_IDT:
2349 #ifdef CONFIG_X86_32
2350 	case FIX_WP_TEST:
2351 # ifdef CONFIG_HIGHMEM
2352 	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
2353 # endif
2354 #elif defined(CONFIG_X86_VSYSCALL_EMULATION)
2355 	case VSYSCALL_PAGE:
2356 #endif
2357 	case FIX_TEXT_POKE0:
2358 	case FIX_TEXT_POKE1:
2359 	case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
2360 		/* All local page mappings */
2361 		pte = pfn_pte(phys, prot);
2362 		break;
2363 
2364 #ifdef CONFIG_X86_LOCAL_APIC
2365 	case FIX_APIC_BASE:	/* maps dummy local APIC */
2366 		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2367 		break;
2368 #endif
2369 
2370 #ifdef CONFIG_X86_IO_APIC
2371 	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
2372 		/*
2373 		 * We just don't map the IO APIC - all access is via
2374 		 * hypercalls.  Keep the address in the pte for reference.
2375 		 */
2376 		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2377 		break;
2378 #endif
2379 
2380 	case FIX_PARAVIRT_BOOTMAP:
2381 		/* This is an MFN, but it isn't an IO mapping from the
2382 		   IO domain */
2383 		pte = mfn_pte(phys, prot);
2384 		break;
2385 
2386 	default:
2387 		/* By default, set_fixmap is used for hardware mappings */
2388 		pte = mfn_pte(phys, prot);
2389 		break;
2390 	}
2391 
2392 	__native_set_fixmap(idx, pte);
2393 
2394 #ifdef CONFIG_X86_VSYSCALL_EMULATION
2395 	/* Replicate changes to map the vsyscall page into the user
2396 	   pagetable vsyscall mapping. */
2397 	if (idx == VSYSCALL_PAGE) {
2398 		unsigned long vaddr = __fix_to_virt(idx);
2399 		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
2400 	}
2401 #endif
2402 }
2403 
2404 static void __init xen_post_allocator_init(void)
2405 {
2406 	if (xen_feature(XENFEAT_auto_translated_physmap))
2407 		return;
2408 
2409 	pv_mmu_ops.set_pte = xen_set_pte;
2410 	pv_mmu_ops.set_pmd = xen_set_pmd;
2411 	pv_mmu_ops.set_pud = xen_set_pud;
2412 #if CONFIG_PGTABLE_LEVELS >= 4
2413 	pv_mmu_ops.set_p4d = xen_set_p4d;
2414 #endif
2415 
2416 	/* This will work as long as patching hasn't happened yet
2417 	   (which it hasn't) */
2418 	pv_mmu_ops.alloc_pte = xen_alloc_pte;
2419 	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2420 	pv_mmu_ops.release_pte = xen_release_pte;
2421 	pv_mmu_ops.release_pmd = xen_release_pmd;
2422 #if CONFIG_PGTABLE_LEVELS >= 4
2423 	pv_mmu_ops.alloc_pud = xen_alloc_pud;
2424 	pv_mmu_ops.release_pud = xen_release_pud;
2425 #endif
2426 	pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte);
2427 
2428 #ifdef CONFIG_X86_64
2429 	pv_mmu_ops.write_cr3 = &xen_write_cr3;
2430 	SetPagePinned(virt_to_page(level3_user_vsyscall));
2431 #endif
2432 	xen_mark_init_mm_pinned();
2433 }
2434 
2435 static void xen_leave_lazy_mmu(void)
2436 {
2437 	preempt_disable();
2438 	xen_mc_flush();
2439 	paravirt_leave_lazy_mmu();
2440 	preempt_enable();
2441 }
2442 
2443 static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2444 	.read_cr2 = xen_read_cr2,
2445 	.write_cr2 = xen_write_cr2,
2446 
2447 	.read_cr3 = xen_read_cr3,
2448 	.write_cr3 = xen_write_cr3_init,
2449 
2450 	.flush_tlb_user = xen_flush_tlb,
2451 	.flush_tlb_kernel = xen_flush_tlb,
2452 	.flush_tlb_single = xen_flush_tlb_single,
2453 	.flush_tlb_others = xen_flush_tlb_others,
2454 
2455 	.pte_update = paravirt_nop,
2456 
2457 	.pgd_alloc = xen_pgd_alloc,
2458 	.pgd_free = xen_pgd_free,
2459 
2460 	.alloc_pte = xen_alloc_pte_init,
2461 	.release_pte = xen_release_pte_init,
2462 	.alloc_pmd = xen_alloc_pmd_init,
2463 	.release_pmd = xen_release_pmd_init,
2464 
2465 	.set_pte = xen_set_pte_init,
2466 	.set_pte_at = xen_set_pte_at,
2467 	.set_pmd = xen_set_pmd_hyper,
2468 
2469 	.ptep_modify_prot_start = __ptep_modify_prot_start,
2470 	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
2471 
2472 	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
2473 	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2474 
2475 	.make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
2476 	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2477 
2478 #ifdef CONFIG_X86_PAE
2479 	.set_pte_atomic = xen_set_pte_atomic,
2480 	.pte_clear = xen_pte_clear,
2481 	.pmd_clear = xen_pmd_clear,
2482 #endif	/* CONFIG_X86_PAE */
2483 	.set_pud = xen_set_pud_hyper,
2484 
2485 	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2486 	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2487 
2488 #if CONFIG_PGTABLE_LEVELS >= 4
2489 	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
2490 	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
2491 	.set_p4d = xen_set_p4d_hyper,
2492 
2493 	.alloc_pud = xen_alloc_pmd_init,
2494 	.release_pud = xen_release_pmd_init,
2495 #endif	/* CONFIG_PGTABLE_LEVELS == 4 */
2496 
2497 	.activate_mm = xen_activate_mm,
2498 	.dup_mmap = xen_dup_mmap,
2499 	.exit_mmap = xen_exit_mmap,
2500 
2501 	.lazy_mode = {
2502 		.enter = paravirt_enter_lazy_mmu,
2503 		.leave = xen_leave_lazy_mmu,
2504 		.flush = paravirt_flush_lazy_mmu,
2505 	},
2506 
2507 	.set_fixmap = xen_set_fixmap,
2508 };
2509 
2510 void __init xen_init_mmu_ops(void)
2511 {
2512 	x86_init.paging.pagetable_init = xen_pagetable_init;
2513 
2514 	if (xen_feature(XENFEAT_auto_translated_physmap))
2515 		return;
2516 
2517 	pv_mmu_ops = xen_mmu_ops;
2518 
2519 	memset(dummy_mapping, 0xff, PAGE_SIZE);
2520 }
2521 
2522 /* Protected by xen_reservation_lock. */
2523 #define MAX_CONTIG_ORDER 9 /* 2MB */
2524 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2525 
2526 #define VOID_PTE (mfn_pte(0, __pgprot(0)))
2527 static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2528 				unsigned long *in_frames,
2529 				unsigned long *out_frames)
2530 {
2531 	int i;
2532 	struct multicall_space mcs;
2533 
2534 	xen_mc_batch();
2535 	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2536 		mcs = __xen_mc_entry(0);
2537 
2538 		if (in_frames)
2539 			in_frames[i] = virt_to_mfn(vaddr);
2540 
2541 		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2542 		__set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2543 
2544 		if (out_frames)
2545 			out_frames[i] = virt_to_pfn(vaddr);
2546 	}
2547 	xen_mc_issue(0);
2548 }
2549 
2550 /*
2551  * Update the pfn-to-mfn mappings for a virtual address range, either to
2552  * point to an array of mfns, or contiguously from a single starting
2553  * mfn.
2554  */
2555 static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2556 				     unsigned long *mfns,
2557 				     unsigned long first_mfn)
2558 {
2559 	unsigned i, limit;
2560 	unsigned long mfn;
2561 
2562 	xen_mc_batch();
2563 
2564 	limit = 1u << order;
2565 	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2566 		struct multicall_space mcs;
2567 		unsigned flags;
2568 
2569 		mcs = __xen_mc_entry(0);
2570 		if (mfns)
2571 			mfn = mfns[i];
2572 		else
2573 			mfn = first_mfn + i;
2574 
2575 		if (i < (limit - 1))
2576 			flags = 0;
2577 		else {
2578 			if (order == 0)
2579 				flags = UVMF_INVLPG | UVMF_ALL;
2580 			else
2581 				flags = UVMF_TLB_FLUSH | UVMF_ALL;
2582 		}
2583 
2584 		MULTI_update_va_mapping(mcs.mc, vaddr,
2585 				mfn_pte(mfn, PAGE_KERNEL), flags);
2586 
2587 		set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2588 	}
2589 
2590 	xen_mc_issue(0);
2591 }
2592 
2593 /*
2594  * Perform the hypercall to exchange a region of our pfns to point to
2595  * memory with the required contiguous alignment.  Takes the pfns as
2596  * input, and populates mfns as output.
2597  *
2598  * Returns a success code indicating whether the hypervisor was able to
2599  * satisfy the request or not.
2600  */
2601 static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2602 			       unsigned long *pfns_in,
2603 			       unsigned long extents_out,
2604 			       unsigned int order_out,
2605 			       unsigned long *mfns_out,
2606 			       unsigned int address_bits)
2607 {
2608 	long rc;
2609 	int success;
2610 
2611 	struct xen_memory_exchange exchange = {
2612 		.in = {
2613 			.nr_extents   = extents_in,
2614 			.extent_order = order_in,
2615 			.extent_start = pfns_in,
2616 			.domid        = DOMID_SELF
2617 		},
2618 		.out = {
2619 			.nr_extents   = extents_out,
2620 			.extent_order = order_out,
2621 			.extent_start = mfns_out,
2622 			.address_bits = address_bits,
2623 			.domid        = DOMID_SELF
2624 		}
2625 	};
2626 
2627 	BUG_ON(extents_in << order_in != extents_out << order_out);
2628 
2629 	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2630 	success = (exchange.nr_exchanged == extents_in);
2631 
2632 	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2633 	BUG_ON(success && (rc != 0));
2634 
2635 	return success;
2636 }
2637 
2638 int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
2639 				 unsigned int address_bits,
2640 				 dma_addr_t *dma_handle)
2641 {
2642 	unsigned long *in_frames = discontig_frames, out_frame;
2643 	unsigned long  flags;
2644 	int            success;
2645 	unsigned long vstart = (unsigned long)phys_to_virt(pstart);
2646 
2647 	/*
2648 	 * Currently an auto-translated guest will not perform I/O, nor will
2649 	 * it require PAE page directories below 4GB. Therefore any calls to
2650 	 * this function are redundant and can be ignored.
2651 	 */
2652 
2653 	if (xen_feature(XENFEAT_auto_translated_physmap))
2654 		return 0;
2655 
2656 	if (unlikely(order > MAX_CONTIG_ORDER))
2657 		return -ENOMEM;
2658 
2659 	memset((void *) vstart, 0, PAGE_SIZE << order);
2660 
2661 	spin_lock_irqsave(&xen_reservation_lock, flags);
2662 
2663 	/* 1. Zap current PTEs, remembering MFNs. */
2664 	xen_zap_pfn_range(vstart, order, in_frames, NULL);
2665 
2666 	/* 2. Get a new contiguous memory extent. */
2667 	out_frame = virt_to_pfn(vstart);
2668 	success = xen_exchange_memory(1UL << order, 0, in_frames,
2669 				      1, order, &out_frame,
2670 				      address_bits);
2671 
2672 	/* 3. Map the new extent in place of old pages. */
2673 	if (success)
2674 		xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2675 	else
2676 		xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2677 
2678 	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2679 
2680 	*dma_handle = virt_to_machine(vstart).maddr;
2681 	return success ? 0 : -ENOMEM;
2682 }
2683 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2684 
2685 void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
2686 {
2687 	unsigned long *out_frames = discontig_frames, in_frame;
2688 	unsigned long  flags;
2689 	int success;
2690 	unsigned long vstart;
2691 
2692 	if (xen_feature(XENFEAT_auto_translated_physmap))
2693 		return;
2694 
2695 	if (unlikely(order > MAX_CONTIG_ORDER))
2696 		return;
2697 
2698 	vstart = (unsigned long)phys_to_virt(pstart);
2699 	memset((void *) vstart, 0, PAGE_SIZE << order);
2700 
2701 	spin_lock_irqsave(&xen_reservation_lock, flags);
2702 
2703 	/* 1. Find start MFN of contiguous extent. */
2704 	in_frame = virt_to_mfn(vstart);
2705 
2706 	/* 2. Zap current PTEs. */
2707 	xen_zap_pfn_range(vstart, order, NULL, out_frames);
2708 
2709 	/* 3. Do the exchange for non-contiguous MFNs. */
2710 	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2711 					0, out_frames, 0);
2712 
2713 	/* 4. Map new pages in place of old pages. */
2714 	if (success)
2715 		xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2716 	else
2717 		xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2718 
2719 	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2720 }
2721 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2722 
2723 #ifdef CONFIG_KEXEC_CORE
2724 phys_addr_t paddr_vmcoreinfo_note(void)
2725 {
2726 	if (xen_pv_domain())
2727 		return virt_to_machine(&vmcoreinfo_note).maddr;
2728 	else
2729 		return __pa_symbol(&vmcoreinfo_note);
2730 }
2731 #endif /* CONFIG_KEXEC_CORE */
2732