xref: /openbmc/linux/arch/x86/xen/mmu.c (revision 2d8ad8719591fa803b0d589ed057fa46f49b7155)
1 /*
2  * Xen mmu operations
3  *
4  * This file contains the various mmu fetch and update operations.
5  * The most important job they must perform is the mapping between the
6  * domain's pfn and the overall machine mfns.
7  *
8  * Xen allows guests to directly update the pagetable, in a controlled
9  * fashion.  In other words, the guest modifies the same pagetable
10  * that the CPU actually uses, which eliminates the overhead of having
11  * a separate shadow pagetable.
12  *
13  * In order to allow this, it falls on the guest domain to map its
14  * notion of a "physical" pfn - which is just a domain-local linear
15  * address - into a real "machine address" which the CPU's MMU can
16  * use.
17  *
18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19  * inserted directly into the pagetable.  When creating a new
20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
22  * the mfn back into a pfn.
23  *
24  * The other constraint is that all pages which make up a pagetable
25  * must be mapped read-only in the guest.  This prevents uncontrolled
26  * guest updates to the pagetable.  Xen strictly enforces this, and
27  * will disallow any pagetable update which will end up mapping a
28  * pagetable page RW, and will disallow using any writable page as a
29  * pagetable.
30  *
31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
32  * would need to validate the whole pagetable before going on.
33  * Naturally, this is quite slow.  The solution is to "pin" a
34  * pagetable, which enforces all the constraints on the pagetable even
35  * when it is not actively in use.  This menas that Xen can be assured
36  * that it is still valid when you do load it into %cr3, and doesn't
37  * need to revalidate it.
38  *
39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40  */
41 #include <linux/sched.h>
42 #include <linux/highmem.h>
43 #include <linux/debugfs.h>
44 #include <linux/bug.h>
45 #include <linux/vmalloc.h>
46 #include <linux/module.h>
47 #include <linux/gfp.h>
48 #include <linux/memblock.h>
49 
50 #include <asm/pgtable.h>
51 #include <asm/tlbflush.h>
52 #include <asm/fixmap.h>
53 #include <asm/mmu_context.h>
54 #include <asm/setup.h>
55 #include <asm/paravirt.h>
56 #include <asm/e820.h>
57 #include <asm/linkage.h>
58 #include <asm/page.h>
59 #include <asm/init.h>
60 #include <asm/pat.h>
61 
62 #include <asm/xen/hypercall.h>
63 #include <asm/xen/hypervisor.h>
64 
65 #include <xen/xen.h>
66 #include <xen/page.h>
67 #include <xen/interface/xen.h>
68 #include <xen/interface/hvm/hvm_op.h>
69 #include <xen/interface/version.h>
70 #include <xen/interface/memory.h>
71 #include <xen/hvc-console.h>
72 
73 #include "multicalls.h"
74 #include "mmu.h"
75 #include "debugfs.h"
76 
77 #define MMU_UPDATE_HISTO	30
78 
79 /*
80  * Protects atomic reservation decrease/increase against concurrent increases.
81  * Also protects non-atomic updates of current_pages and driver_pages, and
82  * balloon lists.
83  */
84 DEFINE_SPINLOCK(xen_reservation_lock);
85 
86 #ifdef CONFIG_XEN_DEBUG_FS
87 
88 static struct {
89 	u32 pgd_update;
90 	u32 pgd_update_pinned;
91 	u32 pgd_update_batched;
92 
93 	u32 pud_update;
94 	u32 pud_update_pinned;
95 	u32 pud_update_batched;
96 
97 	u32 pmd_update;
98 	u32 pmd_update_pinned;
99 	u32 pmd_update_batched;
100 
101 	u32 pte_update;
102 	u32 pte_update_pinned;
103 	u32 pte_update_batched;
104 
105 	u32 mmu_update;
106 	u32 mmu_update_extended;
107 	u32 mmu_update_histo[MMU_UPDATE_HISTO];
108 
109 	u32 prot_commit;
110 	u32 prot_commit_batched;
111 
112 	u32 set_pte_at;
113 	u32 set_pte_at_batched;
114 	u32 set_pte_at_pinned;
115 	u32 set_pte_at_current;
116 	u32 set_pte_at_kernel;
117 } mmu_stats;
118 
119 static u8 zero_stats;
120 
121 static inline void check_zero(void)
122 {
123 	if (unlikely(zero_stats)) {
124 		memset(&mmu_stats, 0, sizeof(mmu_stats));
125 		zero_stats = 0;
126 	}
127 }
128 
129 #define ADD_STATS(elem, val)			\
130 	do { check_zero(); mmu_stats.elem += (val); } while(0)
131 
132 #else  /* !CONFIG_XEN_DEBUG_FS */
133 
134 #define ADD_STATS(elem, val)	do { (void)(val); } while(0)
135 
136 #endif /* CONFIG_XEN_DEBUG_FS */
137 
138 
139 /*
140  * Identity map, in addition to plain kernel map.  This needs to be
141  * large enough to allocate page table pages to allocate the rest.
142  * Each page can map 2MB.
143  */
144 #define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)
145 static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
146 
147 #ifdef CONFIG_X86_64
148 /* l3 pud for userspace vsyscall mapping */
149 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
150 #endif /* CONFIG_X86_64 */
151 
152 /*
153  * Note about cr3 (pagetable base) values:
154  *
155  * xen_cr3 contains the current logical cr3 value; it contains the
156  * last set cr3.  This may not be the current effective cr3, because
157  * its update may be being lazily deferred.  However, a vcpu looking
158  * at its own cr3 can use this value knowing that it everything will
159  * be self-consistent.
160  *
161  * xen_current_cr3 contains the actual vcpu cr3; it is set once the
162  * hypercall to set the vcpu cr3 is complete (so it may be a little
163  * out of date, but it will never be set early).  If one vcpu is
164  * looking at another vcpu's cr3 value, it should use this variable.
165  */
166 DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
167 DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
168 
169 
170 /*
171  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
172  * redzone above it, so round it up to a PGD boundary.
173  */
174 #define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
175 
176 unsigned long arbitrary_virt_to_mfn(void *vaddr)
177 {
178 	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
179 
180 	return PFN_DOWN(maddr.maddr);
181 }
182 
183 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
184 {
185 	unsigned long address = (unsigned long)vaddr;
186 	unsigned int level;
187 	pte_t *pte;
188 	unsigned offset;
189 
190 	/*
191 	 * if the PFN is in the linear mapped vaddr range, we can just use
192 	 * the (quick) virt_to_machine() p2m lookup
193 	 */
194 	if (virt_addr_valid(vaddr))
195 		return virt_to_machine(vaddr);
196 
197 	/* otherwise we have to do a (slower) full page-table walk */
198 
199 	pte = lookup_address(address, &level);
200 	BUG_ON(pte == NULL);
201 	offset = address & ~PAGE_MASK;
202 	return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
203 }
204 EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
205 
206 void make_lowmem_page_readonly(void *vaddr)
207 {
208 	pte_t *pte, ptev;
209 	unsigned long address = (unsigned long)vaddr;
210 	unsigned int level;
211 
212 	pte = lookup_address(address, &level);
213 	if (pte == NULL)
214 		return;		/* vaddr missing */
215 
216 	ptev = pte_wrprotect(*pte);
217 
218 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
219 		BUG();
220 }
221 
222 void make_lowmem_page_readwrite(void *vaddr)
223 {
224 	pte_t *pte, ptev;
225 	unsigned long address = (unsigned long)vaddr;
226 	unsigned int level;
227 
228 	pte = lookup_address(address, &level);
229 	if (pte == NULL)
230 		return;		/* vaddr missing */
231 
232 	ptev = pte_mkwrite(*pte);
233 
234 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
235 		BUG();
236 }
237 
238 
239 static bool xen_page_pinned(void *ptr)
240 {
241 	struct page *page = virt_to_page(ptr);
242 
243 	return PagePinned(page);
244 }
245 
246 static bool xen_iomap_pte(pte_t pte)
247 {
248 	return pte_flags(pte) & _PAGE_IOMAP;
249 }
250 
251 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
252 {
253 	struct multicall_space mcs;
254 	struct mmu_update *u;
255 
256 	mcs = xen_mc_entry(sizeof(*u));
257 	u = mcs.args;
258 
259 	/* ptep might be kmapped when using 32-bit HIGHPTE */
260 	u->ptr = arbitrary_virt_to_machine(ptep).maddr;
261 	u->val = pte_val_ma(pteval);
262 
263 	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
264 
265 	xen_mc_issue(PARAVIRT_LAZY_MMU);
266 }
267 EXPORT_SYMBOL_GPL(xen_set_domain_pte);
268 
269 static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
270 {
271 	xen_set_domain_pte(ptep, pteval, DOMID_IO);
272 }
273 
274 static void xen_extend_mmu_update(const struct mmu_update *update)
275 {
276 	struct multicall_space mcs;
277 	struct mmu_update *u;
278 
279 	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
280 
281 	if (mcs.mc != NULL) {
282 		ADD_STATS(mmu_update_extended, 1);
283 		ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
284 
285 		mcs.mc->args[1]++;
286 
287 		if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
288 			ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
289 		else
290 			ADD_STATS(mmu_update_histo[0], 1);
291 	} else {
292 		ADD_STATS(mmu_update, 1);
293 		mcs = __xen_mc_entry(sizeof(*u));
294 		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
295 		ADD_STATS(mmu_update_histo[1], 1);
296 	}
297 
298 	u = mcs.args;
299 	*u = *update;
300 }
301 
302 void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
303 {
304 	struct mmu_update u;
305 
306 	preempt_disable();
307 
308 	xen_mc_batch();
309 
310 	/* ptr may be ioremapped for 64-bit pagetable setup */
311 	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
312 	u.val = pmd_val_ma(val);
313 	xen_extend_mmu_update(&u);
314 
315 	ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
316 
317 	xen_mc_issue(PARAVIRT_LAZY_MMU);
318 
319 	preempt_enable();
320 }
321 
322 void xen_set_pmd(pmd_t *ptr, pmd_t val)
323 {
324 	ADD_STATS(pmd_update, 1);
325 
326 	/* If page is not pinned, we can just update the entry
327 	   directly */
328 	if (!xen_page_pinned(ptr)) {
329 		*ptr = val;
330 		return;
331 	}
332 
333 	ADD_STATS(pmd_update_pinned, 1);
334 
335 	xen_set_pmd_hyper(ptr, val);
336 }
337 
338 /*
339  * Associate a virtual page frame with a given physical page frame
340  * and protection flags for that frame.
341  */
342 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
343 {
344 	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
345 }
346 
347 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
348 		    pte_t *ptep, pte_t pteval)
349 {
350 	if (xen_iomap_pte(pteval)) {
351 		xen_set_iomap_pte(ptep, pteval);
352 		goto out;
353 	}
354 
355 	ADD_STATS(set_pte_at, 1);
356 //	ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
357 	ADD_STATS(set_pte_at_current, mm == current->mm);
358 	ADD_STATS(set_pte_at_kernel, mm == &init_mm);
359 
360 	if (mm == current->mm || mm == &init_mm) {
361 		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
362 			struct multicall_space mcs;
363 			mcs = xen_mc_entry(0);
364 
365 			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
366 			ADD_STATS(set_pte_at_batched, 1);
367 			xen_mc_issue(PARAVIRT_LAZY_MMU);
368 			goto out;
369 		} else
370 			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
371 				goto out;
372 	}
373 	xen_set_pte(ptep, pteval);
374 
375 out:	return;
376 }
377 
378 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
379 				 unsigned long addr, pte_t *ptep)
380 {
381 	/* Just return the pte as-is.  We preserve the bits on commit */
382 	return *ptep;
383 }
384 
385 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
386 				 pte_t *ptep, pte_t pte)
387 {
388 	struct mmu_update u;
389 
390 	xen_mc_batch();
391 
392 	u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
393 	u.val = pte_val_ma(pte);
394 	xen_extend_mmu_update(&u);
395 
396 	ADD_STATS(prot_commit, 1);
397 	ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
398 
399 	xen_mc_issue(PARAVIRT_LAZY_MMU);
400 }
401 
402 /* Assume pteval_t is equivalent to all the other *val_t types. */
403 static pteval_t pte_mfn_to_pfn(pteval_t val)
404 {
405 	if (val & _PAGE_PRESENT) {
406 		unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
407 		pteval_t flags = val & PTE_FLAGS_MASK;
408 		val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
409 	}
410 
411 	return val;
412 }
413 
414 static pteval_t pte_pfn_to_mfn(pteval_t val)
415 {
416 	if (val & _PAGE_PRESENT) {
417 		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
418 		pteval_t flags = val & PTE_FLAGS_MASK;
419 		unsigned long mfn = pfn_to_mfn(pfn);
420 
421 		/*
422 		 * If there's no mfn for the pfn, then just create an
423 		 * empty non-present pte.  Unfortunately this loses
424 		 * information about the original pfn, so
425 		 * pte_mfn_to_pfn is asymmetric.
426 		 */
427 		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
428 			mfn = 0;
429 			flags = 0;
430 		}
431 
432 		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
433 	}
434 
435 	return val;
436 }
437 
438 static pteval_t iomap_pte(pteval_t val)
439 {
440 	if (val & _PAGE_PRESENT) {
441 		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
442 		pteval_t flags = val & PTE_FLAGS_MASK;
443 
444 		/* We assume the pte frame number is a MFN, so
445 		   just use it as-is. */
446 		val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
447 	}
448 
449 	return val;
450 }
451 
452 pteval_t xen_pte_val(pte_t pte)
453 {
454 	pteval_t pteval = pte.pte;
455 
456 	/* If this is a WC pte, convert back from Xen WC to Linux WC */
457 	if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
458 		WARN_ON(!pat_enabled);
459 		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
460 	}
461 
462 	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
463 		return pteval;
464 
465 	return pte_mfn_to_pfn(pteval);
466 }
467 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
468 
469 pgdval_t xen_pgd_val(pgd_t pgd)
470 {
471 	return pte_mfn_to_pfn(pgd.pgd);
472 }
473 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
474 
475 /*
476  * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
477  * are reserved for now, to correspond to the Intel-reserved PAT
478  * types.
479  *
480  * We expect Linux's PAT set as follows:
481  *
482  * Idx  PTE flags        Linux    Xen    Default
483  * 0                     WB       WB     WB
484  * 1            PWT      WC       WT     WT
485  * 2        PCD          UC-      UC-    UC-
486  * 3        PCD PWT      UC       UC     UC
487  * 4    PAT              WB       WC     WB
488  * 5    PAT     PWT      WC       WP     WT
489  * 6    PAT PCD          UC-      UC     UC-
490  * 7    PAT PCD PWT      UC       UC     UC
491  */
492 
493 void xen_set_pat(u64 pat)
494 {
495 	/* We expect Linux to use a PAT setting of
496 	 * UC UC- WC WB (ignoring the PAT flag) */
497 	WARN_ON(pat != 0x0007010600070106ull);
498 }
499 
500 pte_t xen_make_pte(pteval_t pte)
501 {
502 	phys_addr_t addr = (pte & PTE_PFN_MASK);
503 
504 	/* If Linux is trying to set a WC pte, then map to the Xen WC.
505 	 * If _PAGE_PAT is set, then it probably means it is really
506 	 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
507 	 * things work out OK...
508 	 *
509 	 * (We should never see kernel mappings with _PAGE_PSE set,
510 	 * but we could see hugetlbfs mappings, I think.).
511 	 */
512 	if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
513 		if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
514 			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
515 	}
516 
517 	/*
518 	 * Unprivileged domains are allowed to do IOMAPpings for
519 	 * PCI passthrough, but not map ISA space.  The ISA
520 	 * mappings are just dummy local mappings to keep other
521 	 * parts of the kernel happy.
522 	 */
523 	if (unlikely(pte & _PAGE_IOMAP) &&
524 	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
525 		pte = iomap_pte(pte);
526 	} else {
527 		pte &= ~_PAGE_IOMAP;
528 		pte = pte_pfn_to_mfn(pte);
529 	}
530 
531 	return native_make_pte(pte);
532 }
533 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
534 
535 pgd_t xen_make_pgd(pgdval_t pgd)
536 {
537 	pgd = pte_pfn_to_mfn(pgd);
538 	return native_make_pgd(pgd);
539 }
540 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
541 
542 pmdval_t xen_pmd_val(pmd_t pmd)
543 {
544 	return pte_mfn_to_pfn(pmd.pmd);
545 }
546 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
547 
548 void xen_set_pud_hyper(pud_t *ptr, pud_t val)
549 {
550 	struct mmu_update u;
551 
552 	preempt_disable();
553 
554 	xen_mc_batch();
555 
556 	/* ptr may be ioremapped for 64-bit pagetable setup */
557 	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
558 	u.val = pud_val_ma(val);
559 	xen_extend_mmu_update(&u);
560 
561 	ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
562 
563 	xen_mc_issue(PARAVIRT_LAZY_MMU);
564 
565 	preempt_enable();
566 }
567 
568 void xen_set_pud(pud_t *ptr, pud_t val)
569 {
570 	ADD_STATS(pud_update, 1);
571 
572 	/* If page is not pinned, we can just update the entry
573 	   directly */
574 	if (!xen_page_pinned(ptr)) {
575 		*ptr = val;
576 		return;
577 	}
578 
579 	ADD_STATS(pud_update_pinned, 1);
580 
581 	xen_set_pud_hyper(ptr, val);
582 }
583 
584 void xen_set_pte(pte_t *ptep, pte_t pte)
585 {
586 	if (xen_iomap_pte(pte)) {
587 		xen_set_iomap_pte(ptep, pte);
588 		return;
589 	}
590 
591 	ADD_STATS(pte_update, 1);
592 //	ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
593 	ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
594 
595 #ifdef CONFIG_X86_PAE
596 	ptep->pte_high = pte.pte_high;
597 	smp_wmb();
598 	ptep->pte_low = pte.pte_low;
599 #else
600 	*ptep = pte;
601 #endif
602 }
603 
604 #ifdef CONFIG_X86_PAE
605 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
606 {
607 	if (xen_iomap_pte(pte)) {
608 		xen_set_iomap_pte(ptep, pte);
609 		return;
610 	}
611 
612 	set_64bit((u64 *)ptep, native_pte_val(pte));
613 }
614 
615 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
616 {
617 	ptep->pte_low = 0;
618 	smp_wmb();		/* make sure low gets written first */
619 	ptep->pte_high = 0;
620 }
621 
622 void xen_pmd_clear(pmd_t *pmdp)
623 {
624 	set_pmd(pmdp, __pmd(0));
625 }
626 #endif	/* CONFIG_X86_PAE */
627 
628 pmd_t xen_make_pmd(pmdval_t pmd)
629 {
630 	pmd = pte_pfn_to_mfn(pmd);
631 	return native_make_pmd(pmd);
632 }
633 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
634 
635 #if PAGETABLE_LEVELS == 4
636 pudval_t xen_pud_val(pud_t pud)
637 {
638 	return pte_mfn_to_pfn(pud.pud);
639 }
640 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
641 
642 pud_t xen_make_pud(pudval_t pud)
643 {
644 	pud = pte_pfn_to_mfn(pud);
645 
646 	return native_make_pud(pud);
647 }
648 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
649 
650 pgd_t *xen_get_user_pgd(pgd_t *pgd)
651 {
652 	pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
653 	unsigned offset = pgd - pgd_page;
654 	pgd_t *user_ptr = NULL;
655 
656 	if (offset < pgd_index(USER_LIMIT)) {
657 		struct page *page = virt_to_page(pgd_page);
658 		user_ptr = (pgd_t *)page->private;
659 		if (user_ptr)
660 			user_ptr += offset;
661 	}
662 
663 	return user_ptr;
664 }
665 
666 static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
667 {
668 	struct mmu_update u;
669 
670 	u.ptr = virt_to_machine(ptr).maddr;
671 	u.val = pgd_val_ma(val);
672 	xen_extend_mmu_update(&u);
673 }
674 
675 /*
676  * Raw hypercall-based set_pgd, intended for in early boot before
677  * there's a page structure.  This implies:
678  *  1. The only existing pagetable is the kernel's
679  *  2. It is always pinned
680  *  3. It has no user pagetable attached to it
681  */
682 void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
683 {
684 	preempt_disable();
685 
686 	xen_mc_batch();
687 
688 	__xen_set_pgd_hyper(ptr, val);
689 
690 	xen_mc_issue(PARAVIRT_LAZY_MMU);
691 
692 	preempt_enable();
693 }
694 
695 void xen_set_pgd(pgd_t *ptr, pgd_t val)
696 {
697 	pgd_t *user_ptr = xen_get_user_pgd(ptr);
698 
699 	ADD_STATS(pgd_update, 1);
700 
701 	/* If page is not pinned, we can just update the entry
702 	   directly */
703 	if (!xen_page_pinned(ptr)) {
704 		*ptr = val;
705 		if (user_ptr) {
706 			WARN_ON(xen_page_pinned(user_ptr));
707 			*user_ptr = val;
708 		}
709 		return;
710 	}
711 
712 	ADD_STATS(pgd_update_pinned, 1);
713 	ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
714 
715 	/* If it's pinned, then we can at least batch the kernel and
716 	   user updates together. */
717 	xen_mc_batch();
718 
719 	__xen_set_pgd_hyper(ptr, val);
720 	if (user_ptr)
721 		__xen_set_pgd_hyper(user_ptr, val);
722 
723 	xen_mc_issue(PARAVIRT_LAZY_MMU);
724 }
725 #endif	/* PAGETABLE_LEVELS == 4 */
726 
727 /*
728  * (Yet another) pagetable walker.  This one is intended for pinning a
729  * pagetable.  This means that it walks a pagetable and calls the
730  * callback function on each page it finds making up the page table,
731  * at every level.  It walks the entire pagetable, but it only bothers
732  * pinning pte pages which are below limit.  In the normal case this
733  * will be STACK_TOP_MAX, but at boot we need to pin up to
734  * FIXADDR_TOP.
735  *
736  * For 32-bit the important bit is that we don't pin beyond there,
737  * because then we start getting into Xen's ptes.
738  *
739  * For 64-bit, we must skip the Xen hole in the middle of the address
740  * space, just after the big x86-64 virtual hole.
741  */
742 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
743 			  int (*func)(struct mm_struct *mm, struct page *,
744 				      enum pt_level),
745 			  unsigned long limit)
746 {
747 	int flush = 0;
748 	unsigned hole_low, hole_high;
749 	unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
750 	unsigned pgdidx, pudidx, pmdidx;
751 
752 	/* The limit is the last byte to be touched */
753 	limit--;
754 	BUG_ON(limit >= FIXADDR_TOP);
755 
756 	if (xen_feature(XENFEAT_auto_translated_physmap))
757 		return 0;
758 
759 	/*
760 	 * 64-bit has a great big hole in the middle of the address
761 	 * space, which contains the Xen mappings.  On 32-bit these
762 	 * will end up making a zero-sized hole and so is a no-op.
763 	 */
764 	hole_low = pgd_index(USER_LIMIT);
765 	hole_high = pgd_index(PAGE_OFFSET);
766 
767 	pgdidx_limit = pgd_index(limit);
768 #if PTRS_PER_PUD > 1
769 	pudidx_limit = pud_index(limit);
770 #else
771 	pudidx_limit = 0;
772 #endif
773 #if PTRS_PER_PMD > 1
774 	pmdidx_limit = pmd_index(limit);
775 #else
776 	pmdidx_limit = 0;
777 #endif
778 
779 	for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
780 		pud_t *pud;
781 
782 		if (pgdidx >= hole_low && pgdidx < hole_high)
783 			continue;
784 
785 		if (!pgd_val(pgd[pgdidx]))
786 			continue;
787 
788 		pud = pud_offset(&pgd[pgdidx], 0);
789 
790 		if (PTRS_PER_PUD > 1) /* not folded */
791 			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
792 
793 		for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
794 			pmd_t *pmd;
795 
796 			if (pgdidx == pgdidx_limit &&
797 			    pudidx > pudidx_limit)
798 				goto out;
799 
800 			if (pud_none(pud[pudidx]))
801 				continue;
802 
803 			pmd = pmd_offset(&pud[pudidx], 0);
804 
805 			if (PTRS_PER_PMD > 1) /* not folded */
806 				flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
807 
808 			for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
809 				struct page *pte;
810 
811 				if (pgdidx == pgdidx_limit &&
812 				    pudidx == pudidx_limit &&
813 				    pmdidx > pmdidx_limit)
814 					goto out;
815 
816 				if (pmd_none(pmd[pmdidx]))
817 					continue;
818 
819 				pte = pmd_page(pmd[pmdidx]);
820 				flush |= (*func)(mm, pte, PT_PTE);
821 			}
822 		}
823 	}
824 
825 out:
826 	/* Do the top level last, so that the callbacks can use it as
827 	   a cue to do final things like tlb flushes. */
828 	flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
829 
830 	return flush;
831 }
832 
833 static int xen_pgd_walk(struct mm_struct *mm,
834 			int (*func)(struct mm_struct *mm, struct page *,
835 				    enum pt_level),
836 			unsigned long limit)
837 {
838 	return __xen_pgd_walk(mm, mm->pgd, func, limit);
839 }
840 
841 /* If we're using split pte locks, then take the page's lock and
842    return a pointer to it.  Otherwise return NULL. */
843 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
844 {
845 	spinlock_t *ptl = NULL;
846 
847 #if USE_SPLIT_PTLOCKS
848 	ptl = __pte_lockptr(page);
849 	spin_lock_nest_lock(ptl, &mm->page_table_lock);
850 #endif
851 
852 	return ptl;
853 }
854 
855 static void xen_pte_unlock(void *v)
856 {
857 	spinlock_t *ptl = v;
858 	spin_unlock(ptl);
859 }
860 
861 static void xen_do_pin(unsigned level, unsigned long pfn)
862 {
863 	struct mmuext_op *op;
864 	struct multicall_space mcs;
865 
866 	mcs = __xen_mc_entry(sizeof(*op));
867 	op = mcs.args;
868 	op->cmd = level;
869 	op->arg1.mfn = pfn_to_mfn(pfn);
870 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
871 }
872 
873 static int xen_pin_page(struct mm_struct *mm, struct page *page,
874 			enum pt_level level)
875 {
876 	unsigned pgfl = TestSetPagePinned(page);
877 	int flush;
878 
879 	if (pgfl)
880 		flush = 0;		/* already pinned */
881 	else if (PageHighMem(page))
882 		/* kmaps need flushing if we found an unpinned
883 		   highpage */
884 		flush = 1;
885 	else {
886 		void *pt = lowmem_page_address(page);
887 		unsigned long pfn = page_to_pfn(page);
888 		struct multicall_space mcs = __xen_mc_entry(0);
889 		spinlock_t *ptl;
890 
891 		flush = 0;
892 
893 		/*
894 		 * We need to hold the pagetable lock between the time
895 		 * we make the pagetable RO and when we actually pin
896 		 * it.  If we don't, then other users may come in and
897 		 * attempt to update the pagetable by writing it,
898 		 * which will fail because the memory is RO but not
899 		 * pinned, so Xen won't do the trap'n'emulate.
900 		 *
901 		 * If we're using split pte locks, we can't hold the
902 		 * entire pagetable's worth of locks during the
903 		 * traverse, because we may wrap the preempt count (8
904 		 * bits).  The solution is to mark RO and pin each PTE
905 		 * page while holding the lock.  This means the number
906 		 * of locks we end up holding is never more than a
907 		 * batch size (~32 entries, at present).
908 		 *
909 		 * If we're not using split pte locks, we needn't pin
910 		 * the PTE pages independently, because we're
911 		 * protected by the overall pagetable lock.
912 		 */
913 		ptl = NULL;
914 		if (level == PT_PTE)
915 			ptl = xen_pte_lock(page, mm);
916 
917 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
918 					pfn_pte(pfn, PAGE_KERNEL_RO),
919 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
920 
921 		if (ptl) {
922 			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
923 
924 			/* Queue a deferred unlock for when this batch
925 			   is completed. */
926 			xen_mc_callback(xen_pte_unlock, ptl);
927 		}
928 	}
929 
930 	return flush;
931 }
932 
933 /* This is called just after a mm has been created, but it has not
934    been used yet.  We need to make sure that its pagetable is all
935    read-only, and can be pinned. */
936 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
937 {
938 	xen_mc_batch();
939 
940 	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
941 		/* re-enable interrupts for flushing */
942 		xen_mc_issue(0);
943 
944 		kmap_flush_unused();
945 
946 		xen_mc_batch();
947 	}
948 
949 #ifdef CONFIG_X86_64
950 	{
951 		pgd_t *user_pgd = xen_get_user_pgd(pgd);
952 
953 		xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
954 
955 		if (user_pgd) {
956 			xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
957 			xen_do_pin(MMUEXT_PIN_L4_TABLE,
958 				   PFN_DOWN(__pa(user_pgd)));
959 		}
960 	}
961 #else /* CONFIG_X86_32 */
962 #ifdef CONFIG_X86_PAE
963 	/* Need to make sure unshared kernel PMD is pinnable */
964 	xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
965 		     PT_PMD);
966 #endif
967 	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
968 #endif /* CONFIG_X86_64 */
969 	xen_mc_issue(0);
970 }
971 
972 static void xen_pgd_pin(struct mm_struct *mm)
973 {
974 	__xen_pgd_pin(mm, mm->pgd);
975 }
976 
977 /*
978  * On save, we need to pin all pagetables to make sure they get their
979  * mfns turned into pfns.  Search the list for any unpinned pgds and pin
980  * them (unpinned pgds are not currently in use, probably because the
981  * process is under construction or destruction).
982  *
983  * Expected to be called in stop_machine() ("equivalent to taking
984  * every spinlock in the system"), so the locking doesn't really
985  * matter all that much.
986  */
987 void xen_mm_pin_all(void)
988 {
989 	unsigned long flags;
990 	struct page *page;
991 
992 	spin_lock_irqsave(&pgd_lock, flags);
993 
994 	list_for_each_entry(page, &pgd_list, lru) {
995 		if (!PagePinned(page)) {
996 			__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
997 			SetPageSavePinned(page);
998 		}
999 	}
1000 
1001 	spin_unlock_irqrestore(&pgd_lock, flags);
1002 }
1003 
1004 /*
1005  * The init_mm pagetable is really pinned as soon as its created, but
1006  * that's before we have page structures to store the bits.  So do all
1007  * the book-keeping now.
1008  */
1009 static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1010 				  enum pt_level level)
1011 {
1012 	SetPagePinned(page);
1013 	return 0;
1014 }
1015 
1016 static void __init xen_mark_init_mm_pinned(void)
1017 {
1018 	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
1019 }
1020 
1021 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1022 			  enum pt_level level)
1023 {
1024 	unsigned pgfl = TestClearPagePinned(page);
1025 
1026 	if (pgfl && !PageHighMem(page)) {
1027 		void *pt = lowmem_page_address(page);
1028 		unsigned long pfn = page_to_pfn(page);
1029 		spinlock_t *ptl = NULL;
1030 		struct multicall_space mcs;
1031 
1032 		/*
1033 		 * Do the converse to pin_page.  If we're using split
1034 		 * pte locks, we must be holding the lock for while
1035 		 * the pte page is unpinned but still RO to prevent
1036 		 * concurrent updates from seeing it in this
1037 		 * partially-pinned state.
1038 		 */
1039 		if (level == PT_PTE) {
1040 			ptl = xen_pte_lock(page, mm);
1041 
1042 			if (ptl)
1043 				xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
1044 		}
1045 
1046 		mcs = __xen_mc_entry(0);
1047 
1048 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1049 					pfn_pte(pfn, PAGE_KERNEL),
1050 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1051 
1052 		if (ptl) {
1053 			/* unlock when batch completed */
1054 			xen_mc_callback(xen_pte_unlock, ptl);
1055 		}
1056 	}
1057 
1058 	return 0;		/* never need to flush on unpin */
1059 }
1060 
1061 /* Release a pagetables pages back as normal RW */
1062 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
1063 {
1064 	xen_mc_batch();
1065 
1066 	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1067 
1068 #ifdef CONFIG_X86_64
1069 	{
1070 		pgd_t *user_pgd = xen_get_user_pgd(pgd);
1071 
1072 		if (user_pgd) {
1073 			xen_do_pin(MMUEXT_UNPIN_TABLE,
1074 				   PFN_DOWN(__pa(user_pgd)));
1075 			xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
1076 		}
1077 	}
1078 #endif
1079 
1080 #ifdef CONFIG_X86_PAE
1081 	/* Need to make sure unshared kernel PMD is unpinned */
1082 	xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1083 		       PT_PMD);
1084 #endif
1085 
1086 	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1087 
1088 	xen_mc_issue(0);
1089 }
1090 
1091 static void xen_pgd_unpin(struct mm_struct *mm)
1092 {
1093 	__xen_pgd_unpin(mm, mm->pgd);
1094 }
1095 
1096 /*
1097  * On resume, undo any pinning done at save, so that the rest of the
1098  * kernel doesn't see any unexpected pinned pagetables.
1099  */
1100 void xen_mm_unpin_all(void)
1101 {
1102 	unsigned long flags;
1103 	struct page *page;
1104 
1105 	spin_lock_irqsave(&pgd_lock, flags);
1106 
1107 	list_for_each_entry(page, &pgd_list, lru) {
1108 		if (PageSavePinned(page)) {
1109 			BUG_ON(!PagePinned(page));
1110 			__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1111 			ClearPageSavePinned(page);
1112 		}
1113 	}
1114 
1115 	spin_unlock_irqrestore(&pgd_lock, flags);
1116 }
1117 
1118 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1119 {
1120 	spin_lock(&next->page_table_lock);
1121 	xen_pgd_pin(next);
1122 	spin_unlock(&next->page_table_lock);
1123 }
1124 
1125 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1126 {
1127 	spin_lock(&mm->page_table_lock);
1128 	xen_pgd_pin(mm);
1129 	spin_unlock(&mm->page_table_lock);
1130 }
1131 
1132 
1133 #ifdef CONFIG_SMP
1134 /* Another cpu may still have their %cr3 pointing at the pagetable, so
1135    we need to repoint it somewhere else before we can unpin it. */
1136 static void drop_other_mm_ref(void *info)
1137 {
1138 	struct mm_struct *mm = info;
1139 	struct mm_struct *active_mm;
1140 
1141 	active_mm = percpu_read(cpu_tlbstate.active_mm);
1142 
1143 	if (active_mm == mm)
1144 		leave_mm(smp_processor_id());
1145 
1146 	/* If this cpu still has a stale cr3 reference, then make sure
1147 	   it has been flushed. */
1148 	if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1149 		load_cr3(swapper_pg_dir);
1150 }
1151 
1152 static void xen_drop_mm_ref(struct mm_struct *mm)
1153 {
1154 	cpumask_var_t mask;
1155 	unsigned cpu;
1156 
1157 	if (current->active_mm == mm) {
1158 		if (current->mm == mm)
1159 			load_cr3(swapper_pg_dir);
1160 		else
1161 			leave_mm(smp_processor_id());
1162 	}
1163 
1164 	/* Get the "official" set of cpus referring to our pagetable. */
1165 	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1166 		for_each_online_cpu(cpu) {
1167 			if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1168 			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1169 				continue;
1170 			smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1171 		}
1172 		return;
1173 	}
1174 	cpumask_copy(mask, mm_cpumask(mm));
1175 
1176 	/* It's possible that a vcpu may have a stale reference to our
1177 	   cr3, because its in lazy mode, and it hasn't yet flushed
1178 	   its set of pending hypercalls yet.  In this case, we can
1179 	   look at its actual current cr3 value, and force it to flush
1180 	   if needed. */
1181 	for_each_online_cpu(cpu) {
1182 		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1183 			cpumask_set_cpu(cpu, mask);
1184 	}
1185 
1186 	if (!cpumask_empty(mask))
1187 		smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1188 	free_cpumask_var(mask);
1189 }
1190 #else
1191 static void xen_drop_mm_ref(struct mm_struct *mm)
1192 {
1193 	if (current->active_mm == mm)
1194 		load_cr3(swapper_pg_dir);
1195 }
1196 #endif
1197 
1198 /*
1199  * While a process runs, Xen pins its pagetables, which means that the
1200  * hypervisor forces it to be read-only, and it controls all updates
1201  * to it.  This means that all pagetable updates have to go via the
1202  * hypervisor, which is moderately expensive.
1203  *
1204  * Since we're pulling the pagetable down, we switch to use init_mm,
1205  * unpin old process pagetable and mark it all read-write, which
1206  * allows further operations on it to be simple memory accesses.
1207  *
1208  * The only subtle point is that another CPU may be still using the
1209  * pagetable because of lazy tlb flushing.  This means we need need to
1210  * switch all CPUs off this pagetable before we can unpin it.
1211  */
1212 void xen_exit_mmap(struct mm_struct *mm)
1213 {
1214 	get_cpu();		/* make sure we don't move around */
1215 	xen_drop_mm_ref(mm);
1216 	put_cpu();
1217 
1218 	spin_lock(&mm->page_table_lock);
1219 
1220 	/* pgd may not be pinned in the error exit path of execve */
1221 	if (xen_page_pinned(mm->pgd))
1222 		xen_pgd_unpin(mm);
1223 
1224 	spin_unlock(&mm->page_table_lock);
1225 }
1226 
1227 static __init void xen_pagetable_setup_start(pgd_t *base)
1228 {
1229 }
1230 
1231 static void xen_post_allocator_init(void);
1232 
1233 static __init void xen_pagetable_setup_done(pgd_t *base)
1234 {
1235 	xen_setup_shared_info();
1236 	xen_post_allocator_init();
1237 }
1238 
1239 static void xen_write_cr2(unsigned long cr2)
1240 {
1241 	percpu_read(xen_vcpu)->arch.cr2 = cr2;
1242 }
1243 
1244 static unsigned long xen_read_cr2(void)
1245 {
1246 	return percpu_read(xen_vcpu)->arch.cr2;
1247 }
1248 
1249 unsigned long xen_read_cr2_direct(void)
1250 {
1251 	return percpu_read(xen_vcpu_info.arch.cr2);
1252 }
1253 
1254 static void xen_flush_tlb(void)
1255 {
1256 	struct mmuext_op *op;
1257 	struct multicall_space mcs;
1258 
1259 	preempt_disable();
1260 
1261 	mcs = xen_mc_entry(sizeof(*op));
1262 
1263 	op = mcs.args;
1264 	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1265 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1266 
1267 	xen_mc_issue(PARAVIRT_LAZY_MMU);
1268 
1269 	preempt_enable();
1270 }
1271 
1272 static void xen_flush_tlb_single(unsigned long addr)
1273 {
1274 	struct mmuext_op *op;
1275 	struct multicall_space mcs;
1276 
1277 	preempt_disable();
1278 
1279 	mcs = xen_mc_entry(sizeof(*op));
1280 	op = mcs.args;
1281 	op->cmd = MMUEXT_INVLPG_LOCAL;
1282 	op->arg1.linear_addr = addr & PAGE_MASK;
1283 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1284 
1285 	xen_mc_issue(PARAVIRT_LAZY_MMU);
1286 
1287 	preempt_enable();
1288 }
1289 
1290 static void xen_flush_tlb_others(const struct cpumask *cpus,
1291 				 struct mm_struct *mm, unsigned long va)
1292 {
1293 	struct {
1294 		struct mmuext_op op;
1295 		DECLARE_BITMAP(mask, NR_CPUS);
1296 	} *args;
1297 	struct multicall_space mcs;
1298 
1299 	if (cpumask_empty(cpus))
1300 		return;		/* nothing to do */
1301 
1302 	mcs = xen_mc_entry(sizeof(*args));
1303 	args = mcs.args;
1304 	args->op.arg2.vcpumask = to_cpumask(args->mask);
1305 
1306 	/* Remove us, and any offline CPUS. */
1307 	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1308 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1309 
1310 	if (va == TLB_FLUSH_ALL) {
1311 		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1312 	} else {
1313 		args->op.cmd = MMUEXT_INVLPG_MULTI;
1314 		args->op.arg1.linear_addr = va;
1315 	}
1316 
1317 	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1318 
1319 	xen_mc_issue(PARAVIRT_LAZY_MMU);
1320 }
1321 
1322 static unsigned long xen_read_cr3(void)
1323 {
1324 	return percpu_read(xen_cr3);
1325 }
1326 
1327 static void set_current_cr3(void *v)
1328 {
1329 	percpu_write(xen_current_cr3, (unsigned long)v);
1330 }
1331 
1332 static void __xen_write_cr3(bool kernel, unsigned long cr3)
1333 {
1334 	struct mmuext_op *op;
1335 	struct multicall_space mcs;
1336 	unsigned long mfn;
1337 
1338 	if (cr3)
1339 		mfn = pfn_to_mfn(PFN_DOWN(cr3));
1340 	else
1341 		mfn = 0;
1342 
1343 	WARN_ON(mfn == 0 && kernel);
1344 
1345 	mcs = __xen_mc_entry(sizeof(*op));
1346 
1347 	op = mcs.args;
1348 	op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1349 	op->arg1.mfn = mfn;
1350 
1351 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1352 
1353 	if (kernel) {
1354 		percpu_write(xen_cr3, cr3);
1355 
1356 		/* Update xen_current_cr3 once the batch has actually
1357 		   been submitted. */
1358 		xen_mc_callback(set_current_cr3, (void *)cr3);
1359 	}
1360 }
1361 
1362 static void xen_write_cr3(unsigned long cr3)
1363 {
1364 	BUG_ON(preemptible());
1365 
1366 	xen_mc_batch();  /* disables interrupts */
1367 
1368 	/* Update while interrupts are disabled, so its atomic with
1369 	   respect to ipis */
1370 	percpu_write(xen_cr3, cr3);
1371 
1372 	__xen_write_cr3(true, cr3);
1373 
1374 #ifdef CONFIG_X86_64
1375 	{
1376 		pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1377 		if (user_pgd)
1378 			__xen_write_cr3(false, __pa(user_pgd));
1379 		else
1380 			__xen_write_cr3(false, 0);
1381 	}
1382 #endif
1383 
1384 	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1385 }
1386 
1387 static int xen_pgd_alloc(struct mm_struct *mm)
1388 {
1389 	pgd_t *pgd = mm->pgd;
1390 	int ret = 0;
1391 
1392 	BUG_ON(PagePinned(virt_to_page(pgd)));
1393 
1394 #ifdef CONFIG_X86_64
1395 	{
1396 		struct page *page = virt_to_page(pgd);
1397 		pgd_t *user_pgd;
1398 
1399 		BUG_ON(page->private != 0);
1400 
1401 		ret = -ENOMEM;
1402 
1403 		user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1404 		page->private = (unsigned long)user_pgd;
1405 
1406 		if (user_pgd != NULL) {
1407 			user_pgd[pgd_index(VSYSCALL_START)] =
1408 				__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1409 			ret = 0;
1410 		}
1411 
1412 		BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1413 	}
1414 #endif
1415 
1416 	return ret;
1417 }
1418 
1419 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1420 {
1421 #ifdef CONFIG_X86_64
1422 	pgd_t *user_pgd = xen_get_user_pgd(pgd);
1423 
1424 	if (user_pgd)
1425 		free_page((unsigned long)user_pgd);
1426 #endif
1427 }
1428 
1429 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1430 {
1431 	unsigned long pfn = pte_pfn(pte);
1432 
1433 #ifdef CONFIG_X86_32
1434 	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
1435 	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1436 		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1437 			       pte_val_ma(pte));
1438 #endif
1439 
1440 	/*
1441 	 * If the new pfn is within the range of the newly allocated
1442 	 * kernel pagetable, and it isn't being mapped into an
1443 	 * early_ioremap fixmap slot, make sure it is RO.
1444 	 */
1445 	if (!is_early_ioremap_ptep(ptep) &&
1446 	    pfn >= e820_table_start && pfn < e820_table_end)
1447 		pte = pte_wrprotect(pte);
1448 
1449 	return pte;
1450 }
1451 
1452 /* Init-time set_pte while constructing initial pagetables, which
1453    doesn't allow RO pagetable pages to be remapped RW */
1454 static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1455 {
1456 	pte = mask_rw_pte(ptep, pte);
1457 
1458 	xen_set_pte(ptep, pte);
1459 }
1460 
1461 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1462 {
1463 	struct mmuext_op op;
1464 	op.cmd = cmd;
1465 	op.arg1.mfn = pfn_to_mfn(pfn);
1466 	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1467 		BUG();
1468 }
1469 
1470 /* Early in boot, while setting up the initial pagetable, assume
1471    everything is pinned. */
1472 static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1473 {
1474 #ifdef CONFIG_FLATMEM
1475 	BUG_ON(mem_map);	/* should only be used early */
1476 #endif
1477 	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1478 	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1479 }
1480 
1481 /* Used for pmd and pud */
1482 static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1483 {
1484 #ifdef CONFIG_FLATMEM
1485 	BUG_ON(mem_map);	/* should only be used early */
1486 #endif
1487 	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1488 }
1489 
1490 /* Early release_pte assumes that all pts are pinned, since there's
1491    only init_mm and anything attached to that is pinned. */
1492 static __init void xen_release_pte_init(unsigned long pfn)
1493 {
1494 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1495 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1496 }
1497 
1498 static __init void xen_release_pmd_init(unsigned long pfn)
1499 {
1500 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1501 }
1502 
1503 /* This needs to make sure the new pte page is pinned iff its being
1504    attached to a pinned pagetable. */
1505 static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1506 {
1507 	struct page *page = pfn_to_page(pfn);
1508 
1509 	if (PagePinned(virt_to_page(mm->pgd))) {
1510 		SetPagePinned(page);
1511 
1512 		if (!PageHighMem(page)) {
1513 			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1514 			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1515 				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1516 		} else {
1517 			/* make sure there are no stray mappings of
1518 			   this page */
1519 			kmap_flush_unused();
1520 		}
1521 	}
1522 }
1523 
1524 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1525 {
1526 	xen_alloc_ptpage(mm, pfn, PT_PTE);
1527 }
1528 
1529 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1530 {
1531 	xen_alloc_ptpage(mm, pfn, PT_PMD);
1532 }
1533 
1534 /* This should never happen until we're OK to use struct page */
1535 static void xen_release_ptpage(unsigned long pfn, unsigned level)
1536 {
1537 	struct page *page = pfn_to_page(pfn);
1538 
1539 	if (PagePinned(page)) {
1540 		if (!PageHighMem(page)) {
1541 			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1542 				pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1543 			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1544 		}
1545 		ClearPagePinned(page);
1546 	}
1547 }
1548 
1549 static void xen_release_pte(unsigned long pfn)
1550 {
1551 	xen_release_ptpage(pfn, PT_PTE);
1552 }
1553 
1554 static void xen_release_pmd(unsigned long pfn)
1555 {
1556 	xen_release_ptpage(pfn, PT_PMD);
1557 }
1558 
1559 #if PAGETABLE_LEVELS == 4
1560 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1561 {
1562 	xen_alloc_ptpage(mm, pfn, PT_PUD);
1563 }
1564 
1565 static void xen_release_pud(unsigned long pfn)
1566 {
1567 	xen_release_ptpage(pfn, PT_PUD);
1568 }
1569 #endif
1570 
1571 void __init xen_reserve_top(void)
1572 {
1573 #ifdef CONFIG_X86_32
1574 	unsigned long top = HYPERVISOR_VIRT_START;
1575 	struct xen_platform_parameters pp;
1576 
1577 	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1578 		top = pp.virt_start;
1579 
1580 	reserve_top_address(-top);
1581 #endif	/* CONFIG_X86_32 */
1582 }
1583 
1584 /*
1585  * Like __va(), but returns address in the kernel mapping (which is
1586  * all we have until the physical memory mapping has been set up.
1587  */
1588 static void *__ka(phys_addr_t paddr)
1589 {
1590 #ifdef CONFIG_X86_64
1591 	return (void *)(paddr + __START_KERNEL_map);
1592 #else
1593 	return __va(paddr);
1594 #endif
1595 }
1596 
1597 /* Convert a machine address to physical address */
1598 static unsigned long m2p(phys_addr_t maddr)
1599 {
1600 	phys_addr_t paddr;
1601 
1602 	maddr &= PTE_PFN_MASK;
1603 	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1604 
1605 	return paddr;
1606 }
1607 
1608 /* Convert a machine address to kernel virtual */
1609 static void *m2v(phys_addr_t maddr)
1610 {
1611 	return __ka(m2p(maddr));
1612 }
1613 
1614 /* Set the page permissions on an identity-mapped pages */
1615 static void set_page_prot(void *addr, pgprot_t prot)
1616 {
1617 	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1618 	pte_t pte = pfn_pte(pfn, prot);
1619 
1620 	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1621 		BUG();
1622 }
1623 
1624 static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1625 {
1626 	unsigned pmdidx, pteidx;
1627 	unsigned ident_pte;
1628 	unsigned long pfn;
1629 
1630 	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1631 				      PAGE_SIZE);
1632 
1633 	ident_pte = 0;
1634 	pfn = 0;
1635 	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1636 		pte_t *pte_page;
1637 
1638 		/* Reuse or allocate a page of ptes */
1639 		if (pmd_present(pmd[pmdidx]))
1640 			pte_page = m2v(pmd[pmdidx].pmd);
1641 		else {
1642 			/* Check for free pte pages */
1643 			if (ident_pte == LEVEL1_IDENT_ENTRIES)
1644 				break;
1645 
1646 			pte_page = &level1_ident_pgt[ident_pte];
1647 			ident_pte += PTRS_PER_PTE;
1648 
1649 			pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1650 		}
1651 
1652 		/* Install mappings */
1653 		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1654 			pte_t pte;
1655 
1656 			if (pfn > max_pfn_mapped)
1657 				max_pfn_mapped = pfn;
1658 
1659 			if (!pte_none(pte_page[pteidx]))
1660 				continue;
1661 
1662 			pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1663 			pte_page[pteidx] = pte;
1664 		}
1665 	}
1666 
1667 	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1668 		set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1669 
1670 	set_page_prot(pmd, PAGE_KERNEL_RO);
1671 }
1672 
1673 void __init xen_setup_machphys_mapping(void)
1674 {
1675 	struct xen_machphys_mapping mapping;
1676 	unsigned long machine_to_phys_nr_ents;
1677 
1678 	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1679 		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1680 		machine_to_phys_nr_ents = mapping.max_mfn + 1;
1681 	} else {
1682 		machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
1683 	}
1684 	machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
1685 }
1686 
1687 #ifdef CONFIG_X86_64
1688 static void convert_pfn_mfn(void *v)
1689 {
1690 	pte_t *pte = v;
1691 	int i;
1692 
1693 	/* All levels are converted the same way, so just treat them
1694 	   as ptes. */
1695 	for (i = 0; i < PTRS_PER_PTE; i++)
1696 		pte[i] = xen_make_pte(pte[i].pte);
1697 }
1698 
1699 /*
1700  * Set up the inital kernel pagetable.
1701  *
1702  * We can construct this by grafting the Xen provided pagetable into
1703  * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1704  * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
1705  * means that only the kernel has a physical mapping to start with -
1706  * but that's enough to get __va working.  We need to fill in the rest
1707  * of the physical mapping once some sort of allocator has been set
1708  * up.
1709  */
1710 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1711 					 unsigned long max_pfn)
1712 {
1713 	pud_t *l3;
1714 	pmd_t *l2;
1715 
1716 	/* Zap identity mapping */
1717 	init_level4_pgt[0] = __pgd(0);
1718 
1719 	/* Pre-constructed entries are in pfn, so convert to mfn */
1720 	convert_pfn_mfn(init_level4_pgt);
1721 	convert_pfn_mfn(level3_ident_pgt);
1722 	convert_pfn_mfn(level3_kernel_pgt);
1723 
1724 	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1725 	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1726 
1727 	memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1728 	memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1729 
1730 	l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1731 	l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1732 	memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1733 
1734 	/* Set up identity map */
1735 	xen_map_identity_early(level2_ident_pgt, max_pfn);
1736 
1737 	/* Make pagetable pieces RO */
1738 	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1739 	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1740 	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1741 	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1742 	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1743 	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1744 
1745 	/* Pin down new L4 */
1746 	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1747 			  PFN_DOWN(__pa_symbol(init_level4_pgt)));
1748 
1749 	/* Unpin Xen-provided one */
1750 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1751 
1752 	/* Switch over */
1753 	pgd = init_level4_pgt;
1754 
1755 	/*
1756 	 * At this stage there can be no user pgd, and no page
1757 	 * structure to attach it to, so make sure we just set kernel
1758 	 * pgd.
1759 	 */
1760 	xen_mc_batch();
1761 	__xen_write_cr3(true, __pa(pgd));
1762 	xen_mc_issue(PARAVIRT_LAZY_CPU);
1763 
1764 	memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1765 		      __pa(xen_start_info->pt_base +
1766 			   xen_start_info->nr_pt_frames * PAGE_SIZE),
1767 		      "XEN PAGETABLES");
1768 
1769 	return pgd;
1770 }
1771 #else	/* !CONFIG_X86_64 */
1772 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1773 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1774 
1775 static __init void xen_write_cr3_init(unsigned long cr3)
1776 {
1777 	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1778 
1779 	BUG_ON(read_cr3() != __pa(initial_page_table));
1780 	BUG_ON(cr3 != __pa(swapper_pg_dir));
1781 
1782 	/*
1783 	 * We are switching to swapper_pg_dir for the first time (from
1784 	 * initial_page_table) and therefore need to mark that page
1785 	 * read-only and then pin it.
1786 	 *
1787 	 * Xen disallows sharing of kernel PMDs for PAE
1788 	 * guests. Therefore we must copy the kernel PMD from
1789 	 * initial_page_table into a new kernel PMD to be used in
1790 	 * swapper_pg_dir.
1791 	 */
1792 	swapper_kernel_pmd =
1793 		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1794 	memcpy(swapper_kernel_pmd, initial_kernel_pmd,
1795 	       sizeof(pmd_t) * PTRS_PER_PMD);
1796 	swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1797 		__pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1798 	set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1799 
1800 	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1801 	xen_write_cr3(cr3);
1802 	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1803 
1804 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1805 			  PFN_DOWN(__pa(initial_page_table)));
1806 	set_page_prot(initial_page_table, PAGE_KERNEL);
1807 	set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1808 
1809 	pv_mmu_ops.write_cr3 = &xen_write_cr3;
1810 }
1811 
1812 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1813 					 unsigned long max_pfn)
1814 {
1815 	pmd_t *kernel_pmd;
1816 
1817 	initial_kernel_pmd =
1818 		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1819 
1820 	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1821 				  xen_start_info->nr_pt_frames * PAGE_SIZE +
1822 				  512*1024);
1823 
1824 	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1825 	memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1826 
1827 	xen_map_identity_early(initial_kernel_pmd, max_pfn);
1828 
1829 	memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1830 	initial_page_table[KERNEL_PGD_BOUNDARY] =
1831 		__pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
1832 
1833 	set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1834 	set_page_prot(initial_page_table, PAGE_KERNEL_RO);
1835 	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1836 
1837 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1838 
1839 	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1840 			  PFN_DOWN(__pa(initial_page_table)));
1841 	xen_write_cr3(__pa(initial_page_table));
1842 
1843 	memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1844 		      __pa(xen_start_info->pt_base +
1845 			   xen_start_info->nr_pt_frames * PAGE_SIZE),
1846 		      "XEN PAGETABLES");
1847 
1848 	return initial_page_table;
1849 }
1850 #endif	/* CONFIG_X86_64 */
1851 
1852 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1853 
1854 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1855 {
1856 	pte_t pte;
1857 
1858 	phys >>= PAGE_SHIFT;
1859 
1860 	switch (idx) {
1861 	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1862 #ifdef CONFIG_X86_F00F_BUG
1863 	case FIX_F00F_IDT:
1864 #endif
1865 #ifdef CONFIG_X86_32
1866 	case FIX_WP_TEST:
1867 	case FIX_VDSO:
1868 # ifdef CONFIG_HIGHMEM
1869 	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1870 # endif
1871 #else
1872 	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1873 #endif
1874 	case FIX_TEXT_POKE0:
1875 	case FIX_TEXT_POKE1:
1876 		/* All local page mappings */
1877 		pte = pfn_pte(phys, prot);
1878 		break;
1879 
1880 #ifdef CONFIG_X86_LOCAL_APIC
1881 	case FIX_APIC_BASE:	/* maps dummy local APIC */
1882 		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1883 		break;
1884 #endif
1885 
1886 #ifdef CONFIG_X86_IO_APIC
1887 	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
1888 		/*
1889 		 * We just don't map the IO APIC - all access is via
1890 		 * hypercalls.  Keep the address in the pte for reference.
1891 		 */
1892 		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1893 		break;
1894 #endif
1895 
1896 	case FIX_PARAVIRT_BOOTMAP:
1897 		/* This is an MFN, but it isn't an IO mapping from the
1898 		   IO domain */
1899 		pte = mfn_pte(phys, prot);
1900 		break;
1901 
1902 	default:
1903 		/* By default, set_fixmap is used for hardware mappings */
1904 		pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1905 		break;
1906 	}
1907 
1908 	__native_set_fixmap(idx, pte);
1909 
1910 #ifdef CONFIG_X86_64
1911 	/* Replicate changes to map the vsyscall page into the user
1912 	   pagetable vsyscall mapping. */
1913 	if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1914 		unsigned long vaddr = __fix_to_virt(idx);
1915 		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1916 	}
1917 #endif
1918 }
1919 
1920 __init void xen_ident_map_ISA(void)
1921 {
1922 	unsigned long pa;
1923 
1924 	/*
1925 	 * If we're dom0, then linear map the ISA machine addresses into
1926 	 * the kernel's address space.
1927 	 */
1928 	if (!xen_initial_domain())
1929 		return;
1930 
1931 	xen_raw_printk("Xen: setup ISA identity maps\n");
1932 
1933 	for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
1934 		pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
1935 
1936 		if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
1937 			BUG();
1938 	}
1939 
1940 	xen_flush_tlb();
1941 }
1942 
1943 static __init void xen_post_allocator_init(void)
1944 {
1945 	pv_mmu_ops.set_pte = xen_set_pte;
1946 	pv_mmu_ops.set_pmd = xen_set_pmd;
1947 	pv_mmu_ops.set_pud = xen_set_pud;
1948 #if PAGETABLE_LEVELS == 4
1949 	pv_mmu_ops.set_pgd = xen_set_pgd;
1950 #endif
1951 
1952 	/* This will work as long as patching hasn't happened yet
1953 	   (which it hasn't) */
1954 	pv_mmu_ops.alloc_pte = xen_alloc_pte;
1955 	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1956 	pv_mmu_ops.release_pte = xen_release_pte;
1957 	pv_mmu_ops.release_pmd = xen_release_pmd;
1958 #if PAGETABLE_LEVELS == 4
1959 	pv_mmu_ops.alloc_pud = xen_alloc_pud;
1960 	pv_mmu_ops.release_pud = xen_release_pud;
1961 #endif
1962 
1963 #ifdef CONFIG_X86_64
1964 	SetPagePinned(virt_to_page(level3_user_vsyscall));
1965 #endif
1966 	xen_mark_init_mm_pinned();
1967 }
1968 
1969 static void xen_leave_lazy_mmu(void)
1970 {
1971 	preempt_disable();
1972 	xen_mc_flush();
1973 	paravirt_leave_lazy_mmu();
1974 	preempt_enable();
1975 }
1976 
1977 static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1978 	.read_cr2 = xen_read_cr2,
1979 	.write_cr2 = xen_write_cr2,
1980 
1981 	.read_cr3 = xen_read_cr3,
1982 #ifdef CONFIG_X86_32
1983 	.write_cr3 = xen_write_cr3_init,
1984 #else
1985 	.write_cr3 = xen_write_cr3,
1986 #endif
1987 
1988 	.flush_tlb_user = xen_flush_tlb,
1989 	.flush_tlb_kernel = xen_flush_tlb,
1990 	.flush_tlb_single = xen_flush_tlb_single,
1991 	.flush_tlb_others = xen_flush_tlb_others,
1992 
1993 	.pte_update = paravirt_nop,
1994 	.pte_update_defer = paravirt_nop,
1995 
1996 	.pgd_alloc = xen_pgd_alloc,
1997 	.pgd_free = xen_pgd_free,
1998 
1999 	.alloc_pte = xen_alloc_pte_init,
2000 	.release_pte = xen_release_pte_init,
2001 	.alloc_pmd = xen_alloc_pmd_init,
2002 	.release_pmd = xen_release_pmd_init,
2003 
2004 	.set_pte = xen_set_pte_init,
2005 	.set_pte_at = xen_set_pte_at,
2006 	.set_pmd = xen_set_pmd_hyper,
2007 
2008 	.ptep_modify_prot_start = __ptep_modify_prot_start,
2009 	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
2010 
2011 	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
2012 	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2013 
2014 	.make_pte = PV_CALLEE_SAVE(xen_make_pte),
2015 	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2016 
2017 #ifdef CONFIG_X86_PAE
2018 	.set_pte_atomic = xen_set_pte_atomic,
2019 	.pte_clear = xen_pte_clear,
2020 	.pmd_clear = xen_pmd_clear,
2021 #endif	/* CONFIG_X86_PAE */
2022 	.set_pud = xen_set_pud_hyper,
2023 
2024 	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2025 	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2026 
2027 #if PAGETABLE_LEVELS == 4
2028 	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
2029 	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
2030 	.set_pgd = xen_set_pgd_hyper,
2031 
2032 	.alloc_pud = xen_alloc_pmd_init,
2033 	.release_pud = xen_release_pmd_init,
2034 #endif	/* PAGETABLE_LEVELS == 4 */
2035 
2036 	.activate_mm = xen_activate_mm,
2037 	.dup_mmap = xen_dup_mmap,
2038 	.exit_mmap = xen_exit_mmap,
2039 
2040 	.lazy_mode = {
2041 		.enter = paravirt_enter_lazy_mmu,
2042 		.leave = xen_leave_lazy_mmu,
2043 	},
2044 
2045 	.set_fixmap = xen_set_fixmap,
2046 };
2047 
2048 void __init xen_init_mmu_ops(void)
2049 {
2050 	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2051 	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2052 	pv_mmu_ops = xen_mmu_ops;
2053 
2054 	memset(dummy_mapping, 0xff, PAGE_SIZE);
2055 }
2056 
2057 /* Protected by xen_reservation_lock. */
2058 #define MAX_CONTIG_ORDER 9 /* 2MB */
2059 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2060 
2061 #define VOID_PTE (mfn_pte(0, __pgprot(0)))
2062 static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2063 				unsigned long *in_frames,
2064 				unsigned long *out_frames)
2065 {
2066 	int i;
2067 	struct multicall_space mcs;
2068 
2069 	xen_mc_batch();
2070 	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2071 		mcs = __xen_mc_entry(0);
2072 
2073 		if (in_frames)
2074 			in_frames[i] = virt_to_mfn(vaddr);
2075 
2076 		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2077 		set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2078 
2079 		if (out_frames)
2080 			out_frames[i] = virt_to_pfn(vaddr);
2081 	}
2082 	xen_mc_issue(0);
2083 }
2084 
2085 /*
2086  * Update the pfn-to-mfn mappings for a virtual address range, either to
2087  * point to an array of mfns, or contiguously from a single starting
2088  * mfn.
2089  */
2090 static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2091 				     unsigned long *mfns,
2092 				     unsigned long first_mfn)
2093 {
2094 	unsigned i, limit;
2095 	unsigned long mfn;
2096 
2097 	xen_mc_batch();
2098 
2099 	limit = 1u << order;
2100 	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2101 		struct multicall_space mcs;
2102 		unsigned flags;
2103 
2104 		mcs = __xen_mc_entry(0);
2105 		if (mfns)
2106 			mfn = mfns[i];
2107 		else
2108 			mfn = first_mfn + i;
2109 
2110 		if (i < (limit - 1))
2111 			flags = 0;
2112 		else {
2113 			if (order == 0)
2114 				flags = UVMF_INVLPG | UVMF_ALL;
2115 			else
2116 				flags = UVMF_TLB_FLUSH | UVMF_ALL;
2117 		}
2118 
2119 		MULTI_update_va_mapping(mcs.mc, vaddr,
2120 				mfn_pte(mfn, PAGE_KERNEL), flags);
2121 
2122 		set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2123 	}
2124 
2125 	xen_mc_issue(0);
2126 }
2127 
2128 /*
2129  * Perform the hypercall to exchange a region of our pfns to point to
2130  * memory with the required contiguous alignment.  Takes the pfns as
2131  * input, and populates mfns as output.
2132  *
2133  * Returns a success code indicating whether the hypervisor was able to
2134  * satisfy the request or not.
2135  */
2136 static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2137 			       unsigned long *pfns_in,
2138 			       unsigned long extents_out,
2139 			       unsigned int order_out,
2140 			       unsigned long *mfns_out,
2141 			       unsigned int address_bits)
2142 {
2143 	long rc;
2144 	int success;
2145 
2146 	struct xen_memory_exchange exchange = {
2147 		.in = {
2148 			.nr_extents   = extents_in,
2149 			.extent_order = order_in,
2150 			.extent_start = pfns_in,
2151 			.domid        = DOMID_SELF
2152 		},
2153 		.out = {
2154 			.nr_extents   = extents_out,
2155 			.extent_order = order_out,
2156 			.extent_start = mfns_out,
2157 			.address_bits = address_bits,
2158 			.domid        = DOMID_SELF
2159 		}
2160 	};
2161 
2162 	BUG_ON(extents_in << order_in != extents_out << order_out);
2163 
2164 	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2165 	success = (exchange.nr_exchanged == extents_in);
2166 
2167 	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2168 	BUG_ON(success && (rc != 0));
2169 
2170 	return success;
2171 }
2172 
2173 int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2174 				 unsigned int address_bits)
2175 {
2176 	unsigned long *in_frames = discontig_frames, out_frame;
2177 	unsigned long  flags;
2178 	int            success;
2179 
2180 	/*
2181 	 * Currently an auto-translated guest will not perform I/O, nor will
2182 	 * it require PAE page directories below 4GB. Therefore any calls to
2183 	 * this function are redundant and can be ignored.
2184 	 */
2185 
2186 	if (xen_feature(XENFEAT_auto_translated_physmap))
2187 		return 0;
2188 
2189 	if (unlikely(order > MAX_CONTIG_ORDER))
2190 		return -ENOMEM;
2191 
2192 	memset((void *) vstart, 0, PAGE_SIZE << order);
2193 
2194 	spin_lock_irqsave(&xen_reservation_lock, flags);
2195 
2196 	/* 1. Zap current PTEs, remembering MFNs. */
2197 	xen_zap_pfn_range(vstart, order, in_frames, NULL);
2198 
2199 	/* 2. Get a new contiguous memory extent. */
2200 	out_frame = virt_to_pfn(vstart);
2201 	success = xen_exchange_memory(1UL << order, 0, in_frames,
2202 				      1, order, &out_frame,
2203 				      address_bits);
2204 
2205 	/* 3. Map the new extent in place of old pages. */
2206 	if (success)
2207 		xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2208 	else
2209 		xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2210 
2211 	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2212 
2213 	return success ? 0 : -ENOMEM;
2214 }
2215 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2216 
2217 void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2218 {
2219 	unsigned long *out_frames = discontig_frames, in_frame;
2220 	unsigned long  flags;
2221 	int success;
2222 
2223 	if (xen_feature(XENFEAT_auto_translated_physmap))
2224 		return;
2225 
2226 	if (unlikely(order > MAX_CONTIG_ORDER))
2227 		return;
2228 
2229 	memset((void *) vstart, 0, PAGE_SIZE << order);
2230 
2231 	spin_lock_irqsave(&xen_reservation_lock, flags);
2232 
2233 	/* 1. Find start MFN of contiguous extent. */
2234 	in_frame = virt_to_mfn(vstart);
2235 
2236 	/* 2. Zap current PTEs. */
2237 	xen_zap_pfn_range(vstart, order, NULL, out_frames);
2238 
2239 	/* 3. Do the exchange for non-contiguous MFNs. */
2240 	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2241 					0, out_frames, 0);
2242 
2243 	/* 4. Map new pages in place of old pages. */
2244 	if (success)
2245 		xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2246 	else
2247 		xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2248 
2249 	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2250 }
2251 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2252 
2253 #ifdef CONFIG_XEN_PVHVM
2254 static void xen_hvm_exit_mmap(struct mm_struct *mm)
2255 {
2256 	struct xen_hvm_pagetable_dying a;
2257 	int rc;
2258 
2259 	a.domid = DOMID_SELF;
2260 	a.gpa = __pa(mm->pgd);
2261 	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2262 	WARN_ON_ONCE(rc < 0);
2263 }
2264 
2265 static int is_pagetable_dying_supported(void)
2266 {
2267 	struct xen_hvm_pagetable_dying a;
2268 	int rc = 0;
2269 
2270 	a.domid = DOMID_SELF;
2271 	a.gpa = 0x00;
2272 	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2273 	if (rc < 0) {
2274 		printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2275 		return 0;
2276 	}
2277 	return 1;
2278 }
2279 
2280 void __init xen_hvm_init_mmu_ops(void)
2281 {
2282 	if (is_pagetable_dying_supported())
2283 		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2284 }
2285 #endif
2286 
2287 #define REMAP_BATCH_SIZE 16
2288 
2289 struct remap_data {
2290 	unsigned long mfn;
2291 	pgprot_t prot;
2292 	struct mmu_update *mmu_update;
2293 };
2294 
2295 static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2296 				 unsigned long addr, void *data)
2297 {
2298 	struct remap_data *rmd = data;
2299 	pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2300 
2301 	rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
2302 	rmd->mmu_update->val = pte_val_ma(pte);
2303 	rmd->mmu_update++;
2304 
2305 	return 0;
2306 }
2307 
2308 int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2309 			       unsigned long addr,
2310 			       unsigned long mfn, int nr,
2311 			       pgprot_t prot, unsigned domid)
2312 {
2313 	struct remap_data rmd;
2314 	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2315 	int batch;
2316 	unsigned long range;
2317 	int err = 0;
2318 
2319 	prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2320 
2321 	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2322 				(VM_PFNMAP | VM_RESERVED | VM_IO)));
2323 
2324 	rmd.mfn = mfn;
2325 	rmd.prot = prot;
2326 
2327 	while (nr) {
2328 		batch = min(REMAP_BATCH_SIZE, nr);
2329 		range = (unsigned long)batch << PAGE_SHIFT;
2330 
2331 		rmd.mmu_update = mmu_update;
2332 		err = apply_to_page_range(vma->vm_mm, addr, range,
2333 					  remap_area_mfn_pte_fn, &rmd);
2334 		if (err)
2335 			goto out;
2336 
2337 		err = -EFAULT;
2338 		if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2339 			goto out;
2340 
2341 		nr -= batch;
2342 		addr += range;
2343 	}
2344 
2345 	err = 0;
2346 out:
2347 
2348 	flush_tlb_all();
2349 
2350 	return err;
2351 }
2352 EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2353 
2354 #ifdef CONFIG_XEN_DEBUG_FS
2355 
2356 static struct dentry *d_mmu_debug;
2357 
2358 static int __init xen_mmu_debugfs(void)
2359 {
2360 	struct dentry *d_xen = xen_init_debugfs();
2361 
2362 	if (d_xen == NULL)
2363 		return -ENOMEM;
2364 
2365 	d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2366 
2367 	debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2368 
2369 	debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2370 	debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2371 			   &mmu_stats.pgd_update_pinned);
2372 	debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2373 			   &mmu_stats.pgd_update_pinned);
2374 
2375 	debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2376 	debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2377 			   &mmu_stats.pud_update_pinned);
2378 	debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2379 			   &mmu_stats.pud_update_pinned);
2380 
2381 	debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2382 	debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2383 			   &mmu_stats.pmd_update_pinned);
2384 	debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2385 			   &mmu_stats.pmd_update_pinned);
2386 
2387 	debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2388 //	debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2389 //			   &mmu_stats.pte_update_pinned);
2390 	debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2391 			   &mmu_stats.pte_update_pinned);
2392 
2393 	debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2394 	debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2395 			   &mmu_stats.mmu_update_extended);
2396 	xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2397 				     mmu_stats.mmu_update_histo, 20);
2398 
2399 	debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2400 	debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2401 			   &mmu_stats.set_pte_at_batched);
2402 	debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2403 			   &mmu_stats.set_pte_at_current);
2404 	debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2405 			   &mmu_stats.set_pte_at_kernel);
2406 
2407 	debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2408 	debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2409 			   &mmu_stats.prot_commit_batched);
2410 
2411 	return 0;
2412 }
2413 fs_initcall(xen_mmu_debugfs);
2414 
2415 #endif	/* CONFIG_XEN_DEBUG_FS */
2416