xref: /openbmc/linux/arch/x86/xen/mmu.c (revision baa7eb025ab14f3cba2e35c0a8648f9c9f01d24f)
1 /*
2  * Xen mmu operations
3  *
4  * This file contains the various mmu fetch and update operations.
5  * The most important job they must perform is the mapping between the
6  * domain's pfn and the overall machine mfns.
7  *
8  * Xen allows guests to directly update the pagetable, in a controlled
9  * fashion.  In other words, the guest modifies the same pagetable
10  * that the CPU actually uses, which eliminates the overhead of having
11  * a separate shadow pagetable.
12  *
13  * In order to allow this, it falls on the guest domain to map its
14  * notion of a "physical" pfn - which is just a domain-local linear
15  * address - into a real "machine address" which the CPU's MMU can
16  * use.
17  *
18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19  * inserted directly into the pagetable.  When creating a new
20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
22  * the mfn back into a pfn.
23  *
24  * The other constraint is that all pages which make up a pagetable
25  * must be mapped read-only in the guest.  This prevents uncontrolled
26  * guest updates to the pagetable.  Xen strictly enforces this, and
27  * will disallow any pagetable update which will end up mapping a
28  * pagetable page RW, and will disallow using any writable page as a
29  * pagetable.
30  *
31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
32  * would need to validate the whole pagetable before going on.
33  * Naturally, this is quite slow.  The solution is to "pin" a
34  * pagetable, which enforces all the constraints on the pagetable even
35  * when it is not actively in use.  This menas that Xen can be assured
36  * that it is still valid when you do load it into %cr3, and doesn't
37  * need to revalidate it.
38  *
39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40  */
41 #include <linux/sched.h>
42 #include <linux/highmem.h>
43 #include <linux/debugfs.h>
44 #include <linux/bug.h>
45 #include <linux/vmalloc.h>
46 #include <linux/module.h>
47 #include <linux/gfp.h>
48 #include <linux/memblock.h>
49 
50 #include <asm/pgtable.h>
51 #include <asm/tlbflush.h>
52 #include <asm/fixmap.h>
53 #include <asm/mmu_context.h>
54 #include <asm/setup.h>
55 #include <asm/paravirt.h>
56 #include <asm/e820.h>
57 #include <asm/linkage.h>
58 #include <asm/page.h>
59 #include <asm/init.h>
60 #include <asm/pat.h>
61 
62 #include <asm/xen/hypercall.h>
63 #include <asm/xen/hypervisor.h>
64 
65 #include <xen/xen.h>
66 #include <xen/page.h>
67 #include <xen/interface/xen.h>
68 #include <xen/interface/hvm/hvm_op.h>
69 #include <xen/interface/version.h>
70 #include <xen/interface/memory.h>
71 #include <xen/hvc-console.h>
72 
73 #include "multicalls.h"
74 #include "mmu.h"
75 #include "debugfs.h"
76 
77 #define MMU_UPDATE_HISTO	30
78 
79 /*
80  * Protects atomic reservation decrease/increase against concurrent increases.
81  * Also protects non-atomic updates of current_pages and driver_pages, and
82  * balloon lists.
83  */
84 DEFINE_SPINLOCK(xen_reservation_lock);
85 
86 #ifdef CONFIG_XEN_DEBUG_FS
87 
88 static struct {
89 	u32 pgd_update;
90 	u32 pgd_update_pinned;
91 	u32 pgd_update_batched;
92 
93 	u32 pud_update;
94 	u32 pud_update_pinned;
95 	u32 pud_update_batched;
96 
97 	u32 pmd_update;
98 	u32 pmd_update_pinned;
99 	u32 pmd_update_batched;
100 
101 	u32 pte_update;
102 	u32 pte_update_pinned;
103 	u32 pte_update_batched;
104 
105 	u32 mmu_update;
106 	u32 mmu_update_extended;
107 	u32 mmu_update_histo[MMU_UPDATE_HISTO];
108 
109 	u32 prot_commit;
110 	u32 prot_commit_batched;
111 
112 	u32 set_pte_at;
113 	u32 set_pte_at_batched;
114 	u32 set_pte_at_pinned;
115 	u32 set_pte_at_current;
116 	u32 set_pte_at_kernel;
117 } mmu_stats;
118 
119 static u8 zero_stats;
120 
121 static inline void check_zero(void)
122 {
123 	if (unlikely(zero_stats)) {
124 		memset(&mmu_stats, 0, sizeof(mmu_stats));
125 		zero_stats = 0;
126 	}
127 }
128 
129 #define ADD_STATS(elem, val)			\
130 	do { check_zero(); mmu_stats.elem += (val); } while(0)
131 
132 #else  /* !CONFIG_XEN_DEBUG_FS */
133 
134 #define ADD_STATS(elem, val)	do { (void)(val); } while(0)
135 
136 #endif /* CONFIG_XEN_DEBUG_FS */
137 
138 
139 /*
140  * Identity map, in addition to plain kernel map.  This needs to be
141  * large enough to allocate page table pages to allocate the rest.
142  * Each page can map 2MB.
143  */
144 #define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)
145 static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
146 
147 #ifdef CONFIG_X86_64
148 /* l3 pud for userspace vsyscall mapping */
149 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
150 #endif /* CONFIG_X86_64 */
151 
152 /*
153  * Note about cr3 (pagetable base) values:
154  *
155  * xen_cr3 contains the current logical cr3 value; it contains the
156  * last set cr3.  This may not be the current effective cr3, because
157  * its update may be being lazily deferred.  However, a vcpu looking
158  * at its own cr3 can use this value knowing that it everything will
159  * be self-consistent.
160  *
161  * xen_current_cr3 contains the actual vcpu cr3; it is set once the
162  * hypercall to set the vcpu cr3 is complete (so it may be a little
163  * out of date, but it will never be set early).  If one vcpu is
164  * looking at another vcpu's cr3 value, it should use this variable.
165  */
166 DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
167 DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
168 
169 
170 /*
171  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
172  * redzone above it, so round it up to a PGD boundary.
173  */
174 #define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
175 
176 /*
177  * Xen leaves the responsibility for maintaining p2m mappings to the
178  * guests themselves, but it must also access and update the p2m array
179  * during suspend/resume when all the pages are reallocated.
180  *
181  * The p2m table is logically a flat array, but we implement it as a
182  * three-level tree to allow the address space to be sparse.
183  *
184  *                               Xen
185  *                                |
186  *     p2m_top              p2m_top_mfn
187  *       /  \                   /   \
188  * p2m_mid p2m_mid	p2m_mid_mfn p2m_mid_mfn
189  *    / \      / \         /           /
190  *  p2m p2m p2m p2m p2m p2m p2m ...
191  *
192  * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
193  *
194  * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
195  * maximum representable pseudo-physical address space is:
196  *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
197  *
198  * P2M_PER_PAGE depends on the architecture, as a mfn is always
199  * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
200  * 512 and 1024 entries respectively.
201  */
202 
203 unsigned long xen_max_p2m_pfn __read_mostly;
204 
205 #define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
206 #define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *))
207 #define P2M_TOP_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long **))
208 
209 #define MAX_P2M_PFN		(P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
210 
211 /* Placeholders for holes in the address space */
212 static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
213 static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
214 static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
215 
216 static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
217 static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
218 static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
219 
220 RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
221 RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
222 
223 static inline unsigned p2m_top_index(unsigned long pfn)
224 {
225 	BUG_ON(pfn >= MAX_P2M_PFN);
226 	return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
227 }
228 
229 static inline unsigned p2m_mid_index(unsigned long pfn)
230 {
231 	return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
232 }
233 
234 static inline unsigned p2m_index(unsigned long pfn)
235 {
236 	return pfn % P2M_PER_PAGE;
237 }
238 
239 static void p2m_top_init(unsigned long ***top)
240 {
241 	unsigned i;
242 
243 	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
244 		top[i] = p2m_mid_missing;
245 }
246 
247 static void p2m_top_mfn_init(unsigned long *top)
248 {
249 	unsigned i;
250 
251 	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
252 		top[i] = virt_to_mfn(p2m_mid_missing_mfn);
253 }
254 
255 static void p2m_top_mfn_p_init(unsigned long **top)
256 {
257 	unsigned i;
258 
259 	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
260 		top[i] = p2m_mid_missing_mfn;
261 }
262 
263 static void p2m_mid_init(unsigned long **mid)
264 {
265 	unsigned i;
266 
267 	for (i = 0; i < P2M_MID_PER_PAGE; i++)
268 		mid[i] = p2m_missing;
269 }
270 
271 static void p2m_mid_mfn_init(unsigned long *mid)
272 {
273 	unsigned i;
274 
275 	for (i = 0; i < P2M_MID_PER_PAGE; i++)
276 		mid[i] = virt_to_mfn(p2m_missing);
277 }
278 
279 static void p2m_init(unsigned long *p2m)
280 {
281 	unsigned i;
282 
283 	for (i = 0; i < P2M_MID_PER_PAGE; i++)
284 		p2m[i] = INVALID_P2M_ENTRY;
285 }
286 
287 /*
288  * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
289  *
290  * This is called both at boot time, and after resuming from suspend:
291  * - At boot time we're called very early, and must use extend_brk()
292  *   to allocate memory.
293  *
294  * - After resume we're called from within stop_machine, but the mfn
295  *   tree should alreay be completely allocated.
296  */
297 void xen_build_mfn_list_list(void)
298 {
299 	unsigned long pfn;
300 
301 	/* Pre-initialize p2m_top_mfn to be completely missing */
302 	if (p2m_top_mfn == NULL) {
303 		p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
304 		p2m_mid_mfn_init(p2m_mid_missing_mfn);
305 
306 		p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
307 		p2m_top_mfn_p_init(p2m_top_mfn_p);
308 
309 		p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
310 		p2m_top_mfn_init(p2m_top_mfn);
311 	} else {
312 		/* Reinitialise, mfn's all change after migration */
313 		p2m_mid_mfn_init(p2m_mid_missing_mfn);
314 	}
315 
316 	for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
317 		unsigned topidx = p2m_top_index(pfn);
318 		unsigned mididx = p2m_mid_index(pfn);
319 		unsigned long **mid;
320 		unsigned long *mid_mfn_p;
321 
322 		mid = p2m_top[topidx];
323 		mid_mfn_p = p2m_top_mfn_p[topidx];
324 
325 		/* Don't bother allocating any mfn mid levels if
326 		 * they're just missing, just update the stored mfn,
327 		 * since all could have changed over a migrate.
328 		 */
329 		if (mid == p2m_mid_missing) {
330 			BUG_ON(mididx);
331 			BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
332 			p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
333 			pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
334 			continue;
335 		}
336 
337 		if (mid_mfn_p == p2m_mid_missing_mfn) {
338 			/*
339 			 * XXX boot-time only!  We should never find
340 			 * missing parts of the mfn tree after
341 			 * runtime.  extend_brk() will BUG if we call
342 			 * it too late.
343 			 */
344 			mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
345 			p2m_mid_mfn_init(mid_mfn_p);
346 
347 			p2m_top_mfn_p[topidx] = mid_mfn_p;
348 		}
349 
350 		p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
351 		mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
352 	}
353 }
354 
355 void xen_setup_mfn_list_list(void)
356 {
357 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
358 
359 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
360 		virt_to_mfn(p2m_top_mfn);
361 	HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
362 }
363 
364 /* Set up p2m_top to point to the domain-builder provided p2m pages */
365 void __init xen_build_dynamic_phys_to_machine(void)
366 {
367 	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
368 	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
369 	unsigned long pfn;
370 
371 	xen_max_p2m_pfn = max_pfn;
372 
373 	p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
374 	p2m_init(p2m_missing);
375 
376 	p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
377 	p2m_mid_init(p2m_mid_missing);
378 
379 	p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
380 	p2m_top_init(p2m_top);
381 
382 	/*
383 	 * The domain builder gives us a pre-constructed p2m array in
384 	 * mfn_list for all the pages initially given to us, so we just
385 	 * need to graft that into our tree structure.
386 	 */
387 	for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
388 		unsigned topidx = p2m_top_index(pfn);
389 		unsigned mididx = p2m_mid_index(pfn);
390 
391 		if (p2m_top[topidx] == p2m_mid_missing) {
392 			unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
393 			p2m_mid_init(mid);
394 
395 			p2m_top[topidx] = mid;
396 		}
397 
398 		p2m_top[topidx][mididx] = &mfn_list[pfn];
399 	}
400 }
401 
402 unsigned long get_phys_to_machine(unsigned long pfn)
403 {
404 	unsigned topidx, mididx, idx;
405 
406 	if (unlikely(pfn >= MAX_P2M_PFN))
407 		return INVALID_P2M_ENTRY;
408 
409 	topidx = p2m_top_index(pfn);
410 	mididx = p2m_mid_index(pfn);
411 	idx = p2m_index(pfn);
412 
413 	return p2m_top[topidx][mididx][idx];
414 }
415 EXPORT_SYMBOL_GPL(get_phys_to_machine);
416 
417 static void *alloc_p2m_page(void)
418 {
419 	return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
420 }
421 
422 static void free_p2m_page(void *p)
423 {
424 	free_page((unsigned long)p);
425 }
426 
427 /*
428  * Fully allocate the p2m structure for a given pfn.  We need to check
429  * that both the top and mid levels are allocated, and make sure the
430  * parallel mfn tree is kept in sync.  We may race with other cpus, so
431  * the new pages are installed with cmpxchg; if we lose the race then
432  * simply free the page we allocated and use the one that's there.
433  */
434 static bool alloc_p2m(unsigned long pfn)
435 {
436 	unsigned topidx, mididx;
437 	unsigned long ***top_p, **mid;
438 	unsigned long *top_mfn_p, *mid_mfn;
439 
440 	topidx = p2m_top_index(pfn);
441 	mididx = p2m_mid_index(pfn);
442 
443 	top_p = &p2m_top[topidx];
444 	mid = *top_p;
445 
446 	if (mid == p2m_mid_missing) {
447 		/* Mid level is missing, allocate a new one */
448 		mid = alloc_p2m_page();
449 		if (!mid)
450 			return false;
451 
452 		p2m_mid_init(mid);
453 
454 		if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
455 			free_p2m_page(mid);
456 	}
457 
458 	top_mfn_p = &p2m_top_mfn[topidx];
459 	mid_mfn = p2m_top_mfn_p[topidx];
460 
461 	BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
462 
463 	if (mid_mfn == p2m_mid_missing_mfn) {
464 		/* Separately check the mid mfn level */
465 		unsigned long missing_mfn;
466 		unsigned long mid_mfn_mfn;
467 
468 		mid_mfn = alloc_p2m_page();
469 		if (!mid_mfn)
470 			return false;
471 
472 		p2m_mid_mfn_init(mid_mfn);
473 
474 		missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
475 		mid_mfn_mfn = virt_to_mfn(mid_mfn);
476 		if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
477 			free_p2m_page(mid_mfn);
478 		else
479 			p2m_top_mfn_p[topidx] = mid_mfn;
480 	}
481 
482 	if (p2m_top[topidx][mididx] == p2m_missing) {
483 		/* p2m leaf page is missing */
484 		unsigned long *p2m;
485 
486 		p2m = alloc_p2m_page();
487 		if (!p2m)
488 			return false;
489 
490 		p2m_init(p2m);
491 
492 		if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
493 			free_p2m_page(p2m);
494 		else
495 			mid_mfn[mididx] = virt_to_mfn(p2m);
496 	}
497 
498 	return true;
499 }
500 
501 /* Try to install p2m mapping; fail if intermediate bits missing */
502 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
503 {
504 	unsigned topidx, mididx, idx;
505 
506 	if (unlikely(pfn >= MAX_P2M_PFN)) {
507 		BUG_ON(mfn != INVALID_P2M_ENTRY);
508 		return true;
509 	}
510 
511 	topidx = p2m_top_index(pfn);
512 	mididx = p2m_mid_index(pfn);
513 	idx = p2m_index(pfn);
514 
515 	if (p2m_top[topidx][mididx] == p2m_missing)
516 		return mfn == INVALID_P2M_ENTRY;
517 
518 	p2m_top[topidx][mididx][idx] = mfn;
519 
520 	return true;
521 }
522 
523 bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
524 {
525 	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
526 		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
527 		return true;
528 	}
529 
530 	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
531 		if (!alloc_p2m(pfn))
532 			return false;
533 
534 		if (!__set_phys_to_machine(pfn, mfn))
535 			return false;
536 	}
537 
538 	return true;
539 }
540 
541 unsigned long arbitrary_virt_to_mfn(void *vaddr)
542 {
543 	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
544 
545 	return PFN_DOWN(maddr.maddr);
546 }
547 
548 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
549 {
550 	unsigned long address = (unsigned long)vaddr;
551 	unsigned int level;
552 	pte_t *pte;
553 	unsigned offset;
554 
555 	/*
556 	 * if the PFN is in the linear mapped vaddr range, we can just use
557 	 * the (quick) virt_to_machine() p2m lookup
558 	 */
559 	if (virt_addr_valid(vaddr))
560 		return virt_to_machine(vaddr);
561 
562 	/* otherwise we have to do a (slower) full page-table walk */
563 
564 	pte = lookup_address(address, &level);
565 	BUG_ON(pte == NULL);
566 	offset = address & ~PAGE_MASK;
567 	return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
568 }
569 
570 void make_lowmem_page_readonly(void *vaddr)
571 {
572 	pte_t *pte, ptev;
573 	unsigned long address = (unsigned long)vaddr;
574 	unsigned int level;
575 
576 	pte = lookup_address(address, &level);
577 	if (pte == NULL)
578 		return;		/* vaddr missing */
579 
580 	ptev = pte_wrprotect(*pte);
581 
582 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
583 		BUG();
584 }
585 
586 void make_lowmem_page_readwrite(void *vaddr)
587 {
588 	pte_t *pte, ptev;
589 	unsigned long address = (unsigned long)vaddr;
590 	unsigned int level;
591 
592 	pte = lookup_address(address, &level);
593 	if (pte == NULL)
594 		return;		/* vaddr missing */
595 
596 	ptev = pte_mkwrite(*pte);
597 
598 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
599 		BUG();
600 }
601 
602 
603 static bool xen_page_pinned(void *ptr)
604 {
605 	struct page *page = virt_to_page(ptr);
606 
607 	return PagePinned(page);
608 }
609 
610 static bool xen_iomap_pte(pte_t pte)
611 {
612 	return pte_flags(pte) & _PAGE_IOMAP;
613 }
614 
615 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
616 {
617 	struct multicall_space mcs;
618 	struct mmu_update *u;
619 
620 	mcs = xen_mc_entry(sizeof(*u));
621 	u = mcs.args;
622 
623 	/* ptep might be kmapped when using 32-bit HIGHPTE */
624 	u->ptr = arbitrary_virt_to_machine(ptep).maddr;
625 	u->val = pte_val_ma(pteval);
626 
627 	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
628 
629 	xen_mc_issue(PARAVIRT_LAZY_MMU);
630 }
631 EXPORT_SYMBOL_GPL(xen_set_domain_pte);
632 
633 static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
634 {
635 	xen_set_domain_pte(ptep, pteval, DOMID_IO);
636 }
637 
638 static void xen_extend_mmu_update(const struct mmu_update *update)
639 {
640 	struct multicall_space mcs;
641 	struct mmu_update *u;
642 
643 	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
644 
645 	if (mcs.mc != NULL) {
646 		ADD_STATS(mmu_update_extended, 1);
647 		ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
648 
649 		mcs.mc->args[1]++;
650 
651 		if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
652 			ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
653 		else
654 			ADD_STATS(mmu_update_histo[0], 1);
655 	} else {
656 		ADD_STATS(mmu_update, 1);
657 		mcs = __xen_mc_entry(sizeof(*u));
658 		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
659 		ADD_STATS(mmu_update_histo[1], 1);
660 	}
661 
662 	u = mcs.args;
663 	*u = *update;
664 }
665 
666 void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
667 {
668 	struct mmu_update u;
669 
670 	preempt_disable();
671 
672 	xen_mc_batch();
673 
674 	/* ptr may be ioremapped for 64-bit pagetable setup */
675 	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
676 	u.val = pmd_val_ma(val);
677 	xen_extend_mmu_update(&u);
678 
679 	ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
680 
681 	xen_mc_issue(PARAVIRT_LAZY_MMU);
682 
683 	preempt_enable();
684 }
685 
686 void xen_set_pmd(pmd_t *ptr, pmd_t val)
687 {
688 	ADD_STATS(pmd_update, 1);
689 
690 	/* If page is not pinned, we can just update the entry
691 	   directly */
692 	if (!xen_page_pinned(ptr)) {
693 		*ptr = val;
694 		return;
695 	}
696 
697 	ADD_STATS(pmd_update_pinned, 1);
698 
699 	xen_set_pmd_hyper(ptr, val);
700 }
701 
702 /*
703  * Associate a virtual page frame with a given physical page frame
704  * and protection flags for that frame.
705  */
706 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
707 {
708 	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
709 }
710 
711 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
712 		    pte_t *ptep, pte_t pteval)
713 {
714 	if (xen_iomap_pte(pteval)) {
715 		xen_set_iomap_pte(ptep, pteval);
716 		goto out;
717 	}
718 
719 	ADD_STATS(set_pte_at, 1);
720 //	ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
721 	ADD_STATS(set_pte_at_current, mm == current->mm);
722 	ADD_STATS(set_pte_at_kernel, mm == &init_mm);
723 
724 	if (mm == current->mm || mm == &init_mm) {
725 		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
726 			struct multicall_space mcs;
727 			mcs = xen_mc_entry(0);
728 
729 			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
730 			ADD_STATS(set_pte_at_batched, 1);
731 			xen_mc_issue(PARAVIRT_LAZY_MMU);
732 			goto out;
733 		} else
734 			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
735 				goto out;
736 	}
737 	xen_set_pte(ptep, pteval);
738 
739 out:	return;
740 }
741 
742 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
743 				 unsigned long addr, pte_t *ptep)
744 {
745 	/* Just return the pte as-is.  We preserve the bits on commit */
746 	return *ptep;
747 }
748 
749 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
750 				 pte_t *ptep, pte_t pte)
751 {
752 	struct mmu_update u;
753 
754 	xen_mc_batch();
755 
756 	u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
757 	u.val = pte_val_ma(pte);
758 	xen_extend_mmu_update(&u);
759 
760 	ADD_STATS(prot_commit, 1);
761 	ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
762 
763 	xen_mc_issue(PARAVIRT_LAZY_MMU);
764 }
765 
766 /* Assume pteval_t is equivalent to all the other *val_t types. */
767 static pteval_t pte_mfn_to_pfn(pteval_t val)
768 {
769 	if (val & _PAGE_PRESENT) {
770 		unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
771 		pteval_t flags = val & PTE_FLAGS_MASK;
772 		val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
773 	}
774 
775 	return val;
776 }
777 
778 static pteval_t pte_pfn_to_mfn(pteval_t val)
779 {
780 	if (val & _PAGE_PRESENT) {
781 		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
782 		pteval_t flags = val & PTE_FLAGS_MASK;
783 		unsigned long mfn = pfn_to_mfn(pfn);
784 
785 		/*
786 		 * If there's no mfn for the pfn, then just create an
787 		 * empty non-present pte.  Unfortunately this loses
788 		 * information about the original pfn, so
789 		 * pte_mfn_to_pfn is asymmetric.
790 		 */
791 		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
792 			mfn = 0;
793 			flags = 0;
794 		}
795 
796 		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
797 	}
798 
799 	return val;
800 }
801 
802 static pteval_t iomap_pte(pteval_t val)
803 {
804 	if (val & _PAGE_PRESENT) {
805 		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
806 		pteval_t flags = val & PTE_FLAGS_MASK;
807 
808 		/* We assume the pte frame number is a MFN, so
809 		   just use it as-is. */
810 		val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
811 	}
812 
813 	return val;
814 }
815 
816 pteval_t xen_pte_val(pte_t pte)
817 {
818 	pteval_t pteval = pte.pte;
819 
820 	/* If this is a WC pte, convert back from Xen WC to Linux WC */
821 	if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
822 		WARN_ON(!pat_enabled);
823 		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
824 	}
825 
826 	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
827 		return pteval;
828 
829 	return pte_mfn_to_pfn(pteval);
830 }
831 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
832 
833 pgdval_t xen_pgd_val(pgd_t pgd)
834 {
835 	return pte_mfn_to_pfn(pgd.pgd);
836 }
837 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
838 
839 /*
840  * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
841  * are reserved for now, to correspond to the Intel-reserved PAT
842  * types.
843  *
844  * We expect Linux's PAT set as follows:
845  *
846  * Idx  PTE flags        Linux    Xen    Default
847  * 0                     WB       WB     WB
848  * 1            PWT      WC       WT     WT
849  * 2        PCD          UC-      UC-    UC-
850  * 3        PCD PWT      UC       UC     UC
851  * 4    PAT              WB       WC     WB
852  * 5    PAT     PWT      WC       WP     WT
853  * 6    PAT PCD          UC-      UC     UC-
854  * 7    PAT PCD PWT      UC       UC     UC
855  */
856 
857 void xen_set_pat(u64 pat)
858 {
859 	/* We expect Linux to use a PAT setting of
860 	 * UC UC- WC WB (ignoring the PAT flag) */
861 	WARN_ON(pat != 0x0007010600070106ull);
862 }
863 
864 pte_t xen_make_pte(pteval_t pte)
865 {
866 	phys_addr_t addr = (pte & PTE_PFN_MASK);
867 
868 	/* If Linux is trying to set a WC pte, then map to the Xen WC.
869 	 * If _PAGE_PAT is set, then it probably means it is really
870 	 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
871 	 * things work out OK...
872 	 *
873 	 * (We should never see kernel mappings with _PAGE_PSE set,
874 	 * but we could see hugetlbfs mappings, I think.).
875 	 */
876 	if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
877 		if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
878 			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
879 	}
880 
881 	/*
882 	 * Unprivileged domains are allowed to do IOMAPpings for
883 	 * PCI passthrough, but not map ISA space.  The ISA
884 	 * mappings are just dummy local mappings to keep other
885 	 * parts of the kernel happy.
886 	 */
887 	if (unlikely(pte & _PAGE_IOMAP) &&
888 	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
889 		pte = iomap_pte(pte);
890 	} else {
891 		pte &= ~_PAGE_IOMAP;
892 		pte = pte_pfn_to_mfn(pte);
893 	}
894 
895 	return native_make_pte(pte);
896 }
897 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
898 
899 pgd_t xen_make_pgd(pgdval_t pgd)
900 {
901 	pgd = pte_pfn_to_mfn(pgd);
902 	return native_make_pgd(pgd);
903 }
904 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
905 
906 pmdval_t xen_pmd_val(pmd_t pmd)
907 {
908 	return pte_mfn_to_pfn(pmd.pmd);
909 }
910 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
911 
912 void xen_set_pud_hyper(pud_t *ptr, pud_t val)
913 {
914 	struct mmu_update u;
915 
916 	preempt_disable();
917 
918 	xen_mc_batch();
919 
920 	/* ptr may be ioremapped for 64-bit pagetable setup */
921 	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
922 	u.val = pud_val_ma(val);
923 	xen_extend_mmu_update(&u);
924 
925 	ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
926 
927 	xen_mc_issue(PARAVIRT_LAZY_MMU);
928 
929 	preempt_enable();
930 }
931 
932 void xen_set_pud(pud_t *ptr, pud_t val)
933 {
934 	ADD_STATS(pud_update, 1);
935 
936 	/* If page is not pinned, we can just update the entry
937 	   directly */
938 	if (!xen_page_pinned(ptr)) {
939 		*ptr = val;
940 		return;
941 	}
942 
943 	ADD_STATS(pud_update_pinned, 1);
944 
945 	xen_set_pud_hyper(ptr, val);
946 }
947 
948 void xen_set_pte(pte_t *ptep, pte_t pte)
949 {
950 	if (xen_iomap_pte(pte)) {
951 		xen_set_iomap_pte(ptep, pte);
952 		return;
953 	}
954 
955 	ADD_STATS(pte_update, 1);
956 //	ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
957 	ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
958 
959 #ifdef CONFIG_X86_PAE
960 	ptep->pte_high = pte.pte_high;
961 	smp_wmb();
962 	ptep->pte_low = pte.pte_low;
963 #else
964 	*ptep = pte;
965 #endif
966 }
967 
968 #ifdef CONFIG_X86_PAE
969 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
970 {
971 	if (xen_iomap_pte(pte)) {
972 		xen_set_iomap_pte(ptep, pte);
973 		return;
974 	}
975 
976 	set_64bit((u64 *)ptep, native_pte_val(pte));
977 }
978 
979 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
980 {
981 	ptep->pte_low = 0;
982 	smp_wmb();		/* make sure low gets written first */
983 	ptep->pte_high = 0;
984 }
985 
986 void xen_pmd_clear(pmd_t *pmdp)
987 {
988 	set_pmd(pmdp, __pmd(0));
989 }
990 #endif	/* CONFIG_X86_PAE */
991 
992 pmd_t xen_make_pmd(pmdval_t pmd)
993 {
994 	pmd = pte_pfn_to_mfn(pmd);
995 	return native_make_pmd(pmd);
996 }
997 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
998 
999 #if PAGETABLE_LEVELS == 4
1000 pudval_t xen_pud_val(pud_t pud)
1001 {
1002 	return pte_mfn_to_pfn(pud.pud);
1003 }
1004 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
1005 
1006 pud_t xen_make_pud(pudval_t pud)
1007 {
1008 	pud = pte_pfn_to_mfn(pud);
1009 
1010 	return native_make_pud(pud);
1011 }
1012 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
1013 
1014 pgd_t *xen_get_user_pgd(pgd_t *pgd)
1015 {
1016 	pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
1017 	unsigned offset = pgd - pgd_page;
1018 	pgd_t *user_ptr = NULL;
1019 
1020 	if (offset < pgd_index(USER_LIMIT)) {
1021 		struct page *page = virt_to_page(pgd_page);
1022 		user_ptr = (pgd_t *)page->private;
1023 		if (user_ptr)
1024 			user_ptr += offset;
1025 	}
1026 
1027 	return user_ptr;
1028 }
1029 
1030 static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
1031 {
1032 	struct mmu_update u;
1033 
1034 	u.ptr = virt_to_machine(ptr).maddr;
1035 	u.val = pgd_val_ma(val);
1036 	xen_extend_mmu_update(&u);
1037 }
1038 
1039 /*
1040  * Raw hypercall-based set_pgd, intended for in early boot before
1041  * there's a page structure.  This implies:
1042  *  1. The only existing pagetable is the kernel's
1043  *  2. It is always pinned
1044  *  3. It has no user pagetable attached to it
1045  */
1046 void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
1047 {
1048 	preempt_disable();
1049 
1050 	xen_mc_batch();
1051 
1052 	__xen_set_pgd_hyper(ptr, val);
1053 
1054 	xen_mc_issue(PARAVIRT_LAZY_MMU);
1055 
1056 	preempt_enable();
1057 }
1058 
1059 void xen_set_pgd(pgd_t *ptr, pgd_t val)
1060 {
1061 	pgd_t *user_ptr = xen_get_user_pgd(ptr);
1062 
1063 	ADD_STATS(pgd_update, 1);
1064 
1065 	/* If page is not pinned, we can just update the entry
1066 	   directly */
1067 	if (!xen_page_pinned(ptr)) {
1068 		*ptr = val;
1069 		if (user_ptr) {
1070 			WARN_ON(xen_page_pinned(user_ptr));
1071 			*user_ptr = val;
1072 		}
1073 		return;
1074 	}
1075 
1076 	ADD_STATS(pgd_update_pinned, 1);
1077 	ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
1078 
1079 	/* If it's pinned, then we can at least batch the kernel and
1080 	   user updates together. */
1081 	xen_mc_batch();
1082 
1083 	__xen_set_pgd_hyper(ptr, val);
1084 	if (user_ptr)
1085 		__xen_set_pgd_hyper(user_ptr, val);
1086 
1087 	xen_mc_issue(PARAVIRT_LAZY_MMU);
1088 }
1089 #endif	/* PAGETABLE_LEVELS == 4 */
1090 
1091 /*
1092  * (Yet another) pagetable walker.  This one is intended for pinning a
1093  * pagetable.  This means that it walks a pagetable and calls the
1094  * callback function on each page it finds making up the page table,
1095  * at every level.  It walks the entire pagetable, but it only bothers
1096  * pinning pte pages which are below limit.  In the normal case this
1097  * will be STACK_TOP_MAX, but at boot we need to pin up to
1098  * FIXADDR_TOP.
1099  *
1100  * For 32-bit the important bit is that we don't pin beyond there,
1101  * because then we start getting into Xen's ptes.
1102  *
1103  * For 64-bit, we must skip the Xen hole in the middle of the address
1104  * space, just after the big x86-64 virtual hole.
1105  */
1106 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
1107 			  int (*func)(struct mm_struct *mm, struct page *,
1108 				      enum pt_level),
1109 			  unsigned long limit)
1110 {
1111 	int flush = 0;
1112 	unsigned hole_low, hole_high;
1113 	unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
1114 	unsigned pgdidx, pudidx, pmdidx;
1115 
1116 	/* The limit is the last byte to be touched */
1117 	limit--;
1118 	BUG_ON(limit >= FIXADDR_TOP);
1119 
1120 	if (xen_feature(XENFEAT_auto_translated_physmap))
1121 		return 0;
1122 
1123 	/*
1124 	 * 64-bit has a great big hole in the middle of the address
1125 	 * space, which contains the Xen mappings.  On 32-bit these
1126 	 * will end up making a zero-sized hole and so is a no-op.
1127 	 */
1128 	hole_low = pgd_index(USER_LIMIT);
1129 	hole_high = pgd_index(PAGE_OFFSET);
1130 
1131 	pgdidx_limit = pgd_index(limit);
1132 #if PTRS_PER_PUD > 1
1133 	pudidx_limit = pud_index(limit);
1134 #else
1135 	pudidx_limit = 0;
1136 #endif
1137 #if PTRS_PER_PMD > 1
1138 	pmdidx_limit = pmd_index(limit);
1139 #else
1140 	pmdidx_limit = 0;
1141 #endif
1142 
1143 	for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
1144 		pud_t *pud;
1145 
1146 		if (pgdidx >= hole_low && pgdidx < hole_high)
1147 			continue;
1148 
1149 		if (!pgd_val(pgd[pgdidx]))
1150 			continue;
1151 
1152 		pud = pud_offset(&pgd[pgdidx], 0);
1153 
1154 		if (PTRS_PER_PUD > 1) /* not folded */
1155 			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
1156 
1157 		for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
1158 			pmd_t *pmd;
1159 
1160 			if (pgdidx == pgdidx_limit &&
1161 			    pudidx > pudidx_limit)
1162 				goto out;
1163 
1164 			if (pud_none(pud[pudidx]))
1165 				continue;
1166 
1167 			pmd = pmd_offset(&pud[pudidx], 0);
1168 
1169 			if (PTRS_PER_PMD > 1) /* not folded */
1170 				flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
1171 
1172 			for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
1173 				struct page *pte;
1174 
1175 				if (pgdidx == pgdidx_limit &&
1176 				    pudidx == pudidx_limit &&
1177 				    pmdidx > pmdidx_limit)
1178 					goto out;
1179 
1180 				if (pmd_none(pmd[pmdidx]))
1181 					continue;
1182 
1183 				pte = pmd_page(pmd[pmdidx]);
1184 				flush |= (*func)(mm, pte, PT_PTE);
1185 			}
1186 		}
1187 	}
1188 
1189 out:
1190 	/* Do the top level last, so that the callbacks can use it as
1191 	   a cue to do final things like tlb flushes. */
1192 	flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
1193 
1194 	return flush;
1195 }
1196 
1197 static int xen_pgd_walk(struct mm_struct *mm,
1198 			int (*func)(struct mm_struct *mm, struct page *,
1199 				    enum pt_level),
1200 			unsigned long limit)
1201 {
1202 	return __xen_pgd_walk(mm, mm->pgd, func, limit);
1203 }
1204 
1205 /* If we're using split pte locks, then take the page's lock and
1206    return a pointer to it.  Otherwise return NULL. */
1207 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
1208 {
1209 	spinlock_t *ptl = NULL;
1210 
1211 #if USE_SPLIT_PTLOCKS
1212 	ptl = __pte_lockptr(page);
1213 	spin_lock_nest_lock(ptl, &mm->page_table_lock);
1214 #endif
1215 
1216 	return ptl;
1217 }
1218 
1219 static void xen_pte_unlock(void *v)
1220 {
1221 	spinlock_t *ptl = v;
1222 	spin_unlock(ptl);
1223 }
1224 
1225 static void xen_do_pin(unsigned level, unsigned long pfn)
1226 {
1227 	struct mmuext_op *op;
1228 	struct multicall_space mcs;
1229 
1230 	mcs = __xen_mc_entry(sizeof(*op));
1231 	op = mcs.args;
1232 	op->cmd = level;
1233 	op->arg1.mfn = pfn_to_mfn(pfn);
1234 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1235 }
1236 
1237 static int xen_pin_page(struct mm_struct *mm, struct page *page,
1238 			enum pt_level level)
1239 {
1240 	unsigned pgfl = TestSetPagePinned(page);
1241 	int flush;
1242 
1243 	if (pgfl)
1244 		flush = 0;		/* already pinned */
1245 	else if (PageHighMem(page))
1246 		/* kmaps need flushing if we found an unpinned
1247 		   highpage */
1248 		flush = 1;
1249 	else {
1250 		void *pt = lowmem_page_address(page);
1251 		unsigned long pfn = page_to_pfn(page);
1252 		struct multicall_space mcs = __xen_mc_entry(0);
1253 		spinlock_t *ptl;
1254 
1255 		flush = 0;
1256 
1257 		/*
1258 		 * We need to hold the pagetable lock between the time
1259 		 * we make the pagetable RO and when we actually pin
1260 		 * it.  If we don't, then other users may come in and
1261 		 * attempt to update the pagetable by writing it,
1262 		 * which will fail because the memory is RO but not
1263 		 * pinned, so Xen won't do the trap'n'emulate.
1264 		 *
1265 		 * If we're using split pte locks, we can't hold the
1266 		 * entire pagetable's worth of locks during the
1267 		 * traverse, because we may wrap the preempt count (8
1268 		 * bits).  The solution is to mark RO and pin each PTE
1269 		 * page while holding the lock.  This means the number
1270 		 * of locks we end up holding is never more than a
1271 		 * batch size (~32 entries, at present).
1272 		 *
1273 		 * If we're not using split pte locks, we needn't pin
1274 		 * the PTE pages independently, because we're
1275 		 * protected by the overall pagetable lock.
1276 		 */
1277 		ptl = NULL;
1278 		if (level == PT_PTE)
1279 			ptl = xen_pte_lock(page, mm);
1280 
1281 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1282 					pfn_pte(pfn, PAGE_KERNEL_RO),
1283 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1284 
1285 		if (ptl) {
1286 			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
1287 
1288 			/* Queue a deferred unlock for when this batch
1289 			   is completed. */
1290 			xen_mc_callback(xen_pte_unlock, ptl);
1291 		}
1292 	}
1293 
1294 	return flush;
1295 }
1296 
1297 /* This is called just after a mm has been created, but it has not
1298    been used yet.  We need to make sure that its pagetable is all
1299    read-only, and can be pinned. */
1300 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
1301 {
1302 	xen_mc_batch();
1303 
1304 	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
1305 		/* re-enable interrupts for flushing */
1306 		xen_mc_issue(0);
1307 
1308 		kmap_flush_unused();
1309 
1310 		xen_mc_batch();
1311 	}
1312 
1313 #ifdef CONFIG_X86_64
1314 	{
1315 		pgd_t *user_pgd = xen_get_user_pgd(pgd);
1316 
1317 		xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
1318 
1319 		if (user_pgd) {
1320 			xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
1321 			xen_do_pin(MMUEXT_PIN_L4_TABLE,
1322 				   PFN_DOWN(__pa(user_pgd)));
1323 		}
1324 	}
1325 #else /* CONFIG_X86_32 */
1326 #ifdef CONFIG_X86_PAE
1327 	/* Need to make sure unshared kernel PMD is pinnable */
1328 	xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1329 		     PT_PMD);
1330 #endif
1331 	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
1332 #endif /* CONFIG_X86_64 */
1333 	xen_mc_issue(0);
1334 }
1335 
1336 static void xen_pgd_pin(struct mm_struct *mm)
1337 {
1338 	__xen_pgd_pin(mm, mm->pgd);
1339 }
1340 
1341 /*
1342  * On save, we need to pin all pagetables to make sure they get their
1343  * mfns turned into pfns.  Search the list for any unpinned pgds and pin
1344  * them (unpinned pgds are not currently in use, probably because the
1345  * process is under construction or destruction).
1346  *
1347  * Expected to be called in stop_machine() ("equivalent to taking
1348  * every spinlock in the system"), so the locking doesn't really
1349  * matter all that much.
1350  */
1351 void xen_mm_pin_all(void)
1352 {
1353 	unsigned long flags;
1354 	struct page *page;
1355 
1356 	spin_lock_irqsave(&pgd_lock, flags);
1357 
1358 	list_for_each_entry(page, &pgd_list, lru) {
1359 		if (!PagePinned(page)) {
1360 			__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
1361 			SetPageSavePinned(page);
1362 		}
1363 	}
1364 
1365 	spin_unlock_irqrestore(&pgd_lock, flags);
1366 }
1367 
1368 /*
1369  * The init_mm pagetable is really pinned as soon as its created, but
1370  * that's before we have page structures to store the bits.  So do all
1371  * the book-keeping now.
1372  */
1373 static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1374 				  enum pt_level level)
1375 {
1376 	SetPagePinned(page);
1377 	return 0;
1378 }
1379 
1380 static void __init xen_mark_init_mm_pinned(void)
1381 {
1382 	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
1383 }
1384 
1385 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1386 			  enum pt_level level)
1387 {
1388 	unsigned pgfl = TestClearPagePinned(page);
1389 
1390 	if (pgfl && !PageHighMem(page)) {
1391 		void *pt = lowmem_page_address(page);
1392 		unsigned long pfn = page_to_pfn(page);
1393 		spinlock_t *ptl = NULL;
1394 		struct multicall_space mcs;
1395 
1396 		/*
1397 		 * Do the converse to pin_page.  If we're using split
1398 		 * pte locks, we must be holding the lock for while
1399 		 * the pte page is unpinned but still RO to prevent
1400 		 * concurrent updates from seeing it in this
1401 		 * partially-pinned state.
1402 		 */
1403 		if (level == PT_PTE) {
1404 			ptl = xen_pte_lock(page, mm);
1405 
1406 			if (ptl)
1407 				xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
1408 		}
1409 
1410 		mcs = __xen_mc_entry(0);
1411 
1412 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1413 					pfn_pte(pfn, PAGE_KERNEL),
1414 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1415 
1416 		if (ptl) {
1417 			/* unlock when batch completed */
1418 			xen_mc_callback(xen_pte_unlock, ptl);
1419 		}
1420 	}
1421 
1422 	return 0;		/* never need to flush on unpin */
1423 }
1424 
1425 /* Release a pagetables pages back as normal RW */
1426 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
1427 {
1428 	xen_mc_batch();
1429 
1430 	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1431 
1432 #ifdef CONFIG_X86_64
1433 	{
1434 		pgd_t *user_pgd = xen_get_user_pgd(pgd);
1435 
1436 		if (user_pgd) {
1437 			xen_do_pin(MMUEXT_UNPIN_TABLE,
1438 				   PFN_DOWN(__pa(user_pgd)));
1439 			xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
1440 		}
1441 	}
1442 #endif
1443 
1444 #ifdef CONFIG_X86_PAE
1445 	/* Need to make sure unshared kernel PMD is unpinned */
1446 	xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1447 		       PT_PMD);
1448 #endif
1449 
1450 	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1451 
1452 	xen_mc_issue(0);
1453 }
1454 
1455 static void xen_pgd_unpin(struct mm_struct *mm)
1456 {
1457 	__xen_pgd_unpin(mm, mm->pgd);
1458 }
1459 
1460 /*
1461  * On resume, undo any pinning done at save, so that the rest of the
1462  * kernel doesn't see any unexpected pinned pagetables.
1463  */
1464 void xen_mm_unpin_all(void)
1465 {
1466 	unsigned long flags;
1467 	struct page *page;
1468 
1469 	spin_lock_irqsave(&pgd_lock, flags);
1470 
1471 	list_for_each_entry(page, &pgd_list, lru) {
1472 		if (PageSavePinned(page)) {
1473 			BUG_ON(!PagePinned(page));
1474 			__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1475 			ClearPageSavePinned(page);
1476 		}
1477 	}
1478 
1479 	spin_unlock_irqrestore(&pgd_lock, flags);
1480 }
1481 
1482 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1483 {
1484 	spin_lock(&next->page_table_lock);
1485 	xen_pgd_pin(next);
1486 	spin_unlock(&next->page_table_lock);
1487 }
1488 
1489 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1490 {
1491 	spin_lock(&mm->page_table_lock);
1492 	xen_pgd_pin(mm);
1493 	spin_unlock(&mm->page_table_lock);
1494 }
1495 
1496 
1497 #ifdef CONFIG_SMP
1498 /* Another cpu may still have their %cr3 pointing at the pagetable, so
1499    we need to repoint it somewhere else before we can unpin it. */
1500 static void drop_other_mm_ref(void *info)
1501 {
1502 	struct mm_struct *mm = info;
1503 	struct mm_struct *active_mm;
1504 
1505 	active_mm = percpu_read(cpu_tlbstate.active_mm);
1506 
1507 	if (active_mm == mm)
1508 		leave_mm(smp_processor_id());
1509 
1510 	/* If this cpu still has a stale cr3 reference, then make sure
1511 	   it has been flushed. */
1512 	if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1513 		load_cr3(swapper_pg_dir);
1514 }
1515 
1516 static void xen_drop_mm_ref(struct mm_struct *mm)
1517 {
1518 	cpumask_var_t mask;
1519 	unsigned cpu;
1520 
1521 	if (current->active_mm == mm) {
1522 		if (current->mm == mm)
1523 			load_cr3(swapper_pg_dir);
1524 		else
1525 			leave_mm(smp_processor_id());
1526 	}
1527 
1528 	/* Get the "official" set of cpus referring to our pagetable. */
1529 	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1530 		for_each_online_cpu(cpu) {
1531 			if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1532 			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1533 				continue;
1534 			smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1535 		}
1536 		return;
1537 	}
1538 	cpumask_copy(mask, mm_cpumask(mm));
1539 
1540 	/* It's possible that a vcpu may have a stale reference to our
1541 	   cr3, because its in lazy mode, and it hasn't yet flushed
1542 	   its set of pending hypercalls yet.  In this case, we can
1543 	   look at its actual current cr3 value, and force it to flush
1544 	   if needed. */
1545 	for_each_online_cpu(cpu) {
1546 		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1547 			cpumask_set_cpu(cpu, mask);
1548 	}
1549 
1550 	if (!cpumask_empty(mask))
1551 		smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1552 	free_cpumask_var(mask);
1553 }
1554 #else
1555 static void xen_drop_mm_ref(struct mm_struct *mm)
1556 {
1557 	if (current->active_mm == mm)
1558 		load_cr3(swapper_pg_dir);
1559 }
1560 #endif
1561 
1562 /*
1563  * While a process runs, Xen pins its pagetables, which means that the
1564  * hypervisor forces it to be read-only, and it controls all updates
1565  * to it.  This means that all pagetable updates have to go via the
1566  * hypervisor, which is moderately expensive.
1567  *
1568  * Since we're pulling the pagetable down, we switch to use init_mm,
1569  * unpin old process pagetable and mark it all read-write, which
1570  * allows further operations on it to be simple memory accesses.
1571  *
1572  * The only subtle point is that another CPU may be still using the
1573  * pagetable because of lazy tlb flushing.  This means we need need to
1574  * switch all CPUs off this pagetable before we can unpin it.
1575  */
1576 void xen_exit_mmap(struct mm_struct *mm)
1577 {
1578 	get_cpu();		/* make sure we don't move around */
1579 	xen_drop_mm_ref(mm);
1580 	put_cpu();
1581 
1582 	spin_lock(&mm->page_table_lock);
1583 
1584 	/* pgd may not be pinned in the error exit path of execve */
1585 	if (xen_page_pinned(mm->pgd))
1586 		xen_pgd_unpin(mm);
1587 
1588 	spin_unlock(&mm->page_table_lock);
1589 }
1590 
1591 static __init void xen_pagetable_setup_start(pgd_t *base)
1592 {
1593 }
1594 
1595 static void xen_post_allocator_init(void);
1596 
1597 static __init void xen_pagetable_setup_done(pgd_t *base)
1598 {
1599 	xen_setup_shared_info();
1600 	xen_post_allocator_init();
1601 }
1602 
1603 static void xen_write_cr2(unsigned long cr2)
1604 {
1605 	percpu_read(xen_vcpu)->arch.cr2 = cr2;
1606 }
1607 
1608 static unsigned long xen_read_cr2(void)
1609 {
1610 	return percpu_read(xen_vcpu)->arch.cr2;
1611 }
1612 
1613 unsigned long xen_read_cr2_direct(void)
1614 {
1615 	return percpu_read(xen_vcpu_info.arch.cr2);
1616 }
1617 
1618 static void xen_flush_tlb(void)
1619 {
1620 	struct mmuext_op *op;
1621 	struct multicall_space mcs;
1622 
1623 	preempt_disable();
1624 
1625 	mcs = xen_mc_entry(sizeof(*op));
1626 
1627 	op = mcs.args;
1628 	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1629 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1630 
1631 	xen_mc_issue(PARAVIRT_LAZY_MMU);
1632 
1633 	preempt_enable();
1634 }
1635 
1636 static void xen_flush_tlb_single(unsigned long addr)
1637 {
1638 	struct mmuext_op *op;
1639 	struct multicall_space mcs;
1640 
1641 	preempt_disable();
1642 
1643 	mcs = xen_mc_entry(sizeof(*op));
1644 	op = mcs.args;
1645 	op->cmd = MMUEXT_INVLPG_LOCAL;
1646 	op->arg1.linear_addr = addr & PAGE_MASK;
1647 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1648 
1649 	xen_mc_issue(PARAVIRT_LAZY_MMU);
1650 
1651 	preempt_enable();
1652 }
1653 
1654 static void xen_flush_tlb_others(const struct cpumask *cpus,
1655 				 struct mm_struct *mm, unsigned long va)
1656 {
1657 	struct {
1658 		struct mmuext_op op;
1659 		DECLARE_BITMAP(mask, NR_CPUS);
1660 	} *args;
1661 	struct multicall_space mcs;
1662 
1663 	if (cpumask_empty(cpus))
1664 		return;		/* nothing to do */
1665 
1666 	mcs = xen_mc_entry(sizeof(*args));
1667 	args = mcs.args;
1668 	args->op.arg2.vcpumask = to_cpumask(args->mask);
1669 
1670 	/* Remove us, and any offline CPUS. */
1671 	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1672 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1673 
1674 	if (va == TLB_FLUSH_ALL) {
1675 		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1676 	} else {
1677 		args->op.cmd = MMUEXT_INVLPG_MULTI;
1678 		args->op.arg1.linear_addr = va;
1679 	}
1680 
1681 	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1682 
1683 	xen_mc_issue(PARAVIRT_LAZY_MMU);
1684 }
1685 
1686 static unsigned long xen_read_cr3(void)
1687 {
1688 	return percpu_read(xen_cr3);
1689 }
1690 
1691 static void set_current_cr3(void *v)
1692 {
1693 	percpu_write(xen_current_cr3, (unsigned long)v);
1694 }
1695 
1696 static void __xen_write_cr3(bool kernel, unsigned long cr3)
1697 {
1698 	struct mmuext_op *op;
1699 	struct multicall_space mcs;
1700 	unsigned long mfn;
1701 
1702 	if (cr3)
1703 		mfn = pfn_to_mfn(PFN_DOWN(cr3));
1704 	else
1705 		mfn = 0;
1706 
1707 	WARN_ON(mfn == 0 && kernel);
1708 
1709 	mcs = __xen_mc_entry(sizeof(*op));
1710 
1711 	op = mcs.args;
1712 	op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1713 	op->arg1.mfn = mfn;
1714 
1715 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1716 
1717 	if (kernel) {
1718 		percpu_write(xen_cr3, cr3);
1719 
1720 		/* Update xen_current_cr3 once the batch has actually
1721 		   been submitted. */
1722 		xen_mc_callback(set_current_cr3, (void *)cr3);
1723 	}
1724 }
1725 
1726 static void xen_write_cr3(unsigned long cr3)
1727 {
1728 	BUG_ON(preemptible());
1729 
1730 	xen_mc_batch();  /* disables interrupts */
1731 
1732 	/* Update while interrupts are disabled, so its atomic with
1733 	   respect to ipis */
1734 	percpu_write(xen_cr3, cr3);
1735 
1736 	__xen_write_cr3(true, cr3);
1737 
1738 #ifdef CONFIG_X86_64
1739 	{
1740 		pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1741 		if (user_pgd)
1742 			__xen_write_cr3(false, __pa(user_pgd));
1743 		else
1744 			__xen_write_cr3(false, 0);
1745 	}
1746 #endif
1747 
1748 	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1749 }
1750 
1751 static int xen_pgd_alloc(struct mm_struct *mm)
1752 {
1753 	pgd_t *pgd = mm->pgd;
1754 	int ret = 0;
1755 
1756 	BUG_ON(PagePinned(virt_to_page(pgd)));
1757 
1758 #ifdef CONFIG_X86_64
1759 	{
1760 		struct page *page = virt_to_page(pgd);
1761 		pgd_t *user_pgd;
1762 
1763 		BUG_ON(page->private != 0);
1764 
1765 		ret = -ENOMEM;
1766 
1767 		user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1768 		page->private = (unsigned long)user_pgd;
1769 
1770 		if (user_pgd != NULL) {
1771 			user_pgd[pgd_index(VSYSCALL_START)] =
1772 				__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1773 			ret = 0;
1774 		}
1775 
1776 		BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1777 	}
1778 #endif
1779 
1780 	return ret;
1781 }
1782 
1783 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1784 {
1785 #ifdef CONFIG_X86_64
1786 	pgd_t *user_pgd = xen_get_user_pgd(pgd);
1787 
1788 	if (user_pgd)
1789 		free_page((unsigned long)user_pgd);
1790 #endif
1791 }
1792 
1793 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1794 {
1795 	unsigned long pfn = pte_pfn(pte);
1796 
1797 #ifdef CONFIG_X86_32
1798 	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
1799 	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1800 		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1801 			       pte_val_ma(pte));
1802 #endif
1803 
1804 	/*
1805 	 * If the new pfn is within the range of the newly allocated
1806 	 * kernel pagetable, and it isn't being mapped into an
1807 	 * early_ioremap fixmap slot, make sure it is RO.
1808 	 */
1809 	if (!is_early_ioremap_ptep(ptep) &&
1810 	    pfn >= e820_table_start && pfn < e820_table_end)
1811 		pte = pte_wrprotect(pte);
1812 
1813 	return pte;
1814 }
1815 
1816 /* Init-time set_pte while constructing initial pagetables, which
1817    doesn't allow RO pagetable pages to be remapped RW */
1818 static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1819 {
1820 	pte = mask_rw_pte(ptep, pte);
1821 
1822 	xen_set_pte(ptep, pte);
1823 }
1824 
1825 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1826 {
1827 	struct mmuext_op op;
1828 	op.cmd = cmd;
1829 	op.arg1.mfn = pfn_to_mfn(pfn);
1830 	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1831 		BUG();
1832 }
1833 
1834 /* Early in boot, while setting up the initial pagetable, assume
1835    everything is pinned. */
1836 static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1837 {
1838 #ifdef CONFIG_FLATMEM
1839 	BUG_ON(mem_map);	/* should only be used early */
1840 #endif
1841 	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1842 	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1843 }
1844 
1845 /* Used for pmd and pud */
1846 static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1847 {
1848 #ifdef CONFIG_FLATMEM
1849 	BUG_ON(mem_map);	/* should only be used early */
1850 #endif
1851 	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1852 }
1853 
1854 /* Early release_pte assumes that all pts are pinned, since there's
1855    only init_mm and anything attached to that is pinned. */
1856 static __init void xen_release_pte_init(unsigned long pfn)
1857 {
1858 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1859 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1860 }
1861 
1862 static __init void xen_release_pmd_init(unsigned long pfn)
1863 {
1864 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1865 }
1866 
1867 /* This needs to make sure the new pte page is pinned iff its being
1868    attached to a pinned pagetable. */
1869 static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1870 {
1871 	struct page *page = pfn_to_page(pfn);
1872 
1873 	if (PagePinned(virt_to_page(mm->pgd))) {
1874 		SetPagePinned(page);
1875 
1876 		if (!PageHighMem(page)) {
1877 			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1878 			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1879 				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1880 		} else {
1881 			/* make sure there are no stray mappings of
1882 			   this page */
1883 			kmap_flush_unused();
1884 		}
1885 	}
1886 }
1887 
1888 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1889 {
1890 	xen_alloc_ptpage(mm, pfn, PT_PTE);
1891 }
1892 
1893 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1894 {
1895 	xen_alloc_ptpage(mm, pfn, PT_PMD);
1896 }
1897 
1898 /* This should never happen until we're OK to use struct page */
1899 static void xen_release_ptpage(unsigned long pfn, unsigned level)
1900 {
1901 	struct page *page = pfn_to_page(pfn);
1902 
1903 	if (PagePinned(page)) {
1904 		if (!PageHighMem(page)) {
1905 			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1906 				pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1907 			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1908 		}
1909 		ClearPagePinned(page);
1910 	}
1911 }
1912 
1913 static void xen_release_pte(unsigned long pfn)
1914 {
1915 	xen_release_ptpage(pfn, PT_PTE);
1916 }
1917 
1918 static void xen_release_pmd(unsigned long pfn)
1919 {
1920 	xen_release_ptpage(pfn, PT_PMD);
1921 }
1922 
1923 #if PAGETABLE_LEVELS == 4
1924 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1925 {
1926 	xen_alloc_ptpage(mm, pfn, PT_PUD);
1927 }
1928 
1929 static void xen_release_pud(unsigned long pfn)
1930 {
1931 	xen_release_ptpage(pfn, PT_PUD);
1932 }
1933 #endif
1934 
1935 void __init xen_reserve_top(void)
1936 {
1937 #ifdef CONFIG_X86_32
1938 	unsigned long top = HYPERVISOR_VIRT_START;
1939 	struct xen_platform_parameters pp;
1940 
1941 	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1942 		top = pp.virt_start;
1943 
1944 	reserve_top_address(-top);
1945 #endif	/* CONFIG_X86_32 */
1946 }
1947 
1948 /*
1949  * Like __va(), but returns address in the kernel mapping (which is
1950  * all we have until the physical memory mapping has been set up.
1951  */
1952 static void *__ka(phys_addr_t paddr)
1953 {
1954 #ifdef CONFIG_X86_64
1955 	return (void *)(paddr + __START_KERNEL_map);
1956 #else
1957 	return __va(paddr);
1958 #endif
1959 }
1960 
1961 /* Convert a machine address to physical address */
1962 static unsigned long m2p(phys_addr_t maddr)
1963 {
1964 	phys_addr_t paddr;
1965 
1966 	maddr &= PTE_PFN_MASK;
1967 	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1968 
1969 	return paddr;
1970 }
1971 
1972 /* Convert a machine address to kernel virtual */
1973 static void *m2v(phys_addr_t maddr)
1974 {
1975 	return __ka(m2p(maddr));
1976 }
1977 
1978 /* Set the page permissions on an identity-mapped pages */
1979 static void set_page_prot(void *addr, pgprot_t prot)
1980 {
1981 	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1982 	pte_t pte = pfn_pte(pfn, prot);
1983 
1984 	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1985 		BUG();
1986 }
1987 
1988 static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1989 {
1990 	unsigned pmdidx, pteidx;
1991 	unsigned ident_pte;
1992 	unsigned long pfn;
1993 
1994 	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1995 				      PAGE_SIZE);
1996 
1997 	ident_pte = 0;
1998 	pfn = 0;
1999 	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
2000 		pte_t *pte_page;
2001 
2002 		/* Reuse or allocate a page of ptes */
2003 		if (pmd_present(pmd[pmdidx]))
2004 			pte_page = m2v(pmd[pmdidx].pmd);
2005 		else {
2006 			/* Check for free pte pages */
2007 			if (ident_pte == LEVEL1_IDENT_ENTRIES)
2008 				break;
2009 
2010 			pte_page = &level1_ident_pgt[ident_pte];
2011 			ident_pte += PTRS_PER_PTE;
2012 
2013 			pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
2014 		}
2015 
2016 		/* Install mappings */
2017 		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
2018 			pte_t pte;
2019 
2020 			if (pfn > max_pfn_mapped)
2021 				max_pfn_mapped = pfn;
2022 
2023 			if (!pte_none(pte_page[pteidx]))
2024 				continue;
2025 
2026 			pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
2027 			pte_page[pteidx] = pte;
2028 		}
2029 	}
2030 
2031 	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
2032 		set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
2033 
2034 	set_page_prot(pmd, PAGE_KERNEL_RO);
2035 }
2036 
2037 void __init xen_setup_machphys_mapping(void)
2038 {
2039 	struct xen_machphys_mapping mapping;
2040 	unsigned long machine_to_phys_nr_ents;
2041 
2042 	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
2043 		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
2044 		machine_to_phys_nr_ents = mapping.max_mfn + 1;
2045 	} else {
2046 		machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
2047 	}
2048 	machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
2049 }
2050 
2051 #ifdef CONFIG_X86_64
2052 static void convert_pfn_mfn(void *v)
2053 {
2054 	pte_t *pte = v;
2055 	int i;
2056 
2057 	/* All levels are converted the same way, so just treat them
2058 	   as ptes. */
2059 	for (i = 0; i < PTRS_PER_PTE; i++)
2060 		pte[i] = xen_make_pte(pte[i].pte);
2061 }
2062 
2063 /*
2064  * Set up the inital kernel pagetable.
2065  *
2066  * We can construct this by grafting the Xen provided pagetable into
2067  * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
2068  * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
2069  * means that only the kernel has a physical mapping to start with -
2070  * but that's enough to get __va working.  We need to fill in the rest
2071  * of the physical mapping once some sort of allocator has been set
2072  * up.
2073  */
2074 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
2075 					 unsigned long max_pfn)
2076 {
2077 	pud_t *l3;
2078 	pmd_t *l2;
2079 
2080 	/* Zap identity mapping */
2081 	init_level4_pgt[0] = __pgd(0);
2082 
2083 	/* Pre-constructed entries are in pfn, so convert to mfn */
2084 	convert_pfn_mfn(init_level4_pgt);
2085 	convert_pfn_mfn(level3_ident_pgt);
2086 	convert_pfn_mfn(level3_kernel_pgt);
2087 
2088 	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
2089 	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
2090 
2091 	memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
2092 	memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
2093 
2094 	l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
2095 	l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
2096 	memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
2097 
2098 	/* Set up identity map */
2099 	xen_map_identity_early(level2_ident_pgt, max_pfn);
2100 
2101 	/* Make pagetable pieces RO */
2102 	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
2103 	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
2104 	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
2105 	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
2106 	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
2107 	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
2108 
2109 	/* Pin down new L4 */
2110 	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
2111 			  PFN_DOWN(__pa_symbol(init_level4_pgt)));
2112 
2113 	/* Unpin Xen-provided one */
2114 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
2115 
2116 	/* Switch over */
2117 	pgd = init_level4_pgt;
2118 
2119 	/*
2120 	 * At this stage there can be no user pgd, and no page
2121 	 * structure to attach it to, so make sure we just set kernel
2122 	 * pgd.
2123 	 */
2124 	xen_mc_batch();
2125 	__xen_write_cr3(true, __pa(pgd));
2126 	xen_mc_issue(PARAVIRT_LAZY_CPU);
2127 
2128 	memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
2129 		      __pa(xen_start_info->pt_base +
2130 			   xen_start_info->nr_pt_frames * PAGE_SIZE),
2131 		      "XEN PAGETABLES");
2132 
2133 	return pgd;
2134 }
2135 #else	/* !CONFIG_X86_64 */
2136 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
2137 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
2138 
2139 static __init void xen_write_cr3_init(unsigned long cr3)
2140 {
2141 	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
2142 
2143 	BUG_ON(read_cr3() != __pa(initial_page_table));
2144 	BUG_ON(cr3 != __pa(swapper_pg_dir));
2145 
2146 	/*
2147 	 * We are switching to swapper_pg_dir for the first time (from
2148 	 * initial_page_table) and therefore need to mark that page
2149 	 * read-only and then pin it.
2150 	 *
2151 	 * Xen disallows sharing of kernel PMDs for PAE
2152 	 * guests. Therefore we must copy the kernel PMD from
2153 	 * initial_page_table into a new kernel PMD to be used in
2154 	 * swapper_pg_dir.
2155 	 */
2156 	swapper_kernel_pmd =
2157 		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2158 	memcpy(swapper_kernel_pmd, initial_kernel_pmd,
2159 	       sizeof(pmd_t) * PTRS_PER_PMD);
2160 	swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
2161 		__pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
2162 	set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
2163 
2164 	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
2165 	xen_write_cr3(cr3);
2166 	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
2167 
2168 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
2169 			  PFN_DOWN(__pa(initial_page_table)));
2170 	set_page_prot(initial_page_table, PAGE_KERNEL);
2171 	set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
2172 
2173 	pv_mmu_ops.write_cr3 = &xen_write_cr3;
2174 }
2175 
2176 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
2177 					 unsigned long max_pfn)
2178 {
2179 	pmd_t *kernel_pmd;
2180 
2181 	initial_kernel_pmd =
2182 		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2183 
2184 	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
2185 				  xen_start_info->nr_pt_frames * PAGE_SIZE +
2186 				  512*1024);
2187 
2188 	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
2189 	memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
2190 
2191 	xen_map_identity_early(initial_kernel_pmd, max_pfn);
2192 
2193 	memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
2194 	initial_page_table[KERNEL_PGD_BOUNDARY] =
2195 		__pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
2196 
2197 	set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
2198 	set_page_prot(initial_page_table, PAGE_KERNEL_RO);
2199 	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
2200 
2201 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
2202 
2203 	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
2204 			  PFN_DOWN(__pa(initial_page_table)));
2205 	xen_write_cr3(__pa(initial_page_table));
2206 
2207 	memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
2208 		      __pa(xen_start_info->pt_base +
2209 			   xen_start_info->nr_pt_frames * PAGE_SIZE),
2210 		      "XEN PAGETABLES");
2211 
2212 	return initial_page_table;
2213 }
2214 #endif	/* CONFIG_X86_64 */
2215 
2216 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
2217 
2218 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2219 {
2220 	pte_t pte;
2221 
2222 	phys >>= PAGE_SHIFT;
2223 
2224 	switch (idx) {
2225 	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
2226 #ifdef CONFIG_X86_F00F_BUG
2227 	case FIX_F00F_IDT:
2228 #endif
2229 #ifdef CONFIG_X86_32
2230 	case FIX_WP_TEST:
2231 	case FIX_VDSO:
2232 # ifdef CONFIG_HIGHMEM
2233 	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
2234 # endif
2235 #else
2236 	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
2237 #endif
2238 	case FIX_TEXT_POKE0:
2239 	case FIX_TEXT_POKE1:
2240 		/* All local page mappings */
2241 		pte = pfn_pte(phys, prot);
2242 		break;
2243 
2244 #ifdef CONFIG_X86_LOCAL_APIC
2245 	case FIX_APIC_BASE:	/* maps dummy local APIC */
2246 		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2247 		break;
2248 #endif
2249 
2250 #ifdef CONFIG_X86_IO_APIC
2251 	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
2252 		/*
2253 		 * We just don't map the IO APIC - all access is via
2254 		 * hypercalls.  Keep the address in the pte for reference.
2255 		 */
2256 		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2257 		break;
2258 #endif
2259 
2260 	case FIX_PARAVIRT_BOOTMAP:
2261 		/* This is an MFN, but it isn't an IO mapping from the
2262 		   IO domain */
2263 		pte = mfn_pte(phys, prot);
2264 		break;
2265 
2266 	default:
2267 		/* By default, set_fixmap is used for hardware mappings */
2268 		pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
2269 		break;
2270 	}
2271 
2272 	__native_set_fixmap(idx, pte);
2273 
2274 #ifdef CONFIG_X86_64
2275 	/* Replicate changes to map the vsyscall page into the user
2276 	   pagetable vsyscall mapping. */
2277 	if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
2278 		unsigned long vaddr = __fix_to_virt(idx);
2279 		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
2280 	}
2281 #endif
2282 }
2283 
2284 __init void xen_ident_map_ISA(void)
2285 {
2286 	unsigned long pa;
2287 
2288 	/*
2289 	 * If we're dom0, then linear map the ISA machine addresses into
2290 	 * the kernel's address space.
2291 	 */
2292 	if (!xen_initial_domain())
2293 		return;
2294 
2295 	xen_raw_printk("Xen: setup ISA identity maps\n");
2296 
2297 	for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
2298 		pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
2299 
2300 		if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
2301 			BUG();
2302 	}
2303 
2304 	xen_flush_tlb();
2305 }
2306 
2307 static __init void xen_post_allocator_init(void)
2308 {
2309 	pv_mmu_ops.set_pte = xen_set_pte;
2310 	pv_mmu_ops.set_pmd = xen_set_pmd;
2311 	pv_mmu_ops.set_pud = xen_set_pud;
2312 #if PAGETABLE_LEVELS == 4
2313 	pv_mmu_ops.set_pgd = xen_set_pgd;
2314 #endif
2315 
2316 	/* This will work as long as patching hasn't happened yet
2317 	   (which it hasn't) */
2318 	pv_mmu_ops.alloc_pte = xen_alloc_pte;
2319 	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2320 	pv_mmu_ops.release_pte = xen_release_pte;
2321 	pv_mmu_ops.release_pmd = xen_release_pmd;
2322 #if PAGETABLE_LEVELS == 4
2323 	pv_mmu_ops.alloc_pud = xen_alloc_pud;
2324 	pv_mmu_ops.release_pud = xen_release_pud;
2325 #endif
2326 
2327 #ifdef CONFIG_X86_64
2328 	SetPagePinned(virt_to_page(level3_user_vsyscall));
2329 #endif
2330 	xen_mark_init_mm_pinned();
2331 }
2332 
2333 static void xen_leave_lazy_mmu(void)
2334 {
2335 	preempt_disable();
2336 	xen_mc_flush();
2337 	paravirt_leave_lazy_mmu();
2338 	preempt_enable();
2339 }
2340 
2341 static const struct pv_mmu_ops xen_mmu_ops __initdata = {
2342 	.read_cr2 = xen_read_cr2,
2343 	.write_cr2 = xen_write_cr2,
2344 
2345 	.read_cr3 = xen_read_cr3,
2346 #ifdef CONFIG_X86_32
2347 	.write_cr3 = xen_write_cr3_init,
2348 #else
2349 	.write_cr3 = xen_write_cr3,
2350 #endif
2351 
2352 	.flush_tlb_user = xen_flush_tlb,
2353 	.flush_tlb_kernel = xen_flush_tlb,
2354 	.flush_tlb_single = xen_flush_tlb_single,
2355 	.flush_tlb_others = xen_flush_tlb_others,
2356 
2357 	.pte_update = paravirt_nop,
2358 	.pte_update_defer = paravirt_nop,
2359 
2360 	.pgd_alloc = xen_pgd_alloc,
2361 	.pgd_free = xen_pgd_free,
2362 
2363 	.alloc_pte = xen_alloc_pte_init,
2364 	.release_pte = xen_release_pte_init,
2365 	.alloc_pmd = xen_alloc_pmd_init,
2366 	.release_pmd = xen_release_pmd_init,
2367 
2368 	.set_pte = xen_set_pte_init,
2369 	.set_pte_at = xen_set_pte_at,
2370 	.set_pmd = xen_set_pmd_hyper,
2371 
2372 	.ptep_modify_prot_start = __ptep_modify_prot_start,
2373 	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
2374 
2375 	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
2376 	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2377 
2378 	.make_pte = PV_CALLEE_SAVE(xen_make_pte),
2379 	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2380 
2381 #ifdef CONFIG_X86_PAE
2382 	.set_pte_atomic = xen_set_pte_atomic,
2383 	.pte_clear = xen_pte_clear,
2384 	.pmd_clear = xen_pmd_clear,
2385 #endif	/* CONFIG_X86_PAE */
2386 	.set_pud = xen_set_pud_hyper,
2387 
2388 	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2389 	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2390 
2391 #if PAGETABLE_LEVELS == 4
2392 	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
2393 	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
2394 	.set_pgd = xen_set_pgd_hyper,
2395 
2396 	.alloc_pud = xen_alloc_pmd_init,
2397 	.release_pud = xen_release_pmd_init,
2398 #endif	/* PAGETABLE_LEVELS == 4 */
2399 
2400 	.activate_mm = xen_activate_mm,
2401 	.dup_mmap = xen_dup_mmap,
2402 	.exit_mmap = xen_exit_mmap,
2403 
2404 	.lazy_mode = {
2405 		.enter = paravirt_enter_lazy_mmu,
2406 		.leave = xen_leave_lazy_mmu,
2407 	},
2408 
2409 	.set_fixmap = xen_set_fixmap,
2410 };
2411 
2412 void __init xen_init_mmu_ops(void)
2413 {
2414 	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2415 	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2416 	pv_mmu_ops = xen_mmu_ops;
2417 
2418 	memset(dummy_mapping, 0xff, PAGE_SIZE);
2419 }
2420 
2421 /* Protected by xen_reservation_lock. */
2422 #define MAX_CONTIG_ORDER 9 /* 2MB */
2423 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2424 
2425 #define VOID_PTE (mfn_pte(0, __pgprot(0)))
2426 static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2427 				unsigned long *in_frames,
2428 				unsigned long *out_frames)
2429 {
2430 	int i;
2431 	struct multicall_space mcs;
2432 
2433 	xen_mc_batch();
2434 	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2435 		mcs = __xen_mc_entry(0);
2436 
2437 		if (in_frames)
2438 			in_frames[i] = virt_to_mfn(vaddr);
2439 
2440 		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2441 		set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2442 
2443 		if (out_frames)
2444 			out_frames[i] = virt_to_pfn(vaddr);
2445 	}
2446 	xen_mc_issue(0);
2447 }
2448 
2449 /*
2450  * Update the pfn-to-mfn mappings for a virtual address range, either to
2451  * point to an array of mfns, or contiguously from a single starting
2452  * mfn.
2453  */
2454 static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2455 				     unsigned long *mfns,
2456 				     unsigned long first_mfn)
2457 {
2458 	unsigned i, limit;
2459 	unsigned long mfn;
2460 
2461 	xen_mc_batch();
2462 
2463 	limit = 1u << order;
2464 	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2465 		struct multicall_space mcs;
2466 		unsigned flags;
2467 
2468 		mcs = __xen_mc_entry(0);
2469 		if (mfns)
2470 			mfn = mfns[i];
2471 		else
2472 			mfn = first_mfn + i;
2473 
2474 		if (i < (limit - 1))
2475 			flags = 0;
2476 		else {
2477 			if (order == 0)
2478 				flags = UVMF_INVLPG | UVMF_ALL;
2479 			else
2480 				flags = UVMF_TLB_FLUSH | UVMF_ALL;
2481 		}
2482 
2483 		MULTI_update_va_mapping(mcs.mc, vaddr,
2484 				mfn_pte(mfn, PAGE_KERNEL), flags);
2485 
2486 		set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2487 	}
2488 
2489 	xen_mc_issue(0);
2490 }
2491 
2492 /*
2493  * Perform the hypercall to exchange a region of our pfns to point to
2494  * memory with the required contiguous alignment.  Takes the pfns as
2495  * input, and populates mfns as output.
2496  *
2497  * Returns a success code indicating whether the hypervisor was able to
2498  * satisfy the request or not.
2499  */
2500 static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2501 			       unsigned long *pfns_in,
2502 			       unsigned long extents_out,
2503 			       unsigned int order_out,
2504 			       unsigned long *mfns_out,
2505 			       unsigned int address_bits)
2506 {
2507 	long rc;
2508 	int success;
2509 
2510 	struct xen_memory_exchange exchange = {
2511 		.in = {
2512 			.nr_extents   = extents_in,
2513 			.extent_order = order_in,
2514 			.extent_start = pfns_in,
2515 			.domid        = DOMID_SELF
2516 		},
2517 		.out = {
2518 			.nr_extents   = extents_out,
2519 			.extent_order = order_out,
2520 			.extent_start = mfns_out,
2521 			.address_bits = address_bits,
2522 			.domid        = DOMID_SELF
2523 		}
2524 	};
2525 
2526 	BUG_ON(extents_in << order_in != extents_out << order_out);
2527 
2528 	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2529 	success = (exchange.nr_exchanged == extents_in);
2530 
2531 	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2532 	BUG_ON(success && (rc != 0));
2533 
2534 	return success;
2535 }
2536 
2537 int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2538 				 unsigned int address_bits)
2539 {
2540 	unsigned long *in_frames = discontig_frames, out_frame;
2541 	unsigned long  flags;
2542 	int            success;
2543 
2544 	/*
2545 	 * Currently an auto-translated guest will not perform I/O, nor will
2546 	 * it require PAE page directories below 4GB. Therefore any calls to
2547 	 * this function are redundant and can be ignored.
2548 	 */
2549 
2550 	if (xen_feature(XENFEAT_auto_translated_physmap))
2551 		return 0;
2552 
2553 	if (unlikely(order > MAX_CONTIG_ORDER))
2554 		return -ENOMEM;
2555 
2556 	memset((void *) vstart, 0, PAGE_SIZE << order);
2557 
2558 	spin_lock_irqsave(&xen_reservation_lock, flags);
2559 
2560 	/* 1. Zap current PTEs, remembering MFNs. */
2561 	xen_zap_pfn_range(vstart, order, in_frames, NULL);
2562 
2563 	/* 2. Get a new contiguous memory extent. */
2564 	out_frame = virt_to_pfn(vstart);
2565 	success = xen_exchange_memory(1UL << order, 0, in_frames,
2566 				      1, order, &out_frame,
2567 				      address_bits);
2568 
2569 	/* 3. Map the new extent in place of old pages. */
2570 	if (success)
2571 		xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2572 	else
2573 		xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2574 
2575 	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2576 
2577 	return success ? 0 : -ENOMEM;
2578 }
2579 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2580 
2581 void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2582 {
2583 	unsigned long *out_frames = discontig_frames, in_frame;
2584 	unsigned long  flags;
2585 	int success;
2586 
2587 	if (xen_feature(XENFEAT_auto_translated_physmap))
2588 		return;
2589 
2590 	if (unlikely(order > MAX_CONTIG_ORDER))
2591 		return;
2592 
2593 	memset((void *) vstart, 0, PAGE_SIZE << order);
2594 
2595 	spin_lock_irqsave(&xen_reservation_lock, flags);
2596 
2597 	/* 1. Find start MFN of contiguous extent. */
2598 	in_frame = virt_to_mfn(vstart);
2599 
2600 	/* 2. Zap current PTEs. */
2601 	xen_zap_pfn_range(vstart, order, NULL, out_frames);
2602 
2603 	/* 3. Do the exchange for non-contiguous MFNs. */
2604 	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2605 					0, out_frames, 0);
2606 
2607 	/* 4. Map new pages in place of old pages. */
2608 	if (success)
2609 		xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2610 	else
2611 		xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2612 
2613 	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2614 }
2615 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2616 
2617 #ifdef CONFIG_XEN_PVHVM
2618 static void xen_hvm_exit_mmap(struct mm_struct *mm)
2619 {
2620 	struct xen_hvm_pagetable_dying a;
2621 	int rc;
2622 
2623 	a.domid = DOMID_SELF;
2624 	a.gpa = __pa(mm->pgd);
2625 	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2626 	WARN_ON_ONCE(rc < 0);
2627 }
2628 
2629 static int is_pagetable_dying_supported(void)
2630 {
2631 	struct xen_hvm_pagetable_dying a;
2632 	int rc = 0;
2633 
2634 	a.domid = DOMID_SELF;
2635 	a.gpa = 0x00;
2636 	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2637 	if (rc < 0) {
2638 		printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2639 		return 0;
2640 	}
2641 	return 1;
2642 }
2643 
2644 void __init xen_hvm_init_mmu_ops(void)
2645 {
2646 	if (is_pagetable_dying_supported())
2647 		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2648 }
2649 #endif
2650 
2651 #define REMAP_BATCH_SIZE 16
2652 
2653 struct remap_data {
2654 	unsigned long mfn;
2655 	pgprot_t prot;
2656 	struct mmu_update *mmu_update;
2657 };
2658 
2659 static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2660 				 unsigned long addr, void *data)
2661 {
2662 	struct remap_data *rmd = data;
2663 	pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2664 
2665 	rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
2666 	rmd->mmu_update->val = pte_val_ma(pte);
2667 	rmd->mmu_update++;
2668 
2669 	return 0;
2670 }
2671 
2672 int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2673 			       unsigned long addr,
2674 			       unsigned long mfn, int nr,
2675 			       pgprot_t prot, unsigned domid)
2676 {
2677 	struct remap_data rmd;
2678 	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2679 	int batch;
2680 	unsigned long range;
2681 	int err = 0;
2682 
2683 	prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2684 
2685 	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2686 				(VM_PFNMAP | VM_RESERVED | VM_IO)));
2687 
2688 	rmd.mfn = mfn;
2689 	rmd.prot = prot;
2690 
2691 	while (nr) {
2692 		batch = min(REMAP_BATCH_SIZE, nr);
2693 		range = (unsigned long)batch << PAGE_SHIFT;
2694 
2695 		rmd.mmu_update = mmu_update;
2696 		err = apply_to_page_range(vma->vm_mm, addr, range,
2697 					  remap_area_mfn_pte_fn, &rmd);
2698 		if (err)
2699 			goto out;
2700 
2701 		err = -EFAULT;
2702 		if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2703 			goto out;
2704 
2705 		nr -= batch;
2706 		addr += range;
2707 	}
2708 
2709 	err = 0;
2710 out:
2711 
2712 	flush_tlb_all();
2713 
2714 	return err;
2715 }
2716 EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2717 
2718 #ifdef CONFIG_XEN_DEBUG_FS
2719 
2720 static struct dentry *d_mmu_debug;
2721 
2722 static int __init xen_mmu_debugfs(void)
2723 {
2724 	struct dentry *d_xen = xen_init_debugfs();
2725 
2726 	if (d_xen == NULL)
2727 		return -ENOMEM;
2728 
2729 	d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2730 
2731 	debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2732 
2733 	debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2734 	debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2735 			   &mmu_stats.pgd_update_pinned);
2736 	debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2737 			   &mmu_stats.pgd_update_pinned);
2738 
2739 	debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2740 	debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2741 			   &mmu_stats.pud_update_pinned);
2742 	debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2743 			   &mmu_stats.pud_update_pinned);
2744 
2745 	debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2746 	debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2747 			   &mmu_stats.pmd_update_pinned);
2748 	debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2749 			   &mmu_stats.pmd_update_pinned);
2750 
2751 	debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2752 //	debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2753 //			   &mmu_stats.pte_update_pinned);
2754 	debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2755 			   &mmu_stats.pte_update_pinned);
2756 
2757 	debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2758 	debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2759 			   &mmu_stats.mmu_update_extended);
2760 	xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2761 				     mmu_stats.mmu_update_histo, 20);
2762 
2763 	debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2764 	debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2765 			   &mmu_stats.set_pte_at_batched);
2766 	debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2767 			   &mmu_stats.set_pte_at_current);
2768 	debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2769 			   &mmu_stats.set_pte_at_kernel);
2770 
2771 	debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2772 	debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2773 			   &mmu_stats.prot_commit_batched);
2774 
2775 	return 0;
2776 }
2777 fs_initcall(xen_mmu_debugfs);
2778 
2779 #endif	/* CONFIG_XEN_DEBUG_FS */
2780