xref: /openbmc/linux/arch/x86/xen/mmu.c (revision 22246614)
1 /*
2  * Xen mmu operations
3  *
4  * This file contains the various mmu fetch and update operations.
5  * The most important job they must perform is the mapping between the
6  * domain's pfn and the overall machine mfns.
7  *
8  * Xen allows guests to directly update the pagetable, in a controlled
9  * fashion.  In other words, the guest modifies the same pagetable
10  * that the CPU actually uses, which eliminates the overhead of having
11  * a separate shadow pagetable.
12  *
13  * In order to allow this, it falls on the guest domain to map its
14  * notion of a "physical" pfn - which is just a domain-local linear
15  * address - into a real "machine address" which the CPU's MMU can
16  * use.
17  *
18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19  * inserted directly into the pagetable.  When creating a new
20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
22  * the mfn back into a pfn.
23  *
24  * The other constraint is that all pages which make up a pagetable
25  * must be mapped read-only in the guest.  This prevents uncontrolled
26  * guest updates to the pagetable.  Xen strictly enforces this, and
27  * will disallow any pagetable update which will end up mapping a
28  * pagetable page RW, and will disallow using any writable page as a
29  * pagetable.
30  *
31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
32  * would need to validate the whole pagetable before going on.
33  * Naturally, this is quite slow.  The solution is to "pin" a
34  * pagetable, which enforces all the constraints on the pagetable even
35  * when it is not actively in use.  This menas that Xen can be assured
36  * that it is still valid when you do load it into %cr3, and doesn't
37  * need to revalidate it.
38  *
39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40  */
41 #include <linux/sched.h>
42 #include <linux/highmem.h>
43 #include <linux/bug.h>
44 
45 #include <asm/pgtable.h>
46 #include <asm/tlbflush.h>
47 #include <asm/mmu_context.h>
48 #include <asm/paravirt.h>
49 
50 #include <asm/xen/hypercall.h>
51 #include <asm/xen/hypervisor.h>
52 
53 #include <xen/page.h>
54 #include <xen/interface/xen.h>
55 
56 #include "multicalls.h"
57 #include "mmu.h"
58 
59 xmaddr_t arbitrary_virt_to_machine(unsigned long address)
60 {
61 	unsigned int level;
62 	pte_t *pte = lookup_address(address, &level);
63 	unsigned offset = address & PAGE_MASK;
64 
65 	BUG_ON(pte == NULL);
66 
67 	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
68 }
69 
70 void make_lowmem_page_readonly(void *vaddr)
71 {
72 	pte_t *pte, ptev;
73 	unsigned long address = (unsigned long)vaddr;
74 	unsigned int level;
75 
76 	pte = lookup_address(address, &level);
77 	BUG_ON(pte == NULL);
78 
79 	ptev = pte_wrprotect(*pte);
80 
81 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
82 		BUG();
83 }
84 
85 void make_lowmem_page_readwrite(void *vaddr)
86 {
87 	pte_t *pte, ptev;
88 	unsigned long address = (unsigned long)vaddr;
89 	unsigned int level;
90 
91 	pte = lookup_address(address, &level);
92 	BUG_ON(pte == NULL);
93 
94 	ptev = pte_mkwrite(*pte);
95 
96 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
97 		BUG();
98 }
99 
100 
101 void xen_set_pmd(pmd_t *ptr, pmd_t val)
102 {
103 	struct multicall_space mcs;
104 	struct mmu_update *u;
105 
106 	preempt_disable();
107 
108 	mcs = xen_mc_entry(sizeof(*u));
109 	u = mcs.args;
110 	u->ptr = virt_to_machine(ptr).maddr;
111 	u->val = pmd_val_ma(val);
112 	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
113 
114 	xen_mc_issue(PARAVIRT_LAZY_MMU);
115 
116 	preempt_enable();
117 }
118 
119 /*
120  * Associate a virtual page frame with a given physical page frame
121  * and protection flags for that frame.
122  */
123 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
124 {
125 	pgd_t *pgd;
126 	pud_t *pud;
127 	pmd_t *pmd;
128 	pte_t *pte;
129 
130 	pgd = swapper_pg_dir + pgd_index(vaddr);
131 	if (pgd_none(*pgd)) {
132 		BUG();
133 		return;
134 	}
135 	pud = pud_offset(pgd, vaddr);
136 	if (pud_none(*pud)) {
137 		BUG();
138 		return;
139 	}
140 	pmd = pmd_offset(pud, vaddr);
141 	if (pmd_none(*pmd)) {
142 		BUG();
143 		return;
144 	}
145 	pte = pte_offset_kernel(pmd, vaddr);
146 	/* <mfn,flags> stored as-is, to permit clearing entries */
147 	xen_set_pte(pte, mfn_pte(mfn, flags));
148 
149 	/*
150 	 * It's enough to flush this one mapping.
151 	 * (PGE mappings get flushed as well)
152 	 */
153 	__flush_tlb_one(vaddr);
154 }
155 
156 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
157 		    pte_t *ptep, pte_t pteval)
158 {
159 	/* updates to init_mm may be done without lock */
160 	if (mm == &init_mm)
161 		preempt_disable();
162 
163 	if (mm == current->mm || mm == &init_mm) {
164 		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
165 			struct multicall_space mcs;
166 			mcs = xen_mc_entry(0);
167 
168 			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
169 			xen_mc_issue(PARAVIRT_LAZY_MMU);
170 			goto out;
171 		} else
172 			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
173 				goto out;
174 	}
175 	xen_set_pte(ptep, pteval);
176 
177 out:
178 	if (mm == &init_mm)
179 		preempt_enable();
180 }
181 
182 pteval_t xen_pte_val(pte_t pte)
183 {
184 	pteval_t ret = pte.pte;
185 
186 	if (ret & _PAGE_PRESENT)
187 		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
188 
189 	return ret;
190 }
191 
192 pgdval_t xen_pgd_val(pgd_t pgd)
193 {
194 	pgdval_t ret = pgd.pgd;
195 	if (ret & _PAGE_PRESENT)
196 		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
197 	return ret;
198 }
199 
200 pte_t xen_make_pte(pteval_t pte)
201 {
202 	if (pte & _PAGE_PRESENT) {
203 		pte = phys_to_machine(XPADDR(pte)).maddr;
204 		pte &= ~(_PAGE_PCD | _PAGE_PWT);
205 	}
206 
207 	return (pte_t){ .pte = pte };
208 }
209 
210 pgd_t xen_make_pgd(pgdval_t pgd)
211 {
212 	if (pgd & _PAGE_PRESENT)
213 		pgd = phys_to_machine(XPADDR(pgd)).maddr;
214 
215 	return (pgd_t){ pgd };
216 }
217 
218 pmdval_t xen_pmd_val(pmd_t pmd)
219 {
220 	pmdval_t ret = native_pmd_val(pmd);
221 	if (ret & _PAGE_PRESENT)
222 		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
223 	return ret;
224 }
225 #ifdef CONFIG_X86_PAE
226 void xen_set_pud(pud_t *ptr, pud_t val)
227 {
228 	struct multicall_space mcs;
229 	struct mmu_update *u;
230 
231 	preempt_disable();
232 
233 	mcs = xen_mc_entry(sizeof(*u));
234 	u = mcs.args;
235 	u->ptr = virt_to_machine(ptr).maddr;
236 	u->val = pud_val_ma(val);
237 	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
238 
239 	xen_mc_issue(PARAVIRT_LAZY_MMU);
240 
241 	preempt_enable();
242 }
243 
244 void xen_set_pte(pte_t *ptep, pte_t pte)
245 {
246 	ptep->pte_high = pte.pte_high;
247 	smp_wmb();
248 	ptep->pte_low = pte.pte_low;
249 }
250 
251 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
252 {
253 	set_64bit((u64 *)ptep, pte_val_ma(pte));
254 }
255 
256 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
257 {
258 	ptep->pte_low = 0;
259 	smp_wmb();		/* make sure low gets written first */
260 	ptep->pte_high = 0;
261 }
262 
263 void xen_pmd_clear(pmd_t *pmdp)
264 {
265 	xen_set_pmd(pmdp, __pmd(0));
266 }
267 
268 pmd_t xen_make_pmd(pmdval_t pmd)
269 {
270 	if (pmd & _PAGE_PRESENT)
271 		pmd = phys_to_machine(XPADDR(pmd)).maddr;
272 
273 	return native_make_pmd(pmd);
274 }
275 #else  /* !PAE */
276 void xen_set_pte(pte_t *ptep, pte_t pte)
277 {
278 	*ptep = pte;
279 }
280 #endif	/* CONFIG_X86_PAE */
281 
282 /*
283   (Yet another) pagetable walker.  This one is intended for pinning a
284   pagetable.  This means that it walks a pagetable and calls the
285   callback function on each page it finds making up the page table,
286   at every level.  It walks the entire pagetable, but it only bothers
287   pinning pte pages which are below pte_limit.  In the normal case
288   this will be TASK_SIZE, but at boot we need to pin up to
289   FIXADDR_TOP.  But the important bit is that we don't pin beyond
290   there, because then we start getting into Xen's ptes.
291 */
292 static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
293 		    unsigned long limit)
294 {
295 	pgd_t *pgd = pgd_base;
296 	int flush = 0;
297 	unsigned long addr = 0;
298 	unsigned long pgd_next;
299 
300 	BUG_ON(limit > FIXADDR_TOP);
301 
302 	if (xen_feature(XENFEAT_auto_translated_physmap))
303 		return 0;
304 
305 	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
306 		pud_t *pud;
307 		unsigned long pud_limit, pud_next;
308 
309 		pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
310 
311 		if (!pgd_val(*pgd))
312 			continue;
313 
314 		pud = pud_offset(pgd, 0);
315 
316 		if (PTRS_PER_PUD > 1) /* not folded */
317 			flush |= (*func)(virt_to_page(pud), PT_PUD);
318 
319 		for (; addr != pud_limit; pud++, addr = pud_next) {
320 			pmd_t *pmd;
321 			unsigned long pmd_limit;
322 
323 			pud_next = pud_addr_end(addr, pud_limit);
324 
325 			if (pud_next < limit)
326 				pmd_limit = pud_next;
327 			else
328 				pmd_limit = limit;
329 
330 			if (pud_none(*pud))
331 				continue;
332 
333 			pmd = pmd_offset(pud, 0);
334 
335 			if (PTRS_PER_PMD > 1) /* not folded */
336 				flush |= (*func)(virt_to_page(pmd), PT_PMD);
337 
338 			for (; addr != pmd_limit; pmd++) {
339 				addr += (PAGE_SIZE * PTRS_PER_PTE);
340 				if ((pmd_limit-1) < (addr-1)) {
341 					addr = pmd_limit;
342 					break;
343 				}
344 
345 				if (pmd_none(*pmd))
346 					continue;
347 
348 				flush |= (*func)(pmd_page(*pmd), PT_PTE);
349 			}
350 		}
351 	}
352 
353 	flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
354 
355 	return flush;
356 }
357 
358 static spinlock_t *lock_pte(struct page *page)
359 {
360 	spinlock_t *ptl = NULL;
361 
362 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
363 	ptl = __pte_lockptr(page);
364 	spin_lock(ptl);
365 #endif
366 
367 	return ptl;
368 }
369 
370 static void do_unlock(void *v)
371 {
372 	spinlock_t *ptl = v;
373 	spin_unlock(ptl);
374 }
375 
376 static void xen_do_pin(unsigned level, unsigned long pfn)
377 {
378 	struct mmuext_op *op;
379 	struct multicall_space mcs;
380 
381 	mcs = __xen_mc_entry(sizeof(*op));
382 	op = mcs.args;
383 	op->cmd = level;
384 	op->arg1.mfn = pfn_to_mfn(pfn);
385 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
386 }
387 
388 static int pin_page(struct page *page, enum pt_level level)
389 {
390 	unsigned pgfl = TestSetPagePinned(page);
391 	int flush;
392 
393 	if (pgfl)
394 		flush = 0;		/* already pinned */
395 	else if (PageHighMem(page))
396 		/* kmaps need flushing if we found an unpinned
397 		   highpage */
398 		flush = 1;
399 	else {
400 		void *pt = lowmem_page_address(page);
401 		unsigned long pfn = page_to_pfn(page);
402 		struct multicall_space mcs = __xen_mc_entry(0);
403 		spinlock_t *ptl;
404 
405 		flush = 0;
406 
407 		ptl = NULL;
408 		if (level == PT_PTE)
409 			ptl = lock_pte(page);
410 
411 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
412 					pfn_pte(pfn, PAGE_KERNEL_RO),
413 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
414 
415 		if (level == PT_PTE)
416 			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
417 
418 		if (ptl) {
419 			/* Queue a deferred unlock for when this batch
420 			   is completed. */
421 			xen_mc_callback(do_unlock, ptl);
422 		}
423 	}
424 
425 	return flush;
426 }
427 
428 /* This is called just after a mm has been created, but it has not
429    been used yet.  We need to make sure that its pagetable is all
430    read-only, and can be pinned. */
431 void xen_pgd_pin(pgd_t *pgd)
432 {
433 	unsigned level;
434 
435 	xen_mc_batch();
436 
437 	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
438 		/* re-enable interrupts for kmap_flush_unused */
439 		xen_mc_issue(0);
440 		kmap_flush_unused();
441 		xen_mc_batch();
442 	}
443 
444 #ifdef CONFIG_X86_PAE
445 	level = MMUEXT_PIN_L3_TABLE;
446 #else
447 	level = MMUEXT_PIN_L2_TABLE;
448 #endif
449 
450 	xen_do_pin(level, PFN_DOWN(__pa(pgd)));
451 
452 	xen_mc_issue(0);
453 }
454 
455 /* The init_mm pagetable is really pinned as soon as its created, but
456    that's before we have page structures to store the bits.  So do all
457    the book-keeping now. */
458 static __init int mark_pinned(struct page *page, enum pt_level level)
459 {
460 	SetPagePinned(page);
461 	return 0;
462 }
463 
464 void __init xen_mark_init_mm_pinned(void)
465 {
466 	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
467 }
468 
469 static int unpin_page(struct page *page, enum pt_level level)
470 {
471 	unsigned pgfl = TestClearPagePinned(page);
472 
473 	if (pgfl && !PageHighMem(page)) {
474 		void *pt = lowmem_page_address(page);
475 		unsigned long pfn = page_to_pfn(page);
476 		spinlock_t *ptl = NULL;
477 		struct multicall_space mcs;
478 
479 		if (level == PT_PTE) {
480 			ptl = lock_pte(page);
481 
482 			xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
483 		}
484 
485 		mcs = __xen_mc_entry(0);
486 
487 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
488 					pfn_pte(pfn, PAGE_KERNEL),
489 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
490 
491 		if (ptl) {
492 			/* unlock when batch completed */
493 			xen_mc_callback(do_unlock, ptl);
494 		}
495 	}
496 
497 	return 0;		/* never need to flush on unpin */
498 }
499 
500 /* Release a pagetables pages back as normal RW */
501 static void xen_pgd_unpin(pgd_t *pgd)
502 {
503 	xen_mc_batch();
504 
505 	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
506 
507 	pgd_walk(pgd, unpin_page, TASK_SIZE);
508 
509 	xen_mc_issue(0);
510 }
511 
512 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
513 {
514 	spin_lock(&next->page_table_lock);
515 	xen_pgd_pin(next->pgd);
516 	spin_unlock(&next->page_table_lock);
517 }
518 
519 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
520 {
521 	spin_lock(&mm->page_table_lock);
522 	xen_pgd_pin(mm->pgd);
523 	spin_unlock(&mm->page_table_lock);
524 }
525 
526 
527 #ifdef CONFIG_SMP
528 /* Another cpu may still have their %cr3 pointing at the pagetable, so
529    we need to repoint it somewhere else before we can unpin it. */
530 static void drop_other_mm_ref(void *info)
531 {
532 	struct mm_struct *mm = info;
533 
534 	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
535 		leave_mm(smp_processor_id());
536 
537 	/* If this cpu still has a stale cr3 reference, then make sure
538 	   it has been flushed. */
539 	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
540 		load_cr3(swapper_pg_dir);
541 		arch_flush_lazy_cpu_mode();
542 	}
543 }
544 
545 static void drop_mm_ref(struct mm_struct *mm)
546 {
547 	cpumask_t mask;
548 	unsigned cpu;
549 
550 	if (current->active_mm == mm) {
551 		if (current->mm == mm)
552 			load_cr3(swapper_pg_dir);
553 		else
554 			leave_mm(smp_processor_id());
555 		arch_flush_lazy_cpu_mode();
556 	}
557 
558 	/* Get the "official" set of cpus referring to our pagetable. */
559 	mask = mm->cpu_vm_mask;
560 
561 	/* It's possible that a vcpu may have a stale reference to our
562 	   cr3, because its in lazy mode, and it hasn't yet flushed
563 	   its set of pending hypercalls yet.  In this case, we can
564 	   look at its actual current cr3 value, and force it to flush
565 	   if needed. */
566 	for_each_online_cpu(cpu) {
567 		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
568 			cpu_set(cpu, mask);
569 	}
570 
571 	if (!cpus_empty(mask))
572 		xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
573 }
574 #else
575 static void drop_mm_ref(struct mm_struct *mm)
576 {
577 	if (current->active_mm == mm)
578 		load_cr3(swapper_pg_dir);
579 }
580 #endif
581 
582 /*
583  * While a process runs, Xen pins its pagetables, which means that the
584  * hypervisor forces it to be read-only, and it controls all updates
585  * to it.  This means that all pagetable updates have to go via the
586  * hypervisor, which is moderately expensive.
587  *
588  * Since we're pulling the pagetable down, we switch to use init_mm,
589  * unpin old process pagetable and mark it all read-write, which
590  * allows further operations on it to be simple memory accesses.
591  *
592  * The only subtle point is that another CPU may be still using the
593  * pagetable because of lazy tlb flushing.  This means we need need to
594  * switch all CPUs off this pagetable before we can unpin it.
595  */
596 void xen_exit_mmap(struct mm_struct *mm)
597 {
598 	get_cpu();		/* make sure we don't move around */
599 	drop_mm_ref(mm);
600 	put_cpu();
601 
602 	spin_lock(&mm->page_table_lock);
603 
604 	/* pgd may not be pinned in the error exit path of execve */
605 	if (PagePinned(virt_to_page(mm->pgd)))
606 		xen_pgd_unpin(mm->pgd);
607 
608 	spin_unlock(&mm->page_table_lock);
609 }
610