xref: /openbmc/linux/arch/x86/xen/mmu.c (revision 643d1f7f)
1 /*
2  * Xen mmu operations
3  *
4  * This file contains the various mmu fetch and update operations.
5  * The most important job they must perform is the mapping between the
6  * domain's pfn and the overall machine mfns.
7  *
8  * Xen allows guests to directly update the pagetable, in a controlled
9  * fashion.  In other words, the guest modifies the same pagetable
10  * that the CPU actually uses, which eliminates the overhead of having
11  * a separate shadow pagetable.
12  *
13  * In order to allow this, it falls on the guest domain to map its
14  * notion of a "physical" pfn - which is just a domain-local linear
15  * address - into a real "machine address" which the CPU's MMU can
16  * use.
17  *
18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19  * inserted directly into the pagetable.  When creating a new
20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
22  * the mfn back into a pfn.
23  *
24  * The other constraint is that all pages which make up a pagetable
25  * must be mapped read-only in the guest.  This prevents uncontrolled
26  * guest updates to the pagetable.  Xen strictly enforces this, and
27  * will disallow any pagetable update which will end up mapping a
28  * pagetable page RW, and will disallow using any writable page as a
29  * pagetable.
30  *
31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
32  * would need to validate the whole pagetable before going on.
33  * Naturally, this is quite slow.  The solution is to "pin" a
34  * pagetable, which enforces all the constraints on the pagetable even
35  * when it is not actively in use.  This menas that Xen can be assured
36  * that it is still valid when you do load it into %cr3, and doesn't
37  * need to revalidate it.
38  *
39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40  */
41 #include <linux/sched.h>
42 #include <linux/highmem.h>
43 #include <linux/bug.h>
44 
45 #include <asm/pgtable.h>
46 #include <asm/tlbflush.h>
47 #include <asm/mmu_context.h>
48 #include <asm/paravirt.h>
49 
50 #include <asm/xen/hypercall.h>
51 #include <asm/xen/hypervisor.h>
52 
53 #include <xen/page.h>
54 #include <xen/interface/xen.h>
55 
56 #include "multicalls.h"
57 #include "mmu.h"
58 
59 xmaddr_t arbitrary_virt_to_machine(unsigned long address)
60 {
61 	int level;
62 	pte_t *pte = lookup_address(address, &level);
63 	unsigned offset = address & PAGE_MASK;
64 
65 	BUG_ON(pte == NULL);
66 
67 	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
68 }
69 
70 void make_lowmem_page_readonly(void *vaddr)
71 {
72 	pte_t *pte, ptev;
73 	unsigned long address = (unsigned long)vaddr;
74 	int level;
75 
76 	pte = lookup_address(address, &level);
77 	BUG_ON(pte == NULL);
78 
79 	ptev = pte_wrprotect(*pte);
80 
81 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
82 		BUG();
83 }
84 
85 void make_lowmem_page_readwrite(void *vaddr)
86 {
87 	pte_t *pte, ptev;
88 	unsigned long address = (unsigned long)vaddr;
89 	int level;
90 
91 	pte = lookup_address(address, &level);
92 	BUG_ON(pte == NULL);
93 
94 	ptev = pte_mkwrite(*pte);
95 
96 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
97 		BUG();
98 }
99 
100 
101 void xen_set_pmd(pmd_t *ptr, pmd_t val)
102 {
103 	struct multicall_space mcs;
104 	struct mmu_update *u;
105 
106 	preempt_disable();
107 
108 	mcs = xen_mc_entry(sizeof(*u));
109 	u = mcs.args;
110 	u->ptr = virt_to_machine(ptr).maddr;
111 	u->val = pmd_val_ma(val);
112 	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
113 
114 	xen_mc_issue(PARAVIRT_LAZY_MMU);
115 
116 	preempt_enable();
117 }
118 
119 /*
120  * Associate a virtual page frame with a given physical page frame
121  * and protection flags for that frame.
122  */
123 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
124 {
125 	pgd_t *pgd;
126 	pud_t *pud;
127 	pmd_t *pmd;
128 	pte_t *pte;
129 
130 	pgd = swapper_pg_dir + pgd_index(vaddr);
131 	if (pgd_none(*pgd)) {
132 		BUG();
133 		return;
134 	}
135 	pud = pud_offset(pgd, vaddr);
136 	if (pud_none(*pud)) {
137 		BUG();
138 		return;
139 	}
140 	pmd = pmd_offset(pud, vaddr);
141 	if (pmd_none(*pmd)) {
142 		BUG();
143 		return;
144 	}
145 	pte = pte_offset_kernel(pmd, vaddr);
146 	/* <mfn,flags> stored as-is, to permit clearing entries */
147 	xen_set_pte(pte, mfn_pte(mfn, flags));
148 
149 	/*
150 	 * It's enough to flush this one mapping.
151 	 * (PGE mappings get flushed as well)
152 	 */
153 	__flush_tlb_one(vaddr);
154 }
155 
156 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
157 		    pte_t *ptep, pte_t pteval)
158 {
159 	if (mm == current->mm || mm == &init_mm) {
160 		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
161 			struct multicall_space mcs;
162 			mcs = xen_mc_entry(0);
163 
164 			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
165 			xen_mc_issue(PARAVIRT_LAZY_MMU);
166 			return;
167 		} else
168 			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
169 				return;
170 	}
171 	xen_set_pte(ptep, pteval);
172 }
173 
174 #ifdef CONFIG_X86_PAE
175 void xen_set_pud(pud_t *ptr, pud_t val)
176 {
177 	struct multicall_space mcs;
178 	struct mmu_update *u;
179 
180 	preempt_disable();
181 
182 	mcs = xen_mc_entry(sizeof(*u));
183 	u = mcs.args;
184 	u->ptr = virt_to_machine(ptr).maddr;
185 	u->val = pud_val_ma(val);
186 	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
187 
188 	xen_mc_issue(PARAVIRT_LAZY_MMU);
189 
190 	preempt_enable();
191 }
192 
193 void xen_set_pte(pte_t *ptep, pte_t pte)
194 {
195 	ptep->pte_high = pte.pte_high;
196 	smp_wmb();
197 	ptep->pte_low = pte.pte_low;
198 }
199 
200 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
201 {
202 	set_64bit((u64 *)ptep, pte_val_ma(pte));
203 }
204 
205 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
206 {
207 	ptep->pte_low = 0;
208 	smp_wmb();		/* make sure low gets written first */
209 	ptep->pte_high = 0;
210 }
211 
212 void xen_pmd_clear(pmd_t *pmdp)
213 {
214 	xen_set_pmd(pmdp, __pmd(0));
215 }
216 
217 unsigned long long xen_pte_val(pte_t pte)
218 {
219 	unsigned long long ret = 0;
220 
221 	if (pte.pte_low) {
222 		ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
223 		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
224 	}
225 
226 	return ret;
227 }
228 
229 unsigned long long xen_pmd_val(pmd_t pmd)
230 {
231 	unsigned long long ret = pmd.pmd;
232 	if (ret)
233 		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
234 	return ret;
235 }
236 
237 unsigned long long xen_pgd_val(pgd_t pgd)
238 {
239 	unsigned long long ret = pgd.pgd;
240 	if (ret)
241 		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
242 	return ret;
243 }
244 
245 pte_t xen_make_pte(unsigned long long pte)
246 {
247 	if (pte & _PAGE_PRESENT) {
248 		pte = phys_to_machine(XPADDR(pte)).maddr;
249 		pte &= ~(_PAGE_PCD | _PAGE_PWT);
250 	}
251 
252 	return (pte_t){ .pte = pte };
253 }
254 
255 pmd_t xen_make_pmd(unsigned long long pmd)
256 {
257 	if (pmd & 1)
258 		pmd = phys_to_machine(XPADDR(pmd)).maddr;
259 
260 	return (pmd_t){ pmd };
261 }
262 
263 pgd_t xen_make_pgd(unsigned long long pgd)
264 {
265 	if (pgd & _PAGE_PRESENT)
266 		pgd = phys_to_machine(XPADDR(pgd)).maddr;
267 
268 	return (pgd_t){ pgd };
269 }
270 #else  /* !PAE */
271 void xen_set_pte(pte_t *ptep, pte_t pte)
272 {
273 	*ptep = pte;
274 }
275 
276 unsigned long xen_pte_val(pte_t pte)
277 {
278 	unsigned long ret = pte.pte_low;
279 
280 	if (ret & _PAGE_PRESENT)
281 		ret = machine_to_phys(XMADDR(ret)).paddr;
282 
283 	return ret;
284 }
285 
286 unsigned long xen_pgd_val(pgd_t pgd)
287 {
288 	unsigned long ret = pgd.pgd;
289 	if (ret)
290 		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
291 	return ret;
292 }
293 
294 pte_t xen_make_pte(unsigned long pte)
295 {
296 	if (pte & _PAGE_PRESENT) {
297 		pte = phys_to_machine(XPADDR(pte)).maddr;
298 		pte &= ~(_PAGE_PCD | _PAGE_PWT);
299 	}
300 
301 	return (pte_t){ pte };
302 }
303 
304 pgd_t xen_make_pgd(unsigned long pgd)
305 {
306 	if (pgd & _PAGE_PRESENT)
307 		pgd = phys_to_machine(XPADDR(pgd)).maddr;
308 
309 	return (pgd_t){ pgd };
310 }
311 #endif	/* CONFIG_X86_PAE */
312 
313 enum pt_level {
314 	PT_PGD,
315 	PT_PUD,
316 	PT_PMD,
317 	PT_PTE
318 };
319 
320 /*
321   (Yet another) pagetable walker.  This one is intended for pinning a
322   pagetable.  This means that it walks a pagetable and calls the
323   callback function on each page it finds making up the page table,
324   at every level.  It walks the entire pagetable, but it only bothers
325   pinning pte pages which are below pte_limit.  In the normal case
326   this will be TASK_SIZE, but at boot we need to pin up to
327   FIXADDR_TOP.  But the important bit is that we don't pin beyond
328   there, because then we start getting into Xen's ptes.
329 */
330 static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
331 		    unsigned long limit)
332 {
333 	pgd_t *pgd = pgd_base;
334 	int flush = 0;
335 	unsigned long addr = 0;
336 	unsigned long pgd_next;
337 
338 	BUG_ON(limit > FIXADDR_TOP);
339 
340 	if (xen_feature(XENFEAT_auto_translated_physmap))
341 		return 0;
342 
343 	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
344 		pud_t *pud;
345 		unsigned long pud_limit, pud_next;
346 
347 		pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
348 
349 		if (!pgd_val(*pgd))
350 			continue;
351 
352 		pud = pud_offset(pgd, 0);
353 
354 		if (PTRS_PER_PUD > 1) /* not folded */
355 			flush |= (*func)(virt_to_page(pud), PT_PUD);
356 
357 		for (; addr != pud_limit; pud++, addr = pud_next) {
358 			pmd_t *pmd;
359 			unsigned long pmd_limit;
360 
361 			pud_next = pud_addr_end(addr, pud_limit);
362 
363 			if (pud_next < limit)
364 				pmd_limit = pud_next;
365 			else
366 				pmd_limit = limit;
367 
368 			if (pud_none(*pud))
369 				continue;
370 
371 			pmd = pmd_offset(pud, 0);
372 
373 			if (PTRS_PER_PMD > 1) /* not folded */
374 				flush |= (*func)(virt_to_page(pmd), PT_PMD);
375 
376 			for (; addr != pmd_limit; pmd++) {
377 				addr += (PAGE_SIZE * PTRS_PER_PTE);
378 				if ((pmd_limit-1) < (addr-1)) {
379 					addr = pmd_limit;
380 					break;
381 				}
382 
383 				if (pmd_none(*pmd))
384 					continue;
385 
386 				flush |= (*func)(pmd_page(*pmd), PT_PTE);
387 			}
388 		}
389 	}
390 
391 	flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
392 
393 	return flush;
394 }
395 
396 static spinlock_t *lock_pte(struct page *page)
397 {
398 	spinlock_t *ptl = NULL;
399 
400 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
401 	ptl = __pte_lockptr(page);
402 	spin_lock(ptl);
403 #endif
404 
405 	return ptl;
406 }
407 
408 static void do_unlock(void *v)
409 {
410 	spinlock_t *ptl = v;
411 	spin_unlock(ptl);
412 }
413 
414 static void xen_do_pin(unsigned level, unsigned long pfn)
415 {
416 	struct mmuext_op *op;
417 	struct multicall_space mcs;
418 
419 	mcs = __xen_mc_entry(sizeof(*op));
420 	op = mcs.args;
421 	op->cmd = level;
422 	op->arg1.mfn = pfn_to_mfn(pfn);
423 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
424 }
425 
426 static int pin_page(struct page *page, enum pt_level level)
427 {
428 	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
429 	int flush;
430 
431 	if (pgfl)
432 		flush = 0;		/* already pinned */
433 	else if (PageHighMem(page))
434 		/* kmaps need flushing if we found an unpinned
435 		   highpage */
436 		flush = 1;
437 	else {
438 		void *pt = lowmem_page_address(page);
439 		unsigned long pfn = page_to_pfn(page);
440 		struct multicall_space mcs = __xen_mc_entry(0);
441 		spinlock_t *ptl;
442 
443 		flush = 0;
444 
445 		ptl = NULL;
446 		if (level == PT_PTE)
447 			ptl = lock_pte(page);
448 
449 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
450 					pfn_pte(pfn, PAGE_KERNEL_RO),
451 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
452 
453 		if (level == PT_PTE)
454 			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
455 
456 		if (ptl) {
457 			/* Queue a deferred unlock for when this batch
458 			   is completed. */
459 			xen_mc_callback(do_unlock, ptl);
460 		}
461 	}
462 
463 	return flush;
464 }
465 
466 /* This is called just after a mm has been created, but it has not
467    been used yet.  We need to make sure that its pagetable is all
468    read-only, and can be pinned. */
469 void xen_pgd_pin(pgd_t *pgd)
470 {
471 	unsigned level;
472 
473 	xen_mc_batch();
474 
475 	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
476 		/* re-enable interrupts for kmap_flush_unused */
477 		xen_mc_issue(0);
478 		kmap_flush_unused();
479 		xen_mc_batch();
480 	}
481 
482 #ifdef CONFIG_X86_PAE
483 	level = MMUEXT_PIN_L3_TABLE;
484 #else
485 	level = MMUEXT_PIN_L2_TABLE;
486 #endif
487 
488 	xen_do_pin(level, PFN_DOWN(__pa(pgd)));
489 
490 	xen_mc_issue(0);
491 }
492 
493 /* The init_mm pagetable is really pinned as soon as its created, but
494    that's before we have page structures to store the bits.  So do all
495    the book-keeping now. */
496 static __init int mark_pinned(struct page *page, enum pt_level level)
497 {
498 	SetPagePinned(page);
499 	return 0;
500 }
501 
502 void __init xen_mark_init_mm_pinned(void)
503 {
504 	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
505 }
506 
507 static int unpin_page(struct page *page, enum pt_level level)
508 {
509 	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
510 
511 	if (pgfl && !PageHighMem(page)) {
512 		void *pt = lowmem_page_address(page);
513 		unsigned long pfn = page_to_pfn(page);
514 		spinlock_t *ptl = NULL;
515 		struct multicall_space mcs;
516 
517 		if (level == PT_PTE) {
518 			ptl = lock_pte(page);
519 
520 			xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
521 		}
522 
523 		mcs = __xen_mc_entry(0);
524 
525 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
526 					pfn_pte(pfn, PAGE_KERNEL),
527 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
528 
529 		if (ptl) {
530 			/* unlock when batch completed */
531 			xen_mc_callback(do_unlock, ptl);
532 		}
533 	}
534 
535 	return 0;		/* never need to flush on unpin */
536 }
537 
538 /* Release a pagetables pages back as normal RW */
539 static void xen_pgd_unpin(pgd_t *pgd)
540 {
541 	xen_mc_batch();
542 
543 	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
544 
545 	pgd_walk(pgd, unpin_page, TASK_SIZE);
546 
547 	xen_mc_issue(0);
548 }
549 
550 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
551 {
552 	spin_lock(&next->page_table_lock);
553 	xen_pgd_pin(next->pgd);
554 	spin_unlock(&next->page_table_lock);
555 }
556 
557 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
558 {
559 	spin_lock(&mm->page_table_lock);
560 	xen_pgd_pin(mm->pgd);
561 	spin_unlock(&mm->page_table_lock);
562 }
563 
564 
565 #ifdef CONFIG_SMP
566 /* Another cpu may still have their %cr3 pointing at the pagetable, so
567    we need to repoint it somewhere else before we can unpin it. */
568 static void drop_other_mm_ref(void *info)
569 {
570 	struct mm_struct *mm = info;
571 
572 	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
573 		leave_mm(smp_processor_id());
574 
575 	/* If this cpu still has a stale cr3 reference, then make sure
576 	   it has been flushed. */
577 	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
578 		load_cr3(swapper_pg_dir);
579 		arch_flush_lazy_cpu_mode();
580 	}
581 }
582 
583 static void drop_mm_ref(struct mm_struct *mm)
584 {
585 	cpumask_t mask;
586 	unsigned cpu;
587 
588 	if (current->active_mm == mm) {
589 		if (current->mm == mm)
590 			load_cr3(swapper_pg_dir);
591 		else
592 			leave_mm(smp_processor_id());
593 		arch_flush_lazy_cpu_mode();
594 	}
595 
596 	/* Get the "official" set of cpus referring to our pagetable. */
597 	mask = mm->cpu_vm_mask;
598 
599 	/* It's possible that a vcpu may have a stale reference to our
600 	   cr3, because its in lazy mode, and it hasn't yet flushed
601 	   its set of pending hypercalls yet.  In this case, we can
602 	   look at its actual current cr3 value, and force it to flush
603 	   if needed. */
604 	for_each_online_cpu(cpu) {
605 		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
606 			cpu_set(cpu, mask);
607 	}
608 
609 	if (!cpus_empty(mask))
610 		xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
611 }
612 #else
613 static void drop_mm_ref(struct mm_struct *mm)
614 {
615 	if (current->active_mm == mm)
616 		load_cr3(swapper_pg_dir);
617 }
618 #endif
619 
620 /*
621  * While a process runs, Xen pins its pagetables, which means that the
622  * hypervisor forces it to be read-only, and it controls all updates
623  * to it.  This means that all pagetable updates have to go via the
624  * hypervisor, which is moderately expensive.
625  *
626  * Since we're pulling the pagetable down, we switch to use init_mm,
627  * unpin old process pagetable and mark it all read-write, which
628  * allows further operations on it to be simple memory accesses.
629  *
630  * The only subtle point is that another CPU may be still using the
631  * pagetable because of lazy tlb flushing.  This means we need need to
632  * switch all CPUs off this pagetable before we can unpin it.
633  */
634 void xen_exit_mmap(struct mm_struct *mm)
635 {
636 	get_cpu();		/* make sure we don't move around */
637 	drop_mm_ref(mm);
638 	put_cpu();
639 
640 	spin_lock(&mm->page_table_lock);
641 
642 	/* pgd may not be pinned in the error exit path of execve */
643 	if (PagePinned(virt_to_page(mm->pgd)))
644 		xen_pgd_unpin(mm->pgd);
645 
646 	spin_unlock(&mm->page_table_lock);
647 }
648