xref: /openbmc/linux/arch/x86/xen/mmu.c (revision 96de0e252cedffad61b3cb5e05662c591898e69a)
1 /*
2  * Xen mmu operations
3  *
4  * This file contains the various mmu fetch and update operations.
5  * The most important job they must perform is the mapping between the
6  * domain's pfn and the overall machine mfns.
7  *
8  * Xen allows guests to directly update the pagetable, in a controlled
9  * fashion.  In other words, the guest modifies the same pagetable
10  * that the CPU actually uses, which eliminates the overhead of having
11  * a separate shadow pagetable.
12  *
13  * In order to allow this, it falls on the guest domain to map its
14  * notion of a "physical" pfn - which is just a domain-local linear
15  * address - into a real "machine address" which the CPU's MMU can
16  * use.
17  *
18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19  * inserted directly into the pagetable.  When creating a new
20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
22  * the mfn back into a pfn.
23  *
24  * The other constraint is that all pages which make up a pagetable
25  * must be mapped read-only in the guest.  This prevents uncontrolled
26  * guest updates to the pagetable.  Xen strictly enforces this, and
27  * will disallow any pagetable update which will end up mapping a
28  * pagetable page RW, and will disallow using any writable page as a
29  * pagetable.
30  *
31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
32  * would need to validate the whole pagetable before going on.
33  * Naturally, this is quite slow.  The solution is to "pin" a
34  * pagetable, which enforces all the constraints on the pagetable even
35  * when it is not actively in use.  This menas that Xen can be assured
36  * that it is still valid when you do load it into %cr3, and doesn't
37  * need to revalidate it.
38  *
39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40  */
41 #include <linux/sched.h>
42 #include <linux/highmem.h>
43 #include <linux/bug.h>
44 
45 #include <asm/pgtable.h>
46 #include <asm/tlbflush.h>
47 #include <asm/mmu_context.h>
48 #include <asm/paravirt.h>
49 
50 #include <asm/xen/hypercall.h>
51 #include <asm/xen/hypervisor.h>
52 
53 #include <xen/page.h>
54 #include <xen/interface/xen.h>
55 
56 #include "multicalls.h"
57 #include "mmu.h"
58 
59 xmaddr_t arbitrary_virt_to_machine(unsigned long address)
60 {
61 	pte_t *pte = lookup_address(address);
62 	unsigned offset = address & PAGE_MASK;
63 
64 	BUG_ON(pte == NULL);
65 
66 	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
67 }
68 
69 void make_lowmem_page_readonly(void *vaddr)
70 {
71 	pte_t *pte, ptev;
72 	unsigned long address = (unsigned long)vaddr;
73 
74 	pte = lookup_address(address);
75 	BUG_ON(pte == NULL);
76 
77 	ptev = pte_wrprotect(*pte);
78 
79 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
80 		BUG();
81 }
82 
83 void make_lowmem_page_readwrite(void *vaddr)
84 {
85 	pte_t *pte, ptev;
86 	unsigned long address = (unsigned long)vaddr;
87 
88 	pte = lookup_address(address);
89 	BUG_ON(pte == NULL);
90 
91 	ptev = pte_mkwrite(*pte);
92 
93 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
94 		BUG();
95 }
96 
97 
98 void xen_set_pmd(pmd_t *ptr, pmd_t val)
99 {
100 	struct multicall_space mcs;
101 	struct mmu_update *u;
102 
103 	preempt_disable();
104 
105 	mcs = xen_mc_entry(sizeof(*u));
106 	u = mcs.args;
107 	u->ptr = virt_to_machine(ptr).maddr;
108 	u->val = pmd_val_ma(val);
109 	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
110 
111 	xen_mc_issue(PARAVIRT_LAZY_MMU);
112 
113 	preempt_enable();
114 }
115 
116 /*
117  * Associate a virtual page frame with a given physical page frame
118  * and protection flags for that frame.
119  */
120 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
121 {
122 	pgd_t *pgd;
123 	pud_t *pud;
124 	pmd_t *pmd;
125 	pte_t *pte;
126 
127 	pgd = swapper_pg_dir + pgd_index(vaddr);
128 	if (pgd_none(*pgd)) {
129 		BUG();
130 		return;
131 	}
132 	pud = pud_offset(pgd, vaddr);
133 	if (pud_none(*pud)) {
134 		BUG();
135 		return;
136 	}
137 	pmd = pmd_offset(pud, vaddr);
138 	if (pmd_none(*pmd)) {
139 		BUG();
140 		return;
141 	}
142 	pte = pte_offset_kernel(pmd, vaddr);
143 	/* <mfn,flags> stored as-is, to permit clearing entries */
144 	xen_set_pte(pte, mfn_pte(mfn, flags));
145 
146 	/*
147 	 * It's enough to flush this one mapping.
148 	 * (PGE mappings get flushed as well)
149 	 */
150 	__flush_tlb_one(vaddr);
151 }
152 
153 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
154 		    pte_t *ptep, pte_t pteval)
155 {
156 	if (mm == current->mm || mm == &init_mm) {
157 		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
158 			struct multicall_space mcs;
159 			mcs = xen_mc_entry(0);
160 
161 			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
162 			xen_mc_issue(PARAVIRT_LAZY_MMU);
163 			return;
164 		} else
165 			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
166 				return;
167 	}
168 	xen_set_pte(ptep, pteval);
169 }
170 
171 #ifdef CONFIG_X86_PAE
172 void xen_set_pud(pud_t *ptr, pud_t val)
173 {
174 	struct multicall_space mcs;
175 	struct mmu_update *u;
176 
177 	preempt_disable();
178 
179 	mcs = xen_mc_entry(sizeof(*u));
180 	u = mcs.args;
181 	u->ptr = virt_to_machine(ptr).maddr;
182 	u->val = pud_val_ma(val);
183 	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
184 
185 	xen_mc_issue(PARAVIRT_LAZY_MMU);
186 
187 	preempt_enable();
188 }
189 
190 void xen_set_pte(pte_t *ptep, pte_t pte)
191 {
192 	ptep->pte_high = pte.pte_high;
193 	smp_wmb();
194 	ptep->pte_low = pte.pte_low;
195 }
196 
197 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
198 {
199 	set_64bit((u64 *)ptep, pte_val_ma(pte));
200 }
201 
202 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
203 {
204 	ptep->pte_low = 0;
205 	smp_wmb();		/* make sure low gets written first */
206 	ptep->pte_high = 0;
207 }
208 
209 void xen_pmd_clear(pmd_t *pmdp)
210 {
211 	xen_set_pmd(pmdp, __pmd(0));
212 }
213 
214 unsigned long long xen_pte_val(pte_t pte)
215 {
216 	unsigned long long ret = 0;
217 
218 	if (pte.pte_low) {
219 		ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
220 		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
221 	}
222 
223 	return ret;
224 }
225 
226 unsigned long long xen_pmd_val(pmd_t pmd)
227 {
228 	unsigned long long ret = pmd.pmd;
229 	if (ret)
230 		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
231 	return ret;
232 }
233 
234 unsigned long long xen_pgd_val(pgd_t pgd)
235 {
236 	unsigned long long ret = pgd.pgd;
237 	if (ret)
238 		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
239 	return ret;
240 }
241 
242 pte_t xen_make_pte(unsigned long long pte)
243 {
244 	if (pte & 1)
245 		pte = phys_to_machine(XPADDR(pte)).maddr;
246 
247 	return (pte_t){ pte, pte >> 32 };
248 }
249 
250 pmd_t xen_make_pmd(unsigned long long pmd)
251 {
252 	if (pmd & 1)
253 		pmd = phys_to_machine(XPADDR(pmd)).maddr;
254 
255 	return (pmd_t){ pmd };
256 }
257 
258 pgd_t xen_make_pgd(unsigned long long pgd)
259 {
260 	if (pgd & _PAGE_PRESENT)
261 		pgd = phys_to_machine(XPADDR(pgd)).maddr;
262 
263 	return (pgd_t){ pgd };
264 }
265 #else  /* !PAE */
266 void xen_set_pte(pte_t *ptep, pte_t pte)
267 {
268 	*ptep = pte;
269 }
270 
271 unsigned long xen_pte_val(pte_t pte)
272 {
273 	unsigned long ret = pte.pte_low;
274 
275 	if (ret & _PAGE_PRESENT)
276 		ret = machine_to_phys(XMADDR(ret)).paddr;
277 
278 	return ret;
279 }
280 
281 unsigned long xen_pgd_val(pgd_t pgd)
282 {
283 	unsigned long ret = pgd.pgd;
284 	if (ret)
285 		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
286 	return ret;
287 }
288 
289 pte_t xen_make_pte(unsigned long pte)
290 {
291 	if (pte & _PAGE_PRESENT)
292 		pte = phys_to_machine(XPADDR(pte)).maddr;
293 
294 	return (pte_t){ pte };
295 }
296 
297 pgd_t xen_make_pgd(unsigned long pgd)
298 {
299 	if (pgd & _PAGE_PRESENT)
300 		pgd = phys_to_machine(XPADDR(pgd)).maddr;
301 
302 	return (pgd_t){ pgd };
303 }
304 #endif	/* CONFIG_X86_PAE */
305 
306 enum pt_level {
307 	PT_PGD,
308 	PT_PUD,
309 	PT_PMD,
310 	PT_PTE
311 };
312 
313 /*
314   (Yet another) pagetable walker.  This one is intended for pinning a
315   pagetable.  This means that it walks a pagetable and calls the
316   callback function on each page it finds making up the page table,
317   at every level.  It walks the entire pagetable, but it only bothers
318   pinning pte pages which are below pte_limit.  In the normal case
319   this will be TASK_SIZE, but at boot we need to pin up to
320   FIXADDR_TOP.  But the important bit is that we don't pin beyond
321   there, because then we start getting into Xen's ptes.
322 */
323 static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
324 		    unsigned long limit)
325 {
326 	pgd_t *pgd = pgd_base;
327 	int flush = 0;
328 	unsigned long addr = 0;
329 	unsigned long pgd_next;
330 
331 	BUG_ON(limit > FIXADDR_TOP);
332 
333 	if (xen_feature(XENFEAT_auto_translated_physmap))
334 		return 0;
335 
336 	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
337 		pud_t *pud;
338 		unsigned long pud_limit, pud_next;
339 
340 		pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
341 
342 		if (!pgd_val(*pgd))
343 			continue;
344 
345 		pud = pud_offset(pgd, 0);
346 
347 		if (PTRS_PER_PUD > 1) /* not folded */
348 			flush |= (*func)(virt_to_page(pud), PT_PUD);
349 
350 		for (; addr != pud_limit; pud++, addr = pud_next) {
351 			pmd_t *pmd;
352 			unsigned long pmd_limit;
353 
354 			pud_next = pud_addr_end(addr, pud_limit);
355 
356 			if (pud_next < limit)
357 				pmd_limit = pud_next;
358 			else
359 				pmd_limit = limit;
360 
361 			if (pud_none(*pud))
362 				continue;
363 
364 			pmd = pmd_offset(pud, 0);
365 
366 			if (PTRS_PER_PMD > 1) /* not folded */
367 				flush |= (*func)(virt_to_page(pmd), PT_PMD);
368 
369 			for (; addr != pmd_limit; pmd++) {
370 				addr += (PAGE_SIZE * PTRS_PER_PTE);
371 				if ((pmd_limit-1) < (addr-1)) {
372 					addr = pmd_limit;
373 					break;
374 				}
375 
376 				if (pmd_none(*pmd))
377 					continue;
378 
379 				flush |= (*func)(pmd_page(*pmd), PT_PTE);
380 			}
381 		}
382 	}
383 
384 	flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
385 
386 	return flush;
387 }
388 
389 static spinlock_t *lock_pte(struct page *page)
390 {
391 	spinlock_t *ptl = NULL;
392 
393 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
394 	ptl = __pte_lockptr(page);
395 	spin_lock(ptl);
396 #endif
397 
398 	return ptl;
399 }
400 
401 static void do_unlock(void *v)
402 {
403 	spinlock_t *ptl = v;
404 	spin_unlock(ptl);
405 }
406 
407 static void xen_do_pin(unsigned level, unsigned long pfn)
408 {
409 	struct mmuext_op *op;
410 	struct multicall_space mcs;
411 
412 	mcs = __xen_mc_entry(sizeof(*op));
413 	op = mcs.args;
414 	op->cmd = level;
415 	op->arg1.mfn = pfn_to_mfn(pfn);
416 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
417 }
418 
419 static int pin_page(struct page *page, enum pt_level level)
420 {
421 	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
422 	int flush;
423 
424 	if (pgfl)
425 		flush = 0;		/* already pinned */
426 	else if (PageHighMem(page))
427 		/* kmaps need flushing if we found an unpinned
428 		   highpage */
429 		flush = 1;
430 	else {
431 		void *pt = lowmem_page_address(page);
432 		unsigned long pfn = page_to_pfn(page);
433 		struct multicall_space mcs = __xen_mc_entry(0);
434 		spinlock_t *ptl;
435 
436 		flush = 0;
437 
438 		ptl = NULL;
439 		if (level == PT_PTE)
440 			ptl = lock_pte(page);
441 
442 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
443 					pfn_pte(pfn, PAGE_KERNEL_RO),
444 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
445 
446 		if (level == PT_PTE)
447 			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
448 
449 		if (ptl) {
450 			/* Queue a deferred unlock for when this batch
451 			   is completed. */
452 			xen_mc_callback(do_unlock, ptl);
453 		}
454 	}
455 
456 	return flush;
457 }
458 
459 /* This is called just after a mm has been created, but it has not
460    been used yet.  We need to make sure that its pagetable is all
461    read-only, and can be pinned. */
462 void xen_pgd_pin(pgd_t *pgd)
463 {
464 	unsigned level;
465 
466 	xen_mc_batch();
467 
468 	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
469 		/* re-enable interrupts for kmap_flush_unused */
470 		xen_mc_issue(0);
471 		kmap_flush_unused();
472 		xen_mc_batch();
473 	}
474 
475 #ifdef CONFIG_X86_PAE
476 	level = MMUEXT_PIN_L3_TABLE;
477 #else
478 	level = MMUEXT_PIN_L2_TABLE;
479 #endif
480 
481 	xen_do_pin(level, PFN_DOWN(__pa(pgd)));
482 
483 	xen_mc_issue(0);
484 }
485 
486 /* The init_mm pagetable is really pinned as soon as its created, but
487    that's before we have page structures to store the bits.  So do all
488    the book-keeping now. */
489 static __init int mark_pinned(struct page *page, enum pt_level level)
490 {
491 	SetPagePinned(page);
492 	return 0;
493 }
494 
495 void __init xen_mark_init_mm_pinned(void)
496 {
497 	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
498 }
499 
500 static int unpin_page(struct page *page, enum pt_level level)
501 {
502 	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
503 
504 	if (pgfl && !PageHighMem(page)) {
505 		void *pt = lowmem_page_address(page);
506 		unsigned long pfn = page_to_pfn(page);
507 		spinlock_t *ptl = NULL;
508 		struct multicall_space mcs;
509 
510 		if (level == PT_PTE) {
511 			ptl = lock_pte(page);
512 
513 			xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
514 		}
515 
516 		mcs = __xen_mc_entry(0);
517 
518 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
519 					pfn_pte(pfn, PAGE_KERNEL),
520 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
521 
522 		if (ptl) {
523 			/* unlock when batch completed */
524 			xen_mc_callback(do_unlock, ptl);
525 		}
526 	}
527 
528 	return 0;		/* never need to flush on unpin */
529 }
530 
531 /* Release a pagetables pages back as normal RW */
532 static void xen_pgd_unpin(pgd_t *pgd)
533 {
534 	xen_mc_batch();
535 
536 	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
537 
538 	pgd_walk(pgd, unpin_page, TASK_SIZE);
539 
540 	xen_mc_issue(0);
541 }
542 
543 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
544 {
545 	spin_lock(&next->page_table_lock);
546 	xen_pgd_pin(next->pgd);
547 	spin_unlock(&next->page_table_lock);
548 }
549 
550 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
551 {
552 	spin_lock(&mm->page_table_lock);
553 	xen_pgd_pin(mm->pgd);
554 	spin_unlock(&mm->page_table_lock);
555 }
556 
557 
558 #ifdef CONFIG_SMP
559 /* Another cpu may still have their %cr3 pointing at the pagetable, so
560    we need to repoint it somewhere else before we can unpin it. */
561 static void drop_other_mm_ref(void *info)
562 {
563 	struct mm_struct *mm = info;
564 
565 	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
566 		leave_mm(smp_processor_id());
567 
568 	/* If this cpu still has a stale cr3 reference, then make sure
569 	   it has been flushed. */
570 	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
571 		load_cr3(swapper_pg_dir);
572 		arch_flush_lazy_cpu_mode();
573 	}
574 }
575 
576 static void drop_mm_ref(struct mm_struct *mm)
577 {
578 	cpumask_t mask;
579 	unsigned cpu;
580 
581 	if (current->active_mm == mm) {
582 		if (current->mm == mm)
583 			load_cr3(swapper_pg_dir);
584 		else
585 			leave_mm(smp_processor_id());
586 		arch_flush_lazy_cpu_mode();
587 	}
588 
589 	/* Get the "official" set of cpus referring to our pagetable. */
590 	mask = mm->cpu_vm_mask;
591 
592 	/* It's possible that a vcpu may have a stale reference to our
593 	   cr3, because its in lazy mode, and it hasn't yet flushed
594 	   its set of pending hypercalls yet.  In this case, we can
595 	   look at its actual current cr3 value, and force it to flush
596 	   if needed. */
597 	for_each_online_cpu(cpu) {
598 		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
599 			cpu_set(cpu, mask);
600 	}
601 
602 	if (!cpus_empty(mask))
603 		xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
604 }
605 #else
606 static void drop_mm_ref(struct mm_struct *mm)
607 {
608 	if (current->active_mm == mm)
609 		load_cr3(swapper_pg_dir);
610 }
611 #endif
612 
613 /*
614  * While a process runs, Xen pins its pagetables, which means that the
615  * hypervisor forces it to be read-only, and it controls all updates
616  * to it.  This means that all pagetable updates have to go via the
617  * hypervisor, which is moderately expensive.
618  *
619  * Since we're pulling the pagetable down, we switch to use init_mm,
620  * unpin old process pagetable and mark it all read-write, which
621  * allows further operations on it to be simple memory accesses.
622  *
623  * The only subtle point is that another CPU may be still using the
624  * pagetable because of lazy tlb flushing.  This means we need need to
625  * switch all CPUs off this pagetable before we can unpin it.
626  */
627 void xen_exit_mmap(struct mm_struct *mm)
628 {
629 	get_cpu();		/* make sure we don't move around */
630 	drop_mm_ref(mm);
631 	put_cpu();
632 
633 	spin_lock(&mm->page_table_lock);
634 
635 	/* pgd may not be pinned in the error exit path of execve */
636 	if (PagePinned(virt_to_page(mm->pgd)))
637 		xen_pgd_unpin(mm->pgd);
638 
639 	spin_unlock(&mm->page_table_lock);
640 }
641