xref: /openbmc/linux/arch/x86/xen/mmu.c (revision f42b3800)
1 /*
2  * Xen mmu operations
3  *
4  * This file contains the various mmu fetch and update operations.
5  * The most important job they must perform is the mapping between the
6  * domain's pfn and the overall machine mfns.
7  *
8  * Xen allows guests to directly update the pagetable, in a controlled
9  * fashion.  In other words, the guest modifies the same pagetable
10  * that the CPU actually uses, which eliminates the overhead of having
11  * a separate shadow pagetable.
12  *
13  * In order to allow this, it falls on the guest domain to map its
14  * notion of a "physical" pfn - which is just a domain-local linear
15  * address - into a real "machine address" which the CPU's MMU can
16  * use.
17  *
18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19  * inserted directly into the pagetable.  When creating a new
20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
22  * the mfn back into a pfn.
23  *
24  * The other constraint is that all pages which make up a pagetable
25  * must be mapped read-only in the guest.  This prevents uncontrolled
26  * guest updates to the pagetable.  Xen strictly enforces this, and
27  * will disallow any pagetable update which will end up mapping a
28  * pagetable page RW, and will disallow using any writable page as a
29  * pagetable.
30  *
31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
32  * would need to validate the whole pagetable before going on.
33  * Naturally, this is quite slow.  The solution is to "pin" a
34  * pagetable, which enforces all the constraints on the pagetable even
35  * when it is not actively in use.  This menas that Xen can be assured
36  * that it is still valid when you do load it into %cr3, and doesn't
37  * need to revalidate it.
38  *
39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40  */
41 #include <linux/sched.h>
42 #include <linux/highmem.h>
43 #include <linux/bug.h>
44 
45 #include <asm/pgtable.h>
46 #include <asm/tlbflush.h>
47 #include <asm/mmu_context.h>
48 #include <asm/paravirt.h>
49 
50 #include <asm/xen/hypercall.h>
51 #include <asm/xen/hypervisor.h>
52 
53 #include <xen/page.h>
54 #include <xen/interface/xen.h>
55 
56 #include "multicalls.h"
57 #include "mmu.h"
58 
59 xmaddr_t arbitrary_virt_to_machine(unsigned long address)
60 {
61 	unsigned int level;
62 	pte_t *pte = lookup_address(address, &level);
63 	unsigned offset = address & PAGE_MASK;
64 
65 	BUG_ON(pte == NULL);
66 
67 	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
68 }
69 
70 void make_lowmem_page_readonly(void *vaddr)
71 {
72 	pte_t *pte, ptev;
73 	unsigned long address = (unsigned long)vaddr;
74 	unsigned int level;
75 
76 	pte = lookup_address(address, &level);
77 	BUG_ON(pte == NULL);
78 
79 	ptev = pte_wrprotect(*pte);
80 
81 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
82 		BUG();
83 }
84 
85 void make_lowmem_page_readwrite(void *vaddr)
86 {
87 	pte_t *pte, ptev;
88 	unsigned long address = (unsigned long)vaddr;
89 	unsigned int level;
90 
91 	pte = lookup_address(address, &level);
92 	BUG_ON(pte == NULL);
93 
94 	ptev = pte_mkwrite(*pte);
95 
96 	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
97 		BUG();
98 }
99 
100 
101 void xen_set_pmd(pmd_t *ptr, pmd_t val)
102 {
103 	struct multicall_space mcs;
104 	struct mmu_update *u;
105 
106 	preempt_disable();
107 
108 	mcs = xen_mc_entry(sizeof(*u));
109 	u = mcs.args;
110 	u->ptr = virt_to_machine(ptr).maddr;
111 	u->val = pmd_val_ma(val);
112 	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
113 
114 	xen_mc_issue(PARAVIRT_LAZY_MMU);
115 
116 	preempt_enable();
117 }
118 
119 /*
120  * Associate a virtual page frame with a given physical page frame
121  * and protection flags for that frame.
122  */
123 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
124 {
125 	pgd_t *pgd;
126 	pud_t *pud;
127 	pmd_t *pmd;
128 	pte_t *pte;
129 
130 	pgd = swapper_pg_dir + pgd_index(vaddr);
131 	if (pgd_none(*pgd)) {
132 		BUG();
133 		return;
134 	}
135 	pud = pud_offset(pgd, vaddr);
136 	if (pud_none(*pud)) {
137 		BUG();
138 		return;
139 	}
140 	pmd = pmd_offset(pud, vaddr);
141 	if (pmd_none(*pmd)) {
142 		BUG();
143 		return;
144 	}
145 	pte = pte_offset_kernel(pmd, vaddr);
146 	/* <mfn,flags> stored as-is, to permit clearing entries */
147 	xen_set_pte(pte, mfn_pte(mfn, flags));
148 
149 	/*
150 	 * It's enough to flush this one mapping.
151 	 * (PGE mappings get flushed as well)
152 	 */
153 	__flush_tlb_one(vaddr);
154 }
155 
156 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
157 		    pte_t *ptep, pte_t pteval)
158 {
159 	if (mm == current->mm || mm == &init_mm) {
160 		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
161 			struct multicall_space mcs;
162 			mcs = xen_mc_entry(0);
163 
164 			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
165 			xen_mc_issue(PARAVIRT_LAZY_MMU);
166 			return;
167 		} else
168 			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
169 				return;
170 	}
171 	xen_set_pte(ptep, pteval);
172 }
173 
174 #ifdef CONFIG_X86_PAE
175 void xen_set_pud(pud_t *ptr, pud_t val)
176 {
177 	struct multicall_space mcs;
178 	struct mmu_update *u;
179 
180 	preempt_disable();
181 
182 	mcs = xen_mc_entry(sizeof(*u));
183 	u = mcs.args;
184 	u->ptr = virt_to_machine(ptr).maddr;
185 	u->val = pud_val_ma(val);
186 	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
187 
188 	xen_mc_issue(PARAVIRT_LAZY_MMU);
189 
190 	preempt_enable();
191 }
192 
193 void xen_set_pte(pte_t *ptep, pte_t pte)
194 {
195 	ptep->pte_high = pte.pte_high;
196 	smp_wmb();
197 	ptep->pte_low = pte.pte_low;
198 }
199 
200 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
201 {
202 	set_64bit((u64 *)ptep, pte_val_ma(pte));
203 }
204 
205 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
206 {
207 	ptep->pte_low = 0;
208 	smp_wmb();		/* make sure low gets written first */
209 	ptep->pte_high = 0;
210 }
211 
212 void xen_pmd_clear(pmd_t *pmdp)
213 {
214 	xen_set_pmd(pmdp, __pmd(0));
215 }
216 
217 unsigned long long xen_pte_val(pte_t pte)
218 {
219 	unsigned long long ret = 0;
220 
221 	if (pte.pte_low) {
222 		ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
223 		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
224 	}
225 
226 	return ret;
227 }
228 
229 unsigned long long xen_pmd_val(pmd_t pmd)
230 {
231 	unsigned long long ret = pmd.pmd;
232 	if (ret)
233 		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
234 	return ret;
235 }
236 
237 unsigned long long xen_pgd_val(pgd_t pgd)
238 {
239 	unsigned long long ret = pgd.pgd;
240 	if (ret)
241 		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
242 	return ret;
243 }
244 
245 pte_t xen_make_pte(unsigned long long pte)
246 {
247 	if (pte & _PAGE_PRESENT) {
248 		pte = phys_to_machine(XPADDR(pte)).maddr;
249 		pte &= ~(_PAGE_PCD | _PAGE_PWT);
250 	}
251 
252 	return (pte_t){ .pte = pte };
253 }
254 
255 pmd_t xen_make_pmd(unsigned long long pmd)
256 {
257 	if (pmd & 1)
258 		pmd = phys_to_machine(XPADDR(pmd)).maddr;
259 
260 	return (pmd_t){ pmd };
261 }
262 
263 pgd_t xen_make_pgd(unsigned long long pgd)
264 {
265 	if (pgd & _PAGE_PRESENT)
266 		pgd = phys_to_machine(XPADDR(pgd)).maddr;
267 
268 	return (pgd_t){ pgd };
269 }
270 #else  /* !PAE */
271 void xen_set_pte(pte_t *ptep, pte_t pte)
272 {
273 	*ptep = pte;
274 }
275 
276 unsigned long xen_pte_val(pte_t pte)
277 {
278 	unsigned long ret = pte.pte_low;
279 
280 	if (ret & _PAGE_PRESENT)
281 		ret = machine_to_phys(XMADDR(ret)).paddr;
282 
283 	return ret;
284 }
285 
286 unsigned long xen_pgd_val(pgd_t pgd)
287 {
288 	unsigned long ret = pgd.pgd;
289 	if (ret)
290 		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
291 	return ret;
292 }
293 
294 pte_t xen_make_pte(unsigned long pte)
295 {
296 	if (pte & _PAGE_PRESENT) {
297 		pte = phys_to_machine(XPADDR(pte)).maddr;
298 		pte &= ~(_PAGE_PCD | _PAGE_PWT);
299 	}
300 
301 	return (pte_t){ pte };
302 }
303 
304 pgd_t xen_make_pgd(unsigned long pgd)
305 {
306 	if (pgd & _PAGE_PRESENT)
307 		pgd = phys_to_machine(XPADDR(pgd)).maddr;
308 
309 	return (pgd_t){ pgd };
310 }
311 #endif	/* CONFIG_X86_PAE */
312 
313 /*
314   (Yet another) pagetable walker.  This one is intended for pinning a
315   pagetable.  This means that it walks a pagetable and calls the
316   callback function on each page it finds making up the page table,
317   at every level.  It walks the entire pagetable, but it only bothers
318   pinning pte pages which are below pte_limit.  In the normal case
319   this will be TASK_SIZE, but at boot we need to pin up to
320   FIXADDR_TOP.  But the important bit is that we don't pin beyond
321   there, because then we start getting into Xen's ptes.
322 */
323 static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
324 		    unsigned long limit)
325 {
326 	pgd_t *pgd = pgd_base;
327 	int flush = 0;
328 	unsigned long addr = 0;
329 	unsigned long pgd_next;
330 
331 	BUG_ON(limit > FIXADDR_TOP);
332 
333 	if (xen_feature(XENFEAT_auto_translated_physmap))
334 		return 0;
335 
336 	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
337 		pud_t *pud;
338 		unsigned long pud_limit, pud_next;
339 
340 		pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
341 
342 		if (!pgd_val(*pgd))
343 			continue;
344 
345 		pud = pud_offset(pgd, 0);
346 
347 		if (PTRS_PER_PUD > 1) /* not folded */
348 			flush |= (*func)(virt_to_page(pud), PT_PUD);
349 
350 		for (; addr != pud_limit; pud++, addr = pud_next) {
351 			pmd_t *pmd;
352 			unsigned long pmd_limit;
353 
354 			pud_next = pud_addr_end(addr, pud_limit);
355 
356 			if (pud_next < limit)
357 				pmd_limit = pud_next;
358 			else
359 				pmd_limit = limit;
360 
361 			if (pud_none(*pud))
362 				continue;
363 
364 			pmd = pmd_offset(pud, 0);
365 
366 			if (PTRS_PER_PMD > 1) /* not folded */
367 				flush |= (*func)(virt_to_page(pmd), PT_PMD);
368 
369 			for (; addr != pmd_limit; pmd++) {
370 				addr += (PAGE_SIZE * PTRS_PER_PTE);
371 				if ((pmd_limit-1) < (addr-1)) {
372 					addr = pmd_limit;
373 					break;
374 				}
375 
376 				if (pmd_none(*pmd))
377 					continue;
378 
379 				flush |= (*func)(pmd_page(*pmd), PT_PTE);
380 			}
381 		}
382 	}
383 
384 	flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
385 
386 	return flush;
387 }
388 
389 static spinlock_t *lock_pte(struct page *page)
390 {
391 	spinlock_t *ptl = NULL;
392 
393 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
394 	ptl = __pte_lockptr(page);
395 	spin_lock(ptl);
396 #endif
397 
398 	return ptl;
399 }
400 
401 static void do_unlock(void *v)
402 {
403 	spinlock_t *ptl = v;
404 	spin_unlock(ptl);
405 }
406 
407 static void xen_do_pin(unsigned level, unsigned long pfn)
408 {
409 	struct mmuext_op *op;
410 	struct multicall_space mcs;
411 
412 	mcs = __xen_mc_entry(sizeof(*op));
413 	op = mcs.args;
414 	op->cmd = level;
415 	op->arg1.mfn = pfn_to_mfn(pfn);
416 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
417 }
418 
419 static int pin_page(struct page *page, enum pt_level level)
420 {
421 	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
422 	int flush;
423 
424 	if (pgfl)
425 		flush = 0;		/* already pinned */
426 	else if (PageHighMem(page))
427 		/* kmaps need flushing if we found an unpinned
428 		   highpage */
429 		flush = 1;
430 	else {
431 		void *pt = lowmem_page_address(page);
432 		unsigned long pfn = page_to_pfn(page);
433 		struct multicall_space mcs = __xen_mc_entry(0);
434 		spinlock_t *ptl;
435 
436 		flush = 0;
437 
438 		ptl = NULL;
439 		if (level == PT_PTE)
440 			ptl = lock_pte(page);
441 
442 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
443 					pfn_pte(pfn, PAGE_KERNEL_RO),
444 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
445 
446 		if (level == PT_PTE)
447 			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
448 
449 		if (ptl) {
450 			/* Queue a deferred unlock for when this batch
451 			   is completed. */
452 			xen_mc_callback(do_unlock, ptl);
453 		}
454 	}
455 
456 	return flush;
457 }
458 
459 /* This is called just after a mm has been created, but it has not
460    been used yet.  We need to make sure that its pagetable is all
461    read-only, and can be pinned. */
462 void xen_pgd_pin(pgd_t *pgd)
463 {
464 	unsigned level;
465 
466 	xen_mc_batch();
467 
468 	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
469 		/* re-enable interrupts for kmap_flush_unused */
470 		xen_mc_issue(0);
471 		kmap_flush_unused();
472 		xen_mc_batch();
473 	}
474 
475 #ifdef CONFIG_X86_PAE
476 	level = MMUEXT_PIN_L3_TABLE;
477 #else
478 	level = MMUEXT_PIN_L2_TABLE;
479 #endif
480 
481 	xen_do_pin(level, PFN_DOWN(__pa(pgd)));
482 
483 	xen_mc_issue(0);
484 }
485 
486 /* The init_mm pagetable is really pinned as soon as its created, but
487    that's before we have page structures to store the bits.  So do all
488    the book-keeping now. */
489 static __init int mark_pinned(struct page *page, enum pt_level level)
490 {
491 	SetPagePinned(page);
492 	return 0;
493 }
494 
495 void __init xen_mark_init_mm_pinned(void)
496 {
497 	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
498 }
499 
500 static int unpin_page(struct page *page, enum pt_level level)
501 {
502 	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
503 
504 	if (pgfl && !PageHighMem(page)) {
505 		void *pt = lowmem_page_address(page);
506 		unsigned long pfn = page_to_pfn(page);
507 		spinlock_t *ptl = NULL;
508 		struct multicall_space mcs;
509 
510 		if (level == PT_PTE) {
511 			ptl = lock_pte(page);
512 
513 			xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
514 		}
515 
516 		mcs = __xen_mc_entry(0);
517 
518 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
519 					pfn_pte(pfn, PAGE_KERNEL),
520 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
521 
522 		if (ptl) {
523 			/* unlock when batch completed */
524 			xen_mc_callback(do_unlock, ptl);
525 		}
526 	}
527 
528 	return 0;		/* never need to flush on unpin */
529 }
530 
531 /* Release a pagetables pages back as normal RW */
532 static void xen_pgd_unpin(pgd_t *pgd)
533 {
534 	xen_mc_batch();
535 
536 	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
537 
538 	pgd_walk(pgd, unpin_page, TASK_SIZE);
539 
540 	xen_mc_issue(0);
541 }
542 
543 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
544 {
545 	spin_lock(&next->page_table_lock);
546 	xen_pgd_pin(next->pgd);
547 	spin_unlock(&next->page_table_lock);
548 }
549 
550 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
551 {
552 	spin_lock(&mm->page_table_lock);
553 	xen_pgd_pin(mm->pgd);
554 	spin_unlock(&mm->page_table_lock);
555 }
556 
557 
558 #ifdef CONFIG_SMP
559 /* Another cpu may still have their %cr3 pointing at the pagetable, so
560    we need to repoint it somewhere else before we can unpin it. */
561 static void drop_other_mm_ref(void *info)
562 {
563 	struct mm_struct *mm = info;
564 
565 	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
566 		leave_mm(smp_processor_id());
567 
568 	/* If this cpu still has a stale cr3 reference, then make sure
569 	   it has been flushed. */
570 	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
571 		load_cr3(swapper_pg_dir);
572 		arch_flush_lazy_cpu_mode();
573 	}
574 }
575 
576 static void drop_mm_ref(struct mm_struct *mm)
577 {
578 	cpumask_t mask;
579 	unsigned cpu;
580 
581 	if (current->active_mm == mm) {
582 		if (current->mm == mm)
583 			load_cr3(swapper_pg_dir);
584 		else
585 			leave_mm(smp_processor_id());
586 		arch_flush_lazy_cpu_mode();
587 	}
588 
589 	/* Get the "official" set of cpus referring to our pagetable. */
590 	mask = mm->cpu_vm_mask;
591 
592 	/* It's possible that a vcpu may have a stale reference to our
593 	   cr3, because its in lazy mode, and it hasn't yet flushed
594 	   its set of pending hypercalls yet.  In this case, we can
595 	   look at its actual current cr3 value, and force it to flush
596 	   if needed. */
597 	for_each_online_cpu(cpu) {
598 		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
599 			cpu_set(cpu, mask);
600 	}
601 
602 	if (!cpus_empty(mask))
603 		xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
604 }
605 #else
606 static void drop_mm_ref(struct mm_struct *mm)
607 {
608 	if (current->active_mm == mm)
609 		load_cr3(swapper_pg_dir);
610 }
611 #endif
612 
613 /*
614  * While a process runs, Xen pins its pagetables, which means that the
615  * hypervisor forces it to be read-only, and it controls all updates
616  * to it.  This means that all pagetable updates have to go via the
617  * hypervisor, which is moderately expensive.
618  *
619  * Since we're pulling the pagetable down, we switch to use init_mm,
620  * unpin old process pagetable and mark it all read-write, which
621  * allows further operations on it to be simple memory accesses.
622  *
623  * The only subtle point is that another CPU may be still using the
624  * pagetable because of lazy tlb flushing.  This means we need need to
625  * switch all CPUs off this pagetable before we can unpin it.
626  */
627 void xen_exit_mmap(struct mm_struct *mm)
628 {
629 	get_cpu();		/* make sure we don't move around */
630 	drop_mm_ref(mm);
631 	put_cpu();
632 
633 	spin_lock(&mm->page_table_lock);
634 
635 	/* pgd may not be pinned in the error exit path of execve */
636 	if (PagePinned(virt_to_page(mm->pgd)))
637 		xen_pgd_unpin(mm->pgd);
638 
639 	spin_unlock(&mm->page_table_lock);
640 }
641