xref: /openbmc/linux/arch/x86/xen/p2m.c (revision df2634f43f5106947f3735a0b61a6527a4b278cd)
1 /*
2  * Xen leaves the responsibility for maintaining p2m mappings to the
3  * guests themselves, but it must also access and update the p2m array
4  * during suspend/resume when all the pages are reallocated.
5  *
6  * The p2m table is logically a flat array, but we implement it as a
7  * three-level tree to allow the address space to be sparse.
8  *
9  *                               Xen
10  *                                |
11  *     p2m_top              p2m_top_mfn
12  *       /  \                   /   \
13  * p2m_mid p2m_mid	p2m_mid_mfn p2m_mid_mfn
14  *    / \      / \         /           /
15  *  p2m p2m p2m p2m p2m p2m p2m ...
16  *
17  * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
18  *
19  * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
20  * maximum representable pseudo-physical address space is:
21  *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
22  *
23  * P2M_PER_PAGE depends on the architecture, as a mfn is always
24  * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25  * 512 and 1024 entries respectively.
26  */
27 
28 #include <linux/init.h>
29 #include <linux/module.h>
30 #include <linux/list.h>
31 #include <linux/hash.h>
32 #include <linux/sched.h>
33 
34 #include <asm/cache.h>
35 #include <asm/setup.h>
36 
37 #include <asm/xen/page.h>
38 #include <asm/xen/hypercall.h>
39 #include <asm/xen/hypervisor.h>
40 
41 #include "xen-ops.h"
42 
43 static void __init m2p_override_init(void);
44 
45 unsigned long xen_max_p2m_pfn __read_mostly;
46 
47 #define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
48 #define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *))
49 #define P2M_TOP_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long **))
50 
51 #define MAX_P2M_PFN		(P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
52 
53 /* Placeholders for holes in the address space */
54 static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
55 static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
56 static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
57 
58 static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
59 static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
60 static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
61 
62 RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
63 RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
64 
65 static inline unsigned p2m_top_index(unsigned long pfn)
66 {
67 	BUG_ON(pfn >= MAX_P2M_PFN);
68 	return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
69 }
70 
71 static inline unsigned p2m_mid_index(unsigned long pfn)
72 {
73 	return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
74 }
75 
76 static inline unsigned p2m_index(unsigned long pfn)
77 {
78 	return pfn % P2M_PER_PAGE;
79 }
80 
81 static void p2m_top_init(unsigned long ***top)
82 {
83 	unsigned i;
84 
85 	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
86 		top[i] = p2m_mid_missing;
87 }
88 
89 static void p2m_top_mfn_init(unsigned long *top)
90 {
91 	unsigned i;
92 
93 	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
94 		top[i] = virt_to_mfn(p2m_mid_missing_mfn);
95 }
96 
97 static void p2m_top_mfn_p_init(unsigned long **top)
98 {
99 	unsigned i;
100 
101 	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
102 		top[i] = p2m_mid_missing_mfn;
103 }
104 
105 static void p2m_mid_init(unsigned long **mid)
106 {
107 	unsigned i;
108 
109 	for (i = 0; i < P2M_MID_PER_PAGE; i++)
110 		mid[i] = p2m_missing;
111 }
112 
113 static void p2m_mid_mfn_init(unsigned long *mid)
114 {
115 	unsigned i;
116 
117 	for (i = 0; i < P2M_MID_PER_PAGE; i++)
118 		mid[i] = virt_to_mfn(p2m_missing);
119 }
120 
121 static void p2m_init(unsigned long *p2m)
122 {
123 	unsigned i;
124 
125 	for (i = 0; i < P2M_MID_PER_PAGE; i++)
126 		p2m[i] = INVALID_P2M_ENTRY;
127 }
128 
129 /*
130  * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
131  *
132  * This is called both at boot time, and after resuming from suspend:
133  * - At boot time we're called very early, and must use extend_brk()
134  *   to allocate memory.
135  *
136  * - After resume we're called from within stop_machine, but the mfn
137  *   tree should alreay be completely allocated.
138  */
139 void xen_build_mfn_list_list(void)
140 {
141 	unsigned long pfn;
142 
143 	/* Pre-initialize p2m_top_mfn to be completely missing */
144 	if (p2m_top_mfn == NULL) {
145 		p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
146 		p2m_mid_mfn_init(p2m_mid_missing_mfn);
147 
148 		p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
149 		p2m_top_mfn_p_init(p2m_top_mfn_p);
150 
151 		p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
152 		p2m_top_mfn_init(p2m_top_mfn);
153 	} else {
154 		/* Reinitialise, mfn's all change after migration */
155 		p2m_mid_mfn_init(p2m_mid_missing_mfn);
156 	}
157 
158 	for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
159 		unsigned topidx = p2m_top_index(pfn);
160 		unsigned mididx = p2m_mid_index(pfn);
161 		unsigned long **mid;
162 		unsigned long *mid_mfn_p;
163 
164 		mid = p2m_top[topidx];
165 		mid_mfn_p = p2m_top_mfn_p[topidx];
166 
167 		/* Don't bother allocating any mfn mid levels if
168 		 * they're just missing, just update the stored mfn,
169 		 * since all could have changed over a migrate.
170 		 */
171 		if (mid == p2m_mid_missing) {
172 			BUG_ON(mididx);
173 			BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
174 			p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
175 			pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
176 			continue;
177 		}
178 
179 		if (mid_mfn_p == p2m_mid_missing_mfn) {
180 			/*
181 			 * XXX boot-time only!  We should never find
182 			 * missing parts of the mfn tree after
183 			 * runtime.  extend_brk() will BUG if we call
184 			 * it too late.
185 			 */
186 			mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
187 			p2m_mid_mfn_init(mid_mfn_p);
188 
189 			p2m_top_mfn_p[topidx] = mid_mfn_p;
190 		}
191 
192 		p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
193 		mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
194 	}
195 }
196 
197 void xen_setup_mfn_list_list(void)
198 {
199 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
200 
201 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
202 		virt_to_mfn(p2m_top_mfn);
203 	HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
204 }
205 
206 /* Set up p2m_top to point to the domain-builder provided p2m pages */
207 void __init xen_build_dynamic_phys_to_machine(void)
208 {
209 	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
210 	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
211 	unsigned long pfn;
212 
213 	xen_max_p2m_pfn = max_pfn;
214 
215 	p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
216 	p2m_init(p2m_missing);
217 
218 	p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
219 	p2m_mid_init(p2m_mid_missing);
220 
221 	p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
222 	p2m_top_init(p2m_top);
223 
224 	/*
225 	 * The domain builder gives us a pre-constructed p2m array in
226 	 * mfn_list for all the pages initially given to us, so we just
227 	 * need to graft that into our tree structure.
228 	 */
229 	for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
230 		unsigned topidx = p2m_top_index(pfn);
231 		unsigned mididx = p2m_mid_index(pfn);
232 
233 		if (p2m_top[topidx] == p2m_mid_missing) {
234 			unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
235 			p2m_mid_init(mid);
236 
237 			p2m_top[topidx] = mid;
238 		}
239 
240 		/*
241 		 * As long as the mfn_list has enough entries to completely
242 		 * fill a p2m page, pointing into the array is ok. But if
243 		 * not the entries beyond the last pfn will be undefined.
244 		 */
245 		if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
246 			unsigned long p2midx;
247 
248 			p2midx = max_pfn % P2M_PER_PAGE;
249 			for ( ; p2midx < P2M_PER_PAGE; p2midx++)
250 				mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
251 		}
252 		p2m_top[topidx][mididx] = &mfn_list[pfn];
253 	}
254 
255 	m2p_override_init();
256 }
257 
258 unsigned long get_phys_to_machine(unsigned long pfn)
259 {
260 	unsigned topidx, mididx, idx;
261 
262 	if (unlikely(pfn >= MAX_P2M_PFN))
263 		return INVALID_P2M_ENTRY;
264 
265 	topidx = p2m_top_index(pfn);
266 	mididx = p2m_mid_index(pfn);
267 	idx = p2m_index(pfn);
268 
269 	return p2m_top[topidx][mididx][idx];
270 }
271 EXPORT_SYMBOL_GPL(get_phys_to_machine);
272 
273 static void *alloc_p2m_page(void)
274 {
275 	return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
276 }
277 
278 static void free_p2m_page(void *p)
279 {
280 	free_page((unsigned long)p);
281 }
282 
283 /*
284  * Fully allocate the p2m structure for a given pfn.  We need to check
285  * that both the top and mid levels are allocated, and make sure the
286  * parallel mfn tree is kept in sync.  We may race with other cpus, so
287  * the new pages are installed with cmpxchg; if we lose the race then
288  * simply free the page we allocated and use the one that's there.
289  */
290 static bool alloc_p2m(unsigned long pfn)
291 {
292 	unsigned topidx, mididx;
293 	unsigned long ***top_p, **mid;
294 	unsigned long *top_mfn_p, *mid_mfn;
295 
296 	topidx = p2m_top_index(pfn);
297 	mididx = p2m_mid_index(pfn);
298 
299 	top_p = &p2m_top[topidx];
300 	mid = *top_p;
301 
302 	if (mid == p2m_mid_missing) {
303 		/* Mid level is missing, allocate a new one */
304 		mid = alloc_p2m_page();
305 		if (!mid)
306 			return false;
307 
308 		p2m_mid_init(mid);
309 
310 		if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
311 			free_p2m_page(mid);
312 	}
313 
314 	top_mfn_p = &p2m_top_mfn[topidx];
315 	mid_mfn = p2m_top_mfn_p[topidx];
316 
317 	BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
318 
319 	if (mid_mfn == p2m_mid_missing_mfn) {
320 		/* Separately check the mid mfn level */
321 		unsigned long missing_mfn;
322 		unsigned long mid_mfn_mfn;
323 
324 		mid_mfn = alloc_p2m_page();
325 		if (!mid_mfn)
326 			return false;
327 
328 		p2m_mid_mfn_init(mid_mfn);
329 
330 		missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
331 		mid_mfn_mfn = virt_to_mfn(mid_mfn);
332 		if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
333 			free_p2m_page(mid_mfn);
334 		else
335 			p2m_top_mfn_p[topidx] = mid_mfn;
336 	}
337 
338 	if (p2m_top[topidx][mididx] == p2m_missing) {
339 		/* p2m leaf page is missing */
340 		unsigned long *p2m;
341 
342 		p2m = alloc_p2m_page();
343 		if (!p2m)
344 			return false;
345 
346 		p2m_init(p2m);
347 
348 		if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
349 			free_p2m_page(p2m);
350 		else
351 			mid_mfn[mididx] = virt_to_mfn(p2m);
352 	}
353 
354 	return true;
355 }
356 
357 /* Try to install p2m mapping; fail if intermediate bits missing */
358 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
359 {
360 	unsigned topidx, mididx, idx;
361 
362 	if (unlikely(pfn >= MAX_P2M_PFN)) {
363 		BUG_ON(mfn != INVALID_P2M_ENTRY);
364 		return true;
365 	}
366 
367 	topidx = p2m_top_index(pfn);
368 	mididx = p2m_mid_index(pfn);
369 	idx = p2m_index(pfn);
370 
371 	if (p2m_top[topidx][mididx] == p2m_missing)
372 		return mfn == INVALID_P2M_ENTRY;
373 
374 	p2m_top[topidx][mididx][idx] = mfn;
375 
376 	return true;
377 }
378 
379 bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
380 {
381 	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
382 		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
383 		return true;
384 	}
385 
386 	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
387 		if (!alloc_p2m(pfn))
388 			return false;
389 
390 		if (!__set_phys_to_machine(pfn, mfn))
391 			return false;
392 	}
393 
394 	return true;
395 }
396 
397 #define M2P_OVERRIDE_HASH_SHIFT	10
398 #define M2P_OVERRIDE_HASH	(1 << M2P_OVERRIDE_HASH_SHIFT)
399 
400 static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
401 static DEFINE_SPINLOCK(m2p_override_lock);
402 
403 static void __init m2p_override_init(void)
404 {
405 	unsigned i;
406 
407 	m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
408 				   sizeof(unsigned long));
409 
410 	for (i = 0; i < M2P_OVERRIDE_HASH; i++)
411 		INIT_LIST_HEAD(&m2p_overrides[i]);
412 }
413 
414 static unsigned long mfn_hash(unsigned long mfn)
415 {
416 	return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
417 }
418 
419 /* Add an MFN override for a particular page */
420 int m2p_add_override(unsigned long mfn, struct page *page)
421 {
422 	unsigned long flags;
423 	unsigned long pfn;
424 	unsigned long address;
425 	unsigned level;
426 	pte_t *ptep = NULL;
427 
428 	pfn = page_to_pfn(page);
429 	if (!PageHighMem(page)) {
430 		address = (unsigned long)__va(pfn << PAGE_SHIFT);
431 		ptep = lookup_address(address, &level);
432 
433 		if (WARN(ptep == NULL || level != PG_LEVEL_4K,
434 					"m2p_add_override: pfn %lx not mapped", pfn))
435 			return -EINVAL;
436 	}
437 
438 	page->private = mfn;
439 	page->index = pfn_to_mfn(pfn);
440 
441 	__set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
442 	if (!PageHighMem(page))
443 		/* Just zap old mapping for now */
444 		pte_clear(&init_mm, address, ptep);
445 
446 	spin_lock_irqsave(&m2p_override_lock, flags);
447 	list_add(&page->lru,  &m2p_overrides[mfn_hash(mfn)]);
448 	spin_unlock_irqrestore(&m2p_override_lock, flags);
449 
450 	return 0;
451 }
452 
453 int m2p_remove_override(struct page *page)
454 {
455 	unsigned long flags;
456 	unsigned long mfn;
457 	unsigned long pfn;
458 	unsigned long address;
459 	unsigned level;
460 	pte_t *ptep = NULL;
461 
462 	pfn = page_to_pfn(page);
463 	mfn = get_phys_to_machine(pfn);
464 	if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
465 		return -EINVAL;
466 
467 	if (!PageHighMem(page)) {
468 		address = (unsigned long)__va(pfn << PAGE_SHIFT);
469 		ptep = lookup_address(address, &level);
470 
471 		if (WARN(ptep == NULL || level != PG_LEVEL_4K,
472 					"m2p_remove_override: pfn %lx not mapped", pfn))
473 			return -EINVAL;
474 	}
475 
476 	spin_lock_irqsave(&m2p_override_lock, flags);
477 	list_del(&page->lru);
478 	spin_unlock_irqrestore(&m2p_override_lock, flags);
479 	__set_phys_to_machine(pfn, page->index);
480 
481 	if (!PageHighMem(page))
482 		set_pte_at(&init_mm, address, ptep,
483 				pfn_pte(pfn, PAGE_KERNEL));
484 		/* No tlb flush necessary because the caller already
485 		 * left the pte unmapped. */
486 
487 	return 0;
488 }
489 
490 struct page *m2p_find_override(unsigned long mfn)
491 {
492 	unsigned long flags;
493 	struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
494 	struct page *p, *ret;
495 
496 	ret = NULL;
497 
498 	spin_lock_irqsave(&m2p_override_lock, flags);
499 
500 	list_for_each_entry(p, bucket, lru) {
501 		if (p->private == mfn) {
502 			ret = p;
503 			break;
504 		}
505 	}
506 
507 	spin_unlock_irqrestore(&m2p_override_lock, flags);
508 
509 	return ret;
510 }
511 
512 unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
513 {
514 	struct page *p = m2p_find_override(mfn);
515 	unsigned long ret = pfn;
516 
517 	if (p)
518 		ret = page_to_pfn(p);
519 
520 	return ret;
521 }
522 EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
523