xref: /openbmc/linux/arch/x86/xen/setup.c (revision d699090510c3223641a23834b4710e2d4309a6ad)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Machine specific setup for xen
4  *
5  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
6  */
7 
8 #include <linux/init.h>
9 #include <linux/iscsi_ibft.h>
10 #include <linux/sched.h>
11 #include <linux/kstrtox.h>
12 #include <linux/mm.h>
13 #include <linux/pm.h>
14 #include <linux/memblock.h>
15 #include <linux/cpuidle.h>
16 #include <linux/cpufreq.h>
17 #include <linux/memory_hotplug.h>
18 #include <linux/acpi.h>
19 
20 #include <asm/elf.h>
21 #include <asm/vdso.h>
22 #include <asm/e820/api.h>
23 #include <asm/setup.h>
24 #include <asm/numa.h>
25 #include <asm/idtentry.h>
26 #include <asm/xen/hypervisor.h>
27 #include <asm/xen/hypercall.h>
28 
29 #include <xen/xen.h>
30 #include <xen/page.h>
31 #include <xen/interface/callback.h>
32 #include <xen/interface/memory.h>
33 #include <xen/interface/physdev.h>
34 #include <xen/features.h>
35 #include <xen/hvc-console.h>
36 #include "xen-ops.h"
37 #include "mmu.h"
38 
39 #define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
40 
41 /* Memory map would allow PCI passthrough. */
42 bool xen_pv_pci_possible;
43 
44 /* E820 map used during setting up memory. */
45 static struct e820_table xen_e820_table __initdata;
46 
47 /* Number of initially usable memory pages. */
48 static unsigned long ini_nr_pages __initdata;
49 
50 /*
51  * Buffer used to remap identity mapped pages. We only need the virtual space.
52  * The physical page behind this address is remapped as needed to different
53  * buffer pages.
54  */
55 #define REMAP_SIZE	(P2M_PER_PAGE - 3)
56 static struct {
57 	unsigned long	next_area_mfn;
58 	unsigned long	target_pfn;
59 	unsigned long	size;
60 	unsigned long	mfns[REMAP_SIZE];
61 } xen_remap_buf __initdata __aligned(PAGE_SIZE);
62 static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
63 
64 static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
65 
xen_parse_512gb(void)66 static void __init xen_parse_512gb(void)
67 {
68 	bool val = false;
69 	char *arg;
70 
71 	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
72 	if (!arg)
73 		return;
74 
75 	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
76 	if (!arg)
77 		val = true;
78 	else if (kstrtobool(arg + strlen("xen_512gb_limit="), &val))
79 		return;
80 
81 	xen_512gb_limit = val;
82 }
83 
xen_del_extra_mem(unsigned long start_pfn,unsigned long n_pfns)84 static void __init xen_del_extra_mem(unsigned long start_pfn,
85 				     unsigned long n_pfns)
86 {
87 	int i;
88 	unsigned long start_r, size_r;
89 
90 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
91 		start_r = xen_extra_mem[i].start_pfn;
92 		size_r = xen_extra_mem[i].n_pfns;
93 
94 		/* Start of region. */
95 		if (start_r == start_pfn) {
96 			BUG_ON(n_pfns > size_r);
97 			xen_extra_mem[i].start_pfn += n_pfns;
98 			xen_extra_mem[i].n_pfns -= n_pfns;
99 			break;
100 		}
101 		/* End of region. */
102 		if (start_r + size_r == start_pfn + n_pfns) {
103 			BUG_ON(n_pfns > size_r);
104 			xen_extra_mem[i].n_pfns -= n_pfns;
105 			break;
106 		}
107 		/* Mid of region. */
108 		if (start_pfn > start_r && start_pfn < start_r + size_r) {
109 			BUG_ON(start_pfn + n_pfns > start_r + size_r);
110 			xen_extra_mem[i].n_pfns = start_pfn - start_r;
111 			/* Calling memblock_reserve() again is okay. */
112 			xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
113 					  (start_pfn + n_pfns));
114 			break;
115 		}
116 	}
117 	memblock_phys_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
118 }
119 
120 /*
121  * Called during boot before the p2m list can take entries beyond the
122  * hypervisor supplied p2m list. Entries in extra mem are to be regarded as
123  * invalid.
124  */
xen_chk_extra_mem(unsigned long pfn)125 unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
126 {
127 	int i;
128 
129 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
130 		if (pfn >= xen_extra_mem[i].start_pfn &&
131 		    pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
132 			return INVALID_P2M_ENTRY;
133 	}
134 
135 	return IDENTITY_FRAME(pfn);
136 }
137 
138 /*
139  * Mark all pfns of extra mem as invalid in p2m list.
140  */
xen_inv_extra_mem(void)141 void __init xen_inv_extra_mem(void)
142 {
143 	unsigned long pfn, pfn_s, pfn_e;
144 	int i;
145 
146 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
147 		if (!xen_extra_mem[i].n_pfns)
148 			continue;
149 		pfn_s = xen_extra_mem[i].start_pfn;
150 		pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
151 		for (pfn = pfn_s; pfn < pfn_e; pfn++)
152 			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
153 	}
154 }
155 
156 /*
157  * Finds the next RAM pfn available in the E820 map after min_pfn.
158  * This function updates min_pfn with the pfn found and returns
159  * the size of that range or zero if not found.
160  */
xen_find_pfn_range(unsigned long * min_pfn)161 static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
162 {
163 	const struct e820_entry *entry = xen_e820_table.entries;
164 	unsigned int i;
165 	unsigned long done = 0;
166 
167 	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
168 		unsigned long s_pfn;
169 		unsigned long e_pfn;
170 
171 		if (entry->type != E820_TYPE_RAM)
172 			continue;
173 
174 		e_pfn = PFN_DOWN(entry->addr + entry->size);
175 
176 		/* We only care about E820 after this */
177 		if (e_pfn <= *min_pfn)
178 			continue;
179 
180 		s_pfn = PFN_UP(entry->addr);
181 
182 		/* If min_pfn falls within the E820 entry, we want to start
183 		 * at the min_pfn PFN.
184 		 */
185 		if (s_pfn <= *min_pfn) {
186 			done = e_pfn - *min_pfn;
187 		} else {
188 			done = e_pfn - s_pfn;
189 			*min_pfn = s_pfn;
190 		}
191 		break;
192 	}
193 
194 	return done;
195 }
196 
xen_free_mfn(unsigned long mfn)197 static int __init xen_free_mfn(unsigned long mfn)
198 {
199 	struct xen_memory_reservation reservation = {
200 		.address_bits = 0,
201 		.extent_order = 0,
202 		.domid        = DOMID_SELF
203 	};
204 
205 	set_xen_guest_handle(reservation.extent_start, &mfn);
206 	reservation.nr_extents = 1;
207 
208 	return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
209 }
210 
211 /*
212  * This releases a chunk of memory and then does the identity map. It's used
213  * as a fallback if the remapping fails.
214  */
xen_set_identity_and_release_chunk(unsigned long start_pfn,unsigned long end_pfn)215 static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
216 						      unsigned long end_pfn)
217 {
218 	unsigned long pfn, end;
219 	int ret;
220 
221 	WARN_ON(start_pfn > end_pfn);
222 
223 	/* Release pages first. */
224 	end = min(end_pfn, ini_nr_pages);
225 	for (pfn = start_pfn; pfn < end; pfn++) {
226 		unsigned long mfn = pfn_to_mfn(pfn);
227 
228 		/* Make sure pfn exists to start with */
229 		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
230 			continue;
231 
232 		ret = xen_free_mfn(mfn);
233 		WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
234 
235 		if (ret == 1) {
236 			xen_released_pages++;
237 			if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
238 				break;
239 		} else
240 			break;
241 	}
242 
243 	set_phys_range_identity(start_pfn, end_pfn);
244 }
245 
246 /*
247  * Helper function to update the p2m and m2p tables and kernel mapping.
248  */
xen_update_mem_tables(unsigned long pfn,unsigned long mfn)249 static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
250 {
251 	struct mmu_update update = {
252 		.ptr = ((uint64_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
253 		.val = pfn
254 	};
255 
256 	/* Update p2m */
257 	if (!set_phys_to_machine(pfn, mfn)) {
258 		WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
259 		     pfn, mfn);
260 		BUG();
261 	}
262 
263 	/* Update m2p */
264 	if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
265 		WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
266 		     mfn, pfn);
267 		BUG();
268 	}
269 
270 	if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
271 					 mfn_pte(mfn, PAGE_KERNEL), 0)) {
272 		WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
273 		      mfn, pfn);
274 		BUG();
275 	}
276 }
277 
278 /*
279  * This function updates the p2m and m2p tables with an identity map from
280  * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the
281  * original allocation at remap_pfn. The information needed for remapping is
282  * saved in the memory itself to avoid the need for allocating buffers. The
283  * complete remap information is contained in a list of MFNs each containing
284  * up to REMAP_SIZE MFNs and the start target PFN for doing the remap.
285  * This enables us to preserve the original mfn sequence while doing the
286  * remapping at a time when the memory management is capable of allocating
287  * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and
288  * its callers.
289  */
xen_do_set_identity_and_remap_chunk(unsigned long start_pfn,unsigned long size,unsigned long remap_pfn)290 static void __init xen_do_set_identity_and_remap_chunk(
291         unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
292 {
293 	unsigned long buf = (unsigned long)&xen_remap_buf;
294 	unsigned long mfn_save, mfn;
295 	unsigned long ident_pfn_iter, remap_pfn_iter;
296 	unsigned long ident_end_pfn = start_pfn + size;
297 	unsigned long left = size;
298 	unsigned int i, chunk;
299 
300 	WARN_ON(size == 0);
301 
302 	mfn_save = virt_to_mfn((void *)buf);
303 
304 	for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
305 	     ident_pfn_iter < ident_end_pfn;
306 	     ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) {
307 		chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE;
308 
309 		/* Map first pfn to xen_remap_buf */
310 		mfn = pfn_to_mfn(ident_pfn_iter);
311 		set_pte_mfn(buf, mfn, PAGE_KERNEL);
312 
313 		/* Save mapping information in page */
314 		xen_remap_buf.next_area_mfn = xen_remap_mfn;
315 		xen_remap_buf.target_pfn = remap_pfn_iter;
316 		xen_remap_buf.size = chunk;
317 		for (i = 0; i < chunk; i++)
318 			xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i);
319 
320 		/* Put remap buf into list. */
321 		xen_remap_mfn = mfn;
322 
323 		/* Set identity map */
324 		set_phys_range_identity(ident_pfn_iter, ident_pfn_iter + chunk);
325 
326 		left -= chunk;
327 	}
328 
329 	/* Restore old xen_remap_buf mapping */
330 	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
331 }
332 
333 /*
334  * This function takes a contiguous pfn range that needs to be identity mapped
335  * and:
336  *
337  *  1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
338  *  2) Calls the do_ function to actually do the mapping/remapping work.
339  *
340  * The goal is to not allocate additional memory but to remap the existing
341  * pages. In the case of an error the underlying memory is simply released back
342  * to Xen and not remapped.
343  */
xen_set_identity_and_remap_chunk(unsigned long start_pfn,unsigned long end_pfn,unsigned long remap_pfn)344 static unsigned long __init xen_set_identity_and_remap_chunk(
345 	unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pfn)
346 {
347 	unsigned long pfn;
348 	unsigned long i = 0;
349 	unsigned long n = end_pfn - start_pfn;
350 
351 	if (remap_pfn == 0)
352 		remap_pfn = ini_nr_pages;
353 
354 	while (i < n) {
355 		unsigned long cur_pfn = start_pfn + i;
356 		unsigned long left = n - i;
357 		unsigned long size = left;
358 		unsigned long remap_range_size;
359 
360 		/* Do not remap pages beyond the current allocation */
361 		if (cur_pfn >= ini_nr_pages) {
362 			/* Identity map remaining pages */
363 			set_phys_range_identity(cur_pfn, cur_pfn + size);
364 			break;
365 		}
366 		if (cur_pfn + size > ini_nr_pages)
367 			size = ini_nr_pages - cur_pfn;
368 
369 		remap_range_size = xen_find_pfn_range(&remap_pfn);
370 		if (!remap_range_size) {
371 			pr_warn("Unable to find available pfn range, not remapping identity pages\n");
372 			xen_set_identity_and_release_chunk(cur_pfn,
373 							   cur_pfn + left);
374 			break;
375 		}
376 		/* Adjust size to fit in current e820 RAM region */
377 		if (size > remap_range_size)
378 			size = remap_range_size;
379 
380 		xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn);
381 
382 		/* Update variables to reflect new mappings. */
383 		i += size;
384 		remap_pfn += size;
385 	}
386 
387 	/*
388 	 * If the PFNs are currently mapped, their VA mappings need to be
389 	 * zapped.
390 	 */
391 	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
392 		(void)HYPERVISOR_update_va_mapping(
393 			(unsigned long)__va(pfn << PAGE_SHIFT),
394 			native_make_pte(0), 0);
395 
396 	return remap_pfn;
397 }
398 
xen_count_remap_pages(unsigned long start_pfn,unsigned long end_pfn,unsigned long remap_pages)399 static unsigned long __init xen_count_remap_pages(
400 	unsigned long start_pfn, unsigned long end_pfn,
401 	unsigned long remap_pages)
402 {
403 	if (start_pfn >= ini_nr_pages)
404 		return remap_pages;
405 
406 	return remap_pages + min(end_pfn, ini_nr_pages) - start_pfn;
407 }
408 
xen_foreach_remap_area(unsigned long (* func)(unsigned long start_pfn,unsigned long end_pfn,unsigned long last_val))409 static unsigned long __init xen_foreach_remap_area(
410 	unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn,
411 			      unsigned long last_val))
412 {
413 	phys_addr_t start = 0;
414 	unsigned long ret_val = 0;
415 	const struct e820_entry *entry = xen_e820_table.entries;
416 	int i;
417 
418 	/*
419 	 * Combine non-RAM regions and gaps until a RAM region (or the
420 	 * end of the map) is reached, then call the provided function
421 	 * to perform its duty on the non-RAM region.
422 	 *
423 	 * The combined non-RAM regions are rounded to a whole number
424 	 * of pages so any partial pages are accessible via the 1:1
425 	 * mapping.  This is needed for some BIOSes that put (for
426 	 * example) the DMI tables in a reserved region that begins on
427 	 * a non-page boundary.
428 	 */
429 	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
430 		phys_addr_t end = entry->addr + entry->size;
431 		if (entry->type == E820_TYPE_RAM || i == xen_e820_table.nr_entries - 1) {
432 			unsigned long start_pfn = PFN_DOWN(start);
433 			unsigned long end_pfn = PFN_UP(end);
434 
435 			if (entry->type == E820_TYPE_RAM)
436 				end_pfn = PFN_UP(entry->addr);
437 
438 			if (start_pfn < end_pfn)
439 				ret_val = func(start_pfn, end_pfn, ret_val);
440 			start = end;
441 		}
442 	}
443 
444 	return ret_val;
445 }
446 
447 /*
448  * Remap the memory prepared in xen_do_set_identity_and_remap_chunk().
449  * The remap information (which mfn remap to which pfn) is contained in the
450  * to be remapped memory itself in a linked list anchored at xen_remap_mfn.
451  * This scheme allows to remap the different chunks in arbitrary order while
452  * the resulting mapping will be independent from the order.
453  */
xen_remap_memory(void)454 void __init xen_remap_memory(void)
455 {
456 	unsigned long buf = (unsigned long)&xen_remap_buf;
457 	unsigned long mfn_save, pfn;
458 	unsigned long remapped = 0;
459 	unsigned int i;
460 	unsigned long pfn_s = ~0UL;
461 	unsigned long len = 0;
462 
463 	mfn_save = virt_to_mfn((void *)buf);
464 
465 	while (xen_remap_mfn != INVALID_P2M_ENTRY) {
466 		/* Map the remap information */
467 		set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL);
468 
469 		BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]);
470 
471 		pfn = xen_remap_buf.target_pfn;
472 		for (i = 0; i < xen_remap_buf.size; i++) {
473 			xen_update_mem_tables(pfn, xen_remap_buf.mfns[i]);
474 			remapped++;
475 			pfn++;
476 		}
477 		if (pfn_s == ~0UL || pfn == pfn_s) {
478 			pfn_s = xen_remap_buf.target_pfn;
479 			len += xen_remap_buf.size;
480 		} else if (pfn_s + len == xen_remap_buf.target_pfn) {
481 			len += xen_remap_buf.size;
482 		} else {
483 			xen_del_extra_mem(pfn_s, len);
484 			pfn_s = xen_remap_buf.target_pfn;
485 			len = xen_remap_buf.size;
486 		}
487 		xen_remap_mfn = xen_remap_buf.next_area_mfn;
488 	}
489 
490 	if (pfn_s != ~0UL && len)
491 		xen_del_extra_mem(pfn_s, len);
492 
493 	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
494 
495 	pr_info("Remapped %ld page(s)\n", remapped);
496 
497 	xen_do_remap_nonram();
498 }
499 
xen_get_pages_limit(void)500 static unsigned long __init xen_get_pages_limit(void)
501 {
502 	unsigned long limit;
503 
504 	limit = MAXMEM / PAGE_SIZE;
505 	if (!xen_initial_domain() && xen_512gb_limit)
506 		limit = GB(512) / PAGE_SIZE;
507 
508 	return limit;
509 }
510 
xen_get_max_pages(void)511 static unsigned long __init xen_get_max_pages(void)
512 {
513 	unsigned long max_pages, limit;
514 	domid_t domid = DOMID_SELF;
515 	long ret;
516 
517 	limit = xen_get_pages_limit();
518 	max_pages = limit;
519 
520 	/*
521 	 * For the initial domain we use the maximum reservation as
522 	 * the maximum page.
523 	 *
524 	 * For guest domains the current maximum reservation reflects
525 	 * the current maximum rather than the static maximum. In this
526 	 * case the e820 map provided to us will cover the static
527 	 * maximum region.
528 	 */
529 	if (xen_initial_domain()) {
530 		ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
531 		if (ret > 0)
532 			max_pages = ret;
533 	}
534 
535 	return min(max_pages, limit);
536 }
537 
xen_align_and_add_e820_region(phys_addr_t start,phys_addr_t size,int type)538 static void __init xen_align_and_add_e820_region(phys_addr_t start,
539 						 phys_addr_t size, int type)
540 {
541 	phys_addr_t end = start + size;
542 
543 	/* Align RAM regions to page boundaries. */
544 	if (type == E820_TYPE_RAM) {
545 		start = PAGE_ALIGN(start);
546 		end &= ~((phys_addr_t)PAGE_SIZE - 1);
547 #ifdef CONFIG_MEMORY_HOTPLUG
548 		/*
549 		 * Don't allow adding memory not in E820 map while booting the
550 		 * system. Once the balloon driver is up it will remove that
551 		 * restriction again.
552 		 */
553 		max_mem_size = end;
554 #endif
555 	}
556 
557 	e820__range_add(start, end - start, type);
558 }
559 
xen_ignore_unusable(void)560 static void __init xen_ignore_unusable(void)
561 {
562 	struct e820_entry *entry = xen_e820_table.entries;
563 	unsigned int i;
564 
565 	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
566 		if (entry->type == E820_TYPE_UNUSABLE)
567 			entry->type = E820_TYPE_RAM;
568 	}
569 }
570 
xen_is_e820_reserved(phys_addr_t start,phys_addr_t size)571 static bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
572 {
573 	struct e820_entry *entry;
574 	unsigned mapcnt;
575 	phys_addr_t end;
576 
577 	if (!size)
578 		return false;
579 
580 	end = start + size;
581 	entry = xen_e820_table.entries;
582 
583 	for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
584 		if (entry->type == E820_TYPE_RAM && entry->addr <= start &&
585 		    (entry->addr + entry->size) >= end)
586 			return false;
587 
588 		entry++;
589 	}
590 
591 	return true;
592 }
593 
594 /*
595  * Find a free area in physical memory not yet reserved and compliant with
596  * E820 map.
597  * Used to relocate pre-allocated areas like initrd or p2m list which are in
598  * conflict with the to be used E820 map.
599  * In case no area is found, return 0. Otherwise return the physical address
600  * of the area which is already reserved for convenience.
601  */
xen_find_free_area(phys_addr_t size)602 phys_addr_t __init xen_find_free_area(phys_addr_t size)
603 {
604 	unsigned mapcnt;
605 	phys_addr_t addr, start;
606 	struct e820_entry *entry = xen_e820_table.entries;
607 
608 	for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++, entry++) {
609 		if (entry->type != E820_TYPE_RAM || entry->size < size)
610 			continue;
611 		start = entry->addr;
612 		for (addr = start; addr < start + size; addr += PAGE_SIZE) {
613 			if (!memblock_is_reserved(addr))
614 				continue;
615 			start = addr + PAGE_SIZE;
616 			if (start + size > entry->addr + entry->size)
617 				break;
618 		}
619 		if (addr >= start + size) {
620 			memblock_reserve(start, size);
621 			return start;
622 		}
623 	}
624 
625 	return 0;
626 }
627 
628 /*
629  * Swap a non-RAM E820 map entry with RAM above ini_nr_pages.
630  * Note that the E820 map is modified accordingly, but the P2M map isn't yet.
631  * The adaption of the P2M must be deferred until page allocation is possible.
632  */
xen_e820_swap_entry_with_ram(struct e820_entry * swap_entry)633 static void __init xen_e820_swap_entry_with_ram(struct e820_entry *swap_entry)
634 {
635 	struct e820_entry *entry;
636 	unsigned int mapcnt;
637 	phys_addr_t mem_end = PFN_PHYS(ini_nr_pages);
638 	phys_addr_t swap_addr, swap_size, entry_end;
639 
640 	swap_addr = PAGE_ALIGN_DOWN(swap_entry->addr);
641 	swap_size = PAGE_ALIGN(swap_entry->addr - swap_addr + swap_entry->size);
642 	entry = xen_e820_table.entries;
643 
644 	for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
645 		entry_end = entry->addr + entry->size;
646 		if (entry->type == E820_TYPE_RAM && entry->size >= swap_size &&
647 		    entry_end - swap_size >= mem_end) {
648 			/* Reduce RAM entry by needed space (whole pages). */
649 			entry->size -= swap_size;
650 
651 			/* Add new entry at the end of E820 map. */
652 			entry = xen_e820_table.entries +
653 				xen_e820_table.nr_entries;
654 			xen_e820_table.nr_entries++;
655 
656 			/* Fill new entry (keep size and page offset). */
657 			entry->type = swap_entry->type;
658 			entry->addr = entry_end - swap_size +
659 				      swap_addr - swap_entry->addr;
660 			entry->size = swap_entry->size;
661 
662 			/* Convert old entry to RAM, align to pages. */
663 			swap_entry->type = E820_TYPE_RAM;
664 			swap_entry->addr = swap_addr;
665 			swap_entry->size = swap_size;
666 
667 			/* Remember PFN<->MFN relation for P2M update. */
668 			xen_add_remap_nonram(swap_addr, entry_end - swap_size,
669 					     swap_size);
670 
671 			/* Order E820 table and merge entries. */
672 			e820__update_table(&xen_e820_table);
673 
674 			return;
675 		}
676 
677 		entry++;
678 	}
679 
680 	xen_raw_console_write("No suitable area found for required E820 entry remapping action\n");
681 	BUG();
682 }
683 
684 /*
685  * Look for non-RAM memory types in a specific guest physical area and move
686  * those away if possible (ACPI NVS only for now).
687  */
xen_e820_resolve_conflicts(phys_addr_t start,phys_addr_t size)688 static void __init xen_e820_resolve_conflicts(phys_addr_t start,
689 					      phys_addr_t size)
690 {
691 	struct e820_entry *entry;
692 	unsigned int mapcnt;
693 	phys_addr_t end;
694 
695 	if (!size)
696 		return;
697 
698 	end = start + size;
699 	entry = xen_e820_table.entries;
700 
701 	for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
702 		if (entry->addr >= end)
703 			return;
704 
705 		if (entry->addr + entry->size > start &&
706 		    entry->type == E820_TYPE_NVS)
707 			xen_e820_swap_entry_with_ram(entry);
708 
709 		entry++;
710 	}
711 }
712 
713 /*
714  * Check for an area in physical memory to be usable for non-movable purposes.
715  * An area is considered to usable if the used E820 map lists it to be RAM or
716  * some other type which can be moved to higher PFNs while keeping the MFNs.
717  * In case the area is not usable, crash the system with an error message.
718  */
xen_chk_is_e820_usable(phys_addr_t start,phys_addr_t size,const char * component)719 void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size,
720 				   const char *component)
721 {
722 	xen_e820_resolve_conflicts(start, size);
723 
724 	if (!xen_is_e820_reserved(start, size))
725 		return;
726 
727 	xen_raw_console_write("Xen hypervisor allocated ");
728 	xen_raw_console_write(component);
729 	xen_raw_console_write(" memory conflicts with E820 map\n");
730 	BUG();
731 }
732 
733 /*
734  * Like memcpy, but with physical addresses for dest and src.
735  */
xen_phys_memcpy(phys_addr_t dest,phys_addr_t src,phys_addr_t n)736 static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
737 				   phys_addr_t n)
738 {
739 	phys_addr_t dest_off, src_off, dest_len, src_len, len;
740 	void *from, *to;
741 
742 	while (n) {
743 		dest_off = dest & ~PAGE_MASK;
744 		src_off = src & ~PAGE_MASK;
745 		dest_len = n;
746 		if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
747 			dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
748 		src_len = n;
749 		if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
750 			src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
751 		len = min(dest_len, src_len);
752 		to = early_memremap(dest - dest_off, dest_len + dest_off);
753 		from = early_memremap(src - src_off, src_len + src_off);
754 		memcpy(to, from, len);
755 		early_memunmap(to, dest_len + dest_off);
756 		early_memunmap(from, src_len + src_off);
757 		n -= len;
758 		dest += len;
759 		src += len;
760 	}
761 }
762 
763 /*
764  * Reserve Xen mfn_list.
765  */
xen_reserve_xen_mfnlist(void)766 static void __init xen_reserve_xen_mfnlist(void)
767 {
768 	phys_addr_t start, size;
769 
770 	if (xen_start_info->mfn_list >= __START_KERNEL_map) {
771 		start = __pa(xen_start_info->mfn_list);
772 		size = PFN_ALIGN(xen_start_info->nr_pages *
773 				 sizeof(unsigned long));
774 	} else {
775 		start = PFN_PHYS(xen_start_info->first_p2m_pfn);
776 		size = PFN_PHYS(xen_start_info->nr_p2m_frames);
777 	}
778 
779 	memblock_reserve(start, size);
780 	if (!xen_is_e820_reserved(start, size))
781 		return;
782 
783 	xen_relocate_p2m();
784 	memblock_phys_free(start, size);
785 }
786 
787 /**
788  * xen_memory_setup - Hook for machine specific memory setup.
789  **/
xen_memory_setup(void)790 char * __init xen_memory_setup(void)
791 {
792 	unsigned long pfn_s, n_pfns;
793 	phys_addr_t mem_end, addr, size, chunk_size;
794 	u32 type;
795 	int rc;
796 	struct xen_memory_map memmap;
797 	unsigned long max_pages;
798 	unsigned long extra_pages = 0;
799 	unsigned long maxmem_pages;
800 	int i;
801 	int op;
802 
803 	xen_parse_512gb();
804 	ini_nr_pages = min(xen_get_pages_limit(), xen_start_info->nr_pages);
805 	mem_end = PFN_PHYS(ini_nr_pages);
806 
807 	memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
808 	set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
809 
810 #if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_XEN_BALLOON)
811 	xen_saved_max_mem_size = max_mem_size;
812 #endif
813 
814 	op = xen_initial_domain() ?
815 		XENMEM_machine_memory_map :
816 		XENMEM_memory_map;
817 	rc = HYPERVISOR_memory_op(op, &memmap);
818 	if (rc == -ENOSYS) {
819 		BUG_ON(xen_initial_domain());
820 		memmap.nr_entries = 1;
821 		xen_e820_table.entries[0].addr = 0ULL;
822 		xen_e820_table.entries[0].size = mem_end;
823 		/* 8MB slack (to balance backend allocations). */
824 		xen_e820_table.entries[0].size += 8ULL << 20;
825 		xen_e820_table.entries[0].type = E820_TYPE_RAM;
826 		rc = 0;
827 	}
828 	BUG_ON(rc);
829 	BUG_ON(memmap.nr_entries == 0);
830 	xen_e820_table.nr_entries = memmap.nr_entries;
831 
832 	if (xen_initial_domain()) {
833 		/*
834 		 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
835 		 * regions, so if we're using the machine memory map leave the
836 		 * region as RAM as it is in the pseudo-physical map.
837 		 *
838 		 * UNUSABLE regions in domUs are not handled and will need
839 		 * a patch in the future.
840 		 */
841 		xen_ignore_unusable();
842 
843 #ifdef CONFIG_ISCSI_IBFT_FIND
844 		/* Reserve 0.5 MiB to 1 MiB region so iBFT can be found */
845 		xen_e820_table.entries[xen_e820_table.nr_entries].addr = IBFT_START;
846 		xen_e820_table.entries[xen_e820_table.nr_entries].size = IBFT_END - IBFT_START;
847 		xen_e820_table.entries[xen_e820_table.nr_entries].type = E820_TYPE_RESERVED;
848 		xen_e820_table.nr_entries++;
849 #endif
850 	}
851 
852 	/* Make sure the Xen-supplied memory map is well-ordered. */
853 	e820__update_table(&xen_e820_table);
854 
855 	/*
856 	 * Check whether the kernel itself conflicts with the target E820 map.
857 	 * Failing now is better than running into weird problems later due
858 	 * to relocating (and even reusing) pages with kernel text or data.
859 	 */
860 	xen_chk_is_e820_usable(__pa_symbol(_text),
861 			       __pa_symbol(_end) - __pa_symbol(_text),
862 			       "kernel");
863 
864 	/*
865 	 * Check for a conflict of the xen_start_info memory with the target
866 	 * E820 map.
867 	 */
868 	xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info),
869 			       "xen_start_info");
870 
871 	/*
872 	 * Check for a conflict of the hypervisor supplied page tables with
873 	 * the target E820 map.
874 	 */
875 	xen_pt_check_e820();
876 
877 	max_pages = xen_get_max_pages();
878 
879 	/* How many extra pages do we need due to remapping? */
880 	max_pages += xen_foreach_remap_area(xen_count_remap_pages);
881 
882 	if (max_pages > ini_nr_pages)
883 		extra_pages += max_pages - ini_nr_pages;
884 
885 	/*
886 	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
887 	 * factor the base size.
888 	 *
889 	 * Make sure we have no memory above max_pages, as this area
890 	 * isn't handled by the p2m management.
891 	 */
892 	maxmem_pages = EXTRA_MEM_RATIO * min(ini_nr_pages, PFN_DOWN(MAXMEM));
893 	extra_pages = min3(maxmem_pages, extra_pages, max_pages - ini_nr_pages);
894 	i = 0;
895 	addr = xen_e820_table.entries[0].addr;
896 	size = xen_e820_table.entries[0].size;
897 	while (i < xen_e820_table.nr_entries) {
898 		bool discard = false;
899 
900 		chunk_size = size;
901 		type = xen_e820_table.entries[i].type;
902 
903 		if (type == E820_TYPE_RESERVED)
904 			xen_pv_pci_possible = true;
905 
906 		if (type == E820_TYPE_RAM) {
907 			if (addr < mem_end) {
908 				chunk_size = min(size, mem_end - addr);
909 			} else if (extra_pages) {
910 				chunk_size = min(size, PFN_PHYS(extra_pages));
911 				pfn_s = PFN_UP(addr);
912 				n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
913 				extra_pages -= n_pfns;
914 				xen_add_extra_mem(pfn_s, n_pfns);
915 				xen_max_p2m_pfn = pfn_s + n_pfns;
916 			} else
917 				discard = true;
918 		}
919 
920 		if (!discard)
921 			xen_align_and_add_e820_region(addr, chunk_size, type);
922 
923 		addr += chunk_size;
924 		size -= chunk_size;
925 		if (size == 0) {
926 			i++;
927 			if (i < xen_e820_table.nr_entries) {
928 				addr = xen_e820_table.entries[i].addr;
929 				size = xen_e820_table.entries[i].size;
930 			}
931 		}
932 	}
933 
934 	/*
935 	 * Set the rest as identity mapped, in case PCI BARs are
936 	 * located here.
937 	 */
938 	set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
939 
940 	/*
941 	 * In domU, the ISA region is normal, usable memory, but we
942 	 * reserve ISA memory anyway because too many things poke
943 	 * about in there.
944 	 */
945 	e820__range_add(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_TYPE_RESERVED);
946 
947 	e820__update_table(e820_table);
948 
949 	xen_reserve_xen_mfnlist();
950 
951 	/* Check for a conflict of the initrd with the target E820 map. */
952 	if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
953 				 boot_params.hdr.ramdisk_size)) {
954 		phys_addr_t new_area, start, size;
955 
956 		new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
957 		if (!new_area) {
958 			xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
959 			BUG();
960 		}
961 
962 		start = boot_params.hdr.ramdisk_image;
963 		size = boot_params.hdr.ramdisk_size;
964 		xen_phys_memcpy(new_area, start, size);
965 		pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
966 			start, start + size, new_area, new_area + size);
967 		memblock_phys_free(start, size);
968 		boot_params.hdr.ramdisk_image = new_area;
969 		boot_params.ext_ramdisk_image = new_area >> 32;
970 	}
971 
972 	/*
973 	 * Set identity map on non-RAM pages and prepare remapping the
974 	 * underlying RAM.
975 	 */
976 	xen_foreach_remap_area(xen_set_identity_and_remap_chunk);
977 
978 	pr_info("Released %ld page(s)\n", xen_released_pages);
979 
980 	return "Xen";
981 }
982 
register_callback(unsigned type,const void * func)983 static int register_callback(unsigned type, const void *func)
984 {
985 	struct callback_register callback = {
986 		.type = type,
987 		.address = XEN_CALLBACK(__KERNEL_CS, func),
988 		.flags = CALLBACKF_mask_events,
989 	};
990 
991 	return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
992 }
993 
xen_enable_sysenter(void)994 void xen_enable_sysenter(void)
995 {
996 	if (cpu_feature_enabled(X86_FEATURE_SYSENTER32) &&
997 	    register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat))
998 		setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
999 }
1000 
xen_enable_syscall(void)1001 void xen_enable_syscall(void)
1002 {
1003 	int ret;
1004 
1005 	ret = register_callback(CALLBACKTYPE_syscall, xen_entry_SYSCALL_64);
1006 	if (ret != 0) {
1007 		printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
1008 		/* Pretty fatal; 64-bit userspace has no other
1009 		   mechanism for syscalls. */
1010 	}
1011 
1012 	if (cpu_feature_enabled(X86_FEATURE_SYSCALL32) &&
1013 	    register_callback(CALLBACKTYPE_syscall32, xen_entry_SYSCALL_compat))
1014 		setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
1015 }
1016 
xen_pvmmu_arch_setup(void)1017 static void __init xen_pvmmu_arch_setup(void)
1018 {
1019 	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
1020 
1021 	if (register_callback(CALLBACKTYPE_event,
1022 			      xen_asm_exc_xen_hypervisor_callback) ||
1023 	    register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
1024 		BUG();
1025 
1026 	xen_enable_sysenter();
1027 	xen_enable_syscall();
1028 }
1029 
1030 /* This function is not called for HVM domains */
xen_arch_setup(void)1031 void __init xen_arch_setup(void)
1032 {
1033 	xen_panic_handler_init();
1034 	xen_pvmmu_arch_setup();
1035 
1036 #ifdef CONFIG_ACPI
1037 	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
1038 		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
1039 		disable_acpi();
1040 	}
1041 #endif
1042 
1043 	memcpy(boot_command_line, xen_start_info->cmd_line,
1044 	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
1045 	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
1046 
1047 	/* Set up idle, making sure it calls safe_halt() pvop */
1048 	disable_cpuidle();
1049 	disable_cpufreq();
1050 	WARN_ON(xen_set_default_idle());
1051 #ifdef CONFIG_NUMA
1052 	numa_off = 1;
1053 #endif
1054 }
1055