xref: /openbmc/linux/arch/x86/xen/setup.c (revision fa0dadde)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Machine specific setup for xen
4  *
5  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
6  */
7 
8 #include <linux/init.h>
9 #include <linux/sched.h>
10 #include <linux/kstrtox.h>
11 #include <linux/mm.h>
12 #include <linux/pm.h>
13 #include <linux/memblock.h>
14 #include <linux/cpuidle.h>
15 #include <linux/cpufreq.h>
16 #include <linux/memory_hotplug.h>
17 
18 #include <asm/elf.h>
19 #include <asm/vdso.h>
20 #include <asm/e820/api.h>
21 #include <asm/setup.h>
22 #include <asm/acpi.h>
23 #include <asm/numa.h>
24 #include <asm/idtentry.h>
25 #include <asm/xen/hypervisor.h>
26 #include <asm/xen/hypercall.h>
27 
28 #include <xen/xen.h>
29 #include <xen/page.h>
30 #include <xen/interface/callback.h>
31 #include <xen/interface/memory.h>
32 #include <xen/interface/physdev.h>
33 #include <xen/features.h>
34 #include <xen/hvc-console.h>
35 #include "xen-ops.h"
36 #include "mmu.h"
37 
38 #define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
39 
40 /* Amount of extra memory space we add to the e820 ranges */
41 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
42 
43 /* Number of pages released from the initial allocation. */
44 unsigned long xen_released_pages;
45 
46 /* E820 map used during setting up memory. */
47 static struct e820_table xen_e820_table __initdata;
48 
49 /*
50  * Buffer used to remap identity mapped pages. We only need the virtual space.
51  * The physical page behind this address is remapped as needed to different
52  * buffer pages.
53  */
54 #define REMAP_SIZE	(P2M_PER_PAGE - 3)
55 static struct {
56 	unsigned long	next_area_mfn;
57 	unsigned long	target_pfn;
58 	unsigned long	size;
59 	unsigned long	mfns[REMAP_SIZE];
60 } xen_remap_buf __initdata __aligned(PAGE_SIZE);
61 static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
62 
63 /*
64  * The maximum amount of extra memory compared to the base size.  The
65  * main scaling factor is the size of struct page.  At extreme ratios
66  * of base:extra, all the base memory can be filled with page
67  * structures for the extra memory, leaving no space for anything
68  * else.
69  *
70  * 10x seems like a reasonable balance between scaling flexibility and
71  * leaving a practically usable system.
72  */
73 #define EXTRA_MEM_RATIO		(10)
74 
75 static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
76 
77 static void __init xen_parse_512gb(void)
78 {
79 	bool val = false;
80 	char *arg;
81 
82 	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
83 	if (!arg)
84 		return;
85 
86 	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
87 	if (!arg)
88 		val = true;
89 	else if (kstrtobool(arg + strlen("xen_512gb_limit="), &val))
90 		return;
91 
92 	xen_512gb_limit = val;
93 }
94 
95 static void __init xen_add_extra_mem(unsigned long start_pfn,
96 				     unsigned long n_pfns)
97 {
98 	int i;
99 
100 	/*
101 	 * No need to check for zero size, should happen rarely and will only
102 	 * write a new entry regarded to be unused due to zero size.
103 	 */
104 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
105 		/* Add new region. */
106 		if (xen_extra_mem[i].n_pfns == 0) {
107 			xen_extra_mem[i].start_pfn = start_pfn;
108 			xen_extra_mem[i].n_pfns = n_pfns;
109 			break;
110 		}
111 		/* Append to existing region. */
112 		if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
113 		    start_pfn) {
114 			xen_extra_mem[i].n_pfns += n_pfns;
115 			break;
116 		}
117 	}
118 	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
119 		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
120 
121 	memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
122 }
123 
124 static void __init xen_del_extra_mem(unsigned long start_pfn,
125 				     unsigned long n_pfns)
126 {
127 	int i;
128 	unsigned long start_r, size_r;
129 
130 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
131 		start_r = xen_extra_mem[i].start_pfn;
132 		size_r = xen_extra_mem[i].n_pfns;
133 
134 		/* Start of region. */
135 		if (start_r == start_pfn) {
136 			BUG_ON(n_pfns > size_r);
137 			xen_extra_mem[i].start_pfn += n_pfns;
138 			xen_extra_mem[i].n_pfns -= n_pfns;
139 			break;
140 		}
141 		/* End of region. */
142 		if (start_r + size_r == start_pfn + n_pfns) {
143 			BUG_ON(n_pfns > size_r);
144 			xen_extra_mem[i].n_pfns -= n_pfns;
145 			break;
146 		}
147 		/* Mid of region. */
148 		if (start_pfn > start_r && start_pfn < start_r + size_r) {
149 			BUG_ON(start_pfn + n_pfns > start_r + size_r);
150 			xen_extra_mem[i].n_pfns = start_pfn - start_r;
151 			/* Calling memblock_reserve() again is okay. */
152 			xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
153 					  (start_pfn + n_pfns));
154 			break;
155 		}
156 	}
157 	memblock_phys_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
158 }
159 
160 /*
161  * Called during boot before the p2m list can take entries beyond the
162  * hypervisor supplied p2m list. Entries in extra mem are to be regarded as
163  * invalid.
164  */
165 unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
166 {
167 	int i;
168 
169 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
170 		if (pfn >= xen_extra_mem[i].start_pfn &&
171 		    pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
172 			return INVALID_P2M_ENTRY;
173 	}
174 
175 	return IDENTITY_FRAME(pfn);
176 }
177 
178 /*
179  * Mark all pfns of extra mem as invalid in p2m list.
180  */
181 void __init xen_inv_extra_mem(void)
182 {
183 	unsigned long pfn, pfn_s, pfn_e;
184 	int i;
185 
186 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
187 		if (!xen_extra_mem[i].n_pfns)
188 			continue;
189 		pfn_s = xen_extra_mem[i].start_pfn;
190 		pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
191 		for (pfn = pfn_s; pfn < pfn_e; pfn++)
192 			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
193 	}
194 }
195 
196 /*
197  * Finds the next RAM pfn available in the E820 map after min_pfn.
198  * This function updates min_pfn with the pfn found and returns
199  * the size of that range or zero if not found.
200  */
201 static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
202 {
203 	const struct e820_entry *entry = xen_e820_table.entries;
204 	unsigned int i;
205 	unsigned long done = 0;
206 
207 	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
208 		unsigned long s_pfn;
209 		unsigned long e_pfn;
210 
211 		if (entry->type != E820_TYPE_RAM)
212 			continue;
213 
214 		e_pfn = PFN_DOWN(entry->addr + entry->size);
215 
216 		/* We only care about E820 after this */
217 		if (e_pfn <= *min_pfn)
218 			continue;
219 
220 		s_pfn = PFN_UP(entry->addr);
221 
222 		/* If min_pfn falls within the E820 entry, we want to start
223 		 * at the min_pfn PFN.
224 		 */
225 		if (s_pfn <= *min_pfn) {
226 			done = e_pfn - *min_pfn;
227 		} else {
228 			done = e_pfn - s_pfn;
229 			*min_pfn = s_pfn;
230 		}
231 		break;
232 	}
233 
234 	return done;
235 }
236 
237 static int __init xen_free_mfn(unsigned long mfn)
238 {
239 	struct xen_memory_reservation reservation = {
240 		.address_bits = 0,
241 		.extent_order = 0,
242 		.domid        = DOMID_SELF
243 	};
244 
245 	set_xen_guest_handle(reservation.extent_start, &mfn);
246 	reservation.nr_extents = 1;
247 
248 	return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
249 }
250 
251 /*
252  * This releases a chunk of memory and then does the identity map. It's used
253  * as a fallback if the remapping fails.
254  */
255 static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
256 			unsigned long end_pfn, unsigned long nr_pages)
257 {
258 	unsigned long pfn, end;
259 	int ret;
260 
261 	WARN_ON(start_pfn > end_pfn);
262 
263 	/* Release pages first. */
264 	end = min(end_pfn, nr_pages);
265 	for (pfn = start_pfn; pfn < end; pfn++) {
266 		unsigned long mfn = pfn_to_mfn(pfn);
267 
268 		/* Make sure pfn exists to start with */
269 		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
270 			continue;
271 
272 		ret = xen_free_mfn(mfn);
273 		WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
274 
275 		if (ret == 1) {
276 			xen_released_pages++;
277 			if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
278 				break;
279 		} else
280 			break;
281 	}
282 
283 	set_phys_range_identity(start_pfn, end_pfn);
284 }
285 
286 /*
287  * Helper function to update the p2m and m2p tables and kernel mapping.
288  */
289 static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
290 {
291 	struct mmu_update update = {
292 		.ptr = ((uint64_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
293 		.val = pfn
294 	};
295 
296 	/* Update p2m */
297 	if (!set_phys_to_machine(pfn, mfn)) {
298 		WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
299 		     pfn, mfn);
300 		BUG();
301 	}
302 
303 	/* Update m2p */
304 	if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
305 		WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
306 		     mfn, pfn);
307 		BUG();
308 	}
309 
310 	if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
311 					 mfn_pte(mfn, PAGE_KERNEL), 0)) {
312 		WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
313 		      mfn, pfn);
314 		BUG();
315 	}
316 }
317 
318 /*
319  * This function updates the p2m and m2p tables with an identity map from
320  * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the
321  * original allocation at remap_pfn. The information needed for remapping is
322  * saved in the memory itself to avoid the need for allocating buffers. The
323  * complete remap information is contained in a list of MFNs each containing
324  * up to REMAP_SIZE MFNs and the start target PFN for doing the remap.
325  * This enables us to preserve the original mfn sequence while doing the
326  * remapping at a time when the memory management is capable of allocating
327  * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and
328  * its callers.
329  */
330 static void __init xen_do_set_identity_and_remap_chunk(
331         unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
332 {
333 	unsigned long buf = (unsigned long)&xen_remap_buf;
334 	unsigned long mfn_save, mfn;
335 	unsigned long ident_pfn_iter, remap_pfn_iter;
336 	unsigned long ident_end_pfn = start_pfn + size;
337 	unsigned long left = size;
338 	unsigned int i, chunk;
339 
340 	WARN_ON(size == 0);
341 
342 	mfn_save = virt_to_mfn(buf);
343 
344 	for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
345 	     ident_pfn_iter < ident_end_pfn;
346 	     ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) {
347 		chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE;
348 
349 		/* Map first pfn to xen_remap_buf */
350 		mfn = pfn_to_mfn(ident_pfn_iter);
351 		set_pte_mfn(buf, mfn, PAGE_KERNEL);
352 
353 		/* Save mapping information in page */
354 		xen_remap_buf.next_area_mfn = xen_remap_mfn;
355 		xen_remap_buf.target_pfn = remap_pfn_iter;
356 		xen_remap_buf.size = chunk;
357 		for (i = 0; i < chunk; i++)
358 			xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i);
359 
360 		/* Put remap buf into list. */
361 		xen_remap_mfn = mfn;
362 
363 		/* Set identity map */
364 		set_phys_range_identity(ident_pfn_iter, ident_pfn_iter + chunk);
365 
366 		left -= chunk;
367 	}
368 
369 	/* Restore old xen_remap_buf mapping */
370 	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
371 }
372 
373 /*
374  * This function takes a contiguous pfn range that needs to be identity mapped
375  * and:
376  *
377  *  1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
378  *  2) Calls the do_ function to actually do the mapping/remapping work.
379  *
380  * The goal is to not allocate additional memory but to remap the existing
381  * pages. In the case of an error the underlying memory is simply released back
382  * to Xen and not remapped.
383  */
384 static unsigned long __init xen_set_identity_and_remap_chunk(
385 	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
386 	unsigned long remap_pfn)
387 {
388 	unsigned long pfn;
389 	unsigned long i = 0;
390 	unsigned long n = end_pfn - start_pfn;
391 
392 	if (remap_pfn == 0)
393 		remap_pfn = nr_pages;
394 
395 	while (i < n) {
396 		unsigned long cur_pfn = start_pfn + i;
397 		unsigned long left = n - i;
398 		unsigned long size = left;
399 		unsigned long remap_range_size;
400 
401 		/* Do not remap pages beyond the current allocation */
402 		if (cur_pfn >= nr_pages) {
403 			/* Identity map remaining pages */
404 			set_phys_range_identity(cur_pfn, cur_pfn + size);
405 			break;
406 		}
407 		if (cur_pfn + size > nr_pages)
408 			size = nr_pages - cur_pfn;
409 
410 		remap_range_size = xen_find_pfn_range(&remap_pfn);
411 		if (!remap_range_size) {
412 			pr_warn("Unable to find available pfn range, not remapping identity pages\n");
413 			xen_set_identity_and_release_chunk(cur_pfn,
414 						cur_pfn + left, nr_pages);
415 			break;
416 		}
417 		/* Adjust size to fit in current e820 RAM region */
418 		if (size > remap_range_size)
419 			size = remap_range_size;
420 
421 		xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn);
422 
423 		/* Update variables to reflect new mappings. */
424 		i += size;
425 		remap_pfn += size;
426 	}
427 
428 	/*
429 	 * If the PFNs are currently mapped, their VA mappings need to be
430 	 * zapped.
431 	 */
432 	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
433 		(void)HYPERVISOR_update_va_mapping(
434 			(unsigned long)__va(pfn << PAGE_SHIFT),
435 			native_make_pte(0), 0);
436 
437 	return remap_pfn;
438 }
439 
440 static unsigned long __init xen_count_remap_pages(
441 	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
442 	unsigned long remap_pages)
443 {
444 	if (start_pfn >= nr_pages)
445 		return remap_pages;
446 
447 	return remap_pages + min(end_pfn, nr_pages) - start_pfn;
448 }
449 
450 static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
451 	unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn,
452 			      unsigned long nr_pages, unsigned long last_val))
453 {
454 	phys_addr_t start = 0;
455 	unsigned long ret_val = 0;
456 	const struct e820_entry *entry = xen_e820_table.entries;
457 	int i;
458 
459 	/*
460 	 * Combine non-RAM regions and gaps until a RAM region (or the
461 	 * end of the map) is reached, then call the provided function
462 	 * to perform its duty on the non-RAM region.
463 	 *
464 	 * The combined non-RAM regions are rounded to a whole number
465 	 * of pages so any partial pages are accessible via the 1:1
466 	 * mapping.  This is needed for some BIOSes that put (for
467 	 * example) the DMI tables in a reserved region that begins on
468 	 * a non-page boundary.
469 	 */
470 	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
471 		phys_addr_t end = entry->addr + entry->size;
472 		if (entry->type == E820_TYPE_RAM || i == xen_e820_table.nr_entries - 1) {
473 			unsigned long start_pfn = PFN_DOWN(start);
474 			unsigned long end_pfn = PFN_UP(end);
475 
476 			if (entry->type == E820_TYPE_RAM)
477 				end_pfn = PFN_UP(entry->addr);
478 
479 			if (start_pfn < end_pfn)
480 				ret_val = func(start_pfn, end_pfn, nr_pages,
481 					       ret_val);
482 			start = end;
483 		}
484 	}
485 
486 	return ret_val;
487 }
488 
489 /*
490  * Remap the memory prepared in xen_do_set_identity_and_remap_chunk().
491  * The remap information (which mfn remap to which pfn) is contained in the
492  * to be remapped memory itself in a linked list anchored at xen_remap_mfn.
493  * This scheme allows to remap the different chunks in arbitrary order while
494  * the resulting mapping will be independent from the order.
495  */
496 void __init xen_remap_memory(void)
497 {
498 	unsigned long buf = (unsigned long)&xen_remap_buf;
499 	unsigned long mfn_save, pfn;
500 	unsigned long remapped = 0;
501 	unsigned int i;
502 	unsigned long pfn_s = ~0UL;
503 	unsigned long len = 0;
504 
505 	mfn_save = virt_to_mfn(buf);
506 
507 	while (xen_remap_mfn != INVALID_P2M_ENTRY) {
508 		/* Map the remap information */
509 		set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL);
510 
511 		BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]);
512 
513 		pfn = xen_remap_buf.target_pfn;
514 		for (i = 0; i < xen_remap_buf.size; i++) {
515 			xen_update_mem_tables(pfn, xen_remap_buf.mfns[i]);
516 			remapped++;
517 			pfn++;
518 		}
519 		if (pfn_s == ~0UL || pfn == pfn_s) {
520 			pfn_s = xen_remap_buf.target_pfn;
521 			len += xen_remap_buf.size;
522 		} else if (pfn_s + len == xen_remap_buf.target_pfn) {
523 			len += xen_remap_buf.size;
524 		} else {
525 			xen_del_extra_mem(pfn_s, len);
526 			pfn_s = xen_remap_buf.target_pfn;
527 			len = xen_remap_buf.size;
528 		}
529 		xen_remap_mfn = xen_remap_buf.next_area_mfn;
530 	}
531 
532 	if (pfn_s != ~0UL && len)
533 		xen_del_extra_mem(pfn_s, len);
534 
535 	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
536 
537 	pr_info("Remapped %ld page(s)\n", remapped);
538 }
539 
540 static unsigned long __init xen_get_pages_limit(void)
541 {
542 	unsigned long limit;
543 
544 	limit = MAXMEM / PAGE_SIZE;
545 	if (!xen_initial_domain() && xen_512gb_limit)
546 		limit = GB(512) / PAGE_SIZE;
547 
548 	return limit;
549 }
550 
551 static unsigned long __init xen_get_max_pages(void)
552 {
553 	unsigned long max_pages, limit;
554 	domid_t domid = DOMID_SELF;
555 	long ret;
556 
557 	limit = xen_get_pages_limit();
558 	max_pages = limit;
559 
560 	/*
561 	 * For the initial domain we use the maximum reservation as
562 	 * the maximum page.
563 	 *
564 	 * For guest domains the current maximum reservation reflects
565 	 * the current maximum rather than the static maximum. In this
566 	 * case the e820 map provided to us will cover the static
567 	 * maximum region.
568 	 */
569 	if (xen_initial_domain()) {
570 		ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
571 		if (ret > 0)
572 			max_pages = ret;
573 	}
574 
575 	return min(max_pages, limit);
576 }
577 
578 static void __init xen_align_and_add_e820_region(phys_addr_t start,
579 						 phys_addr_t size, int type)
580 {
581 	phys_addr_t end = start + size;
582 
583 	/* Align RAM regions to page boundaries. */
584 	if (type == E820_TYPE_RAM) {
585 		start = PAGE_ALIGN(start);
586 		end &= ~((phys_addr_t)PAGE_SIZE - 1);
587 #ifdef CONFIG_MEMORY_HOTPLUG
588 		/*
589 		 * Don't allow adding memory not in E820 map while booting the
590 		 * system. Once the balloon driver is up it will remove that
591 		 * restriction again.
592 		 */
593 		max_mem_size = end;
594 #endif
595 	}
596 
597 	e820__range_add(start, end - start, type);
598 }
599 
600 static void __init xen_ignore_unusable(void)
601 {
602 	struct e820_entry *entry = xen_e820_table.entries;
603 	unsigned int i;
604 
605 	for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
606 		if (entry->type == E820_TYPE_UNUSABLE)
607 			entry->type = E820_TYPE_RAM;
608 	}
609 }
610 
611 bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
612 {
613 	struct e820_entry *entry;
614 	unsigned mapcnt;
615 	phys_addr_t end;
616 
617 	if (!size)
618 		return false;
619 
620 	end = start + size;
621 	entry = xen_e820_table.entries;
622 
623 	for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
624 		if (entry->type == E820_TYPE_RAM && entry->addr <= start &&
625 		    (entry->addr + entry->size) >= end)
626 			return false;
627 
628 		entry++;
629 	}
630 
631 	return true;
632 }
633 
634 /*
635  * Find a free area in physical memory not yet reserved and compliant with
636  * E820 map.
637  * Used to relocate pre-allocated areas like initrd or p2m list which are in
638  * conflict with the to be used E820 map.
639  * In case no area is found, return 0. Otherwise return the physical address
640  * of the area which is already reserved for convenience.
641  */
642 phys_addr_t __init xen_find_free_area(phys_addr_t size)
643 {
644 	unsigned mapcnt;
645 	phys_addr_t addr, start;
646 	struct e820_entry *entry = xen_e820_table.entries;
647 
648 	for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++, entry++) {
649 		if (entry->type != E820_TYPE_RAM || entry->size < size)
650 			continue;
651 		start = entry->addr;
652 		for (addr = start; addr < start + size; addr += PAGE_SIZE) {
653 			if (!memblock_is_reserved(addr))
654 				continue;
655 			start = addr + PAGE_SIZE;
656 			if (start + size > entry->addr + entry->size)
657 				break;
658 		}
659 		if (addr >= start + size) {
660 			memblock_reserve(start, size);
661 			return start;
662 		}
663 	}
664 
665 	return 0;
666 }
667 
668 /*
669  * Like memcpy, but with physical addresses for dest and src.
670  */
671 static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
672 				   phys_addr_t n)
673 {
674 	phys_addr_t dest_off, src_off, dest_len, src_len, len;
675 	void *from, *to;
676 
677 	while (n) {
678 		dest_off = dest & ~PAGE_MASK;
679 		src_off = src & ~PAGE_MASK;
680 		dest_len = n;
681 		if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
682 			dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
683 		src_len = n;
684 		if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
685 			src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
686 		len = min(dest_len, src_len);
687 		to = early_memremap(dest - dest_off, dest_len + dest_off);
688 		from = early_memremap(src - src_off, src_len + src_off);
689 		memcpy(to, from, len);
690 		early_memunmap(to, dest_len + dest_off);
691 		early_memunmap(from, src_len + src_off);
692 		n -= len;
693 		dest += len;
694 		src += len;
695 	}
696 }
697 
698 /*
699  * Reserve Xen mfn_list.
700  */
701 static void __init xen_reserve_xen_mfnlist(void)
702 {
703 	phys_addr_t start, size;
704 
705 	if (xen_start_info->mfn_list >= __START_KERNEL_map) {
706 		start = __pa(xen_start_info->mfn_list);
707 		size = PFN_ALIGN(xen_start_info->nr_pages *
708 				 sizeof(unsigned long));
709 	} else {
710 		start = PFN_PHYS(xen_start_info->first_p2m_pfn);
711 		size = PFN_PHYS(xen_start_info->nr_p2m_frames);
712 	}
713 
714 	memblock_reserve(start, size);
715 	if (!xen_is_e820_reserved(start, size))
716 		return;
717 
718 	xen_relocate_p2m();
719 	memblock_phys_free(start, size);
720 }
721 
722 /**
723  * xen_memory_setup - Hook for machine specific memory setup.
724  **/
725 char * __init xen_memory_setup(void)
726 {
727 	unsigned long max_pfn, pfn_s, n_pfns;
728 	phys_addr_t mem_end, addr, size, chunk_size;
729 	u32 type;
730 	int rc;
731 	struct xen_memory_map memmap;
732 	unsigned long max_pages;
733 	unsigned long extra_pages = 0;
734 	int i;
735 	int op;
736 
737 	xen_parse_512gb();
738 	max_pfn = xen_get_pages_limit();
739 	max_pfn = min(max_pfn, xen_start_info->nr_pages);
740 	mem_end = PFN_PHYS(max_pfn);
741 
742 	memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
743 	set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
744 
745 #if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_XEN_BALLOON)
746 	xen_saved_max_mem_size = max_mem_size;
747 #endif
748 
749 	op = xen_initial_domain() ?
750 		XENMEM_machine_memory_map :
751 		XENMEM_memory_map;
752 	rc = HYPERVISOR_memory_op(op, &memmap);
753 	if (rc == -ENOSYS) {
754 		BUG_ON(xen_initial_domain());
755 		memmap.nr_entries = 1;
756 		xen_e820_table.entries[0].addr = 0ULL;
757 		xen_e820_table.entries[0].size = mem_end;
758 		/* 8MB slack (to balance backend allocations). */
759 		xen_e820_table.entries[0].size += 8ULL << 20;
760 		xen_e820_table.entries[0].type = E820_TYPE_RAM;
761 		rc = 0;
762 	}
763 	BUG_ON(rc);
764 	BUG_ON(memmap.nr_entries == 0);
765 	xen_e820_table.nr_entries = memmap.nr_entries;
766 
767 	/*
768 	 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
769 	 * regions, so if we're using the machine memory map leave the
770 	 * region as RAM as it is in the pseudo-physical map.
771 	 *
772 	 * UNUSABLE regions in domUs are not handled and will need
773 	 * a patch in the future.
774 	 */
775 	if (xen_initial_domain())
776 		xen_ignore_unusable();
777 
778 	/* Make sure the Xen-supplied memory map is well-ordered. */
779 	e820__update_table(&xen_e820_table);
780 
781 	max_pages = xen_get_max_pages();
782 
783 	/* How many extra pages do we need due to remapping? */
784 	max_pages += xen_foreach_remap_area(max_pfn, xen_count_remap_pages);
785 
786 	if (max_pages > max_pfn)
787 		extra_pages += max_pages - max_pfn;
788 
789 	/*
790 	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
791 	 * factor the base size.
792 	 *
793 	 * Make sure we have no memory above max_pages, as this area
794 	 * isn't handled by the p2m management.
795 	 */
796 	extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
797 			   extra_pages, max_pages - max_pfn);
798 	i = 0;
799 	addr = xen_e820_table.entries[0].addr;
800 	size = xen_e820_table.entries[0].size;
801 	while (i < xen_e820_table.nr_entries) {
802 		bool discard = false;
803 
804 		chunk_size = size;
805 		type = xen_e820_table.entries[i].type;
806 
807 		if (type == E820_TYPE_RAM) {
808 			if (addr < mem_end) {
809 				chunk_size = min(size, mem_end - addr);
810 			} else if (extra_pages) {
811 				chunk_size = min(size, PFN_PHYS(extra_pages));
812 				pfn_s = PFN_UP(addr);
813 				n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
814 				extra_pages -= n_pfns;
815 				xen_add_extra_mem(pfn_s, n_pfns);
816 				xen_max_p2m_pfn = pfn_s + n_pfns;
817 			} else
818 				discard = true;
819 		}
820 
821 		if (!discard)
822 			xen_align_and_add_e820_region(addr, chunk_size, type);
823 
824 		addr += chunk_size;
825 		size -= chunk_size;
826 		if (size == 0) {
827 			i++;
828 			if (i < xen_e820_table.nr_entries) {
829 				addr = xen_e820_table.entries[i].addr;
830 				size = xen_e820_table.entries[i].size;
831 			}
832 		}
833 	}
834 
835 	/*
836 	 * Set the rest as identity mapped, in case PCI BARs are
837 	 * located here.
838 	 */
839 	set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
840 
841 	/*
842 	 * In domU, the ISA region is normal, usable memory, but we
843 	 * reserve ISA memory anyway because too many things poke
844 	 * about in there.
845 	 */
846 	e820__range_add(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_TYPE_RESERVED);
847 
848 	e820__update_table(e820_table);
849 
850 	/*
851 	 * Check whether the kernel itself conflicts with the target E820 map.
852 	 * Failing now is better than running into weird problems later due
853 	 * to relocating (and even reusing) pages with kernel text or data.
854 	 */
855 	if (xen_is_e820_reserved(__pa_symbol(_text),
856 			__pa_symbol(__bss_stop) - __pa_symbol(_text))) {
857 		xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
858 		BUG();
859 	}
860 
861 	/*
862 	 * Check for a conflict of the hypervisor supplied page tables with
863 	 * the target E820 map.
864 	 */
865 	xen_pt_check_e820();
866 
867 	xen_reserve_xen_mfnlist();
868 
869 	/* Check for a conflict of the initrd with the target E820 map. */
870 	if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
871 				 boot_params.hdr.ramdisk_size)) {
872 		phys_addr_t new_area, start, size;
873 
874 		new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
875 		if (!new_area) {
876 			xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
877 			BUG();
878 		}
879 
880 		start = boot_params.hdr.ramdisk_image;
881 		size = boot_params.hdr.ramdisk_size;
882 		xen_phys_memcpy(new_area, start, size);
883 		pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
884 			start, start + size, new_area, new_area + size);
885 		memblock_phys_free(start, size);
886 		boot_params.hdr.ramdisk_image = new_area;
887 		boot_params.ext_ramdisk_image = new_area >> 32;
888 	}
889 
890 	/*
891 	 * Set identity map on non-RAM pages and prepare remapping the
892 	 * underlying RAM.
893 	 */
894 	xen_foreach_remap_area(max_pfn, xen_set_identity_and_remap_chunk);
895 
896 	pr_info("Released %ld page(s)\n", xen_released_pages);
897 
898 	return "Xen";
899 }
900 
901 static int register_callback(unsigned type, const void *func)
902 {
903 	struct callback_register callback = {
904 		.type = type,
905 		.address = XEN_CALLBACK(__KERNEL_CS, func),
906 		.flags = CALLBACKF_mask_events,
907 	};
908 
909 	return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
910 }
911 
912 void xen_enable_sysenter(void)
913 {
914 	if (cpu_feature_enabled(X86_FEATURE_SYSENTER32) &&
915 	    register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat))
916 		setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
917 }
918 
919 void xen_enable_syscall(void)
920 {
921 	int ret;
922 
923 	ret = register_callback(CALLBACKTYPE_syscall, xen_entry_SYSCALL_64);
924 	if (ret != 0) {
925 		printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
926 		/* Pretty fatal; 64-bit userspace has no other
927 		   mechanism for syscalls. */
928 	}
929 
930 	if (cpu_feature_enabled(X86_FEATURE_SYSCALL32) &&
931 	    register_callback(CALLBACKTYPE_syscall32, xen_entry_SYSCALL_compat))
932 		setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
933 }
934 
935 static void __init xen_pvmmu_arch_setup(void)
936 {
937 	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
938 
939 	if (register_callback(CALLBACKTYPE_event,
940 			      xen_asm_exc_xen_hypervisor_callback) ||
941 	    register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
942 		BUG();
943 
944 	xen_enable_sysenter();
945 	xen_enable_syscall();
946 }
947 
948 /* This function is not called for HVM domains */
949 void __init xen_arch_setup(void)
950 {
951 	xen_panic_handler_init();
952 	xen_pvmmu_arch_setup();
953 
954 #ifdef CONFIG_ACPI
955 	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
956 		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
957 		disable_acpi();
958 	}
959 #endif
960 
961 	memcpy(boot_command_line, xen_start_info->cmd_line,
962 	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
963 	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
964 
965 	/* Set up idle, making sure it calls safe_halt() pvop */
966 	disable_cpuidle();
967 	disable_cpufreq();
968 	WARN_ON(xen_set_default_idle());
969 #ifdef CONFIG_NUMA
970 	numa_off = 1;
971 #endif
972 }
973