xref: /openbmc/linux/arch/x86/xen/setup.c (revision 65ee8aeb)
1 /*
2  * Machine specific setup for xen
3  *
4  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
5  */
6 
7 #include <linux/module.h>
8 #include <linux/sched.h>
9 #include <linux/mm.h>
10 #include <linux/pm.h>
11 #include <linux/memblock.h>
12 #include <linux/cpuidle.h>
13 #include <linux/cpufreq.h>
14 
15 #include <asm/elf.h>
16 #include <asm/vdso.h>
17 #include <asm/e820.h>
18 #include <asm/setup.h>
19 #include <asm/acpi.h>
20 #include <asm/numa.h>
21 #include <asm/xen/hypervisor.h>
22 #include <asm/xen/hypercall.h>
23 
24 #include <xen/xen.h>
25 #include <xen/page.h>
26 #include <xen/interface/callback.h>
27 #include <xen/interface/memory.h>
28 #include <xen/interface/physdev.h>
29 #include <xen/features.h>
30 #include "xen-ops.h"
31 #include "vdso.h"
32 #include "p2m.h"
33 #include "mmu.h"
34 
35 /* Amount of extra memory space we add to the e820 ranges */
36 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
37 
38 /* Number of pages released from the initial allocation. */
39 unsigned long xen_released_pages;
40 
41 /*
42  * Buffer used to remap identity mapped pages. We only need the virtual space.
43  * The physical page behind this address is remapped as needed to different
44  * buffer pages.
45  */
46 #define REMAP_SIZE	(P2M_PER_PAGE - 3)
47 static struct {
48 	unsigned long	next_area_mfn;
49 	unsigned long	target_pfn;
50 	unsigned long	size;
51 	unsigned long	mfns[REMAP_SIZE];
52 } xen_remap_buf __initdata __aligned(PAGE_SIZE);
53 static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
54 
55 /*
56  * The maximum amount of extra memory compared to the base size.  The
57  * main scaling factor is the size of struct page.  At extreme ratios
58  * of base:extra, all the base memory can be filled with page
59  * structures for the extra memory, leaving no space for anything
60  * else.
61  *
62  * 10x seems like a reasonable balance between scaling flexibility and
63  * leaving a practically usable system.
64  */
65 #define EXTRA_MEM_RATIO		(10)
66 
67 static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
68 {
69 	int i;
70 
71 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
72 		/* Add new region. */
73 		if (xen_extra_mem[i].size == 0) {
74 			xen_extra_mem[i].start = start;
75 			xen_extra_mem[i].size  = size;
76 			break;
77 		}
78 		/* Append to existing region. */
79 		if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
80 			xen_extra_mem[i].size += size;
81 			break;
82 		}
83 	}
84 	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
85 		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
86 
87 	memblock_reserve(start, size);
88 }
89 
90 static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
91 {
92 	int i;
93 	phys_addr_t start_r, size_r;
94 
95 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
96 		start_r = xen_extra_mem[i].start;
97 		size_r = xen_extra_mem[i].size;
98 
99 		/* Start of region. */
100 		if (start_r == start) {
101 			BUG_ON(size > size_r);
102 			xen_extra_mem[i].start += size;
103 			xen_extra_mem[i].size -= size;
104 			break;
105 		}
106 		/* End of region. */
107 		if (start_r + size_r == start + size) {
108 			BUG_ON(size > size_r);
109 			xen_extra_mem[i].size -= size;
110 			break;
111 		}
112 		/* Mid of region. */
113 		if (start > start_r && start < start_r + size_r) {
114 			BUG_ON(start + size > start_r + size_r);
115 			xen_extra_mem[i].size = start - start_r;
116 			/* Calling memblock_reserve() again is okay. */
117 			xen_add_extra_mem(start + size, start_r + size_r -
118 					  (start + size));
119 			break;
120 		}
121 	}
122 	memblock_free(start, size);
123 }
124 
125 /*
126  * Called during boot before the p2m list can take entries beyond the
127  * hypervisor supplied p2m list. Entries in extra mem are to be regarded as
128  * invalid.
129  */
130 unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
131 {
132 	int i;
133 	phys_addr_t addr = PFN_PHYS(pfn);
134 
135 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
136 		if (addr >= xen_extra_mem[i].start &&
137 		    addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
138 			return INVALID_P2M_ENTRY;
139 	}
140 
141 	return IDENTITY_FRAME(pfn);
142 }
143 
144 /*
145  * Mark all pfns of extra mem as invalid in p2m list.
146  */
147 void __init xen_inv_extra_mem(void)
148 {
149 	unsigned long pfn, pfn_s, pfn_e;
150 	int i;
151 
152 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
153 		if (!xen_extra_mem[i].size)
154 			continue;
155 		pfn_s = PFN_DOWN(xen_extra_mem[i].start);
156 		pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
157 		for (pfn = pfn_s; pfn < pfn_e; pfn++)
158 			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
159 	}
160 }
161 
162 /*
163  * Finds the next RAM pfn available in the E820 map after min_pfn.
164  * This function updates min_pfn with the pfn found and returns
165  * the size of that range or zero if not found.
166  */
167 static unsigned long __init xen_find_pfn_range(
168 	const struct e820entry *list, size_t map_size,
169 	unsigned long *min_pfn)
170 {
171 	const struct e820entry *entry;
172 	unsigned int i;
173 	unsigned long done = 0;
174 
175 	for (i = 0, entry = list; i < map_size; i++, entry++) {
176 		unsigned long s_pfn;
177 		unsigned long e_pfn;
178 
179 		if (entry->type != E820_RAM)
180 			continue;
181 
182 		e_pfn = PFN_DOWN(entry->addr + entry->size);
183 
184 		/* We only care about E820 after this */
185 		if (e_pfn < *min_pfn)
186 			continue;
187 
188 		s_pfn = PFN_UP(entry->addr);
189 
190 		/* If min_pfn falls within the E820 entry, we want to start
191 		 * at the min_pfn PFN.
192 		 */
193 		if (s_pfn <= *min_pfn) {
194 			done = e_pfn - *min_pfn;
195 		} else {
196 			done = e_pfn - s_pfn;
197 			*min_pfn = s_pfn;
198 		}
199 		break;
200 	}
201 
202 	return done;
203 }
204 
205 static int __init xen_free_mfn(unsigned long mfn)
206 {
207 	struct xen_memory_reservation reservation = {
208 		.address_bits = 0,
209 		.extent_order = 0,
210 		.domid        = DOMID_SELF
211 	};
212 
213 	set_xen_guest_handle(reservation.extent_start, &mfn);
214 	reservation.nr_extents = 1;
215 
216 	return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
217 }
218 
219 /*
220  * This releases a chunk of memory and then does the identity map. It's used
221  * as a fallback if the remapping fails.
222  */
223 static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
224 	unsigned long end_pfn, unsigned long nr_pages, unsigned long *released)
225 {
226 	unsigned long pfn, end;
227 	int ret;
228 
229 	WARN_ON(start_pfn > end_pfn);
230 
231 	/* Release pages first. */
232 	end = min(end_pfn, nr_pages);
233 	for (pfn = start_pfn; pfn < end; pfn++) {
234 		unsigned long mfn = pfn_to_mfn(pfn);
235 
236 		/* Make sure pfn exists to start with */
237 		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
238 			continue;
239 
240 		ret = xen_free_mfn(mfn);
241 		WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
242 
243 		if (ret == 1) {
244 			(*released)++;
245 			if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
246 				break;
247 		} else
248 			break;
249 	}
250 
251 	set_phys_range_identity(start_pfn, end_pfn);
252 }
253 
254 /*
255  * Helper function to update the p2m and m2p tables and kernel mapping.
256  */
257 static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
258 {
259 	struct mmu_update update = {
260 		.ptr = ((uint64_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
261 		.val = pfn
262 	};
263 
264 	/* Update p2m */
265 	if (!set_phys_to_machine(pfn, mfn)) {
266 		WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
267 		     pfn, mfn);
268 		BUG();
269 	}
270 
271 	/* Update m2p */
272 	if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
273 		WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
274 		     mfn, pfn);
275 		BUG();
276 	}
277 
278 	/* Update kernel mapping, but not for highmem. */
279 	if (pfn >= PFN_UP(__pa(high_memory - 1)))
280 		return;
281 
282 	if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
283 					 mfn_pte(mfn, PAGE_KERNEL), 0)) {
284 		WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
285 		      mfn, pfn);
286 		BUG();
287 	}
288 }
289 
290 /*
291  * This function updates the p2m and m2p tables with an identity map from
292  * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the
293  * original allocation at remap_pfn. The information needed for remapping is
294  * saved in the memory itself to avoid the need for allocating buffers. The
295  * complete remap information is contained in a list of MFNs each containing
296  * up to REMAP_SIZE MFNs and the start target PFN for doing the remap.
297  * This enables us to preserve the original mfn sequence while doing the
298  * remapping at a time when the memory management is capable of allocating
299  * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and
300  * its callers.
301  */
302 static void __init xen_do_set_identity_and_remap_chunk(
303         unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
304 {
305 	unsigned long buf = (unsigned long)&xen_remap_buf;
306 	unsigned long mfn_save, mfn;
307 	unsigned long ident_pfn_iter, remap_pfn_iter;
308 	unsigned long ident_end_pfn = start_pfn + size;
309 	unsigned long left = size;
310 	unsigned int i, chunk;
311 
312 	WARN_ON(size == 0);
313 
314 	BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
315 
316 	mfn_save = virt_to_mfn(buf);
317 
318 	for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
319 	     ident_pfn_iter < ident_end_pfn;
320 	     ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) {
321 		chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE;
322 
323 		/* Map first pfn to xen_remap_buf */
324 		mfn = pfn_to_mfn(ident_pfn_iter);
325 		set_pte_mfn(buf, mfn, PAGE_KERNEL);
326 
327 		/* Save mapping information in page */
328 		xen_remap_buf.next_area_mfn = xen_remap_mfn;
329 		xen_remap_buf.target_pfn = remap_pfn_iter;
330 		xen_remap_buf.size = chunk;
331 		for (i = 0; i < chunk; i++)
332 			xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i);
333 
334 		/* Put remap buf into list. */
335 		xen_remap_mfn = mfn;
336 
337 		/* Set identity map */
338 		set_phys_range_identity(ident_pfn_iter, ident_pfn_iter + chunk);
339 
340 		left -= chunk;
341 	}
342 
343 	/* Restore old xen_remap_buf mapping */
344 	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
345 }
346 
347 /*
348  * This function takes a contiguous pfn range that needs to be identity mapped
349  * and:
350  *
351  *  1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
352  *  2) Calls the do_ function to actually do the mapping/remapping work.
353  *
354  * The goal is to not allocate additional memory but to remap the existing
355  * pages. In the case of an error the underlying memory is simply released back
356  * to Xen and not remapped.
357  */
358 static unsigned long __init xen_set_identity_and_remap_chunk(
359         const struct e820entry *list, size_t map_size, unsigned long start_pfn,
360 	unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
361 	unsigned long *released, unsigned long *remapped)
362 {
363 	unsigned long pfn;
364 	unsigned long i = 0;
365 	unsigned long n = end_pfn - start_pfn;
366 
367 	while (i < n) {
368 		unsigned long cur_pfn = start_pfn + i;
369 		unsigned long left = n - i;
370 		unsigned long size = left;
371 		unsigned long remap_range_size;
372 
373 		/* Do not remap pages beyond the current allocation */
374 		if (cur_pfn >= nr_pages) {
375 			/* Identity map remaining pages */
376 			set_phys_range_identity(cur_pfn, cur_pfn + size);
377 			break;
378 		}
379 		if (cur_pfn + size > nr_pages)
380 			size = nr_pages - cur_pfn;
381 
382 		remap_range_size = xen_find_pfn_range(list, map_size,
383 						      &remap_pfn);
384 		if (!remap_range_size) {
385 			pr_warning("Unable to find available pfn range, not remapping identity pages\n");
386 			xen_set_identity_and_release_chunk(cur_pfn,
387 				cur_pfn + left, nr_pages, released);
388 			break;
389 		}
390 		/* Adjust size to fit in current e820 RAM region */
391 		if (size > remap_range_size)
392 			size = remap_range_size;
393 
394 		xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn);
395 
396 		/* Update variables to reflect new mappings. */
397 		i += size;
398 		remap_pfn += size;
399 		*remapped += size;
400 	}
401 
402 	/*
403 	 * If the PFNs are currently mapped, the VA mapping also needs
404 	 * to be updated to be 1:1.
405 	 */
406 	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
407 		(void)HYPERVISOR_update_va_mapping(
408 			(unsigned long)__va(pfn << PAGE_SHIFT),
409 			mfn_pte(pfn, PAGE_KERNEL_IO), 0);
410 
411 	return remap_pfn;
412 }
413 
414 static void __init xen_set_identity_and_remap(
415 	const struct e820entry *list, size_t map_size, unsigned long nr_pages,
416 	unsigned long *released, unsigned long *remapped)
417 {
418 	phys_addr_t start = 0;
419 	unsigned long last_pfn = nr_pages;
420 	const struct e820entry *entry;
421 	unsigned long num_released = 0;
422 	unsigned long num_remapped = 0;
423 	int i;
424 
425 	/*
426 	 * Combine non-RAM regions and gaps until a RAM region (or the
427 	 * end of the map) is reached, then set the 1:1 map and
428 	 * remap the memory in those non-RAM regions.
429 	 *
430 	 * The combined non-RAM regions are rounded to a whole number
431 	 * of pages so any partial pages are accessible via the 1:1
432 	 * mapping.  This is needed for some BIOSes that put (for
433 	 * example) the DMI tables in a reserved region that begins on
434 	 * a non-page boundary.
435 	 */
436 	for (i = 0, entry = list; i < map_size; i++, entry++) {
437 		phys_addr_t end = entry->addr + entry->size;
438 		if (entry->type == E820_RAM || i == map_size - 1) {
439 			unsigned long start_pfn = PFN_DOWN(start);
440 			unsigned long end_pfn = PFN_UP(end);
441 
442 			if (entry->type == E820_RAM)
443 				end_pfn = PFN_UP(entry->addr);
444 
445 			if (start_pfn < end_pfn)
446 				last_pfn = xen_set_identity_and_remap_chunk(
447 						list, map_size, start_pfn,
448 						end_pfn, nr_pages, last_pfn,
449 						&num_released, &num_remapped);
450 			start = end;
451 		}
452 	}
453 
454 	*released = num_released;
455 	*remapped = num_remapped;
456 
457 	pr_info("Released %ld page(s)\n", num_released);
458 }
459 
460 /*
461  * Remap the memory prepared in xen_do_set_identity_and_remap_chunk().
462  * The remap information (which mfn remap to which pfn) is contained in the
463  * to be remapped memory itself in a linked list anchored at xen_remap_mfn.
464  * This scheme allows to remap the different chunks in arbitrary order while
465  * the resulting mapping will be independant from the order.
466  */
467 void __init xen_remap_memory(void)
468 {
469 	unsigned long buf = (unsigned long)&xen_remap_buf;
470 	unsigned long mfn_save, mfn, pfn;
471 	unsigned long remapped = 0;
472 	unsigned int i;
473 	unsigned long pfn_s = ~0UL;
474 	unsigned long len = 0;
475 
476 	mfn_save = virt_to_mfn(buf);
477 
478 	while (xen_remap_mfn != INVALID_P2M_ENTRY) {
479 		/* Map the remap information */
480 		set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL);
481 
482 		BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]);
483 
484 		pfn = xen_remap_buf.target_pfn;
485 		for (i = 0; i < xen_remap_buf.size; i++) {
486 			mfn = xen_remap_buf.mfns[i];
487 			xen_update_mem_tables(pfn, mfn);
488 			remapped++;
489 			pfn++;
490 		}
491 		if (pfn_s == ~0UL || pfn == pfn_s) {
492 			pfn_s = xen_remap_buf.target_pfn;
493 			len += xen_remap_buf.size;
494 		} else if (pfn_s + len == xen_remap_buf.target_pfn) {
495 			len += xen_remap_buf.size;
496 		} else {
497 			xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
498 			pfn_s = xen_remap_buf.target_pfn;
499 			len = xen_remap_buf.size;
500 		}
501 
502 		mfn = xen_remap_mfn;
503 		xen_remap_mfn = xen_remap_buf.next_area_mfn;
504 	}
505 
506 	if (pfn_s != ~0UL && len)
507 		xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
508 
509 	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
510 
511 	pr_info("Remapped %ld page(s)\n", remapped);
512 }
513 
514 static unsigned long __init xen_get_max_pages(void)
515 {
516 	unsigned long max_pages = MAX_DOMAIN_PAGES;
517 	domid_t domid = DOMID_SELF;
518 	int ret;
519 
520 	/*
521 	 * For the initial domain we use the maximum reservation as
522 	 * the maximum page.
523 	 *
524 	 * For guest domains the current maximum reservation reflects
525 	 * the current maximum rather than the static maximum. In this
526 	 * case the e820 map provided to us will cover the static
527 	 * maximum region.
528 	 */
529 	if (xen_initial_domain()) {
530 		ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
531 		if (ret > 0)
532 			max_pages = ret;
533 	}
534 
535 	return min(max_pages, MAX_DOMAIN_PAGES);
536 }
537 
538 static void __init xen_align_and_add_e820_region(phys_addr_t start,
539 						 phys_addr_t size, int type)
540 {
541 	phys_addr_t end = start + size;
542 
543 	/* Align RAM regions to page boundaries. */
544 	if (type == E820_RAM) {
545 		start = PAGE_ALIGN(start);
546 		end &= ~((phys_addr_t)PAGE_SIZE - 1);
547 	}
548 
549 	e820_add_region(start, end - start, type);
550 }
551 
552 static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size)
553 {
554 	struct e820entry *entry;
555 	unsigned int i;
556 
557 	for (i = 0, entry = list; i < map_size; i++, entry++) {
558 		if (entry->type == E820_UNUSABLE)
559 			entry->type = E820_RAM;
560 	}
561 }
562 
563 /**
564  * machine_specific_memory_setup - Hook for machine specific memory setup.
565  **/
566 char * __init xen_memory_setup(void)
567 {
568 	static struct e820entry map[E820MAX] __initdata;
569 
570 	unsigned long max_pfn = xen_start_info->nr_pages;
571 	phys_addr_t mem_end;
572 	int rc;
573 	struct xen_memory_map memmap;
574 	unsigned long max_pages;
575 	unsigned long extra_pages = 0;
576 	unsigned long remapped_pages;
577 	int i;
578 	int op;
579 
580 	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
581 	mem_end = PFN_PHYS(max_pfn);
582 
583 	memmap.nr_entries = E820MAX;
584 	set_xen_guest_handle(memmap.buffer, map);
585 
586 	op = xen_initial_domain() ?
587 		XENMEM_machine_memory_map :
588 		XENMEM_memory_map;
589 	rc = HYPERVISOR_memory_op(op, &memmap);
590 	if (rc == -ENOSYS) {
591 		BUG_ON(xen_initial_domain());
592 		memmap.nr_entries = 1;
593 		map[0].addr = 0ULL;
594 		map[0].size = mem_end;
595 		/* 8MB slack (to balance backend allocations). */
596 		map[0].size += 8ULL << 20;
597 		map[0].type = E820_RAM;
598 		rc = 0;
599 	}
600 	BUG_ON(rc);
601 	BUG_ON(memmap.nr_entries == 0);
602 
603 	/*
604 	 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
605 	 * regions, so if we're using the machine memory map leave the
606 	 * region as RAM as it is in the pseudo-physical map.
607 	 *
608 	 * UNUSABLE regions in domUs are not handled and will need
609 	 * a patch in the future.
610 	 */
611 	if (xen_initial_domain())
612 		xen_ignore_unusable(map, memmap.nr_entries);
613 
614 	/* Make sure the Xen-supplied memory map is well-ordered. */
615 	sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
616 
617 	max_pages = xen_get_max_pages();
618 	if (max_pages > max_pfn)
619 		extra_pages += max_pages - max_pfn;
620 
621 	/*
622 	 * Set identity map on non-RAM pages and prepare remapping the
623 	 * underlying RAM.
624 	 */
625 	xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
626 				   &xen_released_pages, &remapped_pages);
627 
628 	extra_pages += xen_released_pages;
629 	extra_pages += remapped_pages;
630 
631 	/*
632 	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
633 	 * factor the base size.  On non-highmem systems, the base
634 	 * size is the full initial memory allocation; on highmem it
635 	 * is limited to the max size of lowmem, so that it doesn't
636 	 * get completely filled.
637 	 *
638 	 * In principle there could be a problem in lowmem systems if
639 	 * the initial memory is also very large with respect to
640 	 * lowmem, but we won't try to deal with that here.
641 	 */
642 	extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
643 			  extra_pages);
644 	i = 0;
645 	while (i < memmap.nr_entries) {
646 		phys_addr_t addr = map[i].addr;
647 		phys_addr_t size = map[i].size;
648 		u32 type = map[i].type;
649 
650 		if (type == E820_RAM) {
651 			if (addr < mem_end) {
652 				size = min(size, mem_end - addr);
653 			} else if (extra_pages) {
654 				size = min(size, PFN_PHYS(extra_pages));
655 				extra_pages -= PFN_DOWN(size);
656 				xen_add_extra_mem(addr, size);
657 				xen_max_p2m_pfn = PFN_DOWN(addr + size);
658 			} else
659 				type = E820_UNUSABLE;
660 		}
661 
662 		xen_align_and_add_e820_region(addr, size, type);
663 
664 		map[i].addr += size;
665 		map[i].size -= size;
666 		if (map[i].size == 0)
667 			i++;
668 	}
669 
670 	/*
671 	 * Set the rest as identity mapped, in case PCI BARs are
672 	 * located here.
673 	 *
674 	 * PFNs above MAX_P2M_PFN are considered identity mapped as
675 	 * well.
676 	 */
677 	set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
678 
679 	/*
680 	 * In domU, the ISA region is normal, usable memory, but we
681 	 * reserve ISA memory anyway because too many things poke
682 	 * about in there.
683 	 */
684 	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
685 			E820_RESERVED);
686 
687 	/*
688 	 * Reserve Xen bits:
689 	 *  - mfn_list
690 	 *  - xen_start_info
691 	 * See comment above "struct start_info" in <xen/interface/xen.h>
692 	 * We tried to make the the memblock_reserve more selective so
693 	 * that it would be clear what region is reserved. Sadly we ran
694 	 * in the problem wherein on a 64-bit hypervisor with a 32-bit
695 	 * initial domain, the pt_base has the cr3 value which is not
696 	 * neccessarily where the pagetable starts! As Jan put it: "
697 	 * Actually, the adjustment turns out to be correct: The page
698 	 * tables for a 32-on-64 dom0 get allocated in the order "first L1",
699 	 * "first L2", "first L3", so the offset to the page table base is
700 	 * indeed 2. When reading xen/include/public/xen.h's comment
701 	 * very strictly, this is not a violation (since there nothing is said
702 	 * that the first thing in the page table space is pointed to by
703 	 * pt_base; I admit that this seems to be implied though, namely
704 	 * do I think that it is implied that the page table space is the
705 	 * range [pt_base, pt_base + nt_pt_frames), whereas that
706 	 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
707 	 * which - without a priori knowledge - the kernel would have
708 	 * difficulty to figure out)." - so lets just fall back to the
709 	 * easy way and reserve the whole region.
710 	 */
711 	memblock_reserve(__pa(xen_start_info->mfn_list),
712 			 xen_start_info->pt_base - xen_start_info->mfn_list);
713 
714 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
715 
716 	return "Xen";
717 }
718 
719 /*
720  * Machine specific memory setup for auto-translated guests.
721  */
722 char * __init xen_auto_xlated_memory_setup(void)
723 {
724 	static struct e820entry map[E820MAX] __initdata;
725 
726 	struct xen_memory_map memmap;
727 	int i;
728 	int rc;
729 
730 	memmap.nr_entries = E820MAX;
731 	set_xen_guest_handle(memmap.buffer, map);
732 
733 	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
734 	if (rc < 0)
735 		panic("No memory map (%d)\n", rc);
736 
737 	sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
738 
739 	for (i = 0; i < memmap.nr_entries; i++)
740 		e820_add_region(map[i].addr, map[i].size, map[i].type);
741 
742 	memblock_reserve(__pa(xen_start_info->mfn_list),
743 			 xen_start_info->pt_base - xen_start_info->mfn_list);
744 
745 	return "Xen";
746 }
747 
748 /*
749  * Set the bit indicating "nosegneg" library variants should be used.
750  * We only need to bother in pure 32-bit mode; compat 32-bit processes
751  * can have un-truncated segments, so wrapping around is allowed.
752  */
753 static void __init fiddle_vdso(void)
754 {
755 #ifdef CONFIG_X86_32
756 	/*
757 	 * This could be called before selected_vdso32 is initialized, so
758 	 * just fiddle with both possible images.  vdso_image_32_syscall
759 	 * can't be selected, since it only exists on 64-bit systems.
760 	 */
761 	u32 *mask;
762 	mask = vdso_image_32_int80.data +
763 		vdso_image_32_int80.sym_VDSO32_NOTE_MASK;
764 	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
765 	mask = vdso_image_32_sysenter.data +
766 		vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK;
767 	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
768 #endif
769 }
770 
771 static int register_callback(unsigned type, const void *func)
772 {
773 	struct callback_register callback = {
774 		.type = type,
775 		.address = XEN_CALLBACK(__KERNEL_CS, func),
776 		.flags = CALLBACKF_mask_events,
777 	};
778 
779 	return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
780 }
781 
782 void xen_enable_sysenter(void)
783 {
784 	int ret;
785 	unsigned sysenter_feature;
786 
787 #ifdef CONFIG_X86_32
788 	sysenter_feature = X86_FEATURE_SEP;
789 #else
790 	sysenter_feature = X86_FEATURE_SYSENTER32;
791 #endif
792 
793 	if (!boot_cpu_has(sysenter_feature))
794 		return;
795 
796 	ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
797 	if(ret != 0)
798 		setup_clear_cpu_cap(sysenter_feature);
799 }
800 
801 void xen_enable_syscall(void)
802 {
803 #ifdef CONFIG_X86_64
804 	int ret;
805 
806 	ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
807 	if (ret != 0) {
808 		printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
809 		/* Pretty fatal; 64-bit userspace has no other
810 		   mechanism for syscalls. */
811 	}
812 
813 	if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
814 		ret = register_callback(CALLBACKTYPE_syscall32,
815 					xen_syscall32_target);
816 		if (ret != 0)
817 			setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
818 	}
819 #endif /* CONFIG_X86_64 */
820 }
821 
822 void __init xen_pvmmu_arch_setup(void)
823 {
824 	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
825 	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
826 
827 	HYPERVISOR_vm_assist(VMASST_CMD_enable,
828 			     VMASST_TYPE_pae_extended_cr3);
829 
830 	if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
831 	    register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
832 		BUG();
833 
834 	xen_enable_sysenter();
835 	xen_enable_syscall();
836 }
837 
838 /* This function is not called for HVM domains */
839 void __init xen_arch_setup(void)
840 {
841 	xen_panic_handler_init();
842 	if (!xen_feature(XENFEAT_auto_translated_physmap))
843 		xen_pvmmu_arch_setup();
844 
845 #ifdef CONFIG_ACPI
846 	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
847 		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
848 		disable_acpi();
849 	}
850 #endif
851 
852 	memcpy(boot_command_line, xen_start_info->cmd_line,
853 	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
854 	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
855 
856 	/* Set up idle, making sure it calls safe_halt() pvop */
857 	disable_cpuidle();
858 	disable_cpufreq();
859 	WARN_ON(xen_set_default_idle());
860 	fiddle_vdso();
861 #ifdef CONFIG_NUMA
862 	numa_off = 1;
863 #endif
864 }
865