1 /* 2 * Machine specific setup for xen 3 * 4 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 5 */ 6 7 #include <linux/module.h> 8 #include <linux/sched.h> 9 #include <linux/mm.h> 10 #include <linux/pm.h> 11 #include <linux/memblock.h> 12 #include <linux/cpuidle.h> 13 #include <linux/cpufreq.h> 14 15 #include <asm/elf.h> 16 #include <asm/vdso.h> 17 #include <asm/e820.h> 18 #include <asm/setup.h> 19 #include <asm/acpi.h> 20 #include <asm/numa.h> 21 #include <asm/xen/hypervisor.h> 22 #include <asm/xen/hypercall.h> 23 24 #include <xen/xen.h> 25 #include <xen/page.h> 26 #include <xen/interface/callback.h> 27 #include <xen/interface/memory.h> 28 #include <xen/interface/physdev.h> 29 #include <xen/features.h> 30 #include "xen-ops.h" 31 #include "vdso.h" 32 33 /* These are code, but not functions. Defined in entry.S */ 34 extern const char xen_hypervisor_callback[]; 35 extern const char xen_failsafe_callback[]; 36 #ifdef CONFIG_X86_64 37 extern const char nmi[]; 38 #endif 39 extern void xen_sysenter_target(void); 40 extern void xen_syscall_target(void); 41 extern void xen_syscall32_target(void); 42 43 /* Amount of extra memory space we add to the e820 ranges */ 44 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; 45 46 /* Number of pages released from the initial allocation. */ 47 unsigned long xen_released_pages; 48 49 /* 50 * The maximum amount of extra memory compared to the base size. The 51 * main scaling factor is the size of struct page. At extreme ratios 52 * of base:extra, all the base memory can be filled with page 53 * structures for the extra memory, leaving no space for anything 54 * else. 55 * 56 * 10x seems like a reasonable balance between scaling flexibility and 57 * leaving a practically usable system. 58 */ 59 #define EXTRA_MEM_RATIO (10) 60 61 static void __init xen_add_extra_mem(u64 start, u64 size) 62 { 63 unsigned long pfn; 64 int i; 65 66 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { 67 /* Add new region. */ 68 if (xen_extra_mem[i].size == 0) { 69 xen_extra_mem[i].start = start; 70 xen_extra_mem[i].size = size; 71 break; 72 } 73 /* Append to existing region. */ 74 if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { 75 xen_extra_mem[i].size += size; 76 break; 77 } 78 } 79 if (i == XEN_EXTRA_MEM_MAX_REGIONS) 80 printk(KERN_WARNING "Warning: not enough extra memory regions\n"); 81 82 memblock_reserve(start, size); 83 84 xen_max_p2m_pfn = PFN_DOWN(start + size); 85 for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { 86 unsigned long mfn = pfn_to_mfn(pfn); 87 88 if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) 89 continue; 90 WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", 91 pfn, mfn); 92 93 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 94 } 95 } 96 97 static unsigned long __init xen_do_chunk(unsigned long start, 98 unsigned long end, bool release) 99 { 100 struct xen_memory_reservation reservation = { 101 .address_bits = 0, 102 .extent_order = 0, 103 .domid = DOMID_SELF 104 }; 105 unsigned long len = 0; 106 unsigned long pfn; 107 int ret; 108 109 for (pfn = start; pfn < end; pfn++) { 110 unsigned long frame; 111 unsigned long mfn = pfn_to_mfn(pfn); 112 113 if (release) { 114 /* Make sure pfn exists to start with */ 115 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) 116 continue; 117 frame = mfn; 118 } else { 119 if (mfn != INVALID_P2M_ENTRY) 120 continue; 121 frame = pfn; 122 } 123 set_xen_guest_handle(reservation.extent_start, &frame); 124 reservation.nr_extents = 1; 125 126 ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap, 127 &reservation); 128 WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", 129 release ? "release" : "populate", pfn, ret); 130 131 if (ret == 1) { 132 if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { 133 if (release) 134 break; 135 set_xen_guest_handle(reservation.extent_start, &frame); 136 reservation.nr_extents = 1; 137 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, 138 &reservation); 139 break; 140 } 141 len++; 142 } else 143 break; 144 } 145 if (len) 146 printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n", 147 release ? "Freeing" : "Populating", 148 start, end, len, 149 release ? "freed" : "added"); 150 151 return len; 152 } 153 154 static unsigned long __init xen_release_chunk(unsigned long start, 155 unsigned long end) 156 { 157 return xen_do_chunk(start, end, true); 158 } 159 160 static unsigned long __init xen_populate_chunk( 161 const struct e820entry *list, size_t map_size, 162 unsigned long max_pfn, unsigned long *last_pfn, 163 unsigned long credits_left) 164 { 165 const struct e820entry *entry; 166 unsigned int i; 167 unsigned long done = 0; 168 unsigned long dest_pfn; 169 170 for (i = 0, entry = list; i < map_size; i++, entry++) { 171 unsigned long s_pfn; 172 unsigned long e_pfn; 173 unsigned long pfns; 174 long capacity; 175 176 if (credits_left <= 0) 177 break; 178 179 if (entry->type != E820_RAM) 180 continue; 181 182 e_pfn = PFN_DOWN(entry->addr + entry->size); 183 184 /* We only care about E820 after the xen_start_info->nr_pages */ 185 if (e_pfn <= max_pfn) 186 continue; 187 188 s_pfn = PFN_UP(entry->addr); 189 /* If the E820 falls within the nr_pages, we want to start 190 * at the nr_pages PFN. 191 * If that would mean going past the E820 entry, skip it 192 */ 193 if (s_pfn <= max_pfn) { 194 capacity = e_pfn - max_pfn; 195 dest_pfn = max_pfn; 196 } else { 197 capacity = e_pfn - s_pfn; 198 dest_pfn = s_pfn; 199 } 200 201 if (credits_left < capacity) 202 capacity = credits_left; 203 204 pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false); 205 done += pfns; 206 *last_pfn = (dest_pfn + pfns); 207 if (pfns < capacity) 208 break; 209 credits_left -= pfns; 210 } 211 return done; 212 } 213 214 static void __init xen_set_identity_and_release_chunk( 215 unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, 216 unsigned long *released, unsigned long *identity) 217 { 218 unsigned long pfn; 219 220 /* 221 * If the PFNs are currently mapped, clear the mappings 222 * (except for the ISA region which must be 1:1 mapped) to 223 * release the refcounts (in Xen) on the original frames. 224 */ 225 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { 226 pte_t pte = __pte_ma(0); 227 228 if (pfn < PFN_UP(ISA_END_ADDRESS)) 229 pte = mfn_pte(pfn, PAGE_KERNEL_IO); 230 231 (void)HYPERVISOR_update_va_mapping( 232 (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0); 233 } 234 235 if (start_pfn < nr_pages) 236 *released += xen_release_chunk( 237 start_pfn, min(end_pfn, nr_pages)); 238 239 *identity += set_phys_range_identity(start_pfn, end_pfn); 240 } 241 242 static unsigned long __init xen_set_identity_and_release( 243 const struct e820entry *list, size_t map_size, unsigned long nr_pages) 244 { 245 phys_addr_t start = 0; 246 unsigned long released = 0; 247 unsigned long identity = 0; 248 const struct e820entry *entry; 249 int i; 250 251 /* 252 * Combine non-RAM regions and gaps until a RAM region (or the 253 * end of the map) is reached, then set the 1:1 map and 254 * release the pages (if available) in those non-RAM regions. 255 * 256 * The combined non-RAM regions are rounded to a whole number 257 * of pages so any partial pages are accessible via the 1:1 258 * mapping. This is needed for some BIOSes that put (for 259 * example) the DMI tables in a reserved region that begins on 260 * a non-page boundary. 261 */ 262 for (i = 0, entry = list; i < map_size; i++, entry++) { 263 phys_addr_t end = entry->addr + entry->size; 264 if (entry->type == E820_RAM || i == map_size - 1) { 265 unsigned long start_pfn = PFN_DOWN(start); 266 unsigned long end_pfn = PFN_UP(end); 267 268 if (entry->type == E820_RAM) 269 end_pfn = PFN_UP(entry->addr); 270 271 if (start_pfn < end_pfn) 272 xen_set_identity_and_release_chunk( 273 start_pfn, end_pfn, nr_pages, 274 &released, &identity); 275 276 start = end; 277 } 278 } 279 280 if (released) 281 printk(KERN_INFO "Released %lu pages of unused memory\n", released); 282 if (identity) 283 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); 284 285 return released; 286 } 287 288 static unsigned long __init xen_get_max_pages(void) 289 { 290 unsigned long max_pages = MAX_DOMAIN_PAGES; 291 domid_t domid = DOMID_SELF; 292 int ret; 293 294 /* 295 * For the initial domain we use the maximum reservation as 296 * the maximum page. 297 * 298 * For guest domains the current maximum reservation reflects 299 * the current maximum rather than the static maximum. In this 300 * case the e820 map provided to us will cover the static 301 * maximum region. 302 */ 303 if (xen_initial_domain()) { 304 ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); 305 if (ret > 0) 306 max_pages = ret; 307 } 308 309 return min(max_pages, MAX_DOMAIN_PAGES); 310 } 311 312 static void xen_align_and_add_e820_region(u64 start, u64 size, int type) 313 { 314 u64 end = start + size; 315 316 /* Align RAM regions to page boundaries. */ 317 if (type == E820_RAM) { 318 start = PAGE_ALIGN(start); 319 end &= ~((u64)PAGE_SIZE - 1); 320 } 321 322 e820_add_region(start, end - start, type); 323 } 324 325 void xen_ignore_unusable(struct e820entry *list, size_t map_size) 326 { 327 struct e820entry *entry; 328 unsigned int i; 329 330 for (i = 0, entry = list; i < map_size; i++, entry++) { 331 if (entry->type == E820_UNUSABLE) 332 entry->type = E820_RAM; 333 } 334 } 335 336 /** 337 * machine_specific_memory_setup - Hook for machine specific memory setup. 338 **/ 339 char * __init xen_memory_setup(void) 340 { 341 static struct e820entry map[E820MAX] __initdata; 342 343 unsigned long max_pfn = xen_start_info->nr_pages; 344 unsigned long long mem_end; 345 int rc; 346 struct xen_memory_map memmap; 347 unsigned long max_pages; 348 unsigned long last_pfn = 0; 349 unsigned long extra_pages = 0; 350 unsigned long populated; 351 int i; 352 int op; 353 354 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); 355 mem_end = PFN_PHYS(max_pfn); 356 357 memmap.nr_entries = E820MAX; 358 set_xen_guest_handle(memmap.buffer, map); 359 360 op = xen_initial_domain() ? 361 XENMEM_machine_memory_map : 362 XENMEM_memory_map; 363 rc = HYPERVISOR_memory_op(op, &memmap); 364 if (rc == -ENOSYS) { 365 BUG_ON(xen_initial_domain()); 366 memmap.nr_entries = 1; 367 map[0].addr = 0ULL; 368 map[0].size = mem_end; 369 /* 8MB slack (to balance backend allocations). */ 370 map[0].size += 8ULL << 20; 371 map[0].type = E820_RAM; 372 rc = 0; 373 } 374 BUG_ON(rc); 375 376 /* 377 * Xen won't allow a 1:1 mapping to be created to UNUSABLE 378 * regions, so if we're using the machine memory map leave the 379 * region as RAM as it is in the pseudo-physical map. 380 * 381 * UNUSABLE regions in domUs are not handled and will need 382 * a patch in the future. 383 */ 384 if (xen_initial_domain()) 385 xen_ignore_unusable(map, memmap.nr_entries); 386 387 /* Make sure the Xen-supplied memory map is well-ordered. */ 388 sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); 389 390 max_pages = xen_get_max_pages(); 391 if (max_pages > max_pfn) 392 extra_pages += max_pages - max_pfn; 393 394 /* 395 * Set P2M for all non-RAM pages and E820 gaps to be identity 396 * type PFNs. Any RAM pages that would be made inaccesible by 397 * this are first released. 398 */ 399 xen_released_pages = xen_set_identity_and_release( 400 map, memmap.nr_entries, max_pfn); 401 402 /* 403 * Populate back the non-RAM pages and E820 gaps that had been 404 * released. */ 405 populated = xen_populate_chunk(map, memmap.nr_entries, 406 max_pfn, &last_pfn, xen_released_pages); 407 408 xen_released_pages -= populated; 409 extra_pages += xen_released_pages; 410 411 if (last_pfn > max_pfn) { 412 max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); 413 mem_end = PFN_PHYS(max_pfn); 414 } 415 /* 416 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO 417 * factor the base size. On non-highmem systems, the base 418 * size is the full initial memory allocation; on highmem it 419 * is limited to the max size of lowmem, so that it doesn't 420 * get completely filled. 421 * 422 * In principle there could be a problem in lowmem systems if 423 * the initial memory is also very large with respect to 424 * lowmem, but we won't try to deal with that here. 425 */ 426 extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), 427 extra_pages); 428 i = 0; 429 while (i < memmap.nr_entries) { 430 u64 addr = map[i].addr; 431 u64 size = map[i].size; 432 u32 type = map[i].type; 433 434 if (type == E820_RAM) { 435 if (addr < mem_end) { 436 size = min(size, mem_end - addr); 437 } else if (extra_pages) { 438 size = min(size, (u64)extra_pages * PAGE_SIZE); 439 extra_pages -= size / PAGE_SIZE; 440 xen_add_extra_mem(addr, size); 441 } else 442 type = E820_UNUSABLE; 443 } 444 445 xen_align_and_add_e820_region(addr, size, type); 446 447 map[i].addr += size; 448 map[i].size -= size; 449 if (map[i].size == 0) 450 i++; 451 } 452 453 /* 454 * In domU, the ISA region is normal, usable memory, but we 455 * reserve ISA memory anyway because too many things poke 456 * about in there. 457 */ 458 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, 459 E820_RESERVED); 460 461 /* 462 * Reserve Xen bits: 463 * - mfn_list 464 * - xen_start_info 465 * See comment above "struct start_info" in <xen/interface/xen.h> 466 * We tried to make the the memblock_reserve more selective so 467 * that it would be clear what region is reserved. Sadly we ran 468 * in the problem wherein on a 64-bit hypervisor with a 32-bit 469 * initial domain, the pt_base has the cr3 value which is not 470 * neccessarily where the pagetable starts! As Jan put it: " 471 * Actually, the adjustment turns out to be correct: The page 472 * tables for a 32-on-64 dom0 get allocated in the order "first L1", 473 * "first L2", "first L3", so the offset to the page table base is 474 * indeed 2. When reading xen/include/public/xen.h's comment 475 * very strictly, this is not a violation (since there nothing is said 476 * that the first thing in the page table space is pointed to by 477 * pt_base; I admit that this seems to be implied though, namely 478 * do I think that it is implied that the page table space is the 479 * range [pt_base, pt_base + nt_pt_frames), whereas that 480 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), 481 * which - without a priori knowledge - the kernel would have 482 * difficulty to figure out)." - so lets just fall back to the 483 * easy way and reserve the whole region. 484 */ 485 memblock_reserve(__pa(xen_start_info->mfn_list), 486 xen_start_info->pt_base - xen_start_info->mfn_list); 487 488 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 489 490 return "Xen"; 491 } 492 493 /* 494 * Set the bit indicating "nosegneg" library variants should be used. 495 * We only need to bother in pure 32-bit mode; compat 32-bit processes 496 * can have un-truncated segments, so wrapping around is allowed. 497 */ 498 static void __init fiddle_vdso(void) 499 { 500 #ifdef CONFIG_X86_32 501 u32 *mask; 502 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK); 503 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 504 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK); 505 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 506 #endif 507 } 508 509 static int register_callback(unsigned type, const void *func) 510 { 511 struct callback_register callback = { 512 .type = type, 513 .address = XEN_CALLBACK(__KERNEL_CS, func), 514 .flags = CALLBACKF_mask_events, 515 }; 516 517 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); 518 } 519 520 void xen_enable_sysenter(void) 521 { 522 int ret; 523 unsigned sysenter_feature; 524 525 #ifdef CONFIG_X86_32 526 sysenter_feature = X86_FEATURE_SEP; 527 #else 528 sysenter_feature = X86_FEATURE_SYSENTER32; 529 #endif 530 531 if (!boot_cpu_has(sysenter_feature)) 532 return; 533 534 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); 535 if(ret != 0) 536 setup_clear_cpu_cap(sysenter_feature); 537 } 538 539 void xen_enable_syscall(void) 540 { 541 #ifdef CONFIG_X86_64 542 int ret; 543 544 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); 545 if (ret != 0) { 546 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret); 547 /* Pretty fatal; 64-bit userspace has no other 548 mechanism for syscalls. */ 549 } 550 551 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { 552 ret = register_callback(CALLBACKTYPE_syscall32, 553 xen_syscall32_target); 554 if (ret != 0) 555 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); 556 } 557 #endif /* CONFIG_X86_64 */ 558 } 559 void __cpuinit xen_enable_nmi(void) 560 { 561 #ifdef CONFIG_X86_64 562 if (register_callback(CALLBACKTYPE_nmi, nmi)) 563 BUG(); 564 #endif 565 } 566 void __init xen_arch_setup(void) 567 { 568 xen_panic_handler_init(); 569 570 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 571 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); 572 573 if (!xen_feature(XENFEAT_auto_translated_physmap)) 574 HYPERVISOR_vm_assist(VMASST_CMD_enable, 575 VMASST_TYPE_pae_extended_cr3); 576 577 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || 578 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) 579 BUG(); 580 581 xen_enable_sysenter(); 582 xen_enable_syscall(); 583 xen_enable_nmi(); 584 #ifdef CONFIG_ACPI 585 if (!(xen_start_info->flags & SIF_INITDOMAIN)) { 586 printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); 587 disable_acpi(); 588 } 589 #endif 590 591 memcpy(boot_command_line, xen_start_info->cmd_line, 592 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? 593 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); 594 595 /* Set up idle, making sure it calls safe_halt() pvop */ 596 disable_cpuidle(); 597 disable_cpufreq(); 598 WARN_ON(xen_set_default_idle()); 599 fiddle_vdso(); 600 #ifdef CONFIG_NUMA 601 numa_off = 1; 602 #endif 603 } 604