1 /* 2 * Machine specific setup for xen 3 * 4 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 5 */ 6 7 #include <linux/module.h> 8 #include <linux/sched.h> 9 #include <linux/mm.h> 10 #include <linux/pm.h> 11 #include <linux/memblock.h> 12 #include <linux/cpuidle.h> 13 14 #include <asm/elf.h> 15 #include <asm/vdso.h> 16 #include <asm/e820.h> 17 #include <asm/setup.h> 18 #include <asm/acpi.h> 19 #include <asm/xen/hypervisor.h> 20 #include <asm/xen/hypercall.h> 21 22 #include <xen/xen.h> 23 #include <xen/page.h> 24 #include <xen/interface/callback.h> 25 #include <xen/interface/memory.h> 26 #include <xen/interface/physdev.h> 27 #include <xen/features.h> 28 29 #include "xen-ops.h" 30 #include "vdso.h" 31 32 /* These are code, but not functions. Defined in entry.S */ 33 extern const char xen_hypervisor_callback[]; 34 extern const char xen_failsafe_callback[]; 35 extern void xen_sysenter_target(void); 36 extern void xen_syscall_target(void); 37 extern void xen_syscall32_target(void); 38 39 /* Amount of extra memory space we add to the e820 ranges */ 40 phys_addr_t xen_extra_mem_start, xen_extra_mem_size; 41 42 /* 43 * The maximum amount of extra memory compared to the base size. The 44 * main scaling factor is the size of struct page. At extreme ratios 45 * of base:extra, all the base memory can be filled with page 46 * structures for the extra memory, leaving no space for anything 47 * else. 48 * 49 * 10x seems like a reasonable balance between scaling flexibility and 50 * leaving a practically usable system. 51 */ 52 #define EXTRA_MEM_RATIO (10) 53 54 static void __init xen_add_extra_mem(unsigned long pages) 55 { 56 unsigned long pfn; 57 58 u64 size = (u64)pages * PAGE_SIZE; 59 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; 60 61 if (!pages) 62 return; 63 64 e820_add_region(extra_start, size, E820_RAM); 65 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 66 67 memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA"); 68 69 xen_extra_mem_size += size; 70 71 xen_max_p2m_pfn = PFN_DOWN(extra_start + size); 72 73 for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++) 74 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 75 } 76 77 static unsigned long __init xen_release_chunk(phys_addr_t start_addr, 78 phys_addr_t end_addr) 79 { 80 struct xen_memory_reservation reservation = { 81 .address_bits = 0, 82 .extent_order = 0, 83 .domid = DOMID_SELF 84 }; 85 unsigned long start, end; 86 unsigned long len = 0; 87 unsigned long pfn; 88 int ret; 89 90 start = PFN_UP(start_addr); 91 end = PFN_DOWN(end_addr); 92 93 if (end <= start) 94 return 0; 95 96 for(pfn = start; pfn < end; pfn++) { 97 unsigned long mfn = pfn_to_mfn(pfn); 98 99 /* Make sure pfn exists to start with */ 100 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) 101 continue; 102 103 set_xen_guest_handle(reservation.extent_start, &mfn); 104 reservation.nr_extents = 1; 105 106 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, 107 &reservation); 108 WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); 109 if (ret == 1) { 110 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 111 len++; 112 } 113 } 114 printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", 115 start, end, len); 116 117 return len; 118 } 119 120 static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, 121 const struct e820map *e820) 122 { 123 phys_addr_t max_addr = PFN_PHYS(max_pfn); 124 phys_addr_t last_end = ISA_END_ADDRESS; 125 unsigned long released = 0; 126 int i; 127 128 /* Free any unused memory above the low 1Mbyte. */ 129 for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { 130 phys_addr_t end = e820->map[i].addr; 131 end = min(max_addr, end); 132 133 if (last_end < end) 134 released += xen_release_chunk(last_end, end); 135 last_end = max(last_end, e820->map[i].addr + e820->map[i].size); 136 } 137 138 if (last_end < max_addr) 139 released += xen_release_chunk(last_end, max_addr); 140 141 printk(KERN_INFO "released %lu pages of unused memory\n", released); 142 return released; 143 } 144 145 static unsigned long __init xen_set_identity(const struct e820entry *list, 146 ssize_t map_size) 147 { 148 phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS; 149 phys_addr_t start_pci = last; 150 const struct e820entry *entry; 151 unsigned long identity = 0; 152 int i; 153 154 for (i = 0, entry = list; i < map_size; i++, entry++) { 155 phys_addr_t start = entry->addr; 156 phys_addr_t end = start + entry->size; 157 158 if (start < last) 159 start = last; 160 161 if (end <= start) 162 continue; 163 164 /* Skip over the 1MB region. */ 165 if (last > end) 166 continue; 167 168 if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) { 169 if (start > start_pci) 170 identity += set_phys_range_identity( 171 PFN_UP(start_pci), PFN_DOWN(start)); 172 173 /* Without saving 'last' we would gooble RAM too 174 * at the end of the loop. */ 175 last = end; 176 start_pci = end; 177 continue; 178 } 179 start_pci = min(start, start_pci); 180 last = end; 181 } 182 if (last > start_pci) 183 identity += set_phys_range_identity( 184 PFN_UP(start_pci), PFN_DOWN(last)); 185 return identity; 186 } 187 188 static unsigned long __init xen_get_max_pages(void) 189 { 190 unsigned long max_pages = MAX_DOMAIN_PAGES; 191 domid_t domid = DOMID_SELF; 192 int ret; 193 194 ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); 195 if (ret > 0) 196 max_pages = ret; 197 return min(max_pages, MAX_DOMAIN_PAGES); 198 } 199 200 /** 201 * machine_specific_memory_setup - Hook for machine specific memory setup. 202 **/ 203 char * __init xen_memory_setup(void) 204 { 205 static struct e820entry map[E820MAX] __initdata; 206 static struct e820entry map_raw[E820MAX] __initdata; 207 208 unsigned long max_pfn = xen_start_info->nr_pages; 209 unsigned long long mem_end; 210 int rc; 211 struct xen_memory_map memmap; 212 unsigned long extra_pages = 0; 213 unsigned long extra_limit; 214 unsigned long identity_pages = 0; 215 int i; 216 int op; 217 218 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); 219 mem_end = PFN_PHYS(max_pfn); 220 221 memmap.nr_entries = E820MAX; 222 set_xen_guest_handle(memmap.buffer, map); 223 224 op = xen_initial_domain() ? 225 XENMEM_machine_memory_map : 226 XENMEM_memory_map; 227 rc = HYPERVISOR_memory_op(op, &memmap); 228 if (rc == -ENOSYS) { 229 BUG_ON(xen_initial_domain()); 230 memmap.nr_entries = 1; 231 map[0].addr = 0ULL; 232 map[0].size = mem_end; 233 /* 8MB slack (to balance backend allocations). */ 234 map[0].size += 8ULL << 20; 235 map[0].type = E820_RAM; 236 rc = 0; 237 } 238 BUG_ON(rc); 239 240 memcpy(map_raw, map, sizeof(map)); 241 e820.nr_map = 0; 242 xen_extra_mem_start = mem_end; 243 for (i = 0; i < memmap.nr_entries; i++) { 244 unsigned long long end; 245 246 /* Guard against non-page aligned E820 entries. */ 247 if (map[i].type == E820_RAM) 248 map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE; 249 250 end = map[i].addr + map[i].size; 251 if (map[i].type == E820_RAM && end > mem_end) { 252 /* RAM off the end - may be partially included */ 253 u64 delta = min(map[i].size, end - mem_end); 254 255 map[i].size -= delta; 256 end -= delta; 257 258 extra_pages += PFN_DOWN(delta); 259 /* 260 * Set RAM below 4GB that is not for us to be unusable. 261 * This prevents "System RAM" address space from being 262 * used as potential resource for I/O address (happens 263 * when 'allocate_resource' is called). 264 */ 265 if (delta && 266 (xen_initial_domain() && end < 0x100000000ULL)) 267 e820_add_region(end, delta, E820_UNUSABLE); 268 } 269 270 if (map[i].size > 0 && end > xen_extra_mem_start) 271 xen_extra_mem_start = end; 272 273 /* Add region if any remains */ 274 if (map[i].size > 0) 275 e820_add_region(map[i].addr, map[i].size, map[i].type); 276 } 277 /* Align the balloon area so that max_low_pfn does not get set 278 * to be at the _end_ of the PCI gap at the far end (fee01000). 279 * Note that xen_extra_mem_start gets set in the loop above to be 280 * past the last E820 region. */ 281 if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32))) 282 xen_extra_mem_start = (1ULL<<32); 283 284 /* 285 * In domU, the ISA region is normal, usable memory, but we 286 * reserve ISA memory anyway because too many things poke 287 * about in there. 288 * 289 * In Dom0, the host E820 information can leave gaps in the 290 * ISA range, which would cause us to release those pages. To 291 * avoid this, we unconditionally reserve them here. 292 */ 293 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, 294 E820_RESERVED); 295 296 /* 297 * Reserve Xen bits: 298 * - mfn_list 299 * - xen_start_info 300 * See comment above "struct start_info" in <xen/interface/xen.h> 301 */ 302 memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), 303 __pa(xen_start_info->pt_base), 304 "XEN START INFO"); 305 306 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 307 308 extra_limit = xen_get_max_pages(); 309 if (max_pfn + extra_pages > extra_limit) { 310 if (extra_limit > max_pfn) 311 extra_pages = extra_limit - max_pfn; 312 else 313 extra_pages = 0; 314 } 315 316 extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); 317 318 /* 319 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO 320 * factor the base size. On non-highmem systems, the base 321 * size is the full initial memory allocation; on highmem it 322 * is limited to the max size of lowmem, so that it doesn't 323 * get completely filled. 324 * 325 * In principle there could be a problem in lowmem systems if 326 * the initial memory is also very large with respect to 327 * lowmem, but we won't try to deal with that here. 328 */ 329 extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), 330 max_pfn + extra_pages); 331 332 if (extra_limit >= max_pfn) 333 extra_pages = extra_limit - max_pfn; 334 else 335 extra_pages = 0; 336 337 xen_add_extra_mem(extra_pages); 338 339 /* 340 * Set P2M for all non-RAM pages and E820 gaps to be identity 341 * type PFNs. We supply it with the non-sanitized version 342 * of the E820. 343 */ 344 identity_pages = xen_set_identity(map_raw, memmap.nr_entries); 345 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages); 346 return "Xen"; 347 } 348 349 /* 350 * Set the bit indicating "nosegneg" library variants should be used. 351 * We only need to bother in pure 32-bit mode; compat 32-bit processes 352 * can have un-truncated segments, so wrapping around is allowed. 353 */ 354 static void __init fiddle_vdso(void) 355 { 356 #ifdef CONFIG_X86_32 357 u32 *mask; 358 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK); 359 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 360 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK); 361 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 362 #endif 363 } 364 365 static int __cpuinit register_callback(unsigned type, const void *func) 366 { 367 struct callback_register callback = { 368 .type = type, 369 .address = XEN_CALLBACK(__KERNEL_CS, func), 370 .flags = CALLBACKF_mask_events, 371 }; 372 373 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); 374 } 375 376 void __cpuinit xen_enable_sysenter(void) 377 { 378 int ret; 379 unsigned sysenter_feature; 380 381 #ifdef CONFIG_X86_32 382 sysenter_feature = X86_FEATURE_SEP; 383 #else 384 sysenter_feature = X86_FEATURE_SYSENTER32; 385 #endif 386 387 if (!boot_cpu_has(sysenter_feature)) 388 return; 389 390 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); 391 if(ret != 0) 392 setup_clear_cpu_cap(sysenter_feature); 393 } 394 395 void __cpuinit xen_enable_syscall(void) 396 { 397 #ifdef CONFIG_X86_64 398 int ret; 399 400 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); 401 if (ret != 0) { 402 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret); 403 /* Pretty fatal; 64-bit userspace has no other 404 mechanism for syscalls. */ 405 } 406 407 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { 408 ret = register_callback(CALLBACKTYPE_syscall32, 409 xen_syscall32_target); 410 if (ret != 0) 411 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); 412 } 413 #endif /* CONFIG_X86_64 */ 414 } 415 416 void __init xen_arch_setup(void) 417 { 418 xen_panic_handler_init(); 419 420 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 421 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); 422 423 if (!xen_feature(XENFEAT_auto_translated_physmap)) 424 HYPERVISOR_vm_assist(VMASST_CMD_enable, 425 VMASST_TYPE_pae_extended_cr3); 426 427 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || 428 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) 429 BUG(); 430 431 xen_enable_sysenter(); 432 xen_enable_syscall(); 433 434 #ifdef CONFIG_ACPI 435 if (!(xen_start_info->flags & SIF_INITDOMAIN)) { 436 printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); 437 disable_acpi(); 438 } 439 #endif 440 441 memcpy(boot_command_line, xen_start_info->cmd_line, 442 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? 443 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); 444 445 /* Set up idle, making sure it calls safe_halt() pvop */ 446 #ifdef CONFIG_X86_32 447 boot_cpu_data.hlt_works_ok = 1; 448 #endif 449 disable_cpuidle(); 450 boot_option_idle_override = IDLE_HALT; 451 452 fiddle_vdso(); 453 } 454