1 /* 2 * 3 * Copyright (C) 1995 Linus Torvalds 4 * 5 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 6 */ 7 8 #include <linux/module.h> 9 #include <linux/signal.h> 10 #include <linux/sched.h> 11 #include <linux/kernel.h> 12 #include <linux/errno.h> 13 #include <linux/string.h> 14 #include <linux/types.h> 15 #include <linux/ptrace.h> 16 #include <linux/mman.h> 17 #include <linux/mm.h> 18 #include <linux/hugetlb.h> 19 #include <linux/swap.h> 20 #include <linux/smp.h> 21 #include <linux/init.h> 22 #include <linux/highmem.h> 23 #include <linux/pagemap.h> 24 #include <linux/pfn.h> 25 #include <linux/poison.h> 26 #include <linux/bootmem.h> 27 #include <linux/slab.h> 28 #include <linux/proc_fs.h> 29 #include <linux/memory_hotplug.h> 30 #include <linux/initrd.h> 31 #include <linux/cpumask.h> 32 33 #include <asm/asm.h> 34 #include <asm/processor.h> 35 #include <asm/system.h> 36 #include <asm/uaccess.h> 37 #include <asm/pgtable.h> 38 #include <asm/dma.h> 39 #include <asm/fixmap.h> 40 #include <asm/e820.h> 41 #include <asm/apic.h> 42 #include <asm/bugs.h> 43 #include <asm/tlb.h> 44 #include <asm/tlbflush.h> 45 #include <asm/pgalloc.h> 46 #include <asm/sections.h> 47 #include <asm/paravirt.h> 48 #include <asm/setup.h> 49 #include <asm/cacheflush.h> 50 #include <asm/smp.h> 51 52 unsigned int __VMALLOC_RESERVE = 128 << 20; 53 54 unsigned long max_low_pfn_mapped; 55 unsigned long max_pfn_mapped; 56 57 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 58 unsigned long highstart_pfn, highend_pfn; 59 60 static noinline int do_test_wp_bit(void); 61 62 63 static unsigned long __initdata table_start; 64 static unsigned long __meminitdata table_end; 65 static unsigned long __meminitdata table_top; 66 67 static int __initdata after_init_bootmem; 68 69 static __init void *alloc_low_page(unsigned long *phys) 70 { 71 unsigned long pfn = table_end++; 72 void *adr; 73 74 if (pfn >= table_top) 75 panic("alloc_low_page: ran out of memory"); 76 77 adr = __va(pfn * PAGE_SIZE); 78 memset(adr, 0, PAGE_SIZE); 79 *phys = pfn * PAGE_SIZE; 80 return adr; 81 } 82 83 /* 84 * Creates a middle page table and puts a pointer to it in the 85 * given global directory entry. This only returns the gd entry 86 * in non-PAE compilation mode, since the middle layer is folded. 87 */ 88 static pmd_t * __init one_md_table_init(pgd_t *pgd) 89 { 90 pud_t *pud; 91 pmd_t *pmd_table; 92 93 #ifdef CONFIG_X86_PAE 94 unsigned long phys; 95 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 96 if (after_init_bootmem) 97 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); 98 else 99 pmd_table = (pmd_t *)alloc_low_page(&phys); 100 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 101 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 102 pud = pud_offset(pgd, 0); 103 BUG_ON(pmd_table != pmd_offset(pud, 0)); 104 } 105 #endif 106 pud = pud_offset(pgd, 0); 107 pmd_table = pmd_offset(pud, 0); 108 109 return pmd_table; 110 } 111 112 /* 113 * Create a page table and place a pointer to it in a middle page 114 * directory entry: 115 */ 116 static pte_t * __init one_page_table_init(pmd_t *pmd) 117 { 118 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 119 pte_t *page_table = NULL; 120 121 if (after_init_bootmem) { 122 #ifdef CONFIG_DEBUG_PAGEALLOC 123 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 124 #endif 125 if (!page_table) 126 page_table = 127 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 128 } else { 129 unsigned long phys; 130 page_table = (pte_t *)alloc_low_page(&phys); 131 } 132 133 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); 134 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 135 BUG_ON(page_table != pte_offset_kernel(pmd, 0)); 136 } 137 138 return pte_offset_kernel(pmd, 0); 139 } 140 141 /* 142 * This function initializes a certain range of kernel virtual memory 143 * with new bootmem page tables, everywhere page tables are missing in 144 * the given range. 145 * 146 * NOTE: The pagetables are allocated contiguous on the physical space 147 * so we can cache the place of the first one and move around without 148 * checking the pgd every time. 149 */ 150 static void __init 151 page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) 152 { 153 int pgd_idx, pmd_idx; 154 unsigned long vaddr; 155 pgd_t *pgd; 156 pmd_t *pmd; 157 158 vaddr = start; 159 pgd_idx = pgd_index(vaddr); 160 pmd_idx = pmd_index(vaddr); 161 pgd = pgd_base + pgd_idx; 162 163 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { 164 pmd = one_md_table_init(pgd); 165 pmd = pmd + pmd_index(vaddr); 166 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); 167 pmd++, pmd_idx++) { 168 one_page_table_init(pmd); 169 170 vaddr += PMD_SIZE; 171 } 172 pmd_idx = 0; 173 } 174 } 175 176 static inline int is_kernel_text(unsigned long addr) 177 { 178 if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) 179 return 1; 180 return 0; 181 } 182 183 /* 184 * This maps the physical memory to kernel virtual address space, a total 185 * of max_low_pfn pages, by creating page tables starting from address 186 * PAGE_OFFSET: 187 */ 188 static void __init kernel_physical_mapping_init(pgd_t *pgd_base, 189 unsigned long start_pfn, 190 unsigned long end_pfn, 191 int use_pse) 192 { 193 int pgd_idx, pmd_idx, pte_ofs; 194 unsigned long pfn; 195 pgd_t *pgd; 196 pmd_t *pmd; 197 pte_t *pte; 198 unsigned pages_2m = 0, pages_4k = 0; 199 200 if (!cpu_has_pse) 201 use_pse = 0; 202 203 pfn = start_pfn; 204 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); 205 pgd = pgd_base + pgd_idx; 206 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { 207 pmd = one_md_table_init(pgd); 208 209 if (pfn >= end_pfn) 210 continue; 211 #ifdef CONFIG_X86_PAE 212 pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); 213 pmd += pmd_idx; 214 #else 215 pmd_idx = 0; 216 #endif 217 for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn; 218 pmd++, pmd_idx++) { 219 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; 220 221 /* 222 * Map with big pages if possible, otherwise 223 * create normal page tables: 224 */ 225 if (use_pse) { 226 unsigned int addr2; 227 pgprot_t prot = PAGE_KERNEL_LARGE; 228 229 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + 230 PAGE_OFFSET + PAGE_SIZE-1; 231 232 if (is_kernel_text(addr) || 233 is_kernel_text(addr2)) 234 prot = PAGE_KERNEL_LARGE_EXEC; 235 236 pages_2m++; 237 set_pmd(pmd, pfn_pmd(pfn, prot)); 238 239 pfn += PTRS_PER_PTE; 240 continue; 241 } 242 pte = one_page_table_init(pmd); 243 244 pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); 245 pte += pte_ofs; 246 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; 247 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 248 pgprot_t prot = PAGE_KERNEL; 249 250 if (is_kernel_text(addr)) 251 prot = PAGE_KERNEL_EXEC; 252 253 pages_4k++; 254 set_pte(pte, pfn_pte(pfn, prot)); 255 } 256 } 257 } 258 update_page_count(PG_LEVEL_2M, pages_2m); 259 update_page_count(PG_LEVEL_4K, pages_4k); 260 } 261 262 /* 263 * devmem_is_allowed() checks to see if /dev/mem access to a certain address 264 * is valid. The argument is a physical page number. 265 * 266 * 267 * On x86, access has to be given to the first megabyte of ram because that area 268 * contains bios code and data regions used by X and dosemu and similar apps. 269 * Access has to be given to non-kernel-ram areas as well, these contain the PCI 270 * mmio resources as well as potential bios/acpi data regions. 271 */ 272 int devmem_is_allowed(unsigned long pagenr) 273 { 274 if (pagenr <= 256) 275 return 1; 276 if (!page_is_ram(pagenr)) 277 return 1; 278 return 0; 279 } 280 281 #ifdef CONFIG_HIGHMEM 282 pte_t *kmap_pte; 283 pgprot_t kmap_prot; 284 285 static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) 286 { 287 return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), 288 vaddr), vaddr), vaddr); 289 } 290 291 static void __init kmap_init(void) 292 { 293 unsigned long kmap_vstart; 294 295 /* 296 * Cache the first kmap pte: 297 */ 298 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); 299 kmap_pte = kmap_get_fixmap_pte(kmap_vstart); 300 301 kmap_prot = PAGE_KERNEL; 302 } 303 304 static void __init permanent_kmaps_init(pgd_t *pgd_base) 305 { 306 unsigned long vaddr; 307 pgd_t *pgd; 308 pud_t *pud; 309 pmd_t *pmd; 310 pte_t *pte; 311 312 vaddr = PKMAP_BASE; 313 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); 314 315 pgd = swapper_pg_dir + pgd_index(vaddr); 316 pud = pud_offset(pgd, vaddr); 317 pmd = pmd_offset(pud, vaddr); 318 pte = pte_offset_kernel(pmd, vaddr); 319 pkmap_page_table = pte; 320 } 321 322 static void __init add_one_highpage_init(struct page *page, int pfn) 323 { 324 ClearPageReserved(page); 325 init_page_count(page); 326 __free_page(page); 327 totalhigh_pages++; 328 } 329 330 struct add_highpages_data { 331 unsigned long start_pfn; 332 unsigned long end_pfn; 333 }; 334 335 static int __init add_highpages_work_fn(unsigned long start_pfn, 336 unsigned long end_pfn, void *datax) 337 { 338 int node_pfn; 339 struct page *page; 340 unsigned long final_start_pfn, final_end_pfn; 341 struct add_highpages_data *data; 342 343 data = (struct add_highpages_data *)datax; 344 345 final_start_pfn = max(start_pfn, data->start_pfn); 346 final_end_pfn = min(end_pfn, data->end_pfn); 347 if (final_start_pfn >= final_end_pfn) 348 return 0; 349 350 for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; 351 node_pfn++) { 352 if (!pfn_valid(node_pfn)) 353 continue; 354 page = pfn_to_page(node_pfn); 355 add_one_highpage_init(page, node_pfn); 356 } 357 358 return 0; 359 360 } 361 362 void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, 363 unsigned long end_pfn) 364 { 365 struct add_highpages_data data; 366 367 data.start_pfn = start_pfn; 368 data.end_pfn = end_pfn; 369 370 work_with_active_regions(nid, add_highpages_work_fn, &data); 371 } 372 373 #ifndef CONFIG_NUMA 374 static void __init set_highmem_pages_init(void) 375 { 376 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); 377 378 totalram_pages += totalhigh_pages; 379 } 380 #endif /* !CONFIG_NUMA */ 381 382 #else 383 # define kmap_init() do { } while (0) 384 # define permanent_kmaps_init(pgd_base) do { } while (0) 385 # define set_highmem_pages_init() do { } while (0) 386 #endif /* CONFIG_HIGHMEM */ 387 388 void __init native_pagetable_setup_start(pgd_t *base) 389 { 390 unsigned long pfn, va; 391 pgd_t *pgd; 392 pud_t *pud; 393 pmd_t *pmd; 394 pte_t *pte; 395 396 /* 397 * Remove any mappings which extend past the end of physical 398 * memory from the boot time page table: 399 */ 400 for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { 401 va = PAGE_OFFSET + (pfn<<PAGE_SHIFT); 402 pgd = base + pgd_index(va); 403 if (!pgd_present(*pgd)) 404 break; 405 406 pud = pud_offset(pgd, va); 407 pmd = pmd_offset(pud, va); 408 if (!pmd_present(*pmd)) 409 break; 410 411 pte = pte_offset_kernel(pmd, va); 412 if (!pte_present(*pte)) 413 break; 414 415 pte_clear(NULL, va, pte); 416 } 417 paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); 418 } 419 420 void __init native_pagetable_setup_done(pgd_t *base) 421 { 422 } 423 424 /* 425 * Build a proper pagetable for the kernel mappings. Up until this 426 * point, we've been running on some set of pagetables constructed by 427 * the boot process. 428 * 429 * If we're booting on native hardware, this will be a pagetable 430 * constructed in arch/x86/kernel/head_32.S. The root of the 431 * pagetable will be swapper_pg_dir. 432 * 433 * If we're booting paravirtualized under a hypervisor, then there are 434 * more options: we may already be running PAE, and the pagetable may 435 * or may not be based in swapper_pg_dir. In any case, 436 * paravirt_pagetable_setup_start() will set up swapper_pg_dir 437 * appropriately for the rest of the initialization to work. 438 * 439 * In general, pagetable_init() assumes that the pagetable may already 440 * be partially populated, and so it avoids stomping on any existing 441 * mappings. 442 */ 443 static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) 444 { 445 unsigned long vaddr, end; 446 447 /* 448 * Fixed mappings, only the page table structure has to be 449 * created - mappings will be set by set_fixmap(): 450 */ 451 early_ioremap_clear(); 452 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; 453 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; 454 page_table_range_init(vaddr, end, pgd_base); 455 early_ioremap_reset(); 456 } 457 458 static void __init pagetable_init(void) 459 { 460 pgd_t *pgd_base = swapper_pg_dir; 461 462 permanent_kmaps_init(pgd_base); 463 } 464 465 #ifdef CONFIG_ACPI_SLEEP 466 /* 467 * ACPI suspend needs this for resume, because things like the intel-agp 468 * driver might have split up a kernel 4MB mapping. 469 */ 470 char swsusp_pg_dir[PAGE_SIZE] 471 __attribute__ ((aligned(PAGE_SIZE))); 472 473 static inline void save_pg_dir(void) 474 { 475 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); 476 } 477 #else /* !CONFIG_ACPI_SLEEP */ 478 static inline void save_pg_dir(void) 479 { 480 } 481 #endif /* !CONFIG_ACPI_SLEEP */ 482 483 void zap_low_mappings(void) 484 { 485 int i; 486 487 /* 488 * Zap initial low-memory mappings. 489 * 490 * Note that "pgd_clear()" doesn't do it for 491 * us, because pgd_clear() is a no-op on i386. 492 */ 493 for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { 494 #ifdef CONFIG_X86_PAE 495 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); 496 #else 497 set_pgd(swapper_pg_dir+i, __pgd(0)); 498 #endif 499 } 500 flush_tlb_all(); 501 } 502 503 int nx_enabled; 504 505 pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); 506 EXPORT_SYMBOL_GPL(__supported_pte_mask); 507 508 #ifdef CONFIG_X86_PAE 509 510 static int disable_nx __initdata; 511 512 /* 513 * noexec = on|off 514 * 515 * Control non executable mappings. 516 * 517 * on Enable 518 * off Disable 519 */ 520 static int __init noexec_setup(char *str) 521 { 522 if (!str || !strcmp(str, "on")) { 523 if (cpu_has_nx) { 524 __supported_pte_mask |= _PAGE_NX; 525 disable_nx = 0; 526 } 527 } else { 528 if (!strcmp(str, "off")) { 529 disable_nx = 1; 530 __supported_pte_mask &= ~_PAGE_NX; 531 } else { 532 return -EINVAL; 533 } 534 } 535 536 return 0; 537 } 538 early_param("noexec", noexec_setup); 539 540 static void __init set_nx(void) 541 { 542 unsigned int v[4], l, h; 543 544 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { 545 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); 546 547 if ((v[3] & (1 << 20)) && !disable_nx) { 548 rdmsr(MSR_EFER, l, h); 549 l |= EFER_NX; 550 wrmsr(MSR_EFER, l, h); 551 nx_enabled = 1; 552 __supported_pte_mask |= _PAGE_NX; 553 } 554 } 555 } 556 #endif 557 558 /* user-defined highmem size */ 559 static unsigned int highmem_pages = -1; 560 561 /* 562 * highmem=size forces highmem to be exactly 'size' bytes. 563 * This works even on boxes that have no highmem otherwise. 564 * This also works to reduce highmem size on bigger boxes. 565 */ 566 static int __init parse_highmem(char *arg) 567 { 568 if (!arg) 569 return -EINVAL; 570 571 highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; 572 return 0; 573 } 574 early_param("highmem", parse_highmem); 575 576 /* 577 * Determine low and high memory ranges: 578 */ 579 void __init find_low_pfn_range(void) 580 { 581 /* it could update max_pfn */ 582 583 /* max_low_pfn is 0, we already have early_res support */ 584 585 max_low_pfn = max_pfn; 586 if (max_low_pfn > MAXMEM_PFN) { 587 if (highmem_pages == -1) 588 highmem_pages = max_pfn - MAXMEM_PFN; 589 if (highmem_pages + MAXMEM_PFN < max_pfn) 590 max_pfn = MAXMEM_PFN + highmem_pages; 591 if (highmem_pages + MAXMEM_PFN > max_pfn) { 592 printk(KERN_WARNING "only %luMB highmem pages " 593 "available, ignoring highmem size of %uMB.\n", 594 pages_to_mb(max_pfn - MAXMEM_PFN), 595 pages_to_mb(highmem_pages)); 596 highmem_pages = 0; 597 } 598 max_low_pfn = MAXMEM_PFN; 599 #ifndef CONFIG_HIGHMEM 600 /* Maximum memory usable is what is directly addressable */ 601 printk(KERN_WARNING "Warning only %ldMB will be used.\n", 602 MAXMEM>>20); 603 if (max_pfn > MAX_NONPAE_PFN) 604 printk(KERN_WARNING 605 "Use a HIGHMEM64G enabled kernel.\n"); 606 else 607 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); 608 max_pfn = MAXMEM_PFN; 609 #else /* !CONFIG_HIGHMEM */ 610 #ifndef CONFIG_HIGHMEM64G 611 if (max_pfn > MAX_NONPAE_PFN) { 612 max_pfn = MAX_NONPAE_PFN; 613 printk(KERN_WARNING "Warning only 4GB will be used." 614 "Use a HIGHMEM64G enabled kernel.\n"); 615 } 616 #endif /* !CONFIG_HIGHMEM64G */ 617 #endif /* !CONFIG_HIGHMEM */ 618 } else { 619 if (highmem_pages == -1) 620 highmem_pages = 0; 621 #ifdef CONFIG_HIGHMEM 622 if (highmem_pages >= max_pfn) { 623 printk(KERN_ERR "highmem size specified (%uMB) is " 624 "bigger than pages available (%luMB)!.\n", 625 pages_to_mb(highmem_pages), 626 pages_to_mb(max_pfn)); 627 highmem_pages = 0; 628 } 629 if (highmem_pages) { 630 if (max_low_pfn - highmem_pages < 631 64*1024*1024/PAGE_SIZE){ 632 printk(KERN_ERR "highmem size %uMB results in " 633 "smaller than 64MB lowmem, ignoring it.\n" 634 , pages_to_mb(highmem_pages)); 635 highmem_pages = 0; 636 } 637 max_low_pfn -= highmem_pages; 638 } 639 #else 640 if (highmem_pages) 641 printk(KERN_ERR "ignoring highmem size on non-highmem" 642 " kernel!\n"); 643 #endif 644 } 645 } 646 647 #ifndef CONFIG_NEED_MULTIPLE_NODES 648 void __init initmem_init(unsigned long start_pfn, 649 unsigned long end_pfn) 650 { 651 #ifdef CONFIG_HIGHMEM 652 highstart_pfn = highend_pfn = max_pfn; 653 if (max_pfn > max_low_pfn) 654 highstart_pfn = max_low_pfn; 655 memory_present(0, 0, highend_pfn); 656 e820_register_active_regions(0, 0, highend_pfn); 657 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 658 pages_to_mb(highend_pfn - highstart_pfn)); 659 num_physpages = highend_pfn; 660 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 661 #else 662 memory_present(0, 0, max_low_pfn); 663 e820_register_active_regions(0, 0, max_low_pfn); 664 num_physpages = max_low_pfn; 665 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 666 #endif 667 #ifdef CONFIG_FLATMEM 668 max_mapnr = num_physpages; 669 #endif 670 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 671 pages_to_mb(max_low_pfn)); 672 673 setup_bootmem_allocator(); 674 } 675 #endif /* !CONFIG_NEED_MULTIPLE_NODES */ 676 677 static void __init zone_sizes_init(void) 678 { 679 unsigned long max_zone_pfns[MAX_NR_ZONES]; 680 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 681 max_zone_pfns[ZONE_DMA] = 682 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 683 max_zone_pfns[ZONE_NORMAL] = max_low_pfn; 684 #ifdef CONFIG_HIGHMEM 685 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; 686 #endif 687 688 free_area_init_nodes(max_zone_pfns); 689 } 690 691 void __init setup_bootmem_allocator(void) 692 { 693 int i; 694 unsigned long bootmap_size, bootmap; 695 /* 696 * Initialize the boot-time allocator (with low memory only): 697 */ 698 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; 699 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, 700 max_pfn_mapped<<PAGE_SHIFT, bootmap_size, 701 PAGE_SIZE); 702 if (bootmap == -1L) 703 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 704 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); 705 706 /* don't touch min_low_pfn */ 707 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, 708 min_low_pfn, max_low_pfn); 709 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 710 max_pfn_mapped<<PAGE_SHIFT); 711 printk(KERN_INFO " low ram: %08lx - %08lx\n", 712 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT); 713 printk(KERN_INFO " bootmap %08lx - %08lx\n", 714 bootmap, bootmap + bootmap_size); 715 for_each_online_node(i) 716 free_bootmem_with_active_regions(i, max_low_pfn); 717 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); 718 719 after_init_bootmem = 1; 720 } 721 722 static void __init find_early_table_space(unsigned long end) 723 { 724 unsigned long puds, pmds, ptes, tables, start; 725 726 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 727 tables = PAGE_ALIGN(puds * sizeof(pud_t)); 728 729 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 730 tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); 731 732 if (cpu_has_pse) { 733 unsigned long extra; 734 735 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 736 extra += PMD_SIZE; 737 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; 738 } else 739 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 740 741 tables += PAGE_ALIGN(ptes * sizeof(pte_t)); 742 743 /* for fixmap */ 744 tables += PAGE_SIZE * 2; 745 746 /* 747 * RED-PEN putting page tables only on node 0 could 748 * cause a hotspot and fill up ZONE_DMA. The page tables 749 * need roughly 0.5KB per GB. 750 */ 751 start = 0x7000; 752 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, 753 tables, PAGE_SIZE); 754 if (table_start == -1UL) 755 panic("Cannot find space for the kernel page tables"); 756 757 table_start >>= PAGE_SHIFT; 758 table_end = table_start; 759 table_top = table_start + (tables>>PAGE_SHIFT); 760 761 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", 762 end, table_start << PAGE_SHIFT, 763 (table_start << PAGE_SHIFT) + tables); 764 } 765 766 unsigned long __init_refok init_memory_mapping(unsigned long start, 767 unsigned long end) 768 { 769 pgd_t *pgd_base = swapper_pg_dir; 770 unsigned long start_pfn, end_pfn; 771 unsigned long big_page_start; 772 773 /* 774 * Find space for the kernel direct mapping tables. 775 */ 776 if (!after_init_bootmem) 777 find_early_table_space(end); 778 779 #ifdef CONFIG_X86_PAE 780 set_nx(); 781 if (nx_enabled) 782 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 783 #endif 784 785 /* Enable PSE if available */ 786 if (cpu_has_pse) 787 set_in_cr4(X86_CR4_PSE); 788 789 /* Enable PGE if available */ 790 if (cpu_has_pge) { 791 set_in_cr4(X86_CR4_PGE); 792 __supported_pte_mask |= _PAGE_GLOBAL; 793 } 794 795 /* 796 * Don't use a large page for the first 2/4MB of memory 797 * because there are often fixed size MTRRs in there 798 * and overlapping MTRRs into large pages can cause 799 * slowdowns. 800 */ 801 big_page_start = PMD_SIZE; 802 803 if (start < big_page_start) { 804 start_pfn = start >> PAGE_SHIFT; 805 end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT); 806 } else { 807 /* head is not big page alignment ? */ 808 start_pfn = start >> PAGE_SHIFT; 809 end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) 810 << (PMD_SHIFT - PAGE_SHIFT); 811 } 812 if (start_pfn < end_pfn) 813 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0); 814 815 /* big page range */ 816 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) 817 << (PMD_SHIFT - PAGE_SHIFT); 818 if (start_pfn < (big_page_start >> PAGE_SHIFT)) 819 start_pfn = big_page_start >> PAGE_SHIFT; 820 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); 821 if (start_pfn < end_pfn) 822 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 823 cpu_has_pse); 824 825 /* tail is not big page alignment ? */ 826 start_pfn = end_pfn; 827 if (start_pfn > (big_page_start>>PAGE_SHIFT)) { 828 end_pfn = end >> PAGE_SHIFT; 829 if (start_pfn < end_pfn) 830 kernel_physical_mapping_init(pgd_base, start_pfn, 831 end_pfn, 0); 832 } 833 834 early_ioremap_page_table_range_init(pgd_base); 835 836 load_cr3(swapper_pg_dir); 837 838 __flush_tlb_all(); 839 840 if (!after_init_bootmem) 841 reserve_early(table_start << PAGE_SHIFT, 842 table_end << PAGE_SHIFT, "PGTABLE"); 843 844 if (!after_init_bootmem) 845 early_memtest(start, end); 846 847 return end >> PAGE_SHIFT; 848 } 849 850 851 /* 852 * paging_init() sets up the page tables - note that the first 8MB are 853 * already mapped by head.S. 854 * 855 * This routines also unmaps the page at virtual kernel address 0, so 856 * that we can trap those pesky NULL-reference errors in the kernel. 857 */ 858 void __init paging_init(void) 859 { 860 pagetable_init(); 861 862 __flush_tlb_all(); 863 864 kmap_init(); 865 866 /* 867 * NOTE: at this point the bootmem allocator is fully available. 868 */ 869 sparse_init(); 870 zone_sizes_init(); 871 } 872 873 /* 874 * Test if the WP bit works in supervisor mode. It isn't supported on 386's 875 * and also on some strange 486's. All 586+'s are OK. This used to involve 876 * black magic jumps to work around some nasty CPU bugs, but fortunately the 877 * switch to using exceptions got rid of all that. 878 */ 879 static void __init test_wp_bit(void) 880 { 881 printk(KERN_INFO 882 "Checking if this processor honours the WP bit even in supervisor mode..."); 883 884 /* Any page-aligned address will do, the test is non-destructive */ 885 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); 886 boot_cpu_data.wp_works_ok = do_test_wp_bit(); 887 clear_fixmap(FIX_WP_TEST); 888 889 if (!boot_cpu_data.wp_works_ok) { 890 printk(KERN_CONT "No.\n"); 891 #ifdef CONFIG_X86_WP_WORKS_OK 892 panic( 893 "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); 894 #endif 895 } else { 896 printk(KERN_CONT "Ok.\n"); 897 } 898 } 899 900 static struct kcore_list kcore_mem, kcore_vmalloc; 901 902 void __init mem_init(void) 903 { 904 int codesize, reservedpages, datasize, initsize; 905 int tmp; 906 907 #ifdef CONFIG_FLATMEM 908 BUG_ON(!mem_map); 909 #endif 910 /* this will put all low memory onto the freelists */ 911 totalram_pages += free_all_bootmem(); 912 913 reservedpages = 0; 914 for (tmp = 0; tmp < max_low_pfn; tmp++) 915 /* 916 * Only count reserved RAM pages: 917 */ 918 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 919 reservedpages++; 920 921 set_highmem_pages_init(); 922 923 codesize = (unsigned long) &_etext - (unsigned long) &_text; 924 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 925 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 926 927 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 928 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 929 VMALLOC_END-VMALLOC_START); 930 931 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " 932 "%dk reserved, %dk data, %dk init, %ldk highmem)\n", 933 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 934 num_physpages << (PAGE_SHIFT-10), 935 codesize >> 10, 936 reservedpages << (PAGE_SHIFT-10), 937 datasize >> 10, 938 initsize >> 10, 939 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) 940 ); 941 942 printk(KERN_INFO "virtual kernel memory layout:\n" 943 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 944 #ifdef CONFIG_HIGHMEM 945 " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 946 #endif 947 " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" 948 " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" 949 " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" 950 " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" 951 " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", 952 FIXADDR_START, FIXADDR_TOP, 953 (FIXADDR_TOP - FIXADDR_START) >> 10, 954 955 #ifdef CONFIG_HIGHMEM 956 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, 957 (LAST_PKMAP*PAGE_SIZE) >> 10, 958 #endif 959 960 VMALLOC_START, VMALLOC_END, 961 (VMALLOC_END - VMALLOC_START) >> 20, 962 963 (unsigned long)__va(0), (unsigned long)high_memory, 964 ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, 965 966 (unsigned long)&__init_begin, (unsigned long)&__init_end, 967 ((unsigned long)&__init_end - 968 (unsigned long)&__init_begin) >> 10, 969 970 (unsigned long)&_etext, (unsigned long)&_edata, 971 ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, 972 973 (unsigned long)&_text, (unsigned long)&_etext, 974 ((unsigned long)&_etext - (unsigned long)&_text) >> 10); 975 976 #ifdef CONFIG_HIGHMEM 977 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); 978 BUG_ON(VMALLOC_END > PKMAP_BASE); 979 #endif 980 BUG_ON(VMALLOC_START > VMALLOC_END); 981 BUG_ON((unsigned long)high_memory > VMALLOC_START); 982 983 if (boot_cpu_data.wp_works_ok < 0) 984 test_wp_bit(); 985 986 cpa_init(); 987 save_pg_dir(); 988 zap_low_mappings(); 989 } 990 991 #ifdef CONFIG_MEMORY_HOTPLUG 992 int arch_add_memory(int nid, u64 start, u64 size) 993 { 994 struct pglist_data *pgdata = NODE_DATA(nid); 995 struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM; 996 unsigned long start_pfn = start >> PAGE_SHIFT; 997 unsigned long nr_pages = size >> PAGE_SHIFT; 998 999 return __add_pages(zone, start_pfn, nr_pages); 1000 } 1001 #endif 1002 1003 /* 1004 * This function cannot be __init, since exceptions don't work in that 1005 * section. Put this after the callers, so that it cannot be inlined. 1006 */ 1007 static noinline int do_test_wp_bit(void) 1008 { 1009 char tmp_reg; 1010 int flag; 1011 1012 __asm__ __volatile__( 1013 " movb %0, %1 \n" 1014 "1: movb %1, %0 \n" 1015 " xorl %2, %2 \n" 1016 "2: \n" 1017 _ASM_EXTABLE(1b,2b) 1018 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), 1019 "=q" (tmp_reg), 1020 "=r" (flag) 1021 :"2" (1) 1022 :"memory"); 1023 1024 return flag; 1025 } 1026 1027 #ifdef CONFIG_DEBUG_RODATA 1028 const int rodata_test_data = 0xC3; 1029 EXPORT_SYMBOL_GPL(rodata_test_data); 1030 1031 void mark_rodata_ro(void) 1032 { 1033 unsigned long start = PFN_ALIGN(_text); 1034 unsigned long size = PFN_ALIGN(_etext) - start; 1035 1036 #ifndef CONFIG_DYNAMIC_FTRACE 1037 /* Dynamic tracing modifies the kernel text section */ 1038 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1039 printk(KERN_INFO "Write protecting the kernel text: %luk\n", 1040 size >> 10); 1041 1042 #ifdef CONFIG_CPA_DEBUG 1043 printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", 1044 start, start+size); 1045 set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); 1046 1047 printk(KERN_INFO "Testing CPA: write protecting again\n"); 1048 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); 1049 #endif 1050 #endif /* CONFIG_DYNAMIC_FTRACE */ 1051 1052 start += size; 1053 size = (unsigned long)__end_rodata - start; 1054 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1055 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 1056 size >> 10); 1057 rodata_test(); 1058 1059 #ifdef CONFIG_CPA_DEBUG 1060 printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); 1061 set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); 1062 1063 printk(KERN_INFO "Testing CPA: write protecting again\n"); 1064 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1065 #endif 1066 } 1067 #endif 1068 1069 void free_init_pages(char *what, unsigned long begin, unsigned long end) 1070 { 1071 #ifdef CONFIG_DEBUG_PAGEALLOC 1072 /* 1073 * If debugging page accesses then do not free this memory but 1074 * mark them not present - any buggy init-section access will 1075 * create a kernel page fault: 1076 */ 1077 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", 1078 begin, PAGE_ALIGN(end)); 1079 set_memory_np(begin, (end - begin) >> PAGE_SHIFT); 1080 #else 1081 unsigned long addr; 1082 1083 /* 1084 * We just marked the kernel text read only above, now that 1085 * we are going to free part of that, we need to make that 1086 * writeable first. 1087 */ 1088 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); 1089 1090 for (addr = begin; addr < end; addr += PAGE_SIZE) { 1091 ClearPageReserved(virt_to_page(addr)); 1092 init_page_count(virt_to_page(addr)); 1093 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); 1094 free_page(addr); 1095 totalram_pages++; 1096 } 1097 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); 1098 #endif 1099 } 1100 1101 void free_initmem(void) 1102 { 1103 free_init_pages("unused kernel memory", 1104 (unsigned long)(&__init_begin), 1105 (unsigned long)(&__init_end)); 1106 } 1107 1108 #ifdef CONFIG_BLK_DEV_INITRD 1109 void free_initrd_mem(unsigned long start, unsigned long end) 1110 { 1111 free_init_pages("initrd memory", start, end); 1112 } 1113 #endif 1114 1115 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, 1116 int flags) 1117 { 1118 return reserve_bootmem(phys, len, flags); 1119 } 1120