1 /* 2 * linux/arch/x86_64/mm/init.c 3 * 4 * Copyright (C) 1995 Linus Torvalds 5 * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz> 6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> 7 */ 8 9 #include <linux/signal.h> 10 #include <linux/sched.h> 11 #include <linux/kernel.h> 12 #include <linux/errno.h> 13 #include <linux/string.h> 14 #include <linux/types.h> 15 #include <linux/ptrace.h> 16 #include <linux/mman.h> 17 #include <linux/mm.h> 18 #include <linux/swap.h> 19 #include <linux/smp.h> 20 #include <linux/init.h> 21 #include <linux/initrd.h> 22 #include <linux/pagemap.h> 23 #include <linux/bootmem.h> 24 #include <linux/memblock.h> 25 #include <linux/proc_fs.h> 26 #include <linux/pci.h> 27 #include <linux/pfn.h> 28 #include <linux/poison.h> 29 #include <linux/dma-mapping.h> 30 #include <linux/module.h> 31 #include <linux/memory.h> 32 #include <linux/memory_hotplug.h> 33 #include <linux/memremap.h> 34 #include <linux/nmi.h> 35 #include <linux/gfp.h> 36 #include <linux/kcore.h> 37 38 #include <asm/processor.h> 39 #include <asm/bios_ebda.h> 40 #include <asm/uaccess.h> 41 #include <asm/pgtable.h> 42 #include <asm/pgalloc.h> 43 #include <asm/dma.h> 44 #include <asm/fixmap.h> 45 #include <asm/e820.h> 46 #include <asm/apic.h> 47 #include <asm/tlb.h> 48 #include <asm/mmu_context.h> 49 #include <asm/proto.h> 50 #include <asm/smp.h> 51 #include <asm/sections.h> 52 #include <asm/kdebug.h> 53 #include <asm/numa.h> 54 #include <asm/cacheflush.h> 55 #include <asm/init.h> 56 #include <asm/uv/uv.h> 57 #include <asm/setup.h> 58 59 #include "mm_internal.h" 60 61 static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, 62 unsigned long addr, unsigned long end) 63 { 64 addr &= PMD_MASK; 65 for (; addr < end; addr += PMD_SIZE) { 66 pmd_t *pmd = pmd_page + pmd_index(addr); 67 68 if (!pmd_present(*pmd)) 69 set_pmd(pmd, __pmd(addr | pmd_flag)); 70 } 71 } 72 static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, 73 unsigned long addr, unsigned long end) 74 { 75 unsigned long next; 76 77 for (; addr < end; addr = next) { 78 pud_t *pud = pud_page + pud_index(addr); 79 pmd_t *pmd; 80 81 next = (addr & PUD_MASK) + PUD_SIZE; 82 if (next > end) 83 next = end; 84 85 if (pud_present(*pud)) { 86 pmd = pmd_offset(pud, 0); 87 ident_pmd_init(info->pmd_flag, pmd, addr, next); 88 continue; 89 } 90 pmd = (pmd_t *)info->alloc_pgt_page(info->context); 91 if (!pmd) 92 return -ENOMEM; 93 ident_pmd_init(info->pmd_flag, pmd, addr, next); 94 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); 95 } 96 97 return 0; 98 } 99 100 int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, 101 unsigned long addr, unsigned long end) 102 { 103 unsigned long next; 104 int result; 105 int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; 106 107 for (; addr < end; addr = next) { 108 pgd_t *pgd = pgd_page + pgd_index(addr) + off; 109 pud_t *pud; 110 111 next = (addr & PGDIR_MASK) + PGDIR_SIZE; 112 if (next > end) 113 next = end; 114 115 if (pgd_present(*pgd)) { 116 pud = pud_offset(pgd, 0); 117 result = ident_pud_init(info, pud, addr, next); 118 if (result) 119 return result; 120 continue; 121 } 122 123 pud = (pud_t *)info->alloc_pgt_page(info->context); 124 if (!pud) 125 return -ENOMEM; 126 result = ident_pud_init(info, pud, addr, next); 127 if (result) 128 return result; 129 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); 130 } 131 132 return 0; 133 } 134 135 /* 136 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the 137 * physical space so we can cache the place of the first one and move 138 * around without checking the pgd every time. 139 */ 140 141 pteval_t __supported_pte_mask __read_mostly = ~0; 142 EXPORT_SYMBOL_GPL(__supported_pte_mask); 143 144 int force_personality32; 145 146 /* 147 * noexec32=on|off 148 * Control non executable heap for 32bit processes. 149 * To control the stack too use noexec=off 150 * 151 * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) 152 * off PROT_READ implies PROT_EXEC 153 */ 154 static int __init nonx32_setup(char *str) 155 { 156 if (!strcmp(str, "on")) 157 force_personality32 &= ~READ_IMPLIES_EXEC; 158 else if (!strcmp(str, "off")) 159 force_personality32 |= READ_IMPLIES_EXEC; 160 return 1; 161 } 162 __setup("noexec32=", nonx32_setup); 163 164 /* 165 * When memory was added/removed make sure all the processes MM have 166 * suitable PGD entries in the local PGD level page. 167 */ 168 void sync_global_pgds(unsigned long start, unsigned long end, int removed) 169 { 170 unsigned long address; 171 172 for (address = start; address <= end; address += PGDIR_SIZE) { 173 const pgd_t *pgd_ref = pgd_offset_k(address); 174 struct page *page; 175 176 /* 177 * When it is called after memory hot remove, pgd_none() 178 * returns true. In this case (removed == 1), we must clear 179 * the PGD entries in the local PGD level page. 180 */ 181 if (pgd_none(*pgd_ref) && !removed) 182 continue; 183 184 spin_lock(&pgd_lock); 185 list_for_each_entry(page, &pgd_list, lru) { 186 pgd_t *pgd; 187 spinlock_t *pgt_lock; 188 189 pgd = (pgd_t *)page_address(page) + pgd_index(address); 190 /* the pgt_lock only for Xen */ 191 pgt_lock = &pgd_page_get_mm(page)->page_table_lock; 192 spin_lock(pgt_lock); 193 194 if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) 195 BUG_ON(pgd_page_vaddr(*pgd) 196 != pgd_page_vaddr(*pgd_ref)); 197 198 if (removed) { 199 if (pgd_none(*pgd_ref) && !pgd_none(*pgd)) 200 pgd_clear(pgd); 201 } else { 202 if (pgd_none(*pgd)) 203 set_pgd(pgd, *pgd_ref); 204 } 205 206 spin_unlock(pgt_lock); 207 } 208 spin_unlock(&pgd_lock); 209 } 210 } 211 212 /* 213 * NOTE: This function is marked __ref because it calls __init function 214 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. 215 */ 216 static __ref void *spp_getpage(void) 217 { 218 void *ptr; 219 220 if (after_bootmem) 221 ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); 222 else 223 ptr = alloc_bootmem_pages(PAGE_SIZE); 224 225 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) { 226 panic("set_pte_phys: cannot allocate page data %s\n", 227 after_bootmem ? "after bootmem" : ""); 228 } 229 230 pr_debug("spp_getpage %p\n", ptr); 231 232 return ptr; 233 } 234 235 static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) 236 { 237 if (pgd_none(*pgd)) { 238 pud_t *pud = (pud_t *)spp_getpage(); 239 pgd_populate(&init_mm, pgd, pud); 240 if (pud != pud_offset(pgd, 0)) 241 printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", 242 pud, pud_offset(pgd, 0)); 243 } 244 return pud_offset(pgd, vaddr); 245 } 246 247 static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) 248 { 249 if (pud_none(*pud)) { 250 pmd_t *pmd = (pmd_t *) spp_getpage(); 251 pud_populate(&init_mm, pud, pmd); 252 if (pmd != pmd_offset(pud, 0)) 253 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 254 pmd, pmd_offset(pud, 0)); 255 } 256 return pmd_offset(pud, vaddr); 257 } 258 259 static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) 260 { 261 if (pmd_none(*pmd)) { 262 pte_t *pte = (pte_t *) spp_getpage(); 263 pmd_populate_kernel(&init_mm, pmd, pte); 264 if (pte != pte_offset_kernel(pmd, 0)) 265 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 266 } 267 return pte_offset_kernel(pmd, vaddr); 268 } 269 270 void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) 271 { 272 pud_t *pud; 273 pmd_t *pmd; 274 pte_t *pte; 275 276 pud = pud_page + pud_index(vaddr); 277 pmd = fill_pmd(pud, vaddr); 278 pte = fill_pte(pmd, vaddr); 279 280 set_pte(pte, new_pte); 281 282 /* 283 * It's enough to flush this one mapping. 284 * (PGE mappings get flushed as well) 285 */ 286 __flush_tlb_one(vaddr); 287 } 288 289 void set_pte_vaddr(unsigned long vaddr, pte_t pteval) 290 { 291 pgd_t *pgd; 292 pud_t *pud_page; 293 294 pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); 295 296 pgd = pgd_offset_k(vaddr); 297 if (pgd_none(*pgd)) { 298 printk(KERN_ERR 299 "PGD FIXMAP MISSING, it should be setup in head.S!\n"); 300 return; 301 } 302 pud_page = (pud_t*)pgd_page_vaddr(*pgd); 303 set_pte_vaddr_pud(pud_page, vaddr, pteval); 304 } 305 306 pmd_t * __init populate_extra_pmd(unsigned long vaddr) 307 { 308 pgd_t *pgd; 309 pud_t *pud; 310 311 pgd = pgd_offset_k(vaddr); 312 pud = fill_pud(pgd, vaddr); 313 return fill_pmd(pud, vaddr); 314 } 315 316 pte_t * __init populate_extra_pte(unsigned long vaddr) 317 { 318 pmd_t *pmd; 319 320 pmd = populate_extra_pmd(vaddr); 321 return fill_pte(pmd, vaddr); 322 } 323 324 /* 325 * Create large page table mappings for a range of physical addresses. 326 */ 327 static void __init __init_extra_mapping(unsigned long phys, unsigned long size, 328 enum page_cache_mode cache) 329 { 330 pgd_t *pgd; 331 pud_t *pud; 332 pmd_t *pmd; 333 pgprot_t prot; 334 335 pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) | 336 pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache))); 337 BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); 338 for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { 339 pgd = pgd_offset_k((unsigned long)__va(phys)); 340 if (pgd_none(*pgd)) { 341 pud = (pud_t *) spp_getpage(); 342 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | 343 _PAGE_USER)); 344 } 345 pud = pud_offset(pgd, (unsigned long)__va(phys)); 346 if (pud_none(*pud)) { 347 pmd = (pmd_t *) spp_getpage(); 348 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | 349 _PAGE_USER)); 350 } 351 pmd = pmd_offset(pud, phys); 352 BUG_ON(!pmd_none(*pmd)); 353 set_pmd(pmd, __pmd(phys | pgprot_val(prot))); 354 } 355 } 356 357 void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) 358 { 359 __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB); 360 } 361 362 void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) 363 { 364 __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC); 365 } 366 367 /* 368 * The head.S code sets up the kernel high mapping: 369 * 370 * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) 371 * 372 * phys_base holds the negative offset to the kernel, which is added 373 * to the compile time generated pmds. This results in invalid pmds up 374 * to the point where we hit the physaddr 0 mapping. 375 * 376 * We limit the mappings to the region from _text to _brk_end. _brk_end 377 * is rounded up to the 2MB boundary. This catches the invalid pmds as 378 * well, as they are located before _text: 379 */ 380 void __init cleanup_highmap(void) 381 { 382 unsigned long vaddr = __START_KERNEL_map; 383 unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE; 384 unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; 385 pmd_t *pmd = level2_kernel_pgt; 386 387 /* 388 * Native path, max_pfn_mapped is not set yet. 389 * Xen has valid max_pfn_mapped set in 390 * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable(). 391 */ 392 if (max_pfn_mapped) 393 vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); 394 395 for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { 396 if (pmd_none(*pmd)) 397 continue; 398 if (vaddr < (unsigned long) _text || vaddr > end) 399 set_pmd(pmd, __pmd(0)); 400 } 401 } 402 403 static unsigned long __meminit 404 phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, 405 pgprot_t prot) 406 { 407 unsigned long pages = 0, next; 408 unsigned long last_map_addr = end; 409 int i; 410 411 pte_t *pte = pte_page + pte_index(addr); 412 413 for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) { 414 next = (addr & PAGE_MASK) + PAGE_SIZE; 415 if (addr >= end) { 416 if (!after_bootmem && 417 !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) && 418 !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN)) 419 set_pte(pte, __pte(0)); 420 continue; 421 } 422 423 /* 424 * We will re-use the existing mapping. 425 * Xen for example has some special requirements, like mapping 426 * pagetable pages as RO. So assume someone who pre-setup 427 * these mappings are more intelligent. 428 */ 429 if (pte_val(*pte)) { 430 if (!after_bootmem) 431 pages++; 432 continue; 433 } 434 435 if (0) 436 printk(" pte=%p addr=%lx pte=%016lx\n", 437 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); 438 pages++; 439 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); 440 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; 441 } 442 443 update_page_count(PG_LEVEL_4K, pages); 444 445 return last_map_addr; 446 } 447 448 static unsigned long __meminit 449 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 450 unsigned long page_size_mask, pgprot_t prot) 451 { 452 unsigned long pages = 0, next; 453 unsigned long last_map_addr = end; 454 455 int i = pmd_index(address); 456 457 for (; i < PTRS_PER_PMD; i++, address = next) { 458 pmd_t *pmd = pmd_page + pmd_index(address); 459 pte_t *pte; 460 pgprot_t new_prot = prot; 461 462 next = (address & PMD_MASK) + PMD_SIZE; 463 if (address >= end) { 464 if (!after_bootmem && 465 !e820_any_mapped(address & PMD_MASK, next, E820_RAM) && 466 !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN)) 467 set_pmd(pmd, __pmd(0)); 468 continue; 469 } 470 471 if (pmd_val(*pmd)) { 472 if (!pmd_large(*pmd)) { 473 spin_lock(&init_mm.page_table_lock); 474 pte = (pte_t *)pmd_page_vaddr(*pmd); 475 last_map_addr = phys_pte_init(pte, address, 476 end, prot); 477 spin_unlock(&init_mm.page_table_lock); 478 continue; 479 } 480 /* 481 * If we are ok with PG_LEVEL_2M mapping, then we will 482 * use the existing mapping, 483 * 484 * Otherwise, we will split the large page mapping but 485 * use the same existing protection bits except for 486 * large page, so that we don't violate Intel's TLB 487 * Application note (317080) which says, while changing 488 * the page sizes, new and old translations should 489 * not differ with respect to page frame and 490 * attributes. 491 */ 492 if (page_size_mask & (1 << PG_LEVEL_2M)) { 493 if (!after_bootmem) 494 pages++; 495 last_map_addr = next; 496 continue; 497 } 498 new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); 499 } 500 501 if (page_size_mask & (1<<PG_LEVEL_2M)) { 502 pages++; 503 spin_lock(&init_mm.page_table_lock); 504 set_pte((pte_t *)pmd, 505 pfn_pte((address & PMD_MASK) >> PAGE_SHIFT, 506 __pgprot(pgprot_val(prot) | _PAGE_PSE))); 507 spin_unlock(&init_mm.page_table_lock); 508 last_map_addr = next; 509 continue; 510 } 511 512 pte = alloc_low_page(); 513 last_map_addr = phys_pte_init(pte, address, end, new_prot); 514 515 spin_lock(&init_mm.page_table_lock); 516 pmd_populate_kernel(&init_mm, pmd, pte); 517 spin_unlock(&init_mm.page_table_lock); 518 } 519 update_page_count(PG_LEVEL_2M, pages); 520 return last_map_addr; 521 } 522 523 static unsigned long __meminit 524 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, 525 unsigned long page_size_mask) 526 { 527 unsigned long pages = 0, next; 528 unsigned long last_map_addr = end; 529 int i = pud_index(addr); 530 531 for (; i < PTRS_PER_PUD; i++, addr = next) { 532 pud_t *pud = pud_page + pud_index(addr); 533 pmd_t *pmd; 534 pgprot_t prot = PAGE_KERNEL; 535 536 next = (addr & PUD_MASK) + PUD_SIZE; 537 if (addr >= end) { 538 if (!after_bootmem && 539 !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) && 540 !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN)) 541 set_pud(pud, __pud(0)); 542 continue; 543 } 544 545 if (pud_val(*pud)) { 546 if (!pud_large(*pud)) { 547 pmd = pmd_offset(pud, 0); 548 last_map_addr = phys_pmd_init(pmd, addr, end, 549 page_size_mask, prot); 550 __flush_tlb_all(); 551 continue; 552 } 553 /* 554 * If we are ok with PG_LEVEL_1G mapping, then we will 555 * use the existing mapping. 556 * 557 * Otherwise, we will split the gbpage mapping but use 558 * the same existing protection bits except for large 559 * page, so that we don't violate Intel's TLB 560 * Application note (317080) which says, while changing 561 * the page sizes, new and old translations should 562 * not differ with respect to page frame and 563 * attributes. 564 */ 565 if (page_size_mask & (1 << PG_LEVEL_1G)) { 566 if (!after_bootmem) 567 pages++; 568 last_map_addr = next; 569 continue; 570 } 571 prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); 572 } 573 574 if (page_size_mask & (1<<PG_LEVEL_1G)) { 575 pages++; 576 spin_lock(&init_mm.page_table_lock); 577 set_pte((pte_t *)pud, 578 pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT, 579 PAGE_KERNEL_LARGE)); 580 spin_unlock(&init_mm.page_table_lock); 581 last_map_addr = next; 582 continue; 583 } 584 585 pmd = alloc_low_page(); 586 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, 587 prot); 588 589 spin_lock(&init_mm.page_table_lock); 590 pud_populate(&init_mm, pud, pmd); 591 spin_unlock(&init_mm.page_table_lock); 592 } 593 __flush_tlb_all(); 594 595 update_page_count(PG_LEVEL_1G, pages); 596 597 return last_map_addr; 598 } 599 600 unsigned long __meminit 601 kernel_physical_mapping_init(unsigned long start, 602 unsigned long end, 603 unsigned long page_size_mask) 604 { 605 bool pgd_changed = false; 606 unsigned long next, last_map_addr = end; 607 unsigned long addr; 608 609 start = (unsigned long)__va(start); 610 end = (unsigned long)__va(end); 611 addr = start; 612 613 for (; start < end; start = next) { 614 pgd_t *pgd = pgd_offset_k(start); 615 pud_t *pud; 616 617 next = (start & PGDIR_MASK) + PGDIR_SIZE; 618 619 if (pgd_val(*pgd)) { 620 pud = (pud_t *)pgd_page_vaddr(*pgd); 621 last_map_addr = phys_pud_init(pud, __pa(start), 622 __pa(end), page_size_mask); 623 continue; 624 } 625 626 pud = alloc_low_page(); 627 last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), 628 page_size_mask); 629 630 spin_lock(&init_mm.page_table_lock); 631 pgd_populate(&init_mm, pgd, pud); 632 spin_unlock(&init_mm.page_table_lock); 633 pgd_changed = true; 634 } 635 636 if (pgd_changed) 637 sync_global_pgds(addr, end - 1, 0); 638 639 __flush_tlb_all(); 640 641 return last_map_addr; 642 } 643 644 #ifndef CONFIG_NUMA 645 void __init initmem_init(void) 646 { 647 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); 648 } 649 #endif 650 651 void __init paging_init(void) 652 { 653 sparse_memory_present_with_active_regions(MAX_NUMNODES); 654 sparse_init(); 655 656 /* 657 * clear the default setting with node 0 658 * note: don't use nodes_clear here, that is really clearing when 659 * numa support is not compiled in, and later node_set_state 660 * will not set it back. 661 */ 662 node_clear_state(0, N_MEMORY); 663 if (N_MEMORY != N_NORMAL_MEMORY) 664 node_clear_state(0, N_NORMAL_MEMORY); 665 666 zone_sizes_init(); 667 } 668 669 /* 670 * Memory hotplug specific functions 671 */ 672 #ifdef CONFIG_MEMORY_HOTPLUG 673 /* 674 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need 675 * updating. 676 */ 677 static void update_end_of_memory_vars(u64 start, u64 size) 678 { 679 unsigned long end_pfn = PFN_UP(start + size); 680 681 if (end_pfn > max_pfn) { 682 max_pfn = end_pfn; 683 max_low_pfn = end_pfn; 684 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 685 } 686 } 687 688 /* 689 * Memory is added always to NORMAL zone. This means you will never get 690 * additional DMA/DMA32 memory. 691 */ 692 int arch_add_memory(int nid, u64 start, u64 size, bool for_device) 693 { 694 struct pglist_data *pgdat = NODE_DATA(nid); 695 struct zone *zone = pgdat->node_zones + 696 zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); 697 unsigned long start_pfn = start >> PAGE_SHIFT; 698 unsigned long nr_pages = size >> PAGE_SHIFT; 699 int ret; 700 701 init_memory_mapping(start, start + size); 702 703 ret = __add_pages(nid, zone, start_pfn, nr_pages); 704 WARN_ON_ONCE(ret); 705 706 /* update max_pfn, max_low_pfn and high_memory */ 707 update_end_of_memory_vars(start, size); 708 709 return ret; 710 } 711 EXPORT_SYMBOL_GPL(arch_add_memory); 712 713 #define PAGE_INUSE 0xFD 714 715 static void __meminit free_pagetable(struct page *page, int order) 716 { 717 unsigned long magic; 718 unsigned int nr_pages = 1 << order; 719 struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page); 720 721 if (altmap) { 722 vmem_altmap_free(altmap, nr_pages); 723 return; 724 } 725 726 /* bootmem page has reserved flag */ 727 if (PageReserved(page)) { 728 __ClearPageReserved(page); 729 730 magic = (unsigned long)page->lru.next; 731 if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { 732 while (nr_pages--) 733 put_page_bootmem(page++); 734 } else 735 while (nr_pages--) 736 free_reserved_page(page++); 737 } else 738 free_pages((unsigned long)page_address(page), order); 739 } 740 741 static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) 742 { 743 pte_t *pte; 744 int i; 745 746 for (i = 0; i < PTRS_PER_PTE; i++) { 747 pte = pte_start + i; 748 if (pte_val(*pte)) 749 return; 750 } 751 752 /* free a pte talbe */ 753 free_pagetable(pmd_page(*pmd), 0); 754 spin_lock(&init_mm.page_table_lock); 755 pmd_clear(pmd); 756 spin_unlock(&init_mm.page_table_lock); 757 } 758 759 static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) 760 { 761 pmd_t *pmd; 762 int i; 763 764 for (i = 0; i < PTRS_PER_PMD; i++) { 765 pmd = pmd_start + i; 766 if (pmd_val(*pmd)) 767 return; 768 } 769 770 /* free a pmd talbe */ 771 free_pagetable(pud_page(*pud), 0); 772 spin_lock(&init_mm.page_table_lock); 773 pud_clear(pud); 774 spin_unlock(&init_mm.page_table_lock); 775 } 776 777 /* Return true if pgd is changed, otherwise return false. */ 778 static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd) 779 { 780 pud_t *pud; 781 int i; 782 783 for (i = 0; i < PTRS_PER_PUD; i++) { 784 pud = pud_start + i; 785 if (pud_val(*pud)) 786 return false; 787 } 788 789 /* free a pud table */ 790 free_pagetable(pgd_page(*pgd), 0); 791 spin_lock(&init_mm.page_table_lock); 792 pgd_clear(pgd); 793 spin_unlock(&init_mm.page_table_lock); 794 795 return true; 796 } 797 798 static void __meminit 799 remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, 800 bool direct) 801 { 802 unsigned long next, pages = 0; 803 pte_t *pte; 804 void *page_addr; 805 phys_addr_t phys_addr; 806 807 pte = pte_start + pte_index(addr); 808 for (; addr < end; addr = next, pte++) { 809 next = (addr + PAGE_SIZE) & PAGE_MASK; 810 if (next > end) 811 next = end; 812 813 if (!pte_present(*pte)) 814 continue; 815 816 /* 817 * We mapped [0,1G) memory as identity mapping when 818 * initializing, in arch/x86/kernel/head_64.S. These 819 * pagetables cannot be removed. 820 */ 821 phys_addr = pte_val(*pte) + (addr & PAGE_MASK); 822 if (phys_addr < (phys_addr_t)0x40000000) 823 return; 824 825 if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { 826 /* 827 * Do not free direct mapping pages since they were 828 * freed when offlining, or simplely not in use. 829 */ 830 if (!direct) 831 free_pagetable(pte_page(*pte), 0); 832 833 spin_lock(&init_mm.page_table_lock); 834 pte_clear(&init_mm, addr, pte); 835 spin_unlock(&init_mm.page_table_lock); 836 837 /* For non-direct mapping, pages means nothing. */ 838 pages++; 839 } else { 840 /* 841 * If we are here, we are freeing vmemmap pages since 842 * direct mapped memory ranges to be freed are aligned. 843 * 844 * If we are not removing the whole page, it means 845 * other page structs in this page are being used and 846 * we canot remove them. So fill the unused page_structs 847 * with 0xFD, and remove the page when it is wholly 848 * filled with 0xFD. 849 */ 850 memset((void *)addr, PAGE_INUSE, next - addr); 851 852 page_addr = page_address(pte_page(*pte)); 853 if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { 854 free_pagetable(pte_page(*pte), 0); 855 856 spin_lock(&init_mm.page_table_lock); 857 pte_clear(&init_mm, addr, pte); 858 spin_unlock(&init_mm.page_table_lock); 859 } 860 } 861 } 862 863 /* Call free_pte_table() in remove_pmd_table(). */ 864 flush_tlb_all(); 865 if (direct) 866 update_page_count(PG_LEVEL_4K, -pages); 867 } 868 869 static void __meminit 870 remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, 871 bool direct) 872 { 873 unsigned long next, pages = 0; 874 pte_t *pte_base; 875 pmd_t *pmd; 876 void *page_addr; 877 878 pmd = pmd_start + pmd_index(addr); 879 for (; addr < end; addr = next, pmd++) { 880 next = pmd_addr_end(addr, end); 881 882 if (!pmd_present(*pmd)) 883 continue; 884 885 if (pmd_large(*pmd)) { 886 if (IS_ALIGNED(addr, PMD_SIZE) && 887 IS_ALIGNED(next, PMD_SIZE)) { 888 if (!direct) 889 free_pagetable(pmd_page(*pmd), 890 get_order(PMD_SIZE)); 891 892 spin_lock(&init_mm.page_table_lock); 893 pmd_clear(pmd); 894 spin_unlock(&init_mm.page_table_lock); 895 pages++; 896 } else { 897 /* If here, we are freeing vmemmap pages. */ 898 memset((void *)addr, PAGE_INUSE, next - addr); 899 900 page_addr = page_address(pmd_page(*pmd)); 901 if (!memchr_inv(page_addr, PAGE_INUSE, 902 PMD_SIZE)) { 903 free_pagetable(pmd_page(*pmd), 904 get_order(PMD_SIZE)); 905 906 spin_lock(&init_mm.page_table_lock); 907 pmd_clear(pmd); 908 spin_unlock(&init_mm.page_table_lock); 909 } 910 } 911 912 continue; 913 } 914 915 pte_base = (pte_t *)pmd_page_vaddr(*pmd); 916 remove_pte_table(pte_base, addr, next, direct); 917 free_pte_table(pte_base, pmd); 918 } 919 920 /* Call free_pmd_table() in remove_pud_table(). */ 921 if (direct) 922 update_page_count(PG_LEVEL_2M, -pages); 923 } 924 925 static void __meminit 926 remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, 927 bool direct) 928 { 929 unsigned long next, pages = 0; 930 pmd_t *pmd_base; 931 pud_t *pud; 932 void *page_addr; 933 934 pud = pud_start + pud_index(addr); 935 for (; addr < end; addr = next, pud++) { 936 next = pud_addr_end(addr, end); 937 938 if (!pud_present(*pud)) 939 continue; 940 941 if (pud_large(*pud)) { 942 if (IS_ALIGNED(addr, PUD_SIZE) && 943 IS_ALIGNED(next, PUD_SIZE)) { 944 if (!direct) 945 free_pagetable(pud_page(*pud), 946 get_order(PUD_SIZE)); 947 948 spin_lock(&init_mm.page_table_lock); 949 pud_clear(pud); 950 spin_unlock(&init_mm.page_table_lock); 951 pages++; 952 } else { 953 /* If here, we are freeing vmemmap pages. */ 954 memset((void *)addr, PAGE_INUSE, next - addr); 955 956 page_addr = page_address(pud_page(*pud)); 957 if (!memchr_inv(page_addr, PAGE_INUSE, 958 PUD_SIZE)) { 959 free_pagetable(pud_page(*pud), 960 get_order(PUD_SIZE)); 961 962 spin_lock(&init_mm.page_table_lock); 963 pud_clear(pud); 964 spin_unlock(&init_mm.page_table_lock); 965 } 966 } 967 968 continue; 969 } 970 971 pmd_base = (pmd_t *)pud_page_vaddr(*pud); 972 remove_pmd_table(pmd_base, addr, next, direct); 973 free_pmd_table(pmd_base, pud); 974 } 975 976 if (direct) 977 update_page_count(PG_LEVEL_1G, -pages); 978 } 979 980 /* start and end are both virtual address. */ 981 static void __meminit 982 remove_pagetable(unsigned long start, unsigned long end, bool direct) 983 { 984 unsigned long next; 985 unsigned long addr; 986 pgd_t *pgd; 987 pud_t *pud; 988 bool pgd_changed = false; 989 990 for (addr = start; addr < end; addr = next) { 991 next = pgd_addr_end(addr, end); 992 993 pgd = pgd_offset_k(addr); 994 if (!pgd_present(*pgd)) 995 continue; 996 997 pud = (pud_t *)pgd_page_vaddr(*pgd); 998 remove_pud_table(pud, addr, next, direct); 999 if (free_pud_table(pud, pgd)) 1000 pgd_changed = true; 1001 } 1002 1003 if (pgd_changed) 1004 sync_global_pgds(start, end - 1, 1); 1005 1006 flush_tlb_all(); 1007 } 1008 1009 void __ref vmemmap_free(unsigned long start, unsigned long end) 1010 { 1011 remove_pagetable(start, end, false); 1012 } 1013 1014 #ifdef CONFIG_MEMORY_HOTREMOVE 1015 static void __meminit 1016 kernel_physical_mapping_remove(unsigned long start, unsigned long end) 1017 { 1018 start = (unsigned long)__va(start); 1019 end = (unsigned long)__va(end); 1020 1021 remove_pagetable(start, end, true); 1022 } 1023 1024 int __ref arch_remove_memory(u64 start, u64 size) 1025 { 1026 unsigned long start_pfn = start >> PAGE_SHIFT; 1027 unsigned long nr_pages = size >> PAGE_SHIFT; 1028 struct page *page = pfn_to_page(start_pfn); 1029 struct vmem_altmap *altmap; 1030 struct zone *zone; 1031 int ret; 1032 1033 /* With altmap the first mapped page is offset from @start */ 1034 altmap = to_vmem_altmap((unsigned long) page); 1035 if (altmap) 1036 page += vmem_altmap_offset(altmap); 1037 zone = page_zone(page); 1038 ret = __remove_pages(zone, start_pfn, nr_pages); 1039 WARN_ON_ONCE(ret); 1040 kernel_physical_mapping_remove(start, start + size); 1041 1042 return ret; 1043 } 1044 #endif 1045 #endif /* CONFIG_MEMORY_HOTPLUG */ 1046 1047 static struct kcore_list kcore_vsyscall; 1048 1049 static void __init register_page_bootmem_info(void) 1050 { 1051 #ifdef CONFIG_NUMA 1052 int i; 1053 1054 for_each_online_node(i) 1055 register_page_bootmem_info_node(NODE_DATA(i)); 1056 #endif 1057 } 1058 1059 void __init mem_init(void) 1060 { 1061 pci_iommu_alloc(); 1062 1063 /* clear_bss() already clear the empty_zero_page */ 1064 1065 register_page_bootmem_info(); 1066 1067 /* this will put all memory onto the freelists */ 1068 free_all_bootmem(); 1069 after_bootmem = 1; 1070 1071 /* Register memory areas for /proc/kcore */ 1072 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, 1073 PAGE_SIZE, KCORE_OTHER); 1074 1075 mem_init_print_info(NULL); 1076 } 1077 1078 const int rodata_test_data = 0xC3; 1079 EXPORT_SYMBOL_GPL(rodata_test_data); 1080 1081 int kernel_set_to_readonly; 1082 1083 void set_kernel_text_rw(void) 1084 { 1085 unsigned long start = PFN_ALIGN(_text); 1086 unsigned long end = PFN_ALIGN(__stop___ex_table); 1087 1088 if (!kernel_set_to_readonly) 1089 return; 1090 1091 pr_debug("Set kernel text: %lx - %lx for read write\n", 1092 start, end); 1093 1094 /* 1095 * Make the kernel identity mapping for text RW. Kernel text 1096 * mapping will always be RO. Refer to the comment in 1097 * static_protections() in pageattr.c 1098 */ 1099 set_memory_rw(start, (end - start) >> PAGE_SHIFT); 1100 } 1101 1102 void set_kernel_text_ro(void) 1103 { 1104 unsigned long start = PFN_ALIGN(_text); 1105 unsigned long end = PFN_ALIGN(__stop___ex_table); 1106 1107 if (!kernel_set_to_readonly) 1108 return; 1109 1110 pr_debug("Set kernel text: %lx - %lx for read only\n", 1111 start, end); 1112 1113 /* 1114 * Set the kernel identity mapping for text RO. 1115 */ 1116 set_memory_ro(start, (end - start) >> PAGE_SHIFT); 1117 } 1118 1119 void mark_rodata_ro(void) 1120 { 1121 unsigned long start = PFN_ALIGN(_text); 1122 unsigned long rodata_start = PFN_ALIGN(__start_rodata); 1123 unsigned long end = (unsigned long) &__end_rodata_hpage_align; 1124 unsigned long text_end = PFN_ALIGN(&__stop___ex_table); 1125 unsigned long rodata_end = PFN_ALIGN(&__end_rodata); 1126 unsigned long all_end; 1127 1128 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 1129 (end - start) >> 10); 1130 set_memory_ro(start, (end - start) >> PAGE_SHIFT); 1131 1132 kernel_set_to_readonly = 1; 1133 1134 /* 1135 * The rodata/data/bss/brk section (but not the kernel text!) 1136 * should also be not-executable. 1137 * 1138 * We align all_end to PMD_SIZE because the existing mapping 1139 * is a full PMD. If we would align _brk_end to PAGE_SIZE we 1140 * split the PMD and the reminder between _brk_end and the end 1141 * of the PMD will remain mapped executable. 1142 * 1143 * Any PMD which was setup after the one which covers _brk_end 1144 * has been zapped already via cleanup_highmem(). 1145 */ 1146 all_end = roundup((unsigned long)_brk_end, PMD_SIZE); 1147 set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT); 1148 1149 rodata_test(); 1150 1151 #ifdef CONFIG_CPA_DEBUG 1152 printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); 1153 set_memory_rw(start, (end-start) >> PAGE_SHIFT); 1154 1155 printk(KERN_INFO "Testing CPA: again\n"); 1156 set_memory_ro(start, (end-start) >> PAGE_SHIFT); 1157 #endif 1158 1159 free_init_pages("unused kernel", 1160 (unsigned long) __va(__pa_symbol(text_end)), 1161 (unsigned long) __va(__pa_symbol(rodata_start))); 1162 free_init_pages("unused kernel", 1163 (unsigned long) __va(__pa_symbol(rodata_end)), 1164 (unsigned long) __va(__pa_symbol(_sdata))); 1165 1166 debug_checkwx(); 1167 } 1168 1169 int kern_addr_valid(unsigned long addr) 1170 { 1171 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; 1172 pgd_t *pgd; 1173 pud_t *pud; 1174 pmd_t *pmd; 1175 pte_t *pte; 1176 1177 if (above != 0 && above != -1UL) 1178 return 0; 1179 1180 pgd = pgd_offset_k(addr); 1181 if (pgd_none(*pgd)) 1182 return 0; 1183 1184 pud = pud_offset(pgd, addr); 1185 if (pud_none(*pud)) 1186 return 0; 1187 1188 if (pud_large(*pud)) 1189 return pfn_valid(pud_pfn(*pud)); 1190 1191 pmd = pmd_offset(pud, addr); 1192 if (pmd_none(*pmd)) 1193 return 0; 1194 1195 if (pmd_large(*pmd)) 1196 return pfn_valid(pmd_pfn(*pmd)); 1197 1198 pte = pte_offset_kernel(pmd, addr); 1199 if (pte_none(*pte)) 1200 return 0; 1201 1202 return pfn_valid(pte_pfn(*pte)); 1203 } 1204 1205 static unsigned long probe_memory_block_size(void) 1206 { 1207 unsigned long bz = MIN_MEMORY_BLOCK_SIZE; 1208 1209 /* if system is UV or has 64GB of RAM or more, use large blocks */ 1210 if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30))) 1211 bz = 2UL << 30; /* 2GB */ 1212 1213 pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20); 1214 1215 return bz; 1216 } 1217 1218 static unsigned long memory_block_size_probed; 1219 unsigned long memory_block_size_bytes(void) 1220 { 1221 if (!memory_block_size_probed) 1222 memory_block_size_probed = probe_memory_block_size(); 1223 1224 return memory_block_size_probed; 1225 } 1226 1227 #ifdef CONFIG_SPARSEMEM_VMEMMAP 1228 /* 1229 * Initialise the sparsemem vmemmap using huge-pages at the PMD level. 1230 */ 1231 static long __meminitdata addr_start, addr_end; 1232 static void __meminitdata *p_start, *p_end; 1233 static int __meminitdata node_start; 1234 1235 static int __meminit vmemmap_populate_hugepages(unsigned long start, 1236 unsigned long end, int node, struct vmem_altmap *altmap) 1237 { 1238 unsigned long addr; 1239 unsigned long next; 1240 pgd_t *pgd; 1241 pud_t *pud; 1242 pmd_t *pmd; 1243 1244 for (addr = start; addr < end; addr = next) { 1245 next = pmd_addr_end(addr, end); 1246 1247 pgd = vmemmap_pgd_populate(addr, node); 1248 if (!pgd) 1249 return -ENOMEM; 1250 1251 pud = vmemmap_pud_populate(pgd, addr, node); 1252 if (!pud) 1253 return -ENOMEM; 1254 1255 pmd = pmd_offset(pud, addr); 1256 if (pmd_none(*pmd)) { 1257 void *p; 1258 1259 p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); 1260 if (p) { 1261 pte_t entry; 1262 1263 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, 1264 PAGE_KERNEL_LARGE); 1265 set_pmd(pmd, __pmd(pte_val(entry))); 1266 1267 /* check to see if we have contiguous blocks */ 1268 if (p_end != p || node_start != node) { 1269 if (p_start) 1270 pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", 1271 addr_start, addr_end-1, p_start, p_end-1, node_start); 1272 addr_start = addr; 1273 node_start = node; 1274 p_start = p; 1275 } 1276 1277 addr_end = addr + PMD_SIZE; 1278 p_end = p + PMD_SIZE; 1279 continue; 1280 } else if (altmap) 1281 return -ENOMEM; /* no fallback */ 1282 } else if (pmd_large(*pmd)) { 1283 vmemmap_verify((pte_t *)pmd, node, addr, next); 1284 continue; 1285 } 1286 pr_warn_once("vmemmap: falling back to regular page backing\n"); 1287 if (vmemmap_populate_basepages(addr, next, node)) 1288 return -ENOMEM; 1289 } 1290 return 0; 1291 } 1292 1293 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) 1294 { 1295 struct vmem_altmap *altmap = to_vmem_altmap(start); 1296 int err; 1297 1298 if (cpu_has_pse) 1299 err = vmemmap_populate_hugepages(start, end, node, altmap); 1300 else if (altmap) { 1301 pr_err_once("%s: no cpu support for altmap allocations\n", 1302 __func__); 1303 err = -ENOMEM; 1304 } else 1305 err = vmemmap_populate_basepages(start, end, node); 1306 if (!err) 1307 sync_global_pgds(start, end - 1, 0); 1308 return err; 1309 } 1310 1311 #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) 1312 void register_page_bootmem_memmap(unsigned long section_nr, 1313 struct page *start_page, unsigned long size) 1314 { 1315 unsigned long addr = (unsigned long)start_page; 1316 unsigned long end = (unsigned long)(start_page + size); 1317 unsigned long next; 1318 pgd_t *pgd; 1319 pud_t *pud; 1320 pmd_t *pmd; 1321 unsigned int nr_pages; 1322 struct page *page; 1323 1324 for (; addr < end; addr = next) { 1325 pte_t *pte = NULL; 1326 1327 pgd = pgd_offset_k(addr); 1328 if (pgd_none(*pgd)) { 1329 next = (addr + PAGE_SIZE) & PAGE_MASK; 1330 continue; 1331 } 1332 get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); 1333 1334 pud = pud_offset(pgd, addr); 1335 if (pud_none(*pud)) { 1336 next = (addr + PAGE_SIZE) & PAGE_MASK; 1337 continue; 1338 } 1339 get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); 1340 1341 if (!cpu_has_pse) { 1342 next = (addr + PAGE_SIZE) & PAGE_MASK; 1343 pmd = pmd_offset(pud, addr); 1344 if (pmd_none(*pmd)) 1345 continue; 1346 get_page_bootmem(section_nr, pmd_page(*pmd), 1347 MIX_SECTION_INFO); 1348 1349 pte = pte_offset_kernel(pmd, addr); 1350 if (pte_none(*pte)) 1351 continue; 1352 get_page_bootmem(section_nr, pte_page(*pte), 1353 SECTION_INFO); 1354 } else { 1355 next = pmd_addr_end(addr, end); 1356 1357 pmd = pmd_offset(pud, addr); 1358 if (pmd_none(*pmd)) 1359 continue; 1360 1361 nr_pages = 1 << (get_order(PMD_SIZE)); 1362 page = pmd_page(*pmd); 1363 while (nr_pages--) 1364 get_page_bootmem(section_nr, page++, 1365 SECTION_INFO); 1366 } 1367 } 1368 } 1369 #endif 1370 1371 void __meminit vmemmap_populate_print_last(void) 1372 { 1373 if (p_start) { 1374 pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", 1375 addr_start, addr_end-1, p_start, p_end-1, node_start); 1376 p_start = NULL; 1377 p_end = NULL; 1378 node_start = 0; 1379 } 1380 } 1381 #endif 1382