1 /* 2 * arch/sparc64/mm/init.c 3 * 4 * Copyright (C) 1996-1999 David S. Miller (davem@caip.rutgers.edu) 5 * Copyright (C) 1997-1999 Jakub Jelinek (jj@sunsite.mff.cuni.cz) 6 */ 7 8 #include <linux/module.h> 9 #include <linux/kernel.h> 10 #include <linux/sched.h> 11 #include <linux/string.h> 12 #include <linux/init.h> 13 #include <linux/bootmem.h> 14 #include <linux/mm.h> 15 #include <linux/hugetlb.h> 16 #include <linux/initrd.h> 17 #include <linux/swap.h> 18 #include <linux/pagemap.h> 19 #include <linux/poison.h> 20 #include <linux/fs.h> 21 #include <linux/seq_file.h> 22 #include <linux/kprobes.h> 23 #include <linux/cache.h> 24 #include <linux/sort.h> 25 #include <linux/ioport.h> 26 #include <linux/percpu.h> 27 #include <linux/memblock.h> 28 #include <linux/mmzone.h> 29 #include <linux/gfp.h> 30 31 #include <asm/head.h> 32 #include <asm/page.h> 33 #include <asm/pgalloc.h> 34 #include <asm/pgtable.h> 35 #include <asm/oplib.h> 36 #include <asm/iommu.h> 37 #include <asm/io.h> 38 #include <asm/uaccess.h> 39 #include <asm/mmu_context.h> 40 #include <asm/tlbflush.h> 41 #include <asm/dma.h> 42 #include <asm/starfire.h> 43 #include <asm/tlb.h> 44 #include <asm/spitfire.h> 45 #include <asm/sections.h> 46 #include <asm/tsb.h> 47 #include <asm/hypervisor.h> 48 #include <asm/prom.h> 49 #include <asm/mdesc.h> 50 #include <asm/cpudata.h> 51 #include <asm/setup.h> 52 #include <asm/irq.h> 53 54 #include "init_64.h" 55 56 unsigned long kern_linear_pte_xor[4] __read_mostly; 57 static unsigned long page_cache4v_flag; 58 59 /* A bitmap, two bits for every 256MB of physical memory. These two 60 * bits determine what page size we use for kernel linear 61 * translations. They form an index into kern_linear_pte_xor[]. The 62 * value in the indexed slot is XOR'd with the TLB miss virtual 63 * address to form the resulting TTE. The mapping is: 64 * 65 * 0 ==> 4MB 66 * 1 ==> 256MB 67 * 2 ==> 2GB 68 * 3 ==> 16GB 69 * 70 * All sun4v chips support 256MB pages. Only SPARC-T4 and later 71 * support 2GB pages, and hopefully future cpus will support the 16GB 72 * pages as well. For slots 2 and 3, we encode a 256MB TTE xor there 73 * if these larger page sizes are not supported by the cpu. 74 * 75 * It would be nice to determine this from the machine description 76 * 'cpu' properties, but we need to have this table setup before the 77 * MDESC is initialized. 78 */ 79 80 #ifndef CONFIG_DEBUG_PAGEALLOC 81 /* A special kernel TSB for 4MB, 256MB, 2GB and 16GB linear mappings. 82 * Space is allocated for this right after the trap table in 83 * arch/sparc64/kernel/head.S 84 */ 85 extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES]; 86 #endif 87 extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES]; 88 89 static unsigned long cpu_pgsz_mask; 90 91 #define MAX_BANKS 1024 92 93 static struct linux_prom64_registers pavail[MAX_BANKS]; 94 static int pavail_ents; 95 96 u64 numa_latency[MAX_NUMNODES][MAX_NUMNODES]; 97 98 static int cmp_p64(const void *a, const void *b) 99 { 100 const struct linux_prom64_registers *x = a, *y = b; 101 102 if (x->phys_addr > y->phys_addr) 103 return 1; 104 if (x->phys_addr < y->phys_addr) 105 return -1; 106 return 0; 107 } 108 109 static void __init read_obp_memory(const char *property, 110 struct linux_prom64_registers *regs, 111 int *num_ents) 112 { 113 phandle node = prom_finddevice("/memory"); 114 int prop_size = prom_getproplen(node, property); 115 int ents, ret, i; 116 117 ents = prop_size / sizeof(struct linux_prom64_registers); 118 if (ents > MAX_BANKS) { 119 prom_printf("The machine has more %s property entries than " 120 "this kernel can support (%d).\n", 121 property, MAX_BANKS); 122 prom_halt(); 123 } 124 125 ret = prom_getproperty(node, property, (char *) regs, prop_size); 126 if (ret == -1) { 127 prom_printf("Couldn't get %s property from /memory.\n", 128 property); 129 prom_halt(); 130 } 131 132 /* Sanitize what we got from the firmware, by page aligning 133 * everything. 134 */ 135 for (i = 0; i < ents; i++) { 136 unsigned long base, size; 137 138 base = regs[i].phys_addr; 139 size = regs[i].reg_size; 140 141 size &= PAGE_MASK; 142 if (base & ~PAGE_MASK) { 143 unsigned long new_base = PAGE_ALIGN(base); 144 145 size -= new_base - base; 146 if ((long) size < 0L) 147 size = 0UL; 148 base = new_base; 149 } 150 if (size == 0UL) { 151 /* If it is empty, simply get rid of it. 152 * This simplifies the logic of the other 153 * functions that process these arrays. 154 */ 155 memmove(®s[i], ®s[i + 1], 156 (ents - i - 1) * sizeof(regs[0])); 157 i--; 158 ents--; 159 continue; 160 } 161 regs[i].phys_addr = base; 162 regs[i].reg_size = size; 163 } 164 165 *num_ents = ents; 166 167 sort(regs, ents, sizeof(struct linux_prom64_registers), 168 cmp_p64, NULL); 169 } 170 171 /* Kernel physical address base and size in bytes. */ 172 unsigned long kern_base __read_mostly; 173 unsigned long kern_size __read_mostly; 174 175 /* Initial ramdisk setup */ 176 extern unsigned long sparc_ramdisk_image64; 177 extern unsigned int sparc_ramdisk_image; 178 extern unsigned int sparc_ramdisk_size; 179 180 struct page *mem_map_zero __read_mostly; 181 EXPORT_SYMBOL(mem_map_zero); 182 183 unsigned int sparc64_highest_unlocked_tlb_ent __read_mostly; 184 185 unsigned long sparc64_kern_pri_context __read_mostly; 186 unsigned long sparc64_kern_pri_nuc_bits __read_mostly; 187 unsigned long sparc64_kern_sec_context __read_mostly; 188 189 int num_kernel_image_mappings; 190 191 #ifdef CONFIG_DEBUG_DCFLUSH 192 atomic_t dcpage_flushes = ATOMIC_INIT(0); 193 #ifdef CONFIG_SMP 194 atomic_t dcpage_flushes_xcall = ATOMIC_INIT(0); 195 #endif 196 #endif 197 198 inline void flush_dcache_page_impl(struct page *page) 199 { 200 BUG_ON(tlb_type == hypervisor); 201 #ifdef CONFIG_DEBUG_DCFLUSH 202 atomic_inc(&dcpage_flushes); 203 #endif 204 205 #ifdef DCACHE_ALIASING_POSSIBLE 206 __flush_dcache_page(page_address(page), 207 ((tlb_type == spitfire) && 208 page_mapping(page) != NULL)); 209 #else 210 if (page_mapping(page) != NULL && 211 tlb_type == spitfire) 212 __flush_icache_page(__pa(page_address(page))); 213 #endif 214 } 215 216 #define PG_dcache_dirty PG_arch_1 217 #define PG_dcache_cpu_shift 32UL 218 #define PG_dcache_cpu_mask \ 219 ((1UL<<ilog2(roundup_pow_of_two(NR_CPUS)))-1UL) 220 221 #define dcache_dirty_cpu(page) \ 222 (((page)->flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) 223 224 static inline void set_dcache_dirty(struct page *page, int this_cpu) 225 { 226 unsigned long mask = this_cpu; 227 unsigned long non_cpu_bits; 228 229 non_cpu_bits = ~(PG_dcache_cpu_mask << PG_dcache_cpu_shift); 230 mask = (mask << PG_dcache_cpu_shift) | (1UL << PG_dcache_dirty); 231 232 __asm__ __volatile__("1:\n\t" 233 "ldx [%2], %%g7\n\t" 234 "and %%g7, %1, %%g1\n\t" 235 "or %%g1, %0, %%g1\n\t" 236 "casx [%2], %%g7, %%g1\n\t" 237 "cmp %%g7, %%g1\n\t" 238 "bne,pn %%xcc, 1b\n\t" 239 " nop" 240 : /* no outputs */ 241 : "r" (mask), "r" (non_cpu_bits), "r" (&page->flags) 242 : "g1", "g7"); 243 } 244 245 static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu) 246 { 247 unsigned long mask = (1UL << PG_dcache_dirty); 248 249 __asm__ __volatile__("! test_and_clear_dcache_dirty\n" 250 "1:\n\t" 251 "ldx [%2], %%g7\n\t" 252 "srlx %%g7, %4, %%g1\n\t" 253 "and %%g1, %3, %%g1\n\t" 254 "cmp %%g1, %0\n\t" 255 "bne,pn %%icc, 2f\n\t" 256 " andn %%g7, %1, %%g1\n\t" 257 "casx [%2], %%g7, %%g1\n\t" 258 "cmp %%g7, %%g1\n\t" 259 "bne,pn %%xcc, 1b\n\t" 260 " nop\n" 261 "2:" 262 : /* no outputs */ 263 : "r" (cpu), "r" (mask), "r" (&page->flags), 264 "i" (PG_dcache_cpu_mask), 265 "i" (PG_dcache_cpu_shift) 266 : "g1", "g7"); 267 } 268 269 static inline void tsb_insert(struct tsb *ent, unsigned long tag, unsigned long pte) 270 { 271 unsigned long tsb_addr = (unsigned long) ent; 272 273 if (tlb_type == cheetah_plus || tlb_type == hypervisor) 274 tsb_addr = __pa(tsb_addr); 275 276 __tsb_insert(tsb_addr, tag, pte); 277 } 278 279 unsigned long _PAGE_ALL_SZ_BITS __read_mostly; 280 281 static void flush_dcache(unsigned long pfn) 282 { 283 struct page *page; 284 285 page = pfn_to_page(pfn); 286 if (page) { 287 unsigned long pg_flags; 288 289 pg_flags = page->flags; 290 if (pg_flags & (1UL << PG_dcache_dirty)) { 291 int cpu = ((pg_flags >> PG_dcache_cpu_shift) & 292 PG_dcache_cpu_mask); 293 int this_cpu = get_cpu(); 294 295 /* This is just to optimize away some function calls 296 * in the SMP case. 297 */ 298 if (cpu == this_cpu) 299 flush_dcache_page_impl(page); 300 else 301 smp_flush_dcache_page_impl(page, cpu); 302 303 clear_dcache_dirty_cpu(page, cpu); 304 305 put_cpu(); 306 } 307 } 308 } 309 310 /* mm->context.lock must be held */ 311 static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_index, 312 unsigned long tsb_hash_shift, unsigned long address, 313 unsigned long tte) 314 { 315 struct tsb *tsb = mm->context.tsb_block[tsb_index].tsb; 316 unsigned long tag; 317 318 if (unlikely(!tsb)) 319 return; 320 321 tsb += ((address >> tsb_hash_shift) & 322 (mm->context.tsb_block[tsb_index].tsb_nentries - 1UL)); 323 tag = (address >> 22UL); 324 tsb_insert(tsb, tag, tte); 325 } 326 327 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) 328 { 329 struct mm_struct *mm; 330 unsigned long flags; 331 pte_t pte = *ptep; 332 333 if (tlb_type != hypervisor) { 334 unsigned long pfn = pte_pfn(pte); 335 336 if (pfn_valid(pfn)) 337 flush_dcache(pfn); 338 } 339 340 mm = vma->vm_mm; 341 342 /* Don't insert a non-valid PTE into the TSB, we'll deadlock. */ 343 if (!pte_accessible(mm, pte)) 344 return; 345 346 spin_lock_irqsave(&mm->context.lock, flags); 347 348 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) 349 if (mm->context.huge_pte_count && is_hugetlb_pte(pte)) 350 __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT, 351 address, pte_val(pte)); 352 else 353 #endif 354 __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT, 355 address, pte_val(pte)); 356 357 spin_unlock_irqrestore(&mm->context.lock, flags); 358 } 359 360 void flush_dcache_page(struct page *page) 361 { 362 struct address_space *mapping; 363 int this_cpu; 364 365 if (tlb_type == hypervisor) 366 return; 367 368 /* Do not bother with the expensive D-cache flush if it 369 * is merely the zero page. The 'bigcore' testcase in GDB 370 * causes this case to run millions of times. 371 */ 372 if (page == ZERO_PAGE(0)) 373 return; 374 375 this_cpu = get_cpu(); 376 377 mapping = page_mapping(page); 378 if (mapping && !mapping_mapped(mapping)) { 379 int dirty = test_bit(PG_dcache_dirty, &page->flags); 380 if (dirty) { 381 int dirty_cpu = dcache_dirty_cpu(page); 382 383 if (dirty_cpu == this_cpu) 384 goto out; 385 smp_flush_dcache_page_impl(page, dirty_cpu); 386 } 387 set_dcache_dirty(page, this_cpu); 388 } else { 389 /* We could delay the flush for the !page_mapping 390 * case too. But that case is for exec env/arg 391 * pages and those are %99 certainly going to get 392 * faulted into the tlb (and thus flushed) anyways. 393 */ 394 flush_dcache_page_impl(page); 395 } 396 397 out: 398 put_cpu(); 399 } 400 EXPORT_SYMBOL(flush_dcache_page); 401 402 void __kprobes flush_icache_range(unsigned long start, unsigned long end) 403 { 404 /* Cheetah and Hypervisor platform cpus have coherent I-cache. */ 405 if (tlb_type == spitfire) { 406 unsigned long kaddr; 407 408 /* This code only runs on Spitfire cpus so this is 409 * why we can assume _PAGE_PADDR_4U. 410 */ 411 for (kaddr = start; kaddr < end; kaddr += PAGE_SIZE) { 412 unsigned long paddr, mask = _PAGE_PADDR_4U; 413 414 if (kaddr >= PAGE_OFFSET) 415 paddr = kaddr & mask; 416 else { 417 pgd_t *pgdp = pgd_offset_k(kaddr); 418 pud_t *pudp = pud_offset(pgdp, kaddr); 419 pmd_t *pmdp = pmd_offset(pudp, kaddr); 420 pte_t *ptep = pte_offset_kernel(pmdp, kaddr); 421 422 paddr = pte_val(*ptep) & mask; 423 } 424 __flush_icache_page(paddr); 425 } 426 } 427 } 428 EXPORT_SYMBOL(flush_icache_range); 429 430 void mmu_info(struct seq_file *m) 431 { 432 static const char *pgsz_strings[] = { 433 "8K", "64K", "512K", "4MB", "32MB", 434 "256MB", "2GB", "16GB", 435 }; 436 int i, printed; 437 438 if (tlb_type == cheetah) 439 seq_printf(m, "MMU Type\t: Cheetah\n"); 440 else if (tlb_type == cheetah_plus) 441 seq_printf(m, "MMU Type\t: Cheetah+\n"); 442 else if (tlb_type == spitfire) 443 seq_printf(m, "MMU Type\t: Spitfire\n"); 444 else if (tlb_type == hypervisor) 445 seq_printf(m, "MMU Type\t: Hypervisor (sun4v)\n"); 446 else 447 seq_printf(m, "MMU Type\t: ???\n"); 448 449 seq_printf(m, "MMU PGSZs\t: "); 450 printed = 0; 451 for (i = 0; i < ARRAY_SIZE(pgsz_strings); i++) { 452 if (cpu_pgsz_mask & (1UL << i)) { 453 seq_printf(m, "%s%s", 454 printed ? "," : "", pgsz_strings[i]); 455 printed++; 456 } 457 } 458 seq_putc(m, '\n'); 459 460 #ifdef CONFIG_DEBUG_DCFLUSH 461 seq_printf(m, "DCPageFlushes\t: %d\n", 462 atomic_read(&dcpage_flushes)); 463 #ifdef CONFIG_SMP 464 seq_printf(m, "DCPageFlushesXC\t: %d\n", 465 atomic_read(&dcpage_flushes_xcall)); 466 #endif /* CONFIG_SMP */ 467 #endif /* CONFIG_DEBUG_DCFLUSH */ 468 } 469 470 struct linux_prom_translation prom_trans[512] __read_mostly; 471 unsigned int prom_trans_ents __read_mostly; 472 473 unsigned long kern_locked_tte_data; 474 475 /* The obp translations are saved based on 8k pagesize, since obp can 476 * use a mixture of pagesizes. Misses to the LOW_OBP_ADDRESS -> 477 * HI_OBP_ADDRESS range are handled in ktlb.S. 478 */ 479 static inline int in_obp_range(unsigned long vaddr) 480 { 481 return (vaddr >= LOW_OBP_ADDRESS && 482 vaddr < HI_OBP_ADDRESS); 483 } 484 485 static int cmp_ptrans(const void *a, const void *b) 486 { 487 const struct linux_prom_translation *x = a, *y = b; 488 489 if (x->virt > y->virt) 490 return 1; 491 if (x->virt < y->virt) 492 return -1; 493 return 0; 494 } 495 496 /* Read OBP translations property into 'prom_trans[]'. */ 497 static void __init read_obp_translations(void) 498 { 499 int n, node, ents, first, last, i; 500 501 node = prom_finddevice("/virtual-memory"); 502 n = prom_getproplen(node, "translations"); 503 if (unlikely(n == 0 || n == -1)) { 504 prom_printf("prom_mappings: Couldn't get size.\n"); 505 prom_halt(); 506 } 507 if (unlikely(n > sizeof(prom_trans))) { 508 prom_printf("prom_mappings: Size %d is too big.\n", n); 509 prom_halt(); 510 } 511 512 if ((n = prom_getproperty(node, "translations", 513 (char *)&prom_trans[0], 514 sizeof(prom_trans))) == -1) { 515 prom_printf("prom_mappings: Couldn't get property.\n"); 516 prom_halt(); 517 } 518 519 n = n / sizeof(struct linux_prom_translation); 520 521 ents = n; 522 523 sort(prom_trans, ents, sizeof(struct linux_prom_translation), 524 cmp_ptrans, NULL); 525 526 /* Now kick out all the non-OBP entries. */ 527 for (i = 0; i < ents; i++) { 528 if (in_obp_range(prom_trans[i].virt)) 529 break; 530 } 531 first = i; 532 for (; i < ents; i++) { 533 if (!in_obp_range(prom_trans[i].virt)) 534 break; 535 } 536 last = i; 537 538 for (i = 0; i < (last - first); i++) { 539 struct linux_prom_translation *src = &prom_trans[i + first]; 540 struct linux_prom_translation *dest = &prom_trans[i]; 541 542 *dest = *src; 543 } 544 for (; i < ents; i++) { 545 struct linux_prom_translation *dest = &prom_trans[i]; 546 dest->virt = dest->size = dest->data = 0x0UL; 547 } 548 549 prom_trans_ents = last - first; 550 551 if (tlb_type == spitfire) { 552 /* Clear diag TTE bits. */ 553 for (i = 0; i < prom_trans_ents; i++) 554 prom_trans[i].data &= ~0x0003fe0000000000UL; 555 } 556 557 /* Force execute bit on. */ 558 for (i = 0; i < prom_trans_ents; i++) 559 prom_trans[i].data |= (tlb_type == hypervisor ? 560 _PAGE_EXEC_4V : _PAGE_EXEC_4U); 561 } 562 563 static void __init hypervisor_tlb_lock(unsigned long vaddr, 564 unsigned long pte, 565 unsigned long mmu) 566 { 567 unsigned long ret = sun4v_mmu_map_perm_addr(vaddr, 0, pte, mmu); 568 569 if (ret != 0) { 570 prom_printf("hypervisor_tlb_lock[%lx:%x:%lx:%lx]: " 571 "errors with %lx\n", vaddr, 0, pte, mmu, ret); 572 prom_halt(); 573 } 574 } 575 576 static unsigned long kern_large_tte(unsigned long paddr); 577 578 static void __init remap_kernel(void) 579 { 580 unsigned long phys_page, tte_vaddr, tte_data; 581 int i, tlb_ent = sparc64_highest_locked_tlbent(); 582 583 tte_vaddr = (unsigned long) KERNBASE; 584 phys_page = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB; 585 tte_data = kern_large_tte(phys_page); 586 587 kern_locked_tte_data = tte_data; 588 589 /* Now lock us into the TLBs via Hypervisor or OBP. */ 590 if (tlb_type == hypervisor) { 591 for (i = 0; i < num_kernel_image_mappings; i++) { 592 hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_DMMU); 593 hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_IMMU); 594 tte_vaddr += 0x400000; 595 tte_data += 0x400000; 596 } 597 } else { 598 for (i = 0; i < num_kernel_image_mappings; i++) { 599 prom_dtlb_load(tlb_ent - i, tte_data, tte_vaddr); 600 prom_itlb_load(tlb_ent - i, tte_data, tte_vaddr); 601 tte_vaddr += 0x400000; 602 tte_data += 0x400000; 603 } 604 sparc64_highest_unlocked_tlb_ent = tlb_ent - i; 605 } 606 if (tlb_type == cheetah_plus) { 607 sparc64_kern_pri_context = (CTX_CHEETAH_PLUS_CTX0 | 608 CTX_CHEETAH_PLUS_NUC); 609 sparc64_kern_pri_nuc_bits = CTX_CHEETAH_PLUS_NUC; 610 sparc64_kern_sec_context = CTX_CHEETAH_PLUS_CTX0; 611 } 612 } 613 614 615 static void __init inherit_prom_mappings(void) 616 { 617 /* Now fixup OBP's idea about where we really are mapped. */ 618 printk("Remapping the kernel... "); 619 remap_kernel(); 620 printk("done.\n"); 621 } 622 623 void prom_world(int enter) 624 { 625 if (!enter) 626 set_fs(get_fs()); 627 628 __asm__ __volatile__("flushw"); 629 } 630 631 void __flush_dcache_range(unsigned long start, unsigned long end) 632 { 633 unsigned long va; 634 635 if (tlb_type == spitfire) { 636 int n = 0; 637 638 for (va = start; va < end; va += 32) { 639 spitfire_put_dcache_tag(va & 0x3fe0, 0x0); 640 if (++n >= 512) 641 break; 642 } 643 } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { 644 start = __pa(start); 645 end = __pa(end); 646 for (va = start; va < end; va += 32) 647 __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" 648 "membar #Sync" 649 : /* no outputs */ 650 : "r" (va), 651 "i" (ASI_DCACHE_INVALIDATE)); 652 } 653 } 654 EXPORT_SYMBOL(__flush_dcache_range); 655 656 /* get_new_mmu_context() uses "cache + 1". */ 657 DEFINE_SPINLOCK(ctx_alloc_lock); 658 unsigned long tlb_context_cache = CTX_FIRST_VERSION - 1; 659 #define MAX_CTX_NR (1UL << CTX_NR_BITS) 660 #define CTX_BMAP_SLOTS BITS_TO_LONGS(MAX_CTX_NR) 661 DECLARE_BITMAP(mmu_context_bmap, MAX_CTX_NR); 662 663 /* Caller does TLB context flushing on local CPU if necessary. 664 * The caller also ensures that CTX_VALID(mm->context) is false. 665 * 666 * We must be careful about boundary cases so that we never 667 * let the user have CTX 0 (nucleus) or we ever use a CTX 668 * version of zero (and thus NO_CONTEXT would not be caught 669 * by version mis-match tests in mmu_context.h). 670 * 671 * Always invoked with interrupts disabled. 672 */ 673 void get_new_mmu_context(struct mm_struct *mm) 674 { 675 unsigned long ctx, new_ctx; 676 unsigned long orig_pgsz_bits; 677 int new_version; 678 679 spin_lock(&ctx_alloc_lock); 680 orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK); 681 ctx = (tlb_context_cache + 1) & CTX_NR_MASK; 682 new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx); 683 new_version = 0; 684 if (new_ctx >= (1 << CTX_NR_BITS)) { 685 new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1); 686 if (new_ctx >= ctx) { 687 int i; 688 new_ctx = (tlb_context_cache & CTX_VERSION_MASK) + 689 CTX_FIRST_VERSION; 690 if (new_ctx == 1) 691 new_ctx = CTX_FIRST_VERSION; 692 693 /* Don't call memset, for 16 entries that's just 694 * plain silly... 695 */ 696 mmu_context_bmap[0] = 3; 697 mmu_context_bmap[1] = 0; 698 mmu_context_bmap[2] = 0; 699 mmu_context_bmap[3] = 0; 700 for (i = 4; i < CTX_BMAP_SLOTS; i += 4) { 701 mmu_context_bmap[i + 0] = 0; 702 mmu_context_bmap[i + 1] = 0; 703 mmu_context_bmap[i + 2] = 0; 704 mmu_context_bmap[i + 3] = 0; 705 } 706 new_version = 1; 707 goto out; 708 } 709 } 710 mmu_context_bmap[new_ctx>>6] |= (1UL << (new_ctx & 63)); 711 new_ctx |= (tlb_context_cache & CTX_VERSION_MASK); 712 out: 713 tlb_context_cache = new_ctx; 714 mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits; 715 spin_unlock(&ctx_alloc_lock); 716 717 if (unlikely(new_version)) 718 smp_new_mmu_context_version(); 719 } 720 721 static int numa_enabled = 1; 722 static int numa_debug; 723 724 static int __init early_numa(char *p) 725 { 726 if (!p) 727 return 0; 728 729 if (strstr(p, "off")) 730 numa_enabled = 0; 731 732 if (strstr(p, "debug")) 733 numa_debug = 1; 734 735 return 0; 736 } 737 early_param("numa", early_numa); 738 739 #define numadbg(f, a...) \ 740 do { if (numa_debug) \ 741 printk(KERN_INFO f, ## a); \ 742 } while (0) 743 744 static void __init find_ramdisk(unsigned long phys_base) 745 { 746 #ifdef CONFIG_BLK_DEV_INITRD 747 if (sparc_ramdisk_image || sparc_ramdisk_image64) { 748 unsigned long ramdisk_image; 749 750 /* Older versions of the bootloader only supported a 751 * 32-bit physical address for the ramdisk image 752 * location, stored at sparc_ramdisk_image. Newer 753 * SILO versions set sparc_ramdisk_image to zero and 754 * provide a full 64-bit physical address at 755 * sparc_ramdisk_image64. 756 */ 757 ramdisk_image = sparc_ramdisk_image; 758 if (!ramdisk_image) 759 ramdisk_image = sparc_ramdisk_image64; 760 761 /* Another bootloader quirk. The bootloader normalizes 762 * the physical address to KERNBASE, so we have to 763 * factor that back out and add in the lowest valid 764 * physical page address to get the true physical address. 765 */ 766 ramdisk_image -= KERNBASE; 767 ramdisk_image += phys_base; 768 769 numadbg("Found ramdisk at physical address 0x%lx, size %u\n", 770 ramdisk_image, sparc_ramdisk_size); 771 772 initrd_start = ramdisk_image; 773 initrd_end = ramdisk_image + sparc_ramdisk_size; 774 775 memblock_reserve(initrd_start, sparc_ramdisk_size); 776 777 initrd_start += PAGE_OFFSET; 778 initrd_end += PAGE_OFFSET; 779 } 780 #endif 781 } 782 783 struct node_mem_mask { 784 unsigned long mask; 785 unsigned long val; 786 }; 787 static struct node_mem_mask node_masks[MAX_NUMNODES]; 788 static int num_node_masks; 789 790 #ifdef CONFIG_NEED_MULTIPLE_NODES 791 792 int numa_cpu_lookup_table[NR_CPUS]; 793 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; 794 795 struct mdesc_mblock { 796 u64 base; 797 u64 size; 798 u64 offset; /* RA-to-PA */ 799 }; 800 static struct mdesc_mblock *mblocks; 801 static int num_mblocks; 802 803 static unsigned long ra_to_pa(unsigned long addr) 804 { 805 int i; 806 807 for (i = 0; i < num_mblocks; i++) { 808 struct mdesc_mblock *m = &mblocks[i]; 809 810 if (addr >= m->base && 811 addr < (m->base + m->size)) { 812 addr += m->offset; 813 break; 814 } 815 } 816 return addr; 817 } 818 819 static int find_node(unsigned long addr) 820 { 821 int i; 822 823 addr = ra_to_pa(addr); 824 for (i = 0; i < num_node_masks; i++) { 825 struct node_mem_mask *p = &node_masks[i]; 826 827 if ((addr & p->mask) == p->val) 828 return i; 829 } 830 /* The following condition has been observed on LDOM guests.*/ 831 WARN_ONCE(1, "find_node: A physical address doesn't match a NUMA node" 832 " rule. Some physical memory will be owned by node 0."); 833 return 0; 834 } 835 836 static u64 memblock_nid_range(u64 start, u64 end, int *nid) 837 { 838 *nid = find_node(start); 839 start += PAGE_SIZE; 840 while (start < end) { 841 int n = find_node(start); 842 843 if (n != *nid) 844 break; 845 start += PAGE_SIZE; 846 } 847 848 if (start > end) 849 start = end; 850 851 return start; 852 } 853 #endif 854 855 /* This must be invoked after performing all of the necessary 856 * memblock_set_node() calls for 'nid'. We need to be able to get 857 * correct data from get_pfn_range_for_nid(). 858 */ 859 static void __init allocate_node_data(int nid) 860 { 861 struct pglist_data *p; 862 unsigned long start_pfn, end_pfn; 863 #ifdef CONFIG_NEED_MULTIPLE_NODES 864 unsigned long paddr; 865 866 paddr = memblock_alloc_try_nid(sizeof(struct pglist_data), SMP_CACHE_BYTES, nid); 867 if (!paddr) { 868 prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid); 869 prom_halt(); 870 } 871 NODE_DATA(nid) = __va(paddr); 872 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 873 874 NODE_DATA(nid)->node_id = nid; 875 #endif 876 877 p = NODE_DATA(nid); 878 879 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 880 p->node_start_pfn = start_pfn; 881 p->node_spanned_pages = end_pfn - start_pfn; 882 } 883 884 static void init_node_masks_nonnuma(void) 885 { 886 #ifdef CONFIG_NEED_MULTIPLE_NODES 887 int i; 888 #endif 889 890 numadbg("Initializing tables for non-numa.\n"); 891 892 node_masks[0].mask = node_masks[0].val = 0; 893 num_node_masks = 1; 894 895 #ifdef CONFIG_NEED_MULTIPLE_NODES 896 for (i = 0; i < NR_CPUS; i++) 897 numa_cpu_lookup_table[i] = 0; 898 899 cpumask_setall(&numa_cpumask_lookup_table[0]); 900 #endif 901 } 902 903 #ifdef CONFIG_NEED_MULTIPLE_NODES 904 struct pglist_data *node_data[MAX_NUMNODES]; 905 906 EXPORT_SYMBOL(numa_cpu_lookup_table); 907 EXPORT_SYMBOL(numa_cpumask_lookup_table); 908 EXPORT_SYMBOL(node_data); 909 910 struct mdesc_mlgroup { 911 u64 node; 912 u64 latency; 913 u64 match; 914 u64 mask; 915 }; 916 static struct mdesc_mlgroup *mlgroups; 917 static int num_mlgroups; 918 919 static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio, 920 u32 cfg_handle) 921 { 922 u64 arc; 923 924 mdesc_for_each_arc(arc, md, pio, MDESC_ARC_TYPE_FWD) { 925 u64 target = mdesc_arc_target(md, arc); 926 const u64 *val; 927 928 val = mdesc_get_property(md, target, 929 "cfg-handle", NULL); 930 if (val && *val == cfg_handle) 931 return 0; 932 } 933 return -ENODEV; 934 } 935 936 static int scan_arcs_for_cfg_handle(struct mdesc_handle *md, u64 grp, 937 u32 cfg_handle) 938 { 939 u64 arc, candidate, best_latency = ~(u64)0; 940 941 candidate = MDESC_NODE_NULL; 942 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) { 943 u64 target = mdesc_arc_target(md, arc); 944 const char *name = mdesc_node_name(md, target); 945 const u64 *val; 946 947 if (strcmp(name, "pio-latency-group")) 948 continue; 949 950 val = mdesc_get_property(md, target, "latency", NULL); 951 if (!val) 952 continue; 953 954 if (*val < best_latency) { 955 candidate = target; 956 best_latency = *val; 957 } 958 } 959 960 if (candidate == MDESC_NODE_NULL) 961 return -ENODEV; 962 963 return scan_pio_for_cfg_handle(md, candidate, cfg_handle); 964 } 965 966 int of_node_to_nid(struct device_node *dp) 967 { 968 const struct linux_prom64_registers *regs; 969 struct mdesc_handle *md; 970 u32 cfg_handle; 971 int count, nid; 972 u64 grp; 973 974 /* This is the right thing to do on currently supported 975 * SUN4U NUMA platforms as well, as the PCI controller does 976 * not sit behind any particular memory controller. 977 */ 978 if (!mlgroups) 979 return -1; 980 981 regs = of_get_property(dp, "reg", NULL); 982 if (!regs) 983 return -1; 984 985 cfg_handle = (regs->phys_addr >> 32UL) & 0x0fffffff; 986 987 md = mdesc_grab(); 988 989 count = 0; 990 nid = -1; 991 mdesc_for_each_node_by_name(md, grp, "group") { 992 if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) { 993 nid = count; 994 break; 995 } 996 count++; 997 } 998 999 mdesc_release(md); 1000 1001 return nid; 1002 } 1003 1004 static void __init add_node_ranges(void) 1005 { 1006 struct memblock_region *reg; 1007 1008 for_each_memblock(memory, reg) { 1009 unsigned long size = reg->size; 1010 unsigned long start, end; 1011 1012 start = reg->base; 1013 end = start + size; 1014 while (start < end) { 1015 unsigned long this_end; 1016 int nid; 1017 1018 this_end = memblock_nid_range(start, end, &nid); 1019 1020 numadbg("Setting memblock NUMA node nid[%d] " 1021 "start[%lx] end[%lx]\n", 1022 nid, start, this_end); 1023 1024 memblock_set_node(start, this_end - start, 1025 &memblock.memory, nid); 1026 start = this_end; 1027 } 1028 } 1029 } 1030 1031 static int __init grab_mlgroups(struct mdesc_handle *md) 1032 { 1033 unsigned long paddr; 1034 int count = 0; 1035 u64 node; 1036 1037 mdesc_for_each_node_by_name(md, node, "memory-latency-group") 1038 count++; 1039 if (!count) 1040 return -ENOENT; 1041 1042 paddr = memblock_alloc(count * sizeof(struct mdesc_mlgroup), 1043 SMP_CACHE_BYTES); 1044 if (!paddr) 1045 return -ENOMEM; 1046 1047 mlgroups = __va(paddr); 1048 num_mlgroups = count; 1049 1050 count = 0; 1051 mdesc_for_each_node_by_name(md, node, "memory-latency-group") { 1052 struct mdesc_mlgroup *m = &mlgroups[count++]; 1053 const u64 *val; 1054 1055 m->node = node; 1056 1057 val = mdesc_get_property(md, node, "latency", NULL); 1058 m->latency = *val; 1059 val = mdesc_get_property(md, node, "address-match", NULL); 1060 m->match = *val; 1061 val = mdesc_get_property(md, node, "address-mask", NULL); 1062 m->mask = *val; 1063 1064 numadbg("MLGROUP[%d]: node[%llx] latency[%llx] " 1065 "match[%llx] mask[%llx]\n", 1066 count - 1, m->node, m->latency, m->match, m->mask); 1067 } 1068 1069 return 0; 1070 } 1071 1072 static int __init grab_mblocks(struct mdesc_handle *md) 1073 { 1074 unsigned long paddr; 1075 int count = 0; 1076 u64 node; 1077 1078 mdesc_for_each_node_by_name(md, node, "mblock") 1079 count++; 1080 if (!count) 1081 return -ENOENT; 1082 1083 paddr = memblock_alloc(count * sizeof(struct mdesc_mblock), 1084 SMP_CACHE_BYTES); 1085 if (!paddr) 1086 return -ENOMEM; 1087 1088 mblocks = __va(paddr); 1089 num_mblocks = count; 1090 1091 count = 0; 1092 mdesc_for_each_node_by_name(md, node, "mblock") { 1093 struct mdesc_mblock *m = &mblocks[count++]; 1094 const u64 *val; 1095 1096 val = mdesc_get_property(md, node, "base", NULL); 1097 m->base = *val; 1098 val = mdesc_get_property(md, node, "size", NULL); 1099 m->size = *val; 1100 val = mdesc_get_property(md, node, 1101 "address-congruence-offset", NULL); 1102 1103 /* The address-congruence-offset property is optional. 1104 * Explicity zero it be identifty this. 1105 */ 1106 if (val) 1107 m->offset = *val; 1108 else 1109 m->offset = 0UL; 1110 1111 numadbg("MBLOCK[%d]: base[%llx] size[%llx] offset[%llx]\n", 1112 count - 1, m->base, m->size, m->offset); 1113 } 1114 1115 return 0; 1116 } 1117 1118 static void __init numa_parse_mdesc_group_cpus(struct mdesc_handle *md, 1119 u64 grp, cpumask_t *mask) 1120 { 1121 u64 arc; 1122 1123 cpumask_clear(mask); 1124 1125 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_BACK) { 1126 u64 target = mdesc_arc_target(md, arc); 1127 const char *name = mdesc_node_name(md, target); 1128 const u64 *id; 1129 1130 if (strcmp(name, "cpu")) 1131 continue; 1132 id = mdesc_get_property(md, target, "id", NULL); 1133 if (*id < nr_cpu_ids) 1134 cpumask_set_cpu(*id, mask); 1135 } 1136 } 1137 1138 static struct mdesc_mlgroup * __init find_mlgroup(u64 node) 1139 { 1140 int i; 1141 1142 for (i = 0; i < num_mlgroups; i++) { 1143 struct mdesc_mlgroup *m = &mlgroups[i]; 1144 if (m->node == node) 1145 return m; 1146 } 1147 return NULL; 1148 } 1149 1150 int __node_distance(int from, int to) 1151 { 1152 if ((from >= MAX_NUMNODES) || (to >= MAX_NUMNODES)) { 1153 pr_warn("Returning default NUMA distance value for %d->%d\n", 1154 from, to); 1155 return (from == to) ? LOCAL_DISTANCE : REMOTE_DISTANCE; 1156 } 1157 return numa_latency[from][to]; 1158 } 1159 1160 static int find_best_numa_node_for_mlgroup(struct mdesc_mlgroup *grp) 1161 { 1162 int i; 1163 1164 for (i = 0; i < MAX_NUMNODES; i++) { 1165 struct node_mem_mask *n = &node_masks[i]; 1166 1167 if ((grp->mask == n->mask) && (grp->match == n->val)) 1168 break; 1169 } 1170 return i; 1171 } 1172 1173 static void find_numa_latencies_for_group(struct mdesc_handle *md, u64 grp, 1174 int index) 1175 { 1176 u64 arc; 1177 1178 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) { 1179 int tnode; 1180 u64 target = mdesc_arc_target(md, arc); 1181 struct mdesc_mlgroup *m = find_mlgroup(target); 1182 1183 if (!m) 1184 continue; 1185 tnode = find_best_numa_node_for_mlgroup(m); 1186 if (tnode == MAX_NUMNODES) 1187 continue; 1188 numa_latency[index][tnode] = m->latency; 1189 } 1190 } 1191 1192 static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp, 1193 int index) 1194 { 1195 struct mdesc_mlgroup *candidate = NULL; 1196 u64 arc, best_latency = ~(u64)0; 1197 struct node_mem_mask *n; 1198 1199 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) { 1200 u64 target = mdesc_arc_target(md, arc); 1201 struct mdesc_mlgroup *m = find_mlgroup(target); 1202 if (!m) 1203 continue; 1204 if (m->latency < best_latency) { 1205 candidate = m; 1206 best_latency = m->latency; 1207 } 1208 } 1209 if (!candidate) 1210 return -ENOENT; 1211 1212 if (num_node_masks != index) { 1213 printk(KERN_ERR "Inconsistent NUMA state, " 1214 "index[%d] != num_node_masks[%d]\n", 1215 index, num_node_masks); 1216 return -EINVAL; 1217 } 1218 1219 n = &node_masks[num_node_masks++]; 1220 1221 n->mask = candidate->mask; 1222 n->val = candidate->match; 1223 1224 numadbg("NUMA NODE[%d]: mask[%lx] val[%lx] (latency[%llx])\n", 1225 index, n->mask, n->val, candidate->latency); 1226 1227 return 0; 1228 } 1229 1230 static int __init numa_parse_mdesc_group(struct mdesc_handle *md, u64 grp, 1231 int index) 1232 { 1233 cpumask_t mask; 1234 int cpu; 1235 1236 numa_parse_mdesc_group_cpus(md, grp, &mask); 1237 1238 for_each_cpu(cpu, &mask) 1239 numa_cpu_lookup_table[cpu] = index; 1240 cpumask_copy(&numa_cpumask_lookup_table[index], &mask); 1241 1242 if (numa_debug) { 1243 printk(KERN_INFO "NUMA GROUP[%d]: cpus [ ", index); 1244 for_each_cpu(cpu, &mask) 1245 printk("%d ", cpu); 1246 printk("]\n"); 1247 } 1248 1249 return numa_attach_mlgroup(md, grp, index); 1250 } 1251 1252 static int __init numa_parse_mdesc(void) 1253 { 1254 struct mdesc_handle *md = mdesc_grab(); 1255 int i, j, err, count; 1256 u64 node; 1257 1258 node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups"); 1259 if (node == MDESC_NODE_NULL) { 1260 mdesc_release(md); 1261 return -ENOENT; 1262 } 1263 1264 err = grab_mblocks(md); 1265 if (err < 0) 1266 goto out; 1267 1268 err = grab_mlgroups(md); 1269 if (err < 0) 1270 goto out; 1271 1272 count = 0; 1273 mdesc_for_each_node_by_name(md, node, "group") { 1274 err = numa_parse_mdesc_group(md, node, count); 1275 if (err < 0) 1276 break; 1277 count++; 1278 } 1279 1280 count = 0; 1281 mdesc_for_each_node_by_name(md, node, "group") { 1282 find_numa_latencies_for_group(md, node, count); 1283 count++; 1284 } 1285 1286 /* Normalize numa latency matrix according to ACPI SLIT spec. */ 1287 for (i = 0; i < MAX_NUMNODES; i++) { 1288 u64 self_latency = numa_latency[i][i]; 1289 1290 for (j = 0; j < MAX_NUMNODES; j++) { 1291 numa_latency[i][j] = 1292 (numa_latency[i][j] * LOCAL_DISTANCE) / 1293 self_latency; 1294 } 1295 } 1296 1297 add_node_ranges(); 1298 1299 for (i = 0; i < num_node_masks; i++) { 1300 allocate_node_data(i); 1301 node_set_online(i); 1302 } 1303 1304 err = 0; 1305 out: 1306 mdesc_release(md); 1307 return err; 1308 } 1309 1310 static int __init numa_parse_jbus(void) 1311 { 1312 unsigned long cpu, index; 1313 1314 /* NUMA node id is encoded in bits 36 and higher, and there is 1315 * a 1-to-1 mapping from CPU ID to NUMA node ID. 1316 */ 1317 index = 0; 1318 for_each_present_cpu(cpu) { 1319 numa_cpu_lookup_table[cpu] = index; 1320 cpumask_copy(&numa_cpumask_lookup_table[index], cpumask_of(cpu)); 1321 node_masks[index].mask = ~((1UL << 36UL) - 1UL); 1322 node_masks[index].val = cpu << 36UL; 1323 1324 index++; 1325 } 1326 num_node_masks = index; 1327 1328 add_node_ranges(); 1329 1330 for (index = 0; index < num_node_masks; index++) { 1331 allocate_node_data(index); 1332 node_set_online(index); 1333 } 1334 1335 return 0; 1336 } 1337 1338 static int __init numa_parse_sun4u(void) 1339 { 1340 if (tlb_type == cheetah || tlb_type == cheetah_plus) { 1341 unsigned long ver; 1342 1343 __asm__ ("rdpr %%ver, %0" : "=r" (ver)); 1344 if ((ver >> 32UL) == __JALAPENO_ID || 1345 (ver >> 32UL) == __SERRANO_ID) 1346 return numa_parse_jbus(); 1347 } 1348 return -1; 1349 } 1350 1351 static int __init bootmem_init_numa(void) 1352 { 1353 int i, j; 1354 int err = -1; 1355 1356 numadbg("bootmem_init_numa()\n"); 1357 1358 /* Some sane defaults for numa latency values */ 1359 for (i = 0; i < MAX_NUMNODES; i++) { 1360 for (j = 0; j < MAX_NUMNODES; j++) 1361 numa_latency[i][j] = (i == j) ? 1362 LOCAL_DISTANCE : REMOTE_DISTANCE; 1363 } 1364 1365 if (numa_enabled) { 1366 if (tlb_type == hypervisor) 1367 err = numa_parse_mdesc(); 1368 else 1369 err = numa_parse_sun4u(); 1370 } 1371 return err; 1372 } 1373 1374 #else 1375 1376 static int bootmem_init_numa(void) 1377 { 1378 return -1; 1379 } 1380 1381 #endif 1382 1383 static void __init bootmem_init_nonnuma(void) 1384 { 1385 unsigned long top_of_ram = memblock_end_of_DRAM(); 1386 unsigned long total_ram = memblock_phys_mem_size(); 1387 1388 numadbg("bootmem_init_nonnuma()\n"); 1389 1390 printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 1391 top_of_ram, total_ram); 1392 printk(KERN_INFO "Memory hole size: %ldMB\n", 1393 (top_of_ram - total_ram) >> 20); 1394 1395 init_node_masks_nonnuma(); 1396 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); 1397 allocate_node_data(0); 1398 node_set_online(0); 1399 } 1400 1401 static unsigned long __init bootmem_init(unsigned long phys_base) 1402 { 1403 unsigned long end_pfn; 1404 1405 end_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; 1406 max_pfn = max_low_pfn = end_pfn; 1407 min_low_pfn = (phys_base >> PAGE_SHIFT); 1408 1409 if (bootmem_init_numa() < 0) 1410 bootmem_init_nonnuma(); 1411 1412 /* Dump memblock with node info. */ 1413 memblock_dump_all(); 1414 1415 /* XXX cpu notifier XXX */ 1416 1417 sparse_memory_present_with_active_regions(MAX_NUMNODES); 1418 sparse_init(); 1419 1420 return end_pfn; 1421 } 1422 1423 static struct linux_prom64_registers pall[MAX_BANKS] __initdata; 1424 static int pall_ents __initdata; 1425 1426 static unsigned long max_phys_bits = 40; 1427 1428 bool kern_addr_valid(unsigned long addr) 1429 { 1430 pgd_t *pgd; 1431 pud_t *pud; 1432 pmd_t *pmd; 1433 pte_t *pte; 1434 1435 if ((long)addr < 0L) { 1436 unsigned long pa = __pa(addr); 1437 1438 if ((addr >> max_phys_bits) != 0UL) 1439 return false; 1440 1441 return pfn_valid(pa >> PAGE_SHIFT); 1442 } 1443 1444 if (addr >= (unsigned long) KERNBASE && 1445 addr < (unsigned long)&_end) 1446 return true; 1447 1448 pgd = pgd_offset_k(addr); 1449 if (pgd_none(*pgd)) 1450 return 0; 1451 1452 pud = pud_offset(pgd, addr); 1453 if (pud_none(*pud)) 1454 return 0; 1455 1456 if (pud_large(*pud)) 1457 return pfn_valid(pud_pfn(*pud)); 1458 1459 pmd = pmd_offset(pud, addr); 1460 if (pmd_none(*pmd)) 1461 return 0; 1462 1463 if (pmd_large(*pmd)) 1464 return pfn_valid(pmd_pfn(*pmd)); 1465 1466 pte = pte_offset_kernel(pmd, addr); 1467 if (pte_none(*pte)) 1468 return 0; 1469 1470 return pfn_valid(pte_pfn(*pte)); 1471 } 1472 EXPORT_SYMBOL(kern_addr_valid); 1473 1474 static unsigned long __ref kernel_map_hugepud(unsigned long vstart, 1475 unsigned long vend, 1476 pud_t *pud) 1477 { 1478 const unsigned long mask16gb = (1UL << 34) - 1UL; 1479 u64 pte_val = vstart; 1480 1481 /* Each PUD is 8GB */ 1482 if ((vstart & mask16gb) || 1483 (vend - vstart <= mask16gb)) { 1484 pte_val ^= kern_linear_pte_xor[2]; 1485 pud_val(*pud) = pte_val | _PAGE_PUD_HUGE; 1486 1487 return vstart + PUD_SIZE; 1488 } 1489 1490 pte_val ^= kern_linear_pte_xor[3]; 1491 pte_val |= _PAGE_PUD_HUGE; 1492 1493 vend = vstart + mask16gb + 1UL; 1494 while (vstart < vend) { 1495 pud_val(*pud) = pte_val; 1496 1497 pte_val += PUD_SIZE; 1498 vstart += PUD_SIZE; 1499 pud++; 1500 } 1501 return vstart; 1502 } 1503 1504 static bool kernel_can_map_hugepud(unsigned long vstart, unsigned long vend, 1505 bool guard) 1506 { 1507 if (guard && !(vstart & ~PUD_MASK) && (vend - vstart) >= PUD_SIZE) 1508 return true; 1509 1510 return false; 1511 } 1512 1513 static unsigned long __ref kernel_map_hugepmd(unsigned long vstart, 1514 unsigned long vend, 1515 pmd_t *pmd) 1516 { 1517 const unsigned long mask256mb = (1UL << 28) - 1UL; 1518 const unsigned long mask2gb = (1UL << 31) - 1UL; 1519 u64 pte_val = vstart; 1520 1521 /* Each PMD is 8MB */ 1522 if ((vstart & mask256mb) || 1523 (vend - vstart <= mask256mb)) { 1524 pte_val ^= kern_linear_pte_xor[0]; 1525 pmd_val(*pmd) = pte_val | _PAGE_PMD_HUGE; 1526 1527 return vstart + PMD_SIZE; 1528 } 1529 1530 if ((vstart & mask2gb) || 1531 (vend - vstart <= mask2gb)) { 1532 pte_val ^= kern_linear_pte_xor[1]; 1533 pte_val |= _PAGE_PMD_HUGE; 1534 vend = vstart + mask256mb + 1UL; 1535 } else { 1536 pte_val ^= kern_linear_pte_xor[2]; 1537 pte_val |= _PAGE_PMD_HUGE; 1538 vend = vstart + mask2gb + 1UL; 1539 } 1540 1541 while (vstart < vend) { 1542 pmd_val(*pmd) = pte_val; 1543 1544 pte_val += PMD_SIZE; 1545 vstart += PMD_SIZE; 1546 pmd++; 1547 } 1548 1549 return vstart; 1550 } 1551 1552 static bool kernel_can_map_hugepmd(unsigned long vstart, unsigned long vend, 1553 bool guard) 1554 { 1555 if (guard && !(vstart & ~PMD_MASK) && (vend - vstart) >= PMD_SIZE) 1556 return true; 1557 1558 return false; 1559 } 1560 1561 static unsigned long __ref kernel_map_range(unsigned long pstart, 1562 unsigned long pend, pgprot_t prot, 1563 bool use_huge) 1564 { 1565 unsigned long vstart = PAGE_OFFSET + pstart; 1566 unsigned long vend = PAGE_OFFSET + pend; 1567 unsigned long alloc_bytes = 0UL; 1568 1569 if ((vstart & ~PAGE_MASK) || (vend & ~PAGE_MASK)) { 1570 prom_printf("kernel_map: Unaligned physmem[%lx:%lx]\n", 1571 vstart, vend); 1572 prom_halt(); 1573 } 1574 1575 while (vstart < vend) { 1576 unsigned long this_end, paddr = __pa(vstart); 1577 pgd_t *pgd = pgd_offset_k(vstart); 1578 pud_t *pud; 1579 pmd_t *pmd; 1580 pte_t *pte; 1581 1582 if (pgd_none(*pgd)) { 1583 pud_t *new; 1584 1585 new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); 1586 alloc_bytes += PAGE_SIZE; 1587 pgd_populate(&init_mm, pgd, new); 1588 } 1589 pud = pud_offset(pgd, vstart); 1590 if (pud_none(*pud)) { 1591 pmd_t *new; 1592 1593 if (kernel_can_map_hugepud(vstart, vend, use_huge)) { 1594 vstart = kernel_map_hugepud(vstart, vend, pud); 1595 continue; 1596 } 1597 new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); 1598 alloc_bytes += PAGE_SIZE; 1599 pud_populate(&init_mm, pud, new); 1600 } 1601 1602 pmd = pmd_offset(pud, vstart); 1603 if (pmd_none(*pmd)) { 1604 pte_t *new; 1605 1606 if (kernel_can_map_hugepmd(vstart, vend, use_huge)) { 1607 vstart = kernel_map_hugepmd(vstart, vend, pmd); 1608 continue; 1609 } 1610 new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); 1611 alloc_bytes += PAGE_SIZE; 1612 pmd_populate_kernel(&init_mm, pmd, new); 1613 } 1614 1615 pte = pte_offset_kernel(pmd, vstart); 1616 this_end = (vstart + PMD_SIZE) & PMD_MASK; 1617 if (this_end > vend) 1618 this_end = vend; 1619 1620 while (vstart < this_end) { 1621 pte_val(*pte) = (paddr | pgprot_val(prot)); 1622 1623 vstart += PAGE_SIZE; 1624 paddr += PAGE_SIZE; 1625 pte++; 1626 } 1627 } 1628 1629 return alloc_bytes; 1630 } 1631 1632 static void __init flush_all_kernel_tsbs(void) 1633 { 1634 int i; 1635 1636 for (i = 0; i < KERNEL_TSB_NENTRIES; i++) { 1637 struct tsb *ent = &swapper_tsb[i]; 1638 1639 ent->tag = (1UL << TSB_TAG_INVALID_BIT); 1640 } 1641 #ifndef CONFIG_DEBUG_PAGEALLOC 1642 for (i = 0; i < KERNEL_TSB4M_NENTRIES; i++) { 1643 struct tsb *ent = &swapper_4m_tsb[i]; 1644 1645 ent->tag = (1UL << TSB_TAG_INVALID_BIT); 1646 } 1647 #endif 1648 } 1649 1650 extern unsigned int kvmap_linear_patch[1]; 1651 1652 static void __init kernel_physical_mapping_init(void) 1653 { 1654 unsigned long i, mem_alloced = 0UL; 1655 bool use_huge = true; 1656 1657 #ifdef CONFIG_DEBUG_PAGEALLOC 1658 use_huge = false; 1659 #endif 1660 for (i = 0; i < pall_ents; i++) { 1661 unsigned long phys_start, phys_end; 1662 1663 phys_start = pall[i].phys_addr; 1664 phys_end = phys_start + pall[i].reg_size; 1665 1666 mem_alloced += kernel_map_range(phys_start, phys_end, 1667 PAGE_KERNEL, use_huge); 1668 } 1669 1670 printk("Allocated %ld bytes for kernel page tables.\n", 1671 mem_alloced); 1672 1673 kvmap_linear_patch[0] = 0x01000000; /* nop */ 1674 flushi(&kvmap_linear_patch[0]); 1675 1676 flush_all_kernel_tsbs(); 1677 1678 __flush_tlb_all(); 1679 } 1680 1681 #ifdef CONFIG_DEBUG_PAGEALLOC 1682 void __kernel_map_pages(struct page *page, int numpages, int enable) 1683 { 1684 unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT; 1685 unsigned long phys_end = phys_start + (numpages * PAGE_SIZE); 1686 1687 kernel_map_range(phys_start, phys_end, 1688 (enable ? PAGE_KERNEL : __pgprot(0)), false); 1689 1690 flush_tsb_kernel_range(PAGE_OFFSET + phys_start, 1691 PAGE_OFFSET + phys_end); 1692 1693 /* we should perform an IPI and flush all tlbs, 1694 * but that can deadlock->flush only current cpu. 1695 */ 1696 __flush_tlb_kernel_range(PAGE_OFFSET + phys_start, 1697 PAGE_OFFSET + phys_end); 1698 } 1699 #endif 1700 1701 unsigned long __init find_ecache_flush_span(unsigned long size) 1702 { 1703 int i; 1704 1705 for (i = 0; i < pavail_ents; i++) { 1706 if (pavail[i].reg_size >= size) 1707 return pavail[i].phys_addr; 1708 } 1709 1710 return ~0UL; 1711 } 1712 1713 unsigned long PAGE_OFFSET; 1714 EXPORT_SYMBOL(PAGE_OFFSET); 1715 1716 unsigned long VMALLOC_END = 0x0000010000000000UL; 1717 EXPORT_SYMBOL(VMALLOC_END); 1718 1719 unsigned long sparc64_va_hole_top = 0xfffff80000000000UL; 1720 unsigned long sparc64_va_hole_bottom = 0x0000080000000000UL; 1721 1722 static void __init setup_page_offset(void) 1723 { 1724 if (tlb_type == cheetah || tlb_type == cheetah_plus) { 1725 /* Cheetah/Panther support a full 64-bit virtual 1726 * address, so we can use all that our page tables 1727 * support. 1728 */ 1729 sparc64_va_hole_top = 0xfff0000000000000UL; 1730 sparc64_va_hole_bottom = 0x0010000000000000UL; 1731 1732 max_phys_bits = 42; 1733 } else if (tlb_type == hypervisor) { 1734 switch (sun4v_chip_type) { 1735 case SUN4V_CHIP_NIAGARA1: 1736 case SUN4V_CHIP_NIAGARA2: 1737 /* T1 and T2 support 48-bit virtual addresses. */ 1738 sparc64_va_hole_top = 0xffff800000000000UL; 1739 sparc64_va_hole_bottom = 0x0000800000000000UL; 1740 1741 max_phys_bits = 39; 1742 break; 1743 case SUN4V_CHIP_NIAGARA3: 1744 /* T3 supports 48-bit virtual addresses. */ 1745 sparc64_va_hole_top = 0xffff800000000000UL; 1746 sparc64_va_hole_bottom = 0x0000800000000000UL; 1747 1748 max_phys_bits = 43; 1749 break; 1750 case SUN4V_CHIP_NIAGARA4: 1751 case SUN4V_CHIP_NIAGARA5: 1752 case SUN4V_CHIP_SPARC64X: 1753 case SUN4V_CHIP_SPARC_M6: 1754 /* T4 and later support 52-bit virtual addresses. */ 1755 sparc64_va_hole_top = 0xfff8000000000000UL; 1756 sparc64_va_hole_bottom = 0x0008000000000000UL; 1757 max_phys_bits = 47; 1758 break; 1759 case SUN4V_CHIP_SPARC_M7: 1760 case SUN4V_CHIP_SPARC_SN: 1761 default: 1762 /* M7 and later support 52-bit virtual addresses. */ 1763 sparc64_va_hole_top = 0xfff8000000000000UL; 1764 sparc64_va_hole_bottom = 0x0008000000000000UL; 1765 max_phys_bits = 49; 1766 break; 1767 } 1768 } 1769 1770 if (max_phys_bits > MAX_PHYS_ADDRESS_BITS) { 1771 prom_printf("MAX_PHYS_ADDRESS_BITS is too small, need %lu\n", 1772 max_phys_bits); 1773 prom_halt(); 1774 } 1775 1776 PAGE_OFFSET = sparc64_va_hole_top; 1777 VMALLOC_END = ((sparc64_va_hole_bottom >> 1) + 1778 (sparc64_va_hole_bottom >> 2)); 1779 1780 pr_info("MM: PAGE_OFFSET is 0x%016lx (max_phys_bits == %lu)\n", 1781 PAGE_OFFSET, max_phys_bits); 1782 pr_info("MM: VMALLOC [0x%016lx --> 0x%016lx]\n", 1783 VMALLOC_START, VMALLOC_END); 1784 pr_info("MM: VMEMMAP [0x%016lx --> 0x%016lx]\n", 1785 VMEMMAP_BASE, VMEMMAP_BASE << 1); 1786 } 1787 1788 static void __init tsb_phys_patch(void) 1789 { 1790 struct tsb_ldquad_phys_patch_entry *pquad; 1791 struct tsb_phys_patch_entry *p; 1792 1793 pquad = &__tsb_ldquad_phys_patch; 1794 while (pquad < &__tsb_ldquad_phys_patch_end) { 1795 unsigned long addr = pquad->addr; 1796 1797 if (tlb_type == hypervisor) 1798 *(unsigned int *) addr = pquad->sun4v_insn; 1799 else 1800 *(unsigned int *) addr = pquad->sun4u_insn; 1801 wmb(); 1802 __asm__ __volatile__("flush %0" 1803 : /* no outputs */ 1804 : "r" (addr)); 1805 1806 pquad++; 1807 } 1808 1809 p = &__tsb_phys_patch; 1810 while (p < &__tsb_phys_patch_end) { 1811 unsigned long addr = p->addr; 1812 1813 *(unsigned int *) addr = p->insn; 1814 wmb(); 1815 __asm__ __volatile__("flush %0" 1816 : /* no outputs */ 1817 : "r" (addr)); 1818 1819 p++; 1820 } 1821 } 1822 1823 /* Don't mark as init, we give this to the Hypervisor. */ 1824 #ifndef CONFIG_DEBUG_PAGEALLOC 1825 #define NUM_KTSB_DESCR 2 1826 #else 1827 #define NUM_KTSB_DESCR 1 1828 #endif 1829 static struct hv_tsb_descr ktsb_descr[NUM_KTSB_DESCR]; 1830 1831 /* The swapper TSBs are loaded with a base sequence of: 1832 * 1833 * sethi %uhi(SYMBOL), REG1 1834 * sethi %hi(SYMBOL), REG2 1835 * or REG1, %ulo(SYMBOL), REG1 1836 * or REG2, %lo(SYMBOL), REG2 1837 * sllx REG1, 32, REG1 1838 * or REG1, REG2, REG1 1839 * 1840 * When we use physical addressing for the TSB accesses, we patch the 1841 * first four instructions in the above sequence. 1842 */ 1843 1844 static void patch_one_ktsb_phys(unsigned int *start, unsigned int *end, unsigned long pa) 1845 { 1846 unsigned long high_bits, low_bits; 1847 1848 high_bits = (pa >> 32) & 0xffffffff; 1849 low_bits = (pa >> 0) & 0xffffffff; 1850 1851 while (start < end) { 1852 unsigned int *ia = (unsigned int *)(unsigned long)*start; 1853 1854 ia[0] = (ia[0] & ~0x3fffff) | (high_bits >> 10); 1855 __asm__ __volatile__("flush %0" : : "r" (ia)); 1856 1857 ia[1] = (ia[1] & ~0x3fffff) | (low_bits >> 10); 1858 __asm__ __volatile__("flush %0" : : "r" (ia + 1)); 1859 1860 ia[2] = (ia[2] & ~0x1fff) | (high_bits & 0x3ff); 1861 __asm__ __volatile__("flush %0" : : "r" (ia + 2)); 1862 1863 ia[3] = (ia[3] & ~0x1fff) | (low_bits & 0x3ff); 1864 __asm__ __volatile__("flush %0" : : "r" (ia + 3)); 1865 1866 start++; 1867 } 1868 } 1869 1870 static void ktsb_phys_patch(void) 1871 { 1872 extern unsigned int __swapper_tsb_phys_patch; 1873 extern unsigned int __swapper_tsb_phys_patch_end; 1874 unsigned long ktsb_pa; 1875 1876 ktsb_pa = kern_base + ((unsigned long)&swapper_tsb[0] - KERNBASE); 1877 patch_one_ktsb_phys(&__swapper_tsb_phys_patch, 1878 &__swapper_tsb_phys_patch_end, ktsb_pa); 1879 #ifndef CONFIG_DEBUG_PAGEALLOC 1880 { 1881 extern unsigned int __swapper_4m_tsb_phys_patch; 1882 extern unsigned int __swapper_4m_tsb_phys_patch_end; 1883 ktsb_pa = (kern_base + 1884 ((unsigned long)&swapper_4m_tsb[0] - KERNBASE)); 1885 patch_one_ktsb_phys(&__swapper_4m_tsb_phys_patch, 1886 &__swapper_4m_tsb_phys_patch_end, ktsb_pa); 1887 } 1888 #endif 1889 } 1890 1891 static void __init sun4v_ktsb_init(void) 1892 { 1893 unsigned long ktsb_pa; 1894 1895 /* First KTSB for PAGE_SIZE mappings. */ 1896 ktsb_pa = kern_base + ((unsigned long)&swapper_tsb[0] - KERNBASE); 1897 1898 switch (PAGE_SIZE) { 1899 case 8 * 1024: 1900 default: 1901 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_8K; 1902 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_8K; 1903 break; 1904 1905 case 64 * 1024: 1906 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_64K; 1907 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_64K; 1908 break; 1909 1910 case 512 * 1024: 1911 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_512K; 1912 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_512K; 1913 break; 1914 1915 case 4 * 1024 * 1024: 1916 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_4MB; 1917 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_4MB; 1918 break; 1919 } 1920 1921 ktsb_descr[0].assoc = 1; 1922 ktsb_descr[0].num_ttes = KERNEL_TSB_NENTRIES; 1923 ktsb_descr[0].ctx_idx = 0; 1924 ktsb_descr[0].tsb_base = ktsb_pa; 1925 ktsb_descr[0].resv = 0; 1926 1927 #ifndef CONFIG_DEBUG_PAGEALLOC 1928 /* Second KTSB for 4MB/256MB/2GB/16GB mappings. */ 1929 ktsb_pa = (kern_base + 1930 ((unsigned long)&swapper_4m_tsb[0] - KERNBASE)); 1931 1932 ktsb_descr[1].pgsz_idx = HV_PGSZ_IDX_4MB; 1933 ktsb_descr[1].pgsz_mask = ((HV_PGSZ_MASK_4MB | 1934 HV_PGSZ_MASK_256MB | 1935 HV_PGSZ_MASK_2GB | 1936 HV_PGSZ_MASK_16GB) & 1937 cpu_pgsz_mask); 1938 ktsb_descr[1].assoc = 1; 1939 ktsb_descr[1].num_ttes = KERNEL_TSB4M_NENTRIES; 1940 ktsb_descr[1].ctx_idx = 0; 1941 ktsb_descr[1].tsb_base = ktsb_pa; 1942 ktsb_descr[1].resv = 0; 1943 #endif 1944 } 1945 1946 void sun4v_ktsb_register(void) 1947 { 1948 unsigned long pa, ret; 1949 1950 pa = kern_base + ((unsigned long)&ktsb_descr[0] - KERNBASE); 1951 1952 ret = sun4v_mmu_tsb_ctx0(NUM_KTSB_DESCR, pa); 1953 if (ret != 0) { 1954 prom_printf("hypervisor_mmu_tsb_ctx0[%lx]: " 1955 "errors with %lx\n", pa, ret); 1956 prom_halt(); 1957 } 1958 } 1959 1960 static void __init sun4u_linear_pte_xor_finalize(void) 1961 { 1962 #ifndef CONFIG_DEBUG_PAGEALLOC 1963 /* This is where we would add Panther support for 1964 * 32MB and 256MB pages. 1965 */ 1966 #endif 1967 } 1968 1969 static void __init sun4v_linear_pte_xor_finalize(void) 1970 { 1971 unsigned long pagecv_flag; 1972 1973 /* Bit 9 of TTE is no longer CV bit on M7 processor and it instead 1974 * enables MCD error. Do not set bit 9 on M7 processor. 1975 */ 1976 switch (sun4v_chip_type) { 1977 case SUN4V_CHIP_SPARC_M7: 1978 case SUN4V_CHIP_SPARC_SN: 1979 pagecv_flag = 0x00; 1980 break; 1981 default: 1982 pagecv_flag = _PAGE_CV_4V; 1983 break; 1984 } 1985 #ifndef CONFIG_DEBUG_PAGEALLOC 1986 if (cpu_pgsz_mask & HV_PGSZ_MASK_256MB) { 1987 kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^ 1988 PAGE_OFFSET; 1989 kern_linear_pte_xor[1] |= (_PAGE_CP_4V | pagecv_flag | 1990 _PAGE_P_4V | _PAGE_W_4V); 1991 } else { 1992 kern_linear_pte_xor[1] = kern_linear_pte_xor[0]; 1993 } 1994 1995 if (cpu_pgsz_mask & HV_PGSZ_MASK_2GB) { 1996 kern_linear_pte_xor[2] = (_PAGE_VALID | _PAGE_SZ2GB_4V) ^ 1997 PAGE_OFFSET; 1998 kern_linear_pte_xor[2] |= (_PAGE_CP_4V | pagecv_flag | 1999 _PAGE_P_4V | _PAGE_W_4V); 2000 } else { 2001 kern_linear_pte_xor[2] = kern_linear_pte_xor[1]; 2002 } 2003 2004 if (cpu_pgsz_mask & HV_PGSZ_MASK_16GB) { 2005 kern_linear_pte_xor[3] = (_PAGE_VALID | _PAGE_SZ16GB_4V) ^ 2006 PAGE_OFFSET; 2007 kern_linear_pte_xor[3] |= (_PAGE_CP_4V | pagecv_flag | 2008 _PAGE_P_4V | _PAGE_W_4V); 2009 } else { 2010 kern_linear_pte_xor[3] = kern_linear_pte_xor[2]; 2011 } 2012 #endif 2013 } 2014 2015 /* paging_init() sets up the page tables */ 2016 2017 static unsigned long last_valid_pfn; 2018 2019 static void sun4u_pgprot_init(void); 2020 static void sun4v_pgprot_init(void); 2021 2022 static phys_addr_t __init available_memory(void) 2023 { 2024 phys_addr_t available = 0ULL; 2025 phys_addr_t pa_start, pa_end; 2026 u64 i; 2027 2028 for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start, 2029 &pa_end, NULL) 2030 available = available + (pa_end - pa_start); 2031 2032 return available; 2033 } 2034 2035 #define _PAGE_CACHE_4U (_PAGE_CP_4U | _PAGE_CV_4U) 2036 #define _PAGE_CACHE_4V (_PAGE_CP_4V | _PAGE_CV_4V) 2037 #define __DIRTY_BITS_4U (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U) 2038 #define __DIRTY_BITS_4V (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V) 2039 #define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R) 2040 #define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R) 2041 2042 /* We need to exclude reserved regions. This exclusion will include 2043 * vmlinux and initrd. To be more precise the initrd size could be used to 2044 * compute a new lower limit because it is freed later during initialization. 2045 */ 2046 static void __init reduce_memory(phys_addr_t limit_ram) 2047 { 2048 phys_addr_t avail_ram = available_memory(); 2049 phys_addr_t pa_start, pa_end; 2050 u64 i; 2051 2052 if (limit_ram >= avail_ram) 2053 return; 2054 2055 for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start, 2056 &pa_end, NULL) { 2057 phys_addr_t region_size = pa_end - pa_start; 2058 phys_addr_t clip_start = pa_start; 2059 2060 avail_ram = avail_ram - region_size; 2061 /* Are we consuming too much? */ 2062 if (avail_ram < limit_ram) { 2063 phys_addr_t give_back = limit_ram - avail_ram; 2064 2065 region_size = region_size - give_back; 2066 clip_start = clip_start + give_back; 2067 } 2068 2069 memblock_remove(clip_start, region_size); 2070 2071 if (avail_ram <= limit_ram) 2072 break; 2073 i = 0UL; 2074 } 2075 } 2076 2077 void __init paging_init(void) 2078 { 2079 unsigned long end_pfn, shift, phys_base; 2080 unsigned long real_end, i; 2081 int node; 2082 2083 setup_page_offset(); 2084 2085 /* These build time checkes make sure that the dcache_dirty_cpu() 2086 * page->flags usage will work. 2087 * 2088 * When a page gets marked as dcache-dirty, we store the 2089 * cpu number starting at bit 32 in the page->flags. Also, 2090 * functions like clear_dcache_dirty_cpu use the cpu mask 2091 * in 13-bit signed-immediate instruction fields. 2092 */ 2093 2094 /* 2095 * Page flags must not reach into upper 32 bits that are used 2096 * for the cpu number 2097 */ 2098 BUILD_BUG_ON(NR_PAGEFLAGS > 32); 2099 2100 /* 2101 * The bit fields placed in the high range must not reach below 2102 * the 32 bit boundary. Otherwise we cannot place the cpu field 2103 * at the 32 bit boundary. 2104 */ 2105 BUILD_BUG_ON(SECTIONS_WIDTH + NODES_WIDTH + ZONES_WIDTH + 2106 ilog2(roundup_pow_of_two(NR_CPUS)) > 32); 2107 2108 BUILD_BUG_ON(NR_CPUS > 4096); 2109 2110 kern_base = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB; 2111 kern_size = (unsigned long)&_end - (unsigned long)KERNBASE; 2112 2113 /* Invalidate both kernel TSBs. */ 2114 memset(swapper_tsb, 0x40, sizeof(swapper_tsb)); 2115 #ifndef CONFIG_DEBUG_PAGEALLOC 2116 memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb)); 2117 #endif 2118 2119 /* TTE.cv bit on sparc v9 occupies the same position as TTE.mcde 2120 * bit on M7 processor. This is a conflicting usage of the same 2121 * bit. Enabling TTE.cv on M7 would turn on Memory Corruption 2122 * Detection error on all pages and this will lead to problems 2123 * later. Kernel does not run with MCD enabled and hence rest 2124 * of the required steps to fully configure memory corruption 2125 * detection are not taken. We need to ensure TTE.mcde is not 2126 * set on M7 processor. Compute the value of cacheability 2127 * flag for use later taking this into consideration. 2128 */ 2129 switch (sun4v_chip_type) { 2130 case SUN4V_CHIP_SPARC_M7: 2131 case SUN4V_CHIP_SPARC_SN: 2132 page_cache4v_flag = _PAGE_CP_4V; 2133 break; 2134 default: 2135 page_cache4v_flag = _PAGE_CACHE_4V; 2136 break; 2137 } 2138 2139 if (tlb_type == hypervisor) 2140 sun4v_pgprot_init(); 2141 else 2142 sun4u_pgprot_init(); 2143 2144 if (tlb_type == cheetah_plus || 2145 tlb_type == hypervisor) { 2146 tsb_phys_patch(); 2147 ktsb_phys_patch(); 2148 } 2149 2150 if (tlb_type == hypervisor) 2151 sun4v_patch_tlb_handlers(); 2152 2153 /* Find available physical memory... 2154 * 2155 * Read it twice in order to work around a bug in openfirmware. 2156 * The call to grab this table itself can cause openfirmware to 2157 * allocate memory, which in turn can take away some space from 2158 * the list of available memory. Reading it twice makes sure 2159 * we really do get the final value. 2160 */ 2161 read_obp_translations(); 2162 read_obp_memory("reg", &pall[0], &pall_ents); 2163 read_obp_memory("available", &pavail[0], &pavail_ents); 2164 read_obp_memory("available", &pavail[0], &pavail_ents); 2165 2166 phys_base = 0xffffffffffffffffUL; 2167 for (i = 0; i < pavail_ents; i++) { 2168 phys_base = min(phys_base, pavail[i].phys_addr); 2169 memblock_add(pavail[i].phys_addr, pavail[i].reg_size); 2170 } 2171 2172 memblock_reserve(kern_base, kern_size); 2173 2174 find_ramdisk(phys_base); 2175 2176 if (cmdline_memory_size) 2177 reduce_memory(cmdline_memory_size); 2178 2179 memblock_allow_resize(); 2180 memblock_dump_all(); 2181 2182 set_bit(0, mmu_context_bmap); 2183 2184 shift = kern_base + PAGE_OFFSET - ((unsigned long)KERNBASE); 2185 2186 real_end = (unsigned long)_end; 2187 num_kernel_image_mappings = DIV_ROUND_UP(real_end - KERNBASE, 1 << ILOG2_4MB); 2188 printk("Kernel: Using %d locked TLB entries for main kernel image.\n", 2189 num_kernel_image_mappings); 2190 2191 /* Set kernel pgd to upper alias so physical page computations 2192 * work. 2193 */ 2194 init_mm.pgd += ((shift) / (sizeof(pgd_t))); 2195 2196 memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir)); 2197 2198 inherit_prom_mappings(); 2199 2200 /* Ok, we can use our TLB miss and window trap handlers safely. */ 2201 setup_tba(); 2202 2203 __flush_tlb_all(); 2204 2205 prom_build_devicetree(); 2206 of_populate_present_mask(); 2207 #ifndef CONFIG_SMP 2208 of_fill_in_cpu_data(); 2209 #endif 2210 2211 if (tlb_type == hypervisor) { 2212 sun4v_mdesc_init(); 2213 mdesc_populate_present_mask(cpu_all_mask); 2214 #ifndef CONFIG_SMP 2215 mdesc_fill_in_cpu_data(cpu_all_mask); 2216 #endif 2217 mdesc_get_page_sizes(cpu_all_mask, &cpu_pgsz_mask); 2218 2219 sun4v_linear_pte_xor_finalize(); 2220 2221 sun4v_ktsb_init(); 2222 sun4v_ktsb_register(); 2223 } else { 2224 unsigned long impl, ver; 2225 2226 cpu_pgsz_mask = (HV_PGSZ_MASK_8K | HV_PGSZ_MASK_64K | 2227 HV_PGSZ_MASK_512K | HV_PGSZ_MASK_4MB); 2228 2229 __asm__ __volatile__("rdpr %%ver, %0" : "=r" (ver)); 2230 impl = ((ver >> 32) & 0xffff); 2231 if (impl == PANTHER_IMPL) 2232 cpu_pgsz_mask |= (HV_PGSZ_MASK_32MB | 2233 HV_PGSZ_MASK_256MB); 2234 2235 sun4u_linear_pte_xor_finalize(); 2236 } 2237 2238 /* Flush the TLBs and the 4M TSB so that the updated linear 2239 * pte XOR settings are realized for all mappings. 2240 */ 2241 __flush_tlb_all(); 2242 #ifndef CONFIG_DEBUG_PAGEALLOC 2243 memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb)); 2244 #endif 2245 __flush_tlb_all(); 2246 2247 /* Setup bootmem... */ 2248 last_valid_pfn = end_pfn = bootmem_init(phys_base); 2249 2250 /* Once the OF device tree and MDESC have been setup, we know 2251 * the list of possible cpus. Therefore we can allocate the 2252 * IRQ stacks. 2253 */ 2254 for_each_possible_cpu(i) { 2255 node = cpu_to_node(i); 2256 2257 softirq_stack[i] = __alloc_bootmem_node(NODE_DATA(node), 2258 THREAD_SIZE, 2259 THREAD_SIZE, 0); 2260 hardirq_stack[i] = __alloc_bootmem_node(NODE_DATA(node), 2261 THREAD_SIZE, 2262 THREAD_SIZE, 0); 2263 } 2264 2265 kernel_physical_mapping_init(); 2266 2267 { 2268 unsigned long max_zone_pfns[MAX_NR_ZONES]; 2269 2270 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 2271 2272 max_zone_pfns[ZONE_NORMAL] = end_pfn; 2273 2274 free_area_init_nodes(max_zone_pfns); 2275 } 2276 2277 printk("Booting Linux...\n"); 2278 } 2279 2280 int page_in_phys_avail(unsigned long paddr) 2281 { 2282 int i; 2283 2284 paddr &= PAGE_MASK; 2285 2286 for (i = 0; i < pavail_ents; i++) { 2287 unsigned long start, end; 2288 2289 start = pavail[i].phys_addr; 2290 end = start + pavail[i].reg_size; 2291 2292 if (paddr >= start && paddr < end) 2293 return 1; 2294 } 2295 if (paddr >= kern_base && paddr < (kern_base + kern_size)) 2296 return 1; 2297 #ifdef CONFIG_BLK_DEV_INITRD 2298 if (paddr >= __pa(initrd_start) && 2299 paddr < __pa(PAGE_ALIGN(initrd_end))) 2300 return 1; 2301 #endif 2302 2303 return 0; 2304 } 2305 2306 static void __init register_page_bootmem_info(void) 2307 { 2308 #ifdef CONFIG_NEED_MULTIPLE_NODES 2309 int i; 2310 2311 for_each_online_node(i) 2312 if (NODE_DATA(i)->node_spanned_pages) 2313 register_page_bootmem_info_node(NODE_DATA(i)); 2314 #endif 2315 } 2316 void __init mem_init(void) 2317 { 2318 high_memory = __va(last_valid_pfn << PAGE_SHIFT); 2319 2320 register_page_bootmem_info(); 2321 free_all_bootmem(); 2322 2323 /* 2324 * Set up the zero page, mark it reserved, so that page count 2325 * is not manipulated when freeing the page from user ptes. 2326 */ 2327 mem_map_zero = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0); 2328 if (mem_map_zero == NULL) { 2329 prom_printf("paging_init: Cannot alloc zero page.\n"); 2330 prom_halt(); 2331 } 2332 mark_page_reserved(mem_map_zero); 2333 2334 mem_init_print_info(NULL); 2335 2336 if (tlb_type == cheetah || tlb_type == cheetah_plus) 2337 cheetah_ecache_flush_init(); 2338 } 2339 2340 void free_initmem(void) 2341 { 2342 unsigned long addr, initend; 2343 int do_free = 1; 2344 2345 /* If the physical memory maps were trimmed by kernel command 2346 * line options, don't even try freeing this initmem stuff up. 2347 * The kernel image could have been in the trimmed out region 2348 * and if so the freeing below will free invalid page structs. 2349 */ 2350 if (cmdline_memory_size) 2351 do_free = 0; 2352 2353 /* 2354 * The init section is aligned to 8k in vmlinux.lds. Page align for >8k pagesizes. 2355 */ 2356 addr = PAGE_ALIGN((unsigned long)(__init_begin)); 2357 initend = (unsigned long)(__init_end) & PAGE_MASK; 2358 for (; addr < initend; addr += PAGE_SIZE) { 2359 unsigned long page; 2360 2361 page = (addr + 2362 ((unsigned long) __va(kern_base)) - 2363 ((unsigned long) KERNBASE)); 2364 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); 2365 2366 if (do_free) 2367 free_reserved_page(virt_to_page(page)); 2368 } 2369 } 2370 2371 #ifdef CONFIG_BLK_DEV_INITRD 2372 void free_initrd_mem(unsigned long start, unsigned long end) 2373 { 2374 free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, 2375 "initrd"); 2376 } 2377 #endif 2378 2379 pgprot_t PAGE_KERNEL __read_mostly; 2380 EXPORT_SYMBOL(PAGE_KERNEL); 2381 2382 pgprot_t PAGE_KERNEL_LOCKED __read_mostly; 2383 pgprot_t PAGE_COPY __read_mostly; 2384 2385 pgprot_t PAGE_SHARED __read_mostly; 2386 EXPORT_SYMBOL(PAGE_SHARED); 2387 2388 unsigned long pg_iobits __read_mostly; 2389 2390 unsigned long _PAGE_IE __read_mostly; 2391 EXPORT_SYMBOL(_PAGE_IE); 2392 2393 unsigned long _PAGE_E __read_mostly; 2394 EXPORT_SYMBOL(_PAGE_E); 2395 2396 unsigned long _PAGE_CACHE __read_mostly; 2397 EXPORT_SYMBOL(_PAGE_CACHE); 2398 2399 #ifdef CONFIG_SPARSEMEM_VMEMMAP 2400 int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend, 2401 int node) 2402 { 2403 unsigned long pte_base; 2404 2405 pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U | 2406 _PAGE_CP_4U | _PAGE_CV_4U | 2407 _PAGE_P_4U | _PAGE_W_4U); 2408 if (tlb_type == hypervisor) 2409 pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V | 2410 page_cache4v_flag | _PAGE_P_4V | _PAGE_W_4V); 2411 2412 pte_base |= _PAGE_PMD_HUGE; 2413 2414 vstart = vstart & PMD_MASK; 2415 vend = ALIGN(vend, PMD_SIZE); 2416 for (; vstart < vend; vstart += PMD_SIZE) { 2417 pgd_t *pgd = pgd_offset_k(vstart); 2418 unsigned long pte; 2419 pud_t *pud; 2420 pmd_t *pmd; 2421 2422 if (pgd_none(*pgd)) { 2423 pud_t *new = vmemmap_alloc_block(PAGE_SIZE, node); 2424 2425 if (!new) 2426 return -ENOMEM; 2427 pgd_populate(&init_mm, pgd, new); 2428 } 2429 2430 pud = pud_offset(pgd, vstart); 2431 if (pud_none(*pud)) { 2432 pmd_t *new = vmemmap_alloc_block(PAGE_SIZE, node); 2433 2434 if (!new) 2435 return -ENOMEM; 2436 pud_populate(&init_mm, pud, new); 2437 } 2438 2439 pmd = pmd_offset(pud, vstart); 2440 2441 pte = pmd_val(*pmd); 2442 if (!(pte & _PAGE_VALID)) { 2443 void *block = vmemmap_alloc_block(PMD_SIZE, node); 2444 2445 if (!block) 2446 return -ENOMEM; 2447 2448 pmd_val(*pmd) = pte_base | __pa(block); 2449 } 2450 } 2451 2452 return 0; 2453 } 2454 2455 void vmemmap_free(unsigned long start, unsigned long end) 2456 { 2457 } 2458 #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 2459 2460 static void prot_init_common(unsigned long page_none, 2461 unsigned long page_shared, 2462 unsigned long page_copy, 2463 unsigned long page_readonly, 2464 unsigned long page_exec_bit) 2465 { 2466 PAGE_COPY = __pgprot(page_copy); 2467 PAGE_SHARED = __pgprot(page_shared); 2468 2469 protection_map[0x0] = __pgprot(page_none); 2470 protection_map[0x1] = __pgprot(page_readonly & ~page_exec_bit); 2471 protection_map[0x2] = __pgprot(page_copy & ~page_exec_bit); 2472 protection_map[0x3] = __pgprot(page_copy & ~page_exec_bit); 2473 protection_map[0x4] = __pgprot(page_readonly); 2474 protection_map[0x5] = __pgprot(page_readonly); 2475 protection_map[0x6] = __pgprot(page_copy); 2476 protection_map[0x7] = __pgprot(page_copy); 2477 protection_map[0x8] = __pgprot(page_none); 2478 protection_map[0x9] = __pgprot(page_readonly & ~page_exec_bit); 2479 protection_map[0xa] = __pgprot(page_shared & ~page_exec_bit); 2480 protection_map[0xb] = __pgprot(page_shared & ~page_exec_bit); 2481 protection_map[0xc] = __pgprot(page_readonly); 2482 protection_map[0xd] = __pgprot(page_readonly); 2483 protection_map[0xe] = __pgprot(page_shared); 2484 protection_map[0xf] = __pgprot(page_shared); 2485 } 2486 2487 static void __init sun4u_pgprot_init(void) 2488 { 2489 unsigned long page_none, page_shared, page_copy, page_readonly; 2490 unsigned long page_exec_bit; 2491 int i; 2492 2493 PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID | 2494 _PAGE_CACHE_4U | _PAGE_P_4U | 2495 __ACCESS_BITS_4U | __DIRTY_BITS_4U | 2496 _PAGE_EXEC_4U); 2497 PAGE_KERNEL_LOCKED = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID | 2498 _PAGE_CACHE_4U | _PAGE_P_4U | 2499 __ACCESS_BITS_4U | __DIRTY_BITS_4U | 2500 _PAGE_EXEC_4U | _PAGE_L_4U); 2501 2502 _PAGE_IE = _PAGE_IE_4U; 2503 _PAGE_E = _PAGE_E_4U; 2504 _PAGE_CACHE = _PAGE_CACHE_4U; 2505 2506 pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4U | __DIRTY_BITS_4U | 2507 __ACCESS_BITS_4U | _PAGE_E_4U); 2508 2509 #ifdef CONFIG_DEBUG_PAGEALLOC 2510 kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET; 2511 #else 2512 kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^ 2513 PAGE_OFFSET; 2514 #endif 2515 kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U | 2516 _PAGE_P_4U | _PAGE_W_4U); 2517 2518 for (i = 1; i < 4; i++) 2519 kern_linear_pte_xor[i] = kern_linear_pte_xor[0]; 2520 2521 _PAGE_ALL_SZ_BITS = (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U | 2522 _PAGE_SZ64K_4U | _PAGE_SZ8K_4U | 2523 _PAGE_SZ32MB_4U | _PAGE_SZ256MB_4U); 2524 2525 2526 page_none = _PAGE_PRESENT_4U | _PAGE_ACCESSED_4U | _PAGE_CACHE_4U; 2527 page_shared = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U | 2528 __ACCESS_BITS_4U | _PAGE_WRITE_4U | _PAGE_EXEC_4U); 2529 page_copy = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U | 2530 __ACCESS_BITS_4U | _PAGE_EXEC_4U); 2531 page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U | 2532 __ACCESS_BITS_4U | _PAGE_EXEC_4U); 2533 2534 page_exec_bit = _PAGE_EXEC_4U; 2535 2536 prot_init_common(page_none, page_shared, page_copy, page_readonly, 2537 page_exec_bit); 2538 } 2539 2540 static void __init sun4v_pgprot_init(void) 2541 { 2542 unsigned long page_none, page_shared, page_copy, page_readonly; 2543 unsigned long page_exec_bit; 2544 int i; 2545 2546 PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4V | _PAGE_VALID | 2547 page_cache4v_flag | _PAGE_P_4V | 2548 __ACCESS_BITS_4V | __DIRTY_BITS_4V | 2549 _PAGE_EXEC_4V); 2550 PAGE_KERNEL_LOCKED = PAGE_KERNEL; 2551 2552 _PAGE_IE = _PAGE_IE_4V; 2553 _PAGE_E = _PAGE_E_4V; 2554 _PAGE_CACHE = page_cache4v_flag; 2555 2556 #ifdef CONFIG_DEBUG_PAGEALLOC 2557 kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET; 2558 #else 2559 kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^ 2560 PAGE_OFFSET; 2561 #endif 2562 kern_linear_pte_xor[0] |= (page_cache4v_flag | _PAGE_P_4V | 2563 _PAGE_W_4V); 2564 2565 for (i = 1; i < 4; i++) 2566 kern_linear_pte_xor[i] = kern_linear_pte_xor[0]; 2567 2568 pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V | 2569 __ACCESS_BITS_4V | _PAGE_E_4V); 2570 2571 _PAGE_ALL_SZ_BITS = (_PAGE_SZ16GB_4V | _PAGE_SZ2GB_4V | 2572 _PAGE_SZ256MB_4V | _PAGE_SZ32MB_4V | 2573 _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V | 2574 _PAGE_SZ64K_4V | _PAGE_SZ8K_4V); 2575 2576 page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | page_cache4v_flag; 2577 page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag | 2578 __ACCESS_BITS_4V | _PAGE_WRITE_4V | _PAGE_EXEC_4V); 2579 page_copy = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag | 2580 __ACCESS_BITS_4V | _PAGE_EXEC_4V); 2581 page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag | 2582 __ACCESS_BITS_4V | _PAGE_EXEC_4V); 2583 2584 page_exec_bit = _PAGE_EXEC_4V; 2585 2586 prot_init_common(page_none, page_shared, page_copy, page_readonly, 2587 page_exec_bit); 2588 } 2589 2590 unsigned long pte_sz_bits(unsigned long sz) 2591 { 2592 if (tlb_type == hypervisor) { 2593 switch (sz) { 2594 case 8 * 1024: 2595 default: 2596 return _PAGE_SZ8K_4V; 2597 case 64 * 1024: 2598 return _PAGE_SZ64K_4V; 2599 case 512 * 1024: 2600 return _PAGE_SZ512K_4V; 2601 case 4 * 1024 * 1024: 2602 return _PAGE_SZ4MB_4V; 2603 } 2604 } else { 2605 switch (sz) { 2606 case 8 * 1024: 2607 default: 2608 return _PAGE_SZ8K_4U; 2609 case 64 * 1024: 2610 return _PAGE_SZ64K_4U; 2611 case 512 * 1024: 2612 return _PAGE_SZ512K_4U; 2613 case 4 * 1024 * 1024: 2614 return _PAGE_SZ4MB_4U; 2615 } 2616 } 2617 } 2618 2619 pte_t mk_pte_io(unsigned long page, pgprot_t prot, int space, unsigned long page_size) 2620 { 2621 pte_t pte; 2622 2623 pte_val(pte) = page | pgprot_val(pgprot_noncached(prot)); 2624 pte_val(pte) |= (((unsigned long)space) << 32); 2625 pte_val(pte) |= pte_sz_bits(page_size); 2626 2627 return pte; 2628 } 2629 2630 static unsigned long kern_large_tte(unsigned long paddr) 2631 { 2632 unsigned long val; 2633 2634 val = (_PAGE_VALID | _PAGE_SZ4MB_4U | 2635 _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_P_4U | 2636 _PAGE_EXEC_4U | _PAGE_L_4U | _PAGE_W_4U); 2637 if (tlb_type == hypervisor) 2638 val = (_PAGE_VALID | _PAGE_SZ4MB_4V | 2639 page_cache4v_flag | _PAGE_P_4V | 2640 _PAGE_EXEC_4V | _PAGE_W_4V); 2641 2642 return val | paddr; 2643 } 2644 2645 /* If not locked, zap it. */ 2646 void __flush_tlb_all(void) 2647 { 2648 unsigned long pstate; 2649 int i; 2650 2651 __asm__ __volatile__("flushw\n\t" 2652 "rdpr %%pstate, %0\n\t" 2653 "wrpr %0, %1, %%pstate" 2654 : "=r" (pstate) 2655 : "i" (PSTATE_IE)); 2656 if (tlb_type == hypervisor) { 2657 sun4v_mmu_demap_all(); 2658 } else if (tlb_type == spitfire) { 2659 for (i = 0; i < 64; i++) { 2660 /* Spitfire Errata #32 workaround */ 2661 /* NOTE: Always runs on spitfire, so no 2662 * cheetah+ page size encodings. 2663 */ 2664 __asm__ __volatile__("stxa %0, [%1] %2\n\t" 2665 "flush %%g6" 2666 : /* No outputs */ 2667 : "r" (0), 2668 "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); 2669 2670 if (!(spitfire_get_dtlb_data(i) & _PAGE_L_4U)) { 2671 __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" 2672 "membar #Sync" 2673 : /* no outputs */ 2674 : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU)); 2675 spitfire_put_dtlb_data(i, 0x0UL); 2676 } 2677 2678 /* Spitfire Errata #32 workaround */ 2679 /* NOTE: Always runs on spitfire, so no 2680 * cheetah+ page size encodings. 2681 */ 2682 __asm__ __volatile__("stxa %0, [%1] %2\n\t" 2683 "flush %%g6" 2684 : /* No outputs */ 2685 : "r" (0), 2686 "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); 2687 2688 if (!(spitfire_get_itlb_data(i) & _PAGE_L_4U)) { 2689 __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" 2690 "membar #Sync" 2691 : /* no outputs */ 2692 : "r" (TLB_TAG_ACCESS), "i" (ASI_IMMU)); 2693 spitfire_put_itlb_data(i, 0x0UL); 2694 } 2695 } 2696 } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { 2697 cheetah_flush_dtlb_all(); 2698 cheetah_flush_itlb_all(); 2699 } 2700 __asm__ __volatile__("wrpr %0, 0, %%pstate" 2701 : : "r" (pstate)); 2702 } 2703 2704 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, 2705 unsigned long address) 2706 { 2707 struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | 2708 __GFP_REPEAT | __GFP_ZERO); 2709 pte_t *pte = NULL; 2710 2711 if (page) 2712 pte = (pte_t *) page_address(page); 2713 2714 return pte; 2715 } 2716 2717 pgtable_t pte_alloc_one(struct mm_struct *mm, 2718 unsigned long address) 2719 { 2720 struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | 2721 __GFP_REPEAT | __GFP_ZERO); 2722 if (!page) 2723 return NULL; 2724 if (!pgtable_page_ctor(page)) { 2725 free_hot_cold_page(page, 0); 2726 return NULL; 2727 } 2728 return (pte_t *) page_address(page); 2729 } 2730 2731 void pte_free_kernel(struct mm_struct *mm, pte_t *pte) 2732 { 2733 free_page((unsigned long)pte); 2734 } 2735 2736 static void __pte_free(pgtable_t pte) 2737 { 2738 struct page *page = virt_to_page(pte); 2739 2740 pgtable_page_dtor(page); 2741 __free_page(page); 2742 } 2743 2744 void pte_free(struct mm_struct *mm, pgtable_t pte) 2745 { 2746 __pte_free(pte); 2747 } 2748 2749 void pgtable_free(void *table, bool is_page) 2750 { 2751 if (is_page) 2752 __pte_free(table); 2753 else 2754 kmem_cache_free(pgtable_cache, table); 2755 } 2756 2757 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2758 void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, 2759 pmd_t *pmd) 2760 { 2761 unsigned long pte, flags; 2762 struct mm_struct *mm; 2763 pmd_t entry = *pmd; 2764 2765 if (!pmd_large(entry) || !pmd_young(entry)) 2766 return; 2767 2768 pte = pmd_val(entry); 2769 2770 /* Don't insert a non-valid PMD into the TSB, we'll deadlock. */ 2771 if (!(pte & _PAGE_VALID)) 2772 return; 2773 2774 /* We are fabricating 8MB pages using 4MB real hw pages. */ 2775 pte |= (addr & (1UL << REAL_HPAGE_SHIFT)); 2776 2777 mm = vma->vm_mm; 2778 2779 spin_lock_irqsave(&mm->context.lock, flags); 2780 2781 if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) 2782 __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT, 2783 addr, pte); 2784 2785 spin_unlock_irqrestore(&mm->context.lock, flags); 2786 } 2787 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2788 2789 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) 2790 static void context_reload(void *__data) 2791 { 2792 struct mm_struct *mm = __data; 2793 2794 if (mm == current->mm) 2795 load_secondary_context(mm); 2796 } 2797 2798 void hugetlb_setup(struct pt_regs *regs) 2799 { 2800 struct mm_struct *mm = current->mm; 2801 struct tsb_config *tp; 2802 2803 if (faulthandler_disabled() || !mm) { 2804 const struct exception_table_entry *entry; 2805 2806 entry = search_exception_tables(regs->tpc); 2807 if (entry) { 2808 regs->tpc = entry->fixup; 2809 regs->tnpc = regs->tpc + 4; 2810 return; 2811 } 2812 pr_alert("Unexpected HugeTLB setup in atomic context.\n"); 2813 die_if_kernel("HugeTSB in atomic", regs); 2814 } 2815 2816 tp = &mm->context.tsb_block[MM_TSB_HUGE]; 2817 if (likely(tp->tsb == NULL)) 2818 tsb_grow(mm, MM_TSB_HUGE, 0); 2819 2820 tsb_context_switch(mm); 2821 smp_tsb_sync(mm); 2822 2823 /* On UltraSPARC-III+ and later, configure the second half of 2824 * the Data-TLB for huge pages. 2825 */ 2826 if (tlb_type == cheetah_plus) { 2827 unsigned long ctx; 2828 2829 spin_lock(&ctx_alloc_lock); 2830 ctx = mm->context.sparc64_ctx_val; 2831 ctx &= ~CTX_PGSZ_MASK; 2832 ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT; 2833 ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT; 2834 2835 if (ctx != mm->context.sparc64_ctx_val) { 2836 /* When changing the page size fields, we 2837 * must perform a context flush so that no 2838 * stale entries match. This flush must 2839 * occur with the original context register 2840 * settings. 2841 */ 2842 do_flush_tlb_mm(mm); 2843 2844 /* Reload the context register of all processors 2845 * also executing in this address space. 2846 */ 2847 mm->context.sparc64_ctx_val = ctx; 2848 on_each_cpu(context_reload, mm, 0); 2849 } 2850 spin_unlock(&ctx_alloc_lock); 2851 } 2852 } 2853 #endif 2854 2855 static struct resource code_resource = { 2856 .name = "Kernel code", 2857 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM 2858 }; 2859 2860 static struct resource data_resource = { 2861 .name = "Kernel data", 2862 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM 2863 }; 2864 2865 static struct resource bss_resource = { 2866 .name = "Kernel bss", 2867 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM 2868 }; 2869 2870 static inline resource_size_t compute_kern_paddr(void *addr) 2871 { 2872 return (resource_size_t) (addr - KERNBASE + kern_base); 2873 } 2874 2875 static void __init kernel_lds_init(void) 2876 { 2877 code_resource.start = compute_kern_paddr(_text); 2878 code_resource.end = compute_kern_paddr(_etext - 1); 2879 data_resource.start = compute_kern_paddr(_etext); 2880 data_resource.end = compute_kern_paddr(_edata - 1); 2881 bss_resource.start = compute_kern_paddr(__bss_start); 2882 bss_resource.end = compute_kern_paddr(_end - 1); 2883 } 2884 2885 static int __init report_memory(void) 2886 { 2887 int i; 2888 struct resource *res; 2889 2890 kernel_lds_init(); 2891 2892 for (i = 0; i < pavail_ents; i++) { 2893 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 2894 2895 if (!res) { 2896 pr_warn("Failed to allocate source.\n"); 2897 break; 2898 } 2899 2900 res->name = "System RAM"; 2901 res->start = pavail[i].phys_addr; 2902 res->end = pavail[i].phys_addr + pavail[i].reg_size - 1; 2903 res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; 2904 2905 if (insert_resource(&iomem_resource, res) < 0) { 2906 pr_warn("Resource insertion failed.\n"); 2907 break; 2908 } 2909 2910 insert_resource(res, &code_resource); 2911 insert_resource(res, &data_resource); 2912 insert_resource(res, &bss_resource); 2913 } 2914 2915 return 0; 2916 } 2917 arch_initcall(report_memory); 2918 2919 #ifdef CONFIG_SMP 2920 #define do_flush_tlb_kernel_range smp_flush_tlb_kernel_range 2921 #else 2922 #define do_flush_tlb_kernel_range __flush_tlb_kernel_range 2923 #endif 2924 2925 void flush_tlb_kernel_range(unsigned long start, unsigned long end) 2926 { 2927 if (start < HI_OBP_ADDRESS && end > LOW_OBP_ADDRESS) { 2928 if (start < LOW_OBP_ADDRESS) { 2929 flush_tsb_kernel_range(start, LOW_OBP_ADDRESS); 2930 do_flush_tlb_kernel_range(start, LOW_OBP_ADDRESS); 2931 } 2932 if (end > HI_OBP_ADDRESS) { 2933 flush_tsb_kernel_range(HI_OBP_ADDRESS, end); 2934 do_flush_tlb_kernel_range(HI_OBP_ADDRESS, end); 2935 } 2936 } else { 2937 flush_tsb_kernel_range(start, end); 2938 do_flush_tlb_kernel_range(start, end); 2939 } 2940 } 2941