1 /* 2 * arch/sparc64/mm/init.c 3 * 4 * Copyright (C) 1996-1999 David S. Miller (davem@caip.rutgers.edu) 5 * Copyright (C) 1997-1999 Jakub Jelinek (jj@sunsite.mff.cuni.cz) 6 */ 7 8 #include <linux/extable.h> 9 #include <linux/kernel.h> 10 #include <linux/sched.h> 11 #include <linux/string.h> 12 #include <linux/init.h> 13 #include <linux/bootmem.h> 14 #include <linux/mm.h> 15 #include <linux/hugetlb.h> 16 #include <linux/initrd.h> 17 #include <linux/swap.h> 18 #include <linux/pagemap.h> 19 #include <linux/poison.h> 20 #include <linux/fs.h> 21 #include <linux/seq_file.h> 22 #include <linux/kprobes.h> 23 #include <linux/cache.h> 24 #include <linux/sort.h> 25 #include <linux/ioport.h> 26 #include <linux/percpu.h> 27 #include <linux/memblock.h> 28 #include <linux/mmzone.h> 29 #include <linux/gfp.h> 30 31 #include <asm/head.h> 32 #include <asm/page.h> 33 #include <asm/pgalloc.h> 34 #include <asm/pgtable.h> 35 #include <asm/oplib.h> 36 #include <asm/iommu.h> 37 #include <asm/io.h> 38 #include <linux/uaccess.h> 39 #include <asm/mmu_context.h> 40 #include <asm/tlbflush.h> 41 #include <asm/dma.h> 42 #include <asm/starfire.h> 43 #include <asm/tlb.h> 44 #include <asm/spitfire.h> 45 #include <asm/sections.h> 46 #include <asm/tsb.h> 47 #include <asm/hypervisor.h> 48 #include <asm/prom.h> 49 #include <asm/mdesc.h> 50 #include <asm/cpudata.h> 51 #include <asm/setup.h> 52 #include <asm/irq.h> 53 54 #include "init_64.h" 55 56 unsigned long kern_linear_pte_xor[4] __read_mostly; 57 static unsigned long page_cache4v_flag; 58 59 /* A bitmap, two bits for every 256MB of physical memory. These two 60 * bits determine what page size we use for kernel linear 61 * translations. They form an index into kern_linear_pte_xor[]. The 62 * value in the indexed slot is XOR'd with the TLB miss virtual 63 * address to form the resulting TTE. The mapping is: 64 * 65 * 0 ==> 4MB 66 * 1 ==> 256MB 67 * 2 ==> 2GB 68 * 3 ==> 16GB 69 * 70 * All sun4v chips support 256MB pages. Only SPARC-T4 and later 71 * support 2GB pages, and hopefully future cpus will support the 16GB 72 * pages as well. For slots 2 and 3, we encode a 256MB TTE xor there 73 * if these larger page sizes are not supported by the cpu. 74 * 75 * It would be nice to determine this from the machine description 76 * 'cpu' properties, but we need to have this table setup before the 77 * MDESC is initialized. 78 */ 79 80 #ifndef CONFIG_DEBUG_PAGEALLOC 81 /* A special kernel TSB for 4MB, 256MB, 2GB and 16GB linear mappings. 82 * Space is allocated for this right after the trap table in 83 * arch/sparc64/kernel/head.S 84 */ 85 extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES]; 86 #endif 87 extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES]; 88 89 static unsigned long cpu_pgsz_mask; 90 91 #define MAX_BANKS 1024 92 93 static struct linux_prom64_registers pavail[MAX_BANKS]; 94 static int pavail_ents; 95 96 u64 numa_latency[MAX_NUMNODES][MAX_NUMNODES]; 97 98 static int cmp_p64(const void *a, const void *b) 99 { 100 const struct linux_prom64_registers *x = a, *y = b; 101 102 if (x->phys_addr > y->phys_addr) 103 return 1; 104 if (x->phys_addr < y->phys_addr) 105 return -1; 106 return 0; 107 } 108 109 static void __init read_obp_memory(const char *property, 110 struct linux_prom64_registers *regs, 111 int *num_ents) 112 { 113 phandle node = prom_finddevice("/memory"); 114 int prop_size = prom_getproplen(node, property); 115 int ents, ret, i; 116 117 ents = prop_size / sizeof(struct linux_prom64_registers); 118 if (ents > MAX_BANKS) { 119 prom_printf("The machine has more %s property entries than " 120 "this kernel can support (%d).\n", 121 property, MAX_BANKS); 122 prom_halt(); 123 } 124 125 ret = prom_getproperty(node, property, (char *) regs, prop_size); 126 if (ret == -1) { 127 prom_printf("Couldn't get %s property from /memory.\n", 128 property); 129 prom_halt(); 130 } 131 132 /* Sanitize what we got from the firmware, by page aligning 133 * everything. 134 */ 135 for (i = 0; i < ents; i++) { 136 unsigned long base, size; 137 138 base = regs[i].phys_addr; 139 size = regs[i].reg_size; 140 141 size &= PAGE_MASK; 142 if (base & ~PAGE_MASK) { 143 unsigned long new_base = PAGE_ALIGN(base); 144 145 size -= new_base - base; 146 if ((long) size < 0L) 147 size = 0UL; 148 base = new_base; 149 } 150 if (size == 0UL) { 151 /* If it is empty, simply get rid of it. 152 * This simplifies the logic of the other 153 * functions that process these arrays. 154 */ 155 memmove(®s[i], ®s[i + 1], 156 (ents - i - 1) * sizeof(regs[0])); 157 i--; 158 ents--; 159 continue; 160 } 161 regs[i].phys_addr = base; 162 regs[i].reg_size = size; 163 } 164 165 *num_ents = ents; 166 167 sort(regs, ents, sizeof(struct linux_prom64_registers), 168 cmp_p64, NULL); 169 } 170 171 /* Kernel physical address base and size in bytes. */ 172 unsigned long kern_base __read_mostly; 173 unsigned long kern_size __read_mostly; 174 175 /* Initial ramdisk setup */ 176 extern unsigned long sparc_ramdisk_image64; 177 extern unsigned int sparc_ramdisk_image; 178 extern unsigned int sparc_ramdisk_size; 179 180 struct page *mem_map_zero __read_mostly; 181 EXPORT_SYMBOL(mem_map_zero); 182 183 unsigned int sparc64_highest_unlocked_tlb_ent __read_mostly; 184 185 unsigned long sparc64_kern_pri_context __read_mostly; 186 unsigned long sparc64_kern_pri_nuc_bits __read_mostly; 187 unsigned long sparc64_kern_sec_context __read_mostly; 188 189 int num_kernel_image_mappings; 190 191 #ifdef CONFIG_DEBUG_DCFLUSH 192 atomic_t dcpage_flushes = ATOMIC_INIT(0); 193 #ifdef CONFIG_SMP 194 atomic_t dcpage_flushes_xcall = ATOMIC_INIT(0); 195 #endif 196 #endif 197 198 inline void flush_dcache_page_impl(struct page *page) 199 { 200 BUG_ON(tlb_type == hypervisor); 201 #ifdef CONFIG_DEBUG_DCFLUSH 202 atomic_inc(&dcpage_flushes); 203 #endif 204 205 #ifdef DCACHE_ALIASING_POSSIBLE 206 __flush_dcache_page(page_address(page), 207 ((tlb_type == spitfire) && 208 page_mapping(page) != NULL)); 209 #else 210 if (page_mapping(page) != NULL && 211 tlb_type == spitfire) 212 __flush_icache_page(__pa(page_address(page))); 213 #endif 214 } 215 216 #define PG_dcache_dirty PG_arch_1 217 #define PG_dcache_cpu_shift 32UL 218 #define PG_dcache_cpu_mask \ 219 ((1UL<<ilog2(roundup_pow_of_two(NR_CPUS)))-1UL) 220 221 #define dcache_dirty_cpu(page) \ 222 (((page)->flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) 223 224 static inline void set_dcache_dirty(struct page *page, int this_cpu) 225 { 226 unsigned long mask = this_cpu; 227 unsigned long non_cpu_bits; 228 229 non_cpu_bits = ~(PG_dcache_cpu_mask << PG_dcache_cpu_shift); 230 mask = (mask << PG_dcache_cpu_shift) | (1UL << PG_dcache_dirty); 231 232 __asm__ __volatile__("1:\n\t" 233 "ldx [%2], %%g7\n\t" 234 "and %%g7, %1, %%g1\n\t" 235 "or %%g1, %0, %%g1\n\t" 236 "casx [%2], %%g7, %%g1\n\t" 237 "cmp %%g7, %%g1\n\t" 238 "bne,pn %%xcc, 1b\n\t" 239 " nop" 240 : /* no outputs */ 241 : "r" (mask), "r" (non_cpu_bits), "r" (&page->flags) 242 : "g1", "g7"); 243 } 244 245 static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu) 246 { 247 unsigned long mask = (1UL << PG_dcache_dirty); 248 249 __asm__ __volatile__("! test_and_clear_dcache_dirty\n" 250 "1:\n\t" 251 "ldx [%2], %%g7\n\t" 252 "srlx %%g7, %4, %%g1\n\t" 253 "and %%g1, %3, %%g1\n\t" 254 "cmp %%g1, %0\n\t" 255 "bne,pn %%icc, 2f\n\t" 256 " andn %%g7, %1, %%g1\n\t" 257 "casx [%2], %%g7, %%g1\n\t" 258 "cmp %%g7, %%g1\n\t" 259 "bne,pn %%xcc, 1b\n\t" 260 " nop\n" 261 "2:" 262 : /* no outputs */ 263 : "r" (cpu), "r" (mask), "r" (&page->flags), 264 "i" (PG_dcache_cpu_mask), 265 "i" (PG_dcache_cpu_shift) 266 : "g1", "g7"); 267 } 268 269 static inline void tsb_insert(struct tsb *ent, unsigned long tag, unsigned long pte) 270 { 271 unsigned long tsb_addr = (unsigned long) ent; 272 273 if (tlb_type == cheetah_plus || tlb_type == hypervisor) 274 tsb_addr = __pa(tsb_addr); 275 276 __tsb_insert(tsb_addr, tag, pte); 277 } 278 279 unsigned long _PAGE_ALL_SZ_BITS __read_mostly; 280 281 static void flush_dcache(unsigned long pfn) 282 { 283 struct page *page; 284 285 page = pfn_to_page(pfn); 286 if (page) { 287 unsigned long pg_flags; 288 289 pg_flags = page->flags; 290 if (pg_flags & (1UL << PG_dcache_dirty)) { 291 int cpu = ((pg_flags >> PG_dcache_cpu_shift) & 292 PG_dcache_cpu_mask); 293 int this_cpu = get_cpu(); 294 295 /* This is just to optimize away some function calls 296 * in the SMP case. 297 */ 298 if (cpu == this_cpu) 299 flush_dcache_page_impl(page); 300 else 301 smp_flush_dcache_page_impl(page, cpu); 302 303 clear_dcache_dirty_cpu(page, cpu); 304 305 put_cpu(); 306 } 307 } 308 } 309 310 /* mm->context.lock must be held */ 311 static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_index, 312 unsigned long tsb_hash_shift, unsigned long address, 313 unsigned long tte) 314 { 315 struct tsb *tsb = mm->context.tsb_block[tsb_index].tsb; 316 unsigned long tag; 317 318 if (unlikely(!tsb)) 319 return; 320 321 tsb += ((address >> tsb_hash_shift) & 322 (mm->context.tsb_block[tsb_index].tsb_nentries - 1UL)); 323 tag = (address >> 22UL); 324 tsb_insert(tsb, tag, tte); 325 } 326 327 #ifdef CONFIG_HUGETLB_PAGE 328 static void __init add_huge_page_size(unsigned long size) 329 { 330 unsigned int order; 331 332 if (size_to_hstate(size)) 333 return; 334 335 order = ilog2(size) - PAGE_SHIFT; 336 hugetlb_add_hstate(order); 337 } 338 339 static int __init hugetlbpage_init(void) 340 { 341 add_huge_page_size(1UL << HPAGE_64K_SHIFT); 342 add_huge_page_size(1UL << HPAGE_SHIFT); 343 add_huge_page_size(1UL << HPAGE_256MB_SHIFT); 344 add_huge_page_size(1UL << HPAGE_2GB_SHIFT); 345 346 return 0; 347 } 348 349 arch_initcall(hugetlbpage_init); 350 351 static void __init pud_huge_patch(void) 352 { 353 struct pud_huge_patch_entry *p; 354 unsigned long addr; 355 356 p = &__pud_huge_patch; 357 addr = p->addr; 358 *(unsigned int *)addr = p->insn; 359 360 __asm__ __volatile__("flush %0" : : "r" (addr)); 361 } 362 363 static int __init setup_hugepagesz(char *string) 364 { 365 unsigned long long hugepage_size; 366 unsigned int hugepage_shift; 367 unsigned short hv_pgsz_idx; 368 unsigned int hv_pgsz_mask; 369 int rc = 0; 370 371 hugepage_size = memparse(string, &string); 372 hugepage_shift = ilog2(hugepage_size); 373 374 switch (hugepage_shift) { 375 case HPAGE_16GB_SHIFT: 376 hv_pgsz_mask = HV_PGSZ_MASK_16GB; 377 hv_pgsz_idx = HV_PGSZ_IDX_16GB; 378 pud_huge_patch(); 379 break; 380 case HPAGE_2GB_SHIFT: 381 hv_pgsz_mask = HV_PGSZ_MASK_2GB; 382 hv_pgsz_idx = HV_PGSZ_IDX_2GB; 383 break; 384 case HPAGE_256MB_SHIFT: 385 hv_pgsz_mask = HV_PGSZ_MASK_256MB; 386 hv_pgsz_idx = HV_PGSZ_IDX_256MB; 387 break; 388 case HPAGE_SHIFT: 389 hv_pgsz_mask = HV_PGSZ_MASK_4MB; 390 hv_pgsz_idx = HV_PGSZ_IDX_4MB; 391 break; 392 case HPAGE_64K_SHIFT: 393 hv_pgsz_mask = HV_PGSZ_MASK_64K; 394 hv_pgsz_idx = HV_PGSZ_IDX_64K; 395 break; 396 default: 397 hv_pgsz_mask = 0; 398 } 399 400 if ((hv_pgsz_mask & cpu_pgsz_mask) == 0U) { 401 hugetlb_bad_size(); 402 pr_err("hugepagesz=%llu not supported by MMU.\n", 403 hugepage_size); 404 goto out; 405 } 406 407 add_huge_page_size(hugepage_size); 408 rc = 1; 409 410 out: 411 return rc; 412 } 413 __setup("hugepagesz=", setup_hugepagesz); 414 #endif /* CONFIG_HUGETLB_PAGE */ 415 416 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) 417 { 418 struct mm_struct *mm; 419 unsigned long flags; 420 bool is_huge_tsb; 421 pte_t pte = *ptep; 422 423 if (tlb_type != hypervisor) { 424 unsigned long pfn = pte_pfn(pte); 425 426 if (pfn_valid(pfn)) 427 flush_dcache(pfn); 428 } 429 430 mm = vma->vm_mm; 431 432 /* Don't insert a non-valid PTE into the TSB, we'll deadlock. */ 433 if (!pte_accessible(mm, pte)) 434 return; 435 436 spin_lock_irqsave(&mm->context.lock, flags); 437 438 is_huge_tsb = false; 439 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) 440 if (mm->context.hugetlb_pte_count || mm->context.thp_pte_count) { 441 unsigned long hugepage_size = PAGE_SIZE; 442 443 if (is_vm_hugetlb_page(vma)) 444 hugepage_size = huge_page_size(hstate_vma(vma)); 445 446 if (hugepage_size >= PUD_SIZE) { 447 unsigned long mask = 0x1ffc00000UL; 448 449 /* Transfer bits [32:22] from address to resolve 450 * at 4M granularity. 451 */ 452 pte_val(pte) &= ~mask; 453 pte_val(pte) |= (address & mask); 454 } else if (hugepage_size >= PMD_SIZE) { 455 /* We are fabricating 8MB pages using 4MB 456 * real hw pages. 457 */ 458 pte_val(pte) |= (address & (1UL << REAL_HPAGE_SHIFT)); 459 } 460 461 if (hugepage_size >= PMD_SIZE) { 462 __update_mmu_tsb_insert(mm, MM_TSB_HUGE, 463 REAL_HPAGE_SHIFT, address, pte_val(pte)); 464 is_huge_tsb = true; 465 } 466 } 467 #endif 468 if (!is_huge_tsb) 469 __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT, 470 address, pte_val(pte)); 471 472 spin_unlock_irqrestore(&mm->context.lock, flags); 473 } 474 475 void flush_dcache_page(struct page *page) 476 { 477 struct address_space *mapping; 478 int this_cpu; 479 480 if (tlb_type == hypervisor) 481 return; 482 483 /* Do not bother with the expensive D-cache flush if it 484 * is merely the zero page. The 'bigcore' testcase in GDB 485 * causes this case to run millions of times. 486 */ 487 if (page == ZERO_PAGE(0)) 488 return; 489 490 this_cpu = get_cpu(); 491 492 mapping = page_mapping(page); 493 if (mapping && !mapping_mapped(mapping)) { 494 int dirty = test_bit(PG_dcache_dirty, &page->flags); 495 if (dirty) { 496 int dirty_cpu = dcache_dirty_cpu(page); 497 498 if (dirty_cpu == this_cpu) 499 goto out; 500 smp_flush_dcache_page_impl(page, dirty_cpu); 501 } 502 set_dcache_dirty(page, this_cpu); 503 } else { 504 /* We could delay the flush for the !page_mapping 505 * case too. But that case is for exec env/arg 506 * pages and those are %99 certainly going to get 507 * faulted into the tlb (and thus flushed) anyways. 508 */ 509 flush_dcache_page_impl(page); 510 } 511 512 out: 513 put_cpu(); 514 } 515 EXPORT_SYMBOL(flush_dcache_page); 516 517 void __kprobes flush_icache_range(unsigned long start, unsigned long end) 518 { 519 /* Cheetah and Hypervisor platform cpus have coherent I-cache. */ 520 if (tlb_type == spitfire) { 521 unsigned long kaddr; 522 523 /* This code only runs on Spitfire cpus so this is 524 * why we can assume _PAGE_PADDR_4U. 525 */ 526 for (kaddr = start; kaddr < end; kaddr += PAGE_SIZE) { 527 unsigned long paddr, mask = _PAGE_PADDR_4U; 528 529 if (kaddr >= PAGE_OFFSET) 530 paddr = kaddr & mask; 531 else { 532 pgd_t *pgdp = pgd_offset_k(kaddr); 533 pud_t *pudp = pud_offset(pgdp, kaddr); 534 pmd_t *pmdp = pmd_offset(pudp, kaddr); 535 pte_t *ptep = pte_offset_kernel(pmdp, kaddr); 536 537 paddr = pte_val(*ptep) & mask; 538 } 539 __flush_icache_page(paddr); 540 } 541 } 542 } 543 EXPORT_SYMBOL(flush_icache_range); 544 545 void mmu_info(struct seq_file *m) 546 { 547 static const char *pgsz_strings[] = { 548 "8K", "64K", "512K", "4MB", "32MB", 549 "256MB", "2GB", "16GB", 550 }; 551 int i, printed; 552 553 if (tlb_type == cheetah) 554 seq_printf(m, "MMU Type\t: Cheetah\n"); 555 else if (tlb_type == cheetah_plus) 556 seq_printf(m, "MMU Type\t: Cheetah+\n"); 557 else if (tlb_type == spitfire) 558 seq_printf(m, "MMU Type\t: Spitfire\n"); 559 else if (tlb_type == hypervisor) 560 seq_printf(m, "MMU Type\t: Hypervisor (sun4v)\n"); 561 else 562 seq_printf(m, "MMU Type\t: ???\n"); 563 564 seq_printf(m, "MMU PGSZs\t: "); 565 printed = 0; 566 for (i = 0; i < ARRAY_SIZE(pgsz_strings); i++) { 567 if (cpu_pgsz_mask & (1UL << i)) { 568 seq_printf(m, "%s%s", 569 printed ? "," : "", pgsz_strings[i]); 570 printed++; 571 } 572 } 573 seq_putc(m, '\n'); 574 575 #ifdef CONFIG_DEBUG_DCFLUSH 576 seq_printf(m, "DCPageFlushes\t: %d\n", 577 atomic_read(&dcpage_flushes)); 578 #ifdef CONFIG_SMP 579 seq_printf(m, "DCPageFlushesXC\t: %d\n", 580 atomic_read(&dcpage_flushes_xcall)); 581 #endif /* CONFIG_SMP */ 582 #endif /* CONFIG_DEBUG_DCFLUSH */ 583 } 584 585 struct linux_prom_translation prom_trans[512] __read_mostly; 586 unsigned int prom_trans_ents __read_mostly; 587 588 unsigned long kern_locked_tte_data; 589 590 /* The obp translations are saved based on 8k pagesize, since obp can 591 * use a mixture of pagesizes. Misses to the LOW_OBP_ADDRESS -> 592 * HI_OBP_ADDRESS range are handled in ktlb.S. 593 */ 594 static inline int in_obp_range(unsigned long vaddr) 595 { 596 return (vaddr >= LOW_OBP_ADDRESS && 597 vaddr < HI_OBP_ADDRESS); 598 } 599 600 static int cmp_ptrans(const void *a, const void *b) 601 { 602 const struct linux_prom_translation *x = a, *y = b; 603 604 if (x->virt > y->virt) 605 return 1; 606 if (x->virt < y->virt) 607 return -1; 608 return 0; 609 } 610 611 /* Read OBP translations property into 'prom_trans[]'. */ 612 static void __init read_obp_translations(void) 613 { 614 int n, node, ents, first, last, i; 615 616 node = prom_finddevice("/virtual-memory"); 617 n = prom_getproplen(node, "translations"); 618 if (unlikely(n == 0 || n == -1)) { 619 prom_printf("prom_mappings: Couldn't get size.\n"); 620 prom_halt(); 621 } 622 if (unlikely(n > sizeof(prom_trans))) { 623 prom_printf("prom_mappings: Size %d is too big.\n", n); 624 prom_halt(); 625 } 626 627 if ((n = prom_getproperty(node, "translations", 628 (char *)&prom_trans[0], 629 sizeof(prom_trans))) == -1) { 630 prom_printf("prom_mappings: Couldn't get property.\n"); 631 prom_halt(); 632 } 633 634 n = n / sizeof(struct linux_prom_translation); 635 636 ents = n; 637 638 sort(prom_trans, ents, sizeof(struct linux_prom_translation), 639 cmp_ptrans, NULL); 640 641 /* Now kick out all the non-OBP entries. */ 642 for (i = 0; i < ents; i++) { 643 if (in_obp_range(prom_trans[i].virt)) 644 break; 645 } 646 first = i; 647 for (; i < ents; i++) { 648 if (!in_obp_range(prom_trans[i].virt)) 649 break; 650 } 651 last = i; 652 653 for (i = 0; i < (last - first); i++) { 654 struct linux_prom_translation *src = &prom_trans[i + first]; 655 struct linux_prom_translation *dest = &prom_trans[i]; 656 657 *dest = *src; 658 } 659 for (; i < ents; i++) { 660 struct linux_prom_translation *dest = &prom_trans[i]; 661 dest->virt = dest->size = dest->data = 0x0UL; 662 } 663 664 prom_trans_ents = last - first; 665 666 if (tlb_type == spitfire) { 667 /* Clear diag TTE bits. */ 668 for (i = 0; i < prom_trans_ents; i++) 669 prom_trans[i].data &= ~0x0003fe0000000000UL; 670 } 671 672 /* Force execute bit on. */ 673 for (i = 0; i < prom_trans_ents; i++) 674 prom_trans[i].data |= (tlb_type == hypervisor ? 675 _PAGE_EXEC_4V : _PAGE_EXEC_4U); 676 } 677 678 static void __init hypervisor_tlb_lock(unsigned long vaddr, 679 unsigned long pte, 680 unsigned long mmu) 681 { 682 unsigned long ret = sun4v_mmu_map_perm_addr(vaddr, 0, pte, mmu); 683 684 if (ret != 0) { 685 prom_printf("hypervisor_tlb_lock[%lx:%x:%lx:%lx]: " 686 "errors with %lx\n", vaddr, 0, pte, mmu, ret); 687 prom_halt(); 688 } 689 } 690 691 static unsigned long kern_large_tte(unsigned long paddr); 692 693 static void __init remap_kernel(void) 694 { 695 unsigned long phys_page, tte_vaddr, tte_data; 696 int i, tlb_ent = sparc64_highest_locked_tlbent(); 697 698 tte_vaddr = (unsigned long) KERNBASE; 699 phys_page = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB; 700 tte_data = kern_large_tte(phys_page); 701 702 kern_locked_tte_data = tte_data; 703 704 /* Now lock us into the TLBs via Hypervisor or OBP. */ 705 if (tlb_type == hypervisor) { 706 for (i = 0; i < num_kernel_image_mappings; i++) { 707 hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_DMMU); 708 hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_IMMU); 709 tte_vaddr += 0x400000; 710 tte_data += 0x400000; 711 } 712 } else { 713 for (i = 0; i < num_kernel_image_mappings; i++) { 714 prom_dtlb_load(tlb_ent - i, tte_data, tte_vaddr); 715 prom_itlb_load(tlb_ent - i, tte_data, tte_vaddr); 716 tte_vaddr += 0x400000; 717 tte_data += 0x400000; 718 } 719 sparc64_highest_unlocked_tlb_ent = tlb_ent - i; 720 } 721 if (tlb_type == cheetah_plus) { 722 sparc64_kern_pri_context = (CTX_CHEETAH_PLUS_CTX0 | 723 CTX_CHEETAH_PLUS_NUC); 724 sparc64_kern_pri_nuc_bits = CTX_CHEETAH_PLUS_NUC; 725 sparc64_kern_sec_context = CTX_CHEETAH_PLUS_CTX0; 726 } 727 } 728 729 730 static void __init inherit_prom_mappings(void) 731 { 732 /* Now fixup OBP's idea about where we really are mapped. */ 733 printk("Remapping the kernel... "); 734 remap_kernel(); 735 printk("done.\n"); 736 } 737 738 void prom_world(int enter) 739 { 740 if (!enter) 741 set_fs(get_fs()); 742 743 __asm__ __volatile__("flushw"); 744 } 745 746 void __flush_dcache_range(unsigned long start, unsigned long end) 747 { 748 unsigned long va; 749 750 if (tlb_type == spitfire) { 751 int n = 0; 752 753 for (va = start; va < end; va += 32) { 754 spitfire_put_dcache_tag(va & 0x3fe0, 0x0); 755 if (++n >= 512) 756 break; 757 } 758 } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { 759 start = __pa(start); 760 end = __pa(end); 761 for (va = start; va < end; va += 32) 762 __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" 763 "membar #Sync" 764 : /* no outputs */ 765 : "r" (va), 766 "i" (ASI_DCACHE_INVALIDATE)); 767 } 768 } 769 EXPORT_SYMBOL(__flush_dcache_range); 770 771 /* get_new_mmu_context() uses "cache + 1". */ 772 DEFINE_SPINLOCK(ctx_alloc_lock); 773 unsigned long tlb_context_cache = CTX_FIRST_VERSION; 774 #define MAX_CTX_NR (1UL << CTX_NR_BITS) 775 #define CTX_BMAP_SLOTS BITS_TO_LONGS(MAX_CTX_NR) 776 DECLARE_BITMAP(mmu_context_bmap, MAX_CTX_NR); 777 DEFINE_PER_CPU(struct mm_struct *, per_cpu_secondary_mm) = {0}; 778 779 static void mmu_context_wrap(void) 780 { 781 unsigned long old_ver = tlb_context_cache & CTX_VERSION_MASK; 782 unsigned long new_ver, new_ctx, old_ctx; 783 struct mm_struct *mm; 784 int cpu; 785 786 bitmap_zero(mmu_context_bmap, 1 << CTX_NR_BITS); 787 788 /* Reserve kernel context */ 789 set_bit(0, mmu_context_bmap); 790 791 new_ver = (tlb_context_cache & CTX_VERSION_MASK) + CTX_FIRST_VERSION; 792 if (unlikely(new_ver == 0)) 793 new_ver = CTX_FIRST_VERSION; 794 tlb_context_cache = new_ver; 795 796 /* 797 * Make sure that any new mm that are added into per_cpu_secondary_mm, 798 * are going to go through get_new_mmu_context() path. 799 */ 800 mb(); 801 802 /* 803 * Updated versions to current on those CPUs that had valid secondary 804 * contexts 805 */ 806 for_each_online_cpu(cpu) { 807 /* 808 * If a new mm is stored after we took this mm from the array, 809 * it will go into get_new_mmu_context() path, because we 810 * already bumped the version in tlb_context_cache. 811 */ 812 mm = per_cpu(per_cpu_secondary_mm, cpu); 813 814 if (unlikely(!mm || mm == &init_mm)) 815 continue; 816 817 old_ctx = mm->context.sparc64_ctx_val; 818 if (likely((old_ctx & CTX_VERSION_MASK) == old_ver)) { 819 new_ctx = (old_ctx & ~CTX_VERSION_MASK) | new_ver; 820 set_bit(new_ctx & CTX_NR_MASK, mmu_context_bmap); 821 mm->context.sparc64_ctx_val = new_ctx; 822 } 823 } 824 } 825 826 /* Caller does TLB context flushing on local CPU if necessary. 827 * The caller also ensures that CTX_VALID(mm->context) is false. 828 * 829 * We must be careful about boundary cases so that we never 830 * let the user have CTX 0 (nucleus) or we ever use a CTX 831 * version of zero (and thus NO_CONTEXT would not be caught 832 * by version mis-match tests in mmu_context.h). 833 * 834 * Always invoked with interrupts disabled. 835 */ 836 void get_new_mmu_context(struct mm_struct *mm) 837 { 838 unsigned long ctx, new_ctx; 839 unsigned long orig_pgsz_bits; 840 841 spin_lock(&ctx_alloc_lock); 842 retry: 843 /* wrap might have happened, test again if our context became valid */ 844 if (unlikely(CTX_VALID(mm->context))) 845 goto out; 846 orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK); 847 ctx = (tlb_context_cache + 1) & CTX_NR_MASK; 848 new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx); 849 if (new_ctx >= (1 << CTX_NR_BITS)) { 850 new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1); 851 if (new_ctx >= ctx) { 852 mmu_context_wrap(); 853 goto retry; 854 } 855 } 856 if (mm->context.sparc64_ctx_val) 857 cpumask_clear(mm_cpumask(mm)); 858 mmu_context_bmap[new_ctx>>6] |= (1UL << (new_ctx & 63)); 859 new_ctx |= (tlb_context_cache & CTX_VERSION_MASK); 860 tlb_context_cache = new_ctx; 861 mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits; 862 out: 863 spin_unlock(&ctx_alloc_lock); 864 } 865 866 static int numa_enabled = 1; 867 static int numa_debug; 868 869 static int __init early_numa(char *p) 870 { 871 if (!p) 872 return 0; 873 874 if (strstr(p, "off")) 875 numa_enabled = 0; 876 877 if (strstr(p, "debug")) 878 numa_debug = 1; 879 880 return 0; 881 } 882 early_param("numa", early_numa); 883 884 #define numadbg(f, a...) \ 885 do { if (numa_debug) \ 886 printk(KERN_INFO f, ## a); \ 887 } while (0) 888 889 static void __init find_ramdisk(unsigned long phys_base) 890 { 891 #ifdef CONFIG_BLK_DEV_INITRD 892 if (sparc_ramdisk_image || sparc_ramdisk_image64) { 893 unsigned long ramdisk_image; 894 895 /* Older versions of the bootloader only supported a 896 * 32-bit physical address for the ramdisk image 897 * location, stored at sparc_ramdisk_image. Newer 898 * SILO versions set sparc_ramdisk_image to zero and 899 * provide a full 64-bit physical address at 900 * sparc_ramdisk_image64. 901 */ 902 ramdisk_image = sparc_ramdisk_image; 903 if (!ramdisk_image) 904 ramdisk_image = sparc_ramdisk_image64; 905 906 /* Another bootloader quirk. The bootloader normalizes 907 * the physical address to KERNBASE, so we have to 908 * factor that back out and add in the lowest valid 909 * physical page address to get the true physical address. 910 */ 911 ramdisk_image -= KERNBASE; 912 ramdisk_image += phys_base; 913 914 numadbg("Found ramdisk at physical address 0x%lx, size %u\n", 915 ramdisk_image, sparc_ramdisk_size); 916 917 initrd_start = ramdisk_image; 918 initrd_end = ramdisk_image + sparc_ramdisk_size; 919 920 memblock_reserve(initrd_start, sparc_ramdisk_size); 921 922 initrd_start += PAGE_OFFSET; 923 initrd_end += PAGE_OFFSET; 924 } 925 #endif 926 } 927 928 struct node_mem_mask { 929 unsigned long mask; 930 unsigned long match; 931 }; 932 static struct node_mem_mask node_masks[MAX_NUMNODES]; 933 static int num_node_masks; 934 935 #ifdef CONFIG_NEED_MULTIPLE_NODES 936 937 struct mdesc_mlgroup { 938 u64 node; 939 u64 latency; 940 u64 match; 941 u64 mask; 942 }; 943 944 static struct mdesc_mlgroup *mlgroups; 945 static int num_mlgroups; 946 947 int numa_cpu_lookup_table[NR_CPUS]; 948 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; 949 950 struct mdesc_mblock { 951 u64 base; 952 u64 size; 953 u64 offset; /* RA-to-PA */ 954 }; 955 static struct mdesc_mblock *mblocks; 956 static int num_mblocks; 957 958 static struct mdesc_mblock * __init addr_to_mblock(unsigned long addr) 959 { 960 struct mdesc_mblock *m = NULL; 961 int i; 962 963 for (i = 0; i < num_mblocks; i++) { 964 m = &mblocks[i]; 965 966 if (addr >= m->base && 967 addr < (m->base + m->size)) { 968 break; 969 } 970 } 971 972 return m; 973 } 974 975 static u64 __init memblock_nid_range_sun4u(u64 start, u64 end, int *nid) 976 { 977 int prev_nid, new_nid; 978 979 prev_nid = -1; 980 for ( ; start < end; start += PAGE_SIZE) { 981 for (new_nid = 0; new_nid < num_node_masks; new_nid++) { 982 struct node_mem_mask *p = &node_masks[new_nid]; 983 984 if ((start & p->mask) == p->match) { 985 if (prev_nid == -1) 986 prev_nid = new_nid; 987 break; 988 } 989 } 990 991 if (new_nid == num_node_masks) { 992 prev_nid = 0; 993 WARN_ONCE(1, "addr[%Lx] doesn't match a NUMA node rule. Some memory will be owned by node 0.", 994 start); 995 break; 996 } 997 998 if (prev_nid != new_nid) 999 break; 1000 } 1001 *nid = prev_nid; 1002 1003 return start > end ? end : start; 1004 } 1005 1006 static u64 __init memblock_nid_range(u64 start, u64 end, int *nid) 1007 { 1008 u64 ret_end, pa_start, m_mask, m_match, m_end; 1009 struct mdesc_mblock *mblock; 1010 int _nid, i; 1011 1012 if (tlb_type != hypervisor) 1013 return memblock_nid_range_sun4u(start, end, nid); 1014 1015 mblock = addr_to_mblock(start); 1016 if (!mblock) { 1017 WARN_ONCE(1, "memblock_nid_range: Can't find mblock addr[%Lx]", 1018 start); 1019 1020 _nid = 0; 1021 ret_end = end; 1022 goto done; 1023 } 1024 1025 pa_start = start + mblock->offset; 1026 m_match = 0; 1027 m_mask = 0; 1028 1029 for (_nid = 0; _nid < num_node_masks; _nid++) { 1030 struct node_mem_mask *const m = &node_masks[_nid]; 1031 1032 if ((pa_start & m->mask) == m->match) { 1033 m_match = m->match; 1034 m_mask = m->mask; 1035 break; 1036 } 1037 } 1038 1039 if (num_node_masks == _nid) { 1040 /* We could not find NUMA group, so default to 0, but lets 1041 * search for latency group, so we could calculate the correct 1042 * end address that we return 1043 */ 1044 _nid = 0; 1045 1046 for (i = 0; i < num_mlgroups; i++) { 1047 struct mdesc_mlgroup *const m = &mlgroups[i]; 1048 1049 if ((pa_start & m->mask) == m->match) { 1050 m_match = m->match; 1051 m_mask = m->mask; 1052 break; 1053 } 1054 } 1055 1056 if (i == num_mlgroups) { 1057 WARN_ONCE(1, "memblock_nid_range: Can't find latency group addr[%Lx]", 1058 start); 1059 1060 ret_end = end; 1061 goto done; 1062 } 1063 } 1064 1065 /* 1066 * Each latency group has match and mask, and each memory block has an 1067 * offset. An address belongs to a latency group if its address matches 1068 * the following formula: ((addr + offset) & mask) == match 1069 * It is, however, slow to check every single page if it matches a 1070 * particular latency group. As optimization we calculate end value by 1071 * using bit arithmetics. 1072 */ 1073 m_end = m_match + (1ul << __ffs(m_mask)) - mblock->offset; 1074 m_end += pa_start & ~((1ul << fls64(m_mask)) - 1); 1075 ret_end = m_end > end ? end : m_end; 1076 1077 done: 1078 *nid = _nid; 1079 return ret_end; 1080 } 1081 #endif 1082 1083 /* This must be invoked after performing all of the necessary 1084 * memblock_set_node() calls for 'nid'. We need to be able to get 1085 * correct data from get_pfn_range_for_nid(). 1086 */ 1087 static void __init allocate_node_data(int nid) 1088 { 1089 struct pglist_data *p; 1090 unsigned long start_pfn, end_pfn; 1091 #ifdef CONFIG_NEED_MULTIPLE_NODES 1092 unsigned long paddr; 1093 1094 paddr = memblock_alloc_try_nid(sizeof(struct pglist_data), SMP_CACHE_BYTES, nid); 1095 if (!paddr) { 1096 prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid); 1097 prom_halt(); 1098 } 1099 NODE_DATA(nid) = __va(paddr); 1100 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 1101 1102 NODE_DATA(nid)->node_id = nid; 1103 #endif 1104 1105 p = NODE_DATA(nid); 1106 1107 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 1108 p->node_start_pfn = start_pfn; 1109 p->node_spanned_pages = end_pfn - start_pfn; 1110 } 1111 1112 static void init_node_masks_nonnuma(void) 1113 { 1114 #ifdef CONFIG_NEED_MULTIPLE_NODES 1115 int i; 1116 #endif 1117 1118 numadbg("Initializing tables for non-numa.\n"); 1119 1120 node_masks[0].mask = 0; 1121 node_masks[0].match = 0; 1122 num_node_masks = 1; 1123 1124 #ifdef CONFIG_NEED_MULTIPLE_NODES 1125 for (i = 0; i < NR_CPUS; i++) 1126 numa_cpu_lookup_table[i] = 0; 1127 1128 cpumask_setall(&numa_cpumask_lookup_table[0]); 1129 #endif 1130 } 1131 1132 #ifdef CONFIG_NEED_MULTIPLE_NODES 1133 struct pglist_data *node_data[MAX_NUMNODES]; 1134 1135 EXPORT_SYMBOL(numa_cpu_lookup_table); 1136 EXPORT_SYMBOL(numa_cpumask_lookup_table); 1137 EXPORT_SYMBOL(node_data); 1138 1139 static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio, 1140 u32 cfg_handle) 1141 { 1142 u64 arc; 1143 1144 mdesc_for_each_arc(arc, md, pio, MDESC_ARC_TYPE_FWD) { 1145 u64 target = mdesc_arc_target(md, arc); 1146 const u64 *val; 1147 1148 val = mdesc_get_property(md, target, 1149 "cfg-handle", NULL); 1150 if (val && *val == cfg_handle) 1151 return 0; 1152 } 1153 return -ENODEV; 1154 } 1155 1156 static int scan_arcs_for_cfg_handle(struct mdesc_handle *md, u64 grp, 1157 u32 cfg_handle) 1158 { 1159 u64 arc, candidate, best_latency = ~(u64)0; 1160 1161 candidate = MDESC_NODE_NULL; 1162 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) { 1163 u64 target = mdesc_arc_target(md, arc); 1164 const char *name = mdesc_node_name(md, target); 1165 const u64 *val; 1166 1167 if (strcmp(name, "pio-latency-group")) 1168 continue; 1169 1170 val = mdesc_get_property(md, target, "latency", NULL); 1171 if (!val) 1172 continue; 1173 1174 if (*val < best_latency) { 1175 candidate = target; 1176 best_latency = *val; 1177 } 1178 } 1179 1180 if (candidate == MDESC_NODE_NULL) 1181 return -ENODEV; 1182 1183 return scan_pio_for_cfg_handle(md, candidate, cfg_handle); 1184 } 1185 1186 int of_node_to_nid(struct device_node *dp) 1187 { 1188 const struct linux_prom64_registers *regs; 1189 struct mdesc_handle *md; 1190 u32 cfg_handle; 1191 int count, nid; 1192 u64 grp; 1193 1194 /* This is the right thing to do on currently supported 1195 * SUN4U NUMA platforms as well, as the PCI controller does 1196 * not sit behind any particular memory controller. 1197 */ 1198 if (!mlgroups) 1199 return -1; 1200 1201 regs = of_get_property(dp, "reg", NULL); 1202 if (!regs) 1203 return -1; 1204 1205 cfg_handle = (regs->phys_addr >> 32UL) & 0x0fffffff; 1206 1207 md = mdesc_grab(); 1208 1209 count = 0; 1210 nid = -1; 1211 mdesc_for_each_node_by_name(md, grp, "group") { 1212 if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) { 1213 nid = count; 1214 break; 1215 } 1216 count++; 1217 } 1218 1219 mdesc_release(md); 1220 1221 return nid; 1222 } 1223 1224 static void __init add_node_ranges(void) 1225 { 1226 struct memblock_region *reg; 1227 unsigned long prev_max; 1228 1229 memblock_resized: 1230 prev_max = memblock.memory.max; 1231 1232 for_each_memblock(memory, reg) { 1233 unsigned long size = reg->size; 1234 unsigned long start, end; 1235 1236 start = reg->base; 1237 end = start + size; 1238 while (start < end) { 1239 unsigned long this_end; 1240 int nid; 1241 1242 this_end = memblock_nid_range(start, end, &nid); 1243 1244 numadbg("Setting memblock NUMA node nid[%d] " 1245 "start[%lx] end[%lx]\n", 1246 nid, start, this_end); 1247 1248 memblock_set_node(start, this_end - start, 1249 &memblock.memory, nid); 1250 if (memblock.memory.max != prev_max) 1251 goto memblock_resized; 1252 start = this_end; 1253 } 1254 } 1255 } 1256 1257 static int __init grab_mlgroups(struct mdesc_handle *md) 1258 { 1259 unsigned long paddr; 1260 int count = 0; 1261 u64 node; 1262 1263 mdesc_for_each_node_by_name(md, node, "memory-latency-group") 1264 count++; 1265 if (!count) 1266 return -ENOENT; 1267 1268 paddr = memblock_alloc(count * sizeof(struct mdesc_mlgroup), 1269 SMP_CACHE_BYTES); 1270 if (!paddr) 1271 return -ENOMEM; 1272 1273 mlgroups = __va(paddr); 1274 num_mlgroups = count; 1275 1276 count = 0; 1277 mdesc_for_each_node_by_name(md, node, "memory-latency-group") { 1278 struct mdesc_mlgroup *m = &mlgroups[count++]; 1279 const u64 *val; 1280 1281 m->node = node; 1282 1283 val = mdesc_get_property(md, node, "latency", NULL); 1284 m->latency = *val; 1285 val = mdesc_get_property(md, node, "address-match", NULL); 1286 m->match = *val; 1287 val = mdesc_get_property(md, node, "address-mask", NULL); 1288 m->mask = *val; 1289 1290 numadbg("MLGROUP[%d]: node[%llx] latency[%llx] " 1291 "match[%llx] mask[%llx]\n", 1292 count - 1, m->node, m->latency, m->match, m->mask); 1293 } 1294 1295 return 0; 1296 } 1297 1298 static int __init grab_mblocks(struct mdesc_handle *md) 1299 { 1300 unsigned long paddr; 1301 int count = 0; 1302 u64 node; 1303 1304 mdesc_for_each_node_by_name(md, node, "mblock") 1305 count++; 1306 if (!count) 1307 return -ENOENT; 1308 1309 paddr = memblock_alloc(count * sizeof(struct mdesc_mblock), 1310 SMP_CACHE_BYTES); 1311 if (!paddr) 1312 return -ENOMEM; 1313 1314 mblocks = __va(paddr); 1315 num_mblocks = count; 1316 1317 count = 0; 1318 mdesc_for_each_node_by_name(md, node, "mblock") { 1319 struct mdesc_mblock *m = &mblocks[count++]; 1320 const u64 *val; 1321 1322 val = mdesc_get_property(md, node, "base", NULL); 1323 m->base = *val; 1324 val = mdesc_get_property(md, node, "size", NULL); 1325 m->size = *val; 1326 val = mdesc_get_property(md, node, 1327 "address-congruence-offset", NULL); 1328 1329 /* The address-congruence-offset property is optional. 1330 * Explicity zero it be identifty this. 1331 */ 1332 if (val) 1333 m->offset = *val; 1334 else 1335 m->offset = 0UL; 1336 1337 numadbg("MBLOCK[%d]: base[%llx] size[%llx] offset[%llx]\n", 1338 count - 1, m->base, m->size, m->offset); 1339 } 1340 1341 return 0; 1342 } 1343 1344 static void __init numa_parse_mdesc_group_cpus(struct mdesc_handle *md, 1345 u64 grp, cpumask_t *mask) 1346 { 1347 u64 arc; 1348 1349 cpumask_clear(mask); 1350 1351 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_BACK) { 1352 u64 target = mdesc_arc_target(md, arc); 1353 const char *name = mdesc_node_name(md, target); 1354 const u64 *id; 1355 1356 if (strcmp(name, "cpu")) 1357 continue; 1358 id = mdesc_get_property(md, target, "id", NULL); 1359 if (*id < nr_cpu_ids) 1360 cpumask_set_cpu(*id, mask); 1361 } 1362 } 1363 1364 static struct mdesc_mlgroup * __init find_mlgroup(u64 node) 1365 { 1366 int i; 1367 1368 for (i = 0; i < num_mlgroups; i++) { 1369 struct mdesc_mlgroup *m = &mlgroups[i]; 1370 if (m->node == node) 1371 return m; 1372 } 1373 return NULL; 1374 } 1375 1376 int __node_distance(int from, int to) 1377 { 1378 if ((from >= MAX_NUMNODES) || (to >= MAX_NUMNODES)) { 1379 pr_warn("Returning default NUMA distance value for %d->%d\n", 1380 from, to); 1381 return (from == to) ? LOCAL_DISTANCE : REMOTE_DISTANCE; 1382 } 1383 return numa_latency[from][to]; 1384 } 1385 1386 static int __init find_best_numa_node_for_mlgroup(struct mdesc_mlgroup *grp) 1387 { 1388 int i; 1389 1390 for (i = 0; i < MAX_NUMNODES; i++) { 1391 struct node_mem_mask *n = &node_masks[i]; 1392 1393 if ((grp->mask == n->mask) && (grp->match == n->match)) 1394 break; 1395 } 1396 return i; 1397 } 1398 1399 static void __init find_numa_latencies_for_group(struct mdesc_handle *md, 1400 u64 grp, int index) 1401 { 1402 u64 arc; 1403 1404 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) { 1405 int tnode; 1406 u64 target = mdesc_arc_target(md, arc); 1407 struct mdesc_mlgroup *m = find_mlgroup(target); 1408 1409 if (!m) 1410 continue; 1411 tnode = find_best_numa_node_for_mlgroup(m); 1412 if (tnode == MAX_NUMNODES) 1413 continue; 1414 numa_latency[index][tnode] = m->latency; 1415 } 1416 } 1417 1418 static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp, 1419 int index) 1420 { 1421 struct mdesc_mlgroup *candidate = NULL; 1422 u64 arc, best_latency = ~(u64)0; 1423 struct node_mem_mask *n; 1424 1425 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) { 1426 u64 target = mdesc_arc_target(md, arc); 1427 struct mdesc_mlgroup *m = find_mlgroup(target); 1428 if (!m) 1429 continue; 1430 if (m->latency < best_latency) { 1431 candidate = m; 1432 best_latency = m->latency; 1433 } 1434 } 1435 if (!candidate) 1436 return -ENOENT; 1437 1438 if (num_node_masks != index) { 1439 printk(KERN_ERR "Inconsistent NUMA state, " 1440 "index[%d] != num_node_masks[%d]\n", 1441 index, num_node_masks); 1442 return -EINVAL; 1443 } 1444 1445 n = &node_masks[num_node_masks++]; 1446 1447 n->mask = candidate->mask; 1448 n->match = candidate->match; 1449 1450 numadbg("NUMA NODE[%d]: mask[%lx] match[%lx] (latency[%llx])\n", 1451 index, n->mask, n->match, candidate->latency); 1452 1453 return 0; 1454 } 1455 1456 static int __init numa_parse_mdesc_group(struct mdesc_handle *md, u64 grp, 1457 int index) 1458 { 1459 cpumask_t mask; 1460 int cpu; 1461 1462 numa_parse_mdesc_group_cpus(md, grp, &mask); 1463 1464 for_each_cpu(cpu, &mask) 1465 numa_cpu_lookup_table[cpu] = index; 1466 cpumask_copy(&numa_cpumask_lookup_table[index], &mask); 1467 1468 if (numa_debug) { 1469 printk(KERN_INFO "NUMA GROUP[%d]: cpus [ ", index); 1470 for_each_cpu(cpu, &mask) 1471 printk("%d ", cpu); 1472 printk("]\n"); 1473 } 1474 1475 return numa_attach_mlgroup(md, grp, index); 1476 } 1477 1478 static int __init numa_parse_mdesc(void) 1479 { 1480 struct mdesc_handle *md = mdesc_grab(); 1481 int i, j, err, count; 1482 u64 node; 1483 1484 node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups"); 1485 if (node == MDESC_NODE_NULL) { 1486 mdesc_release(md); 1487 return -ENOENT; 1488 } 1489 1490 err = grab_mblocks(md); 1491 if (err < 0) 1492 goto out; 1493 1494 err = grab_mlgroups(md); 1495 if (err < 0) 1496 goto out; 1497 1498 count = 0; 1499 mdesc_for_each_node_by_name(md, node, "group") { 1500 err = numa_parse_mdesc_group(md, node, count); 1501 if (err < 0) 1502 break; 1503 count++; 1504 } 1505 1506 count = 0; 1507 mdesc_for_each_node_by_name(md, node, "group") { 1508 find_numa_latencies_for_group(md, node, count); 1509 count++; 1510 } 1511 1512 /* Normalize numa latency matrix according to ACPI SLIT spec. */ 1513 for (i = 0; i < MAX_NUMNODES; i++) { 1514 u64 self_latency = numa_latency[i][i]; 1515 1516 for (j = 0; j < MAX_NUMNODES; j++) { 1517 numa_latency[i][j] = 1518 (numa_latency[i][j] * LOCAL_DISTANCE) / 1519 self_latency; 1520 } 1521 } 1522 1523 add_node_ranges(); 1524 1525 for (i = 0; i < num_node_masks; i++) { 1526 allocate_node_data(i); 1527 node_set_online(i); 1528 } 1529 1530 err = 0; 1531 out: 1532 mdesc_release(md); 1533 return err; 1534 } 1535 1536 static int __init numa_parse_jbus(void) 1537 { 1538 unsigned long cpu, index; 1539 1540 /* NUMA node id is encoded in bits 36 and higher, and there is 1541 * a 1-to-1 mapping from CPU ID to NUMA node ID. 1542 */ 1543 index = 0; 1544 for_each_present_cpu(cpu) { 1545 numa_cpu_lookup_table[cpu] = index; 1546 cpumask_copy(&numa_cpumask_lookup_table[index], cpumask_of(cpu)); 1547 node_masks[index].mask = ~((1UL << 36UL) - 1UL); 1548 node_masks[index].match = cpu << 36UL; 1549 1550 index++; 1551 } 1552 num_node_masks = index; 1553 1554 add_node_ranges(); 1555 1556 for (index = 0; index < num_node_masks; index++) { 1557 allocate_node_data(index); 1558 node_set_online(index); 1559 } 1560 1561 return 0; 1562 } 1563 1564 static int __init numa_parse_sun4u(void) 1565 { 1566 if (tlb_type == cheetah || tlb_type == cheetah_plus) { 1567 unsigned long ver; 1568 1569 __asm__ ("rdpr %%ver, %0" : "=r" (ver)); 1570 if ((ver >> 32UL) == __JALAPENO_ID || 1571 (ver >> 32UL) == __SERRANO_ID) 1572 return numa_parse_jbus(); 1573 } 1574 return -1; 1575 } 1576 1577 static int __init bootmem_init_numa(void) 1578 { 1579 int i, j; 1580 int err = -1; 1581 1582 numadbg("bootmem_init_numa()\n"); 1583 1584 /* Some sane defaults for numa latency values */ 1585 for (i = 0; i < MAX_NUMNODES; i++) { 1586 for (j = 0; j < MAX_NUMNODES; j++) 1587 numa_latency[i][j] = (i == j) ? 1588 LOCAL_DISTANCE : REMOTE_DISTANCE; 1589 } 1590 1591 if (numa_enabled) { 1592 if (tlb_type == hypervisor) 1593 err = numa_parse_mdesc(); 1594 else 1595 err = numa_parse_sun4u(); 1596 } 1597 return err; 1598 } 1599 1600 #else 1601 1602 static int bootmem_init_numa(void) 1603 { 1604 return -1; 1605 } 1606 1607 #endif 1608 1609 static void __init bootmem_init_nonnuma(void) 1610 { 1611 unsigned long top_of_ram = memblock_end_of_DRAM(); 1612 unsigned long total_ram = memblock_phys_mem_size(); 1613 1614 numadbg("bootmem_init_nonnuma()\n"); 1615 1616 printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 1617 top_of_ram, total_ram); 1618 printk(KERN_INFO "Memory hole size: %ldMB\n", 1619 (top_of_ram - total_ram) >> 20); 1620 1621 init_node_masks_nonnuma(); 1622 memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); 1623 allocate_node_data(0); 1624 node_set_online(0); 1625 } 1626 1627 static unsigned long __init bootmem_init(unsigned long phys_base) 1628 { 1629 unsigned long end_pfn; 1630 1631 end_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; 1632 max_pfn = max_low_pfn = end_pfn; 1633 min_low_pfn = (phys_base >> PAGE_SHIFT); 1634 1635 if (bootmem_init_numa() < 0) 1636 bootmem_init_nonnuma(); 1637 1638 /* Dump memblock with node info. */ 1639 memblock_dump_all(); 1640 1641 /* XXX cpu notifier XXX */ 1642 1643 sparse_memory_present_with_active_regions(MAX_NUMNODES); 1644 sparse_init(); 1645 1646 return end_pfn; 1647 } 1648 1649 static struct linux_prom64_registers pall[MAX_BANKS] __initdata; 1650 static int pall_ents __initdata; 1651 1652 static unsigned long max_phys_bits = 40; 1653 1654 bool kern_addr_valid(unsigned long addr) 1655 { 1656 pgd_t *pgd; 1657 pud_t *pud; 1658 pmd_t *pmd; 1659 pte_t *pte; 1660 1661 if ((long)addr < 0L) { 1662 unsigned long pa = __pa(addr); 1663 1664 if ((pa >> max_phys_bits) != 0UL) 1665 return false; 1666 1667 return pfn_valid(pa >> PAGE_SHIFT); 1668 } 1669 1670 if (addr >= (unsigned long) KERNBASE && 1671 addr < (unsigned long)&_end) 1672 return true; 1673 1674 pgd = pgd_offset_k(addr); 1675 if (pgd_none(*pgd)) 1676 return 0; 1677 1678 pud = pud_offset(pgd, addr); 1679 if (pud_none(*pud)) 1680 return 0; 1681 1682 if (pud_large(*pud)) 1683 return pfn_valid(pud_pfn(*pud)); 1684 1685 pmd = pmd_offset(pud, addr); 1686 if (pmd_none(*pmd)) 1687 return 0; 1688 1689 if (pmd_large(*pmd)) 1690 return pfn_valid(pmd_pfn(*pmd)); 1691 1692 pte = pte_offset_kernel(pmd, addr); 1693 if (pte_none(*pte)) 1694 return 0; 1695 1696 return pfn_valid(pte_pfn(*pte)); 1697 } 1698 EXPORT_SYMBOL(kern_addr_valid); 1699 1700 static unsigned long __ref kernel_map_hugepud(unsigned long vstart, 1701 unsigned long vend, 1702 pud_t *pud) 1703 { 1704 const unsigned long mask16gb = (1UL << 34) - 1UL; 1705 u64 pte_val = vstart; 1706 1707 /* Each PUD is 8GB */ 1708 if ((vstart & mask16gb) || 1709 (vend - vstart <= mask16gb)) { 1710 pte_val ^= kern_linear_pte_xor[2]; 1711 pud_val(*pud) = pte_val | _PAGE_PUD_HUGE; 1712 1713 return vstart + PUD_SIZE; 1714 } 1715 1716 pte_val ^= kern_linear_pte_xor[3]; 1717 pte_val |= _PAGE_PUD_HUGE; 1718 1719 vend = vstart + mask16gb + 1UL; 1720 while (vstart < vend) { 1721 pud_val(*pud) = pte_val; 1722 1723 pte_val += PUD_SIZE; 1724 vstart += PUD_SIZE; 1725 pud++; 1726 } 1727 return vstart; 1728 } 1729 1730 static bool kernel_can_map_hugepud(unsigned long vstart, unsigned long vend, 1731 bool guard) 1732 { 1733 if (guard && !(vstart & ~PUD_MASK) && (vend - vstart) >= PUD_SIZE) 1734 return true; 1735 1736 return false; 1737 } 1738 1739 static unsigned long __ref kernel_map_hugepmd(unsigned long vstart, 1740 unsigned long vend, 1741 pmd_t *pmd) 1742 { 1743 const unsigned long mask256mb = (1UL << 28) - 1UL; 1744 const unsigned long mask2gb = (1UL << 31) - 1UL; 1745 u64 pte_val = vstart; 1746 1747 /* Each PMD is 8MB */ 1748 if ((vstart & mask256mb) || 1749 (vend - vstart <= mask256mb)) { 1750 pte_val ^= kern_linear_pte_xor[0]; 1751 pmd_val(*pmd) = pte_val | _PAGE_PMD_HUGE; 1752 1753 return vstart + PMD_SIZE; 1754 } 1755 1756 if ((vstart & mask2gb) || 1757 (vend - vstart <= mask2gb)) { 1758 pte_val ^= kern_linear_pte_xor[1]; 1759 pte_val |= _PAGE_PMD_HUGE; 1760 vend = vstart + mask256mb + 1UL; 1761 } else { 1762 pte_val ^= kern_linear_pte_xor[2]; 1763 pte_val |= _PAGE_PMD_HUGE; 1764 vend = vstart + mask2gb + 1UL; 1765 } 1766 1767 while (vstart < vend) { 1768 pmd_val(*pmd) = pte_val; 1769 1770 pte_val += PMD_SIZE; 1771 vstart += PMD_SIZE; 1772 pmd++; 1773 } 1774 1775 return vstart; 1776 } 1777 1778 static bool kernel_can_map_hugepmd(unsigned long vstart, unsigned long vend, 1779 bool guard) 1780 { 1781 if (guard && !(vstart & ~PMD_MASK) && (vend - vstart) >= PMD_SIZE) 1782 return true; 1783 1784 return false; 1785 } 1786 1787 static unsigned long __ref kernel_map_range(unsigned long pstart, 1788 unsigned long pend, pgprot_t prot, 1789 bool use_huge) 1790 { 1791 unsigned long vstart = PAGE_OFFSET + pstart; 1792 unsigned long vend = PAGE_OFFSET + pend; 1793 unsigned long alloc_bytes = 0UL; 1794 1795 if ((vstart & ~PAGE_MASK) || (vend & ~PAGE_MASK)) { 1796 prom_printf("kernel_map: Unaligned physmem[%lx:%lx]\n", 1797 vstart, vend); 1798 prom_halt(); 1799 } 1800 1801 while (vstart < vend) { 1802 unsigned long this_end, paddr = __pa(vstart); 1803 pgd_t *pgd = pgd_offset_k(vstart); 1804 pud_t *pud; 1805 pmd_t *pmd; 1806 pte_t *pte; 1807 1808 if (pgd_none(*pgd)) { 1809 pud_t *new; 1810 1811 new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); 1812 alloc_bytes += PAGE_SIZE; 1813 pgd_populate(&init_mm, pgd, new); 1814 } 1815 pud = pud_offset(pgd, vstart); 1816 if (pud_none(*pud)) { 1817 pmd_t *new; 1818 1819 if (kernel_can_map_hugepud(vstart, vend, use_huge)) { 1820 vstart = kernel_map_hugepud(vstart, vend, pud); 1821 continue; 1822 } 1823 new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); 1824 alloc_bytes += PAGE_SIZE; 1825 pud_populate(&init_mm, pud, new); 1826 } 1827 1828 pmd = pmd_offset(pud, vstart); 1829 if (pmd_none(*pmd)) { 1830 pte_t *new; 1831 1832 if (kernel_can_map_hugepmd(vstart, vend, use_huge)) { 1833 vstart = kernel_map_hugepmd(vstart, vend, pmd); 1834 continue; 1835 } 1836 new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); 1837 alloc_bytes += PAGE_SIZE; 1838 pmd_populate_kernel(&init_mm, pmd, new); 1839 } 1840 1841 pte = pte_offset_kernel(pmd, vstart); 1842 this_end = (vstart + PMD_SIZE) & PMD_MASK; 1843 if (this_end > vend) 1844 this_end = vend; 1845 1846 while (vstart < this_end) { 1847 pte_val(*pte) = (paddr | pgprot_val(prot)); 1848 1849 vstart += PAGE_SIZE; 1850 paddr += PAGE_SIZE; 1851 pte++; 1852 } 1853 } 1854 1855 return alloc_bytes; 1856 } 1857 1858 static void __init flush_all_kernel_tsbs(void) 1859 { 1860 int i; 1861 1862 for (i = 0; i < KERNEL_TSB_NENTRIES; i++) { 1863 struct tsb *ent = &swapper_tsb[i]; 1864 1865 ent->tag = (1UL << TSB_TAG_INVALID_BIT); 1866 } 1867 #ifndef CONFIG_DEBUG_PAGEALLOC 1868 for (i = 0; i < KERNEL_TSB4M_NENTRIES; i++) { 1869 struct tsb *ent = &swapper_4m_tsb[i]; 1870 1871 ent->tag = (1UL << TSB_TAG_INVALID_BIT); 1872 } 1873 #endif 1874 } 1875 1876 extern unsigned int kvmap_linear_patch[1]; 1877 1878 static void __init kernel_physical_mapping_init(void) 1879 { 1880 unsigned long i, mem_alloced = 0UL; 1881 bool use_huge = true; 1882 1883 #ifdef CONFIG_DEBUG_PAGEALLOC 1884 use_huge = false; 1885 #endif 1886 for (i = 0; i < pall_ents; i++) { 1887 unsigned long phys_start, phys_end; 1888 1889 phys_start = pall[i].phys_addr; 1890 phys_end = phys_start + pall[i].reg_size; 1891 1892 mem_alloced += kernel_map_range(phys_start, phys_end, 1893 PAGE_KERNEL, use_huge); 1894 } 1895 1896 printk("Allocated %ld bytes for kernel page tables.\n", 1897 mem_alloced); 1898 1899 kvmap_linear_patch[0] = 0x01000000; /* nop */ 1900 flushi(&kvmap_linear_patch[0]); 1901 1902 flush_all_kernel_tsbs(); 1903 1904 __flush_tlb_all(); 1905 } 1906 1907 #ifdef CONFIG_DEBUG_PAGEALLOC 1908 void __kernel_map_pages(struct page *page, int numpages, int enable) 1909 { 1910 unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT; 1911 unsigned long phys_end = phys_start + (numpages * PAGE_SIZE); 1912 1913 kernel_map_range(phys_start, phys_end, 1914 (enable ? PAGE_KERNEL : __pgprot(0)), false); 1915 1916 flush_tsb_kernel_range(PAGE_OFFSET + phys_start, 1917 PAGE_OFFSET + phys_end); 1918 1919 /* we should perform an IPI and flush all tlbs, 1920 * but that can deadlock->flush only current cpu. 1921 */ 1922 __flush_tlb_kernel_range(PAGE_OFFSET + phys_start, 1923 PAGE_OFFSET + phys_end); 1924 } 1925 #endif 1926 1927 unsigned long __init find_ecache_flush_span(unsigned long size) 1928 { 1929 int i; 1930 1931 for (i = 0; i < pavail_ents; i++) { 1932 if (pavail[i].reg_size >= size) 1933 return pavail[i].phys_addr; 1934 } 1935 1936 return ~0UL; 1937 } 1938 1939 unsigned long PAGE_OFFSET; 1940 EXPORT_SYMBOL(PAGE_OFFSET); 1941 1942 unsigned long VMALLOC_END = 0x0000010000000000UL; 1943 EXPORT_SYMBOL(VMALLOC_END); 1944 1945 unsigned long sparc64_va_hole_top = 0xfffff80000000000UL; 1946 unsigned long sparc64_va_hole_bottom = 0x0000080000000000UL; 1947 1948 static void __init setup_page_offset(void) 1949 { 1950 if (tlb_type == cheetah || tlb_type == cheetah_plus) { 1951 /* Cheetah/Panther support a full 64-bit virtual 1952 * address, so we can use all that our page tables 1953 * support. 1954 */ 1955 sparc64_va_hole_top = 0xfff0000000000000UL; 1956 sparc64_va_hole_bottom = 0x0010000000000000UL; 1957 1958 max_phys_bits = 42; 1959 } else if (tlb_type == hypervisor) { 1960 switch (sun4v_chip_type) { 1961 case SUN4V_CHIP_NIAGARA1: 1962 case SUN4V_CHIP_NIAGARA2: 1963 /* T1 and T2 support 48-bit virtual addresses. */ 1964 sparc64_va_hole_top = 0xffff800000000000UL; 1965 sparc64_va_hole_bottom = 0x0000800000000000UL; 1966 1967 max_phys_bits = 39; 1968 break; 1969 case SUN4V_CHIP_NIAGARA3: 1970 /* T3 supports 48-bit virtual addresses. */ 1971 sparc64_va_hole_top = 0xffff800000000000UL; 1972 sparc64_va_hole_bottom = 0x0000800000000000UL; 1973 1974 max_phys_bits = 43; 1975 break; 1976 case SUN4V_CHIP_NIAGARA4: 1977 case SUN4V_CHIP_NIAGARA5: 1978 case SUN4V_CHIP_SPARC64X: 1979 case SUN4V_CHIP_SPARC_M6: 1980 /* T4 and later support 52-bit virtual addresses. */ 1981 sparc64_va_hole_top = 0xfff8000000000000UL; 1982 sparc64_va_hole_bottom = 0x0008000000000000UL; 1983 max_phys_bits = 47; 1984 break; 1985 case SUN4V_CHIP_SPARC_M7: 1986 case SUN4V_CHIP_SPARC_SN: 1987 /* M7 and later support 52-bit virtual addresses. */ 1988 sparc64_va_hole_top = 0xfff8000000000000UL; 1989 sparc64_va_hole_bottom = 0x0008000000000000UL; 1990 max_phys_bits = 49; 1991 break; 1992 case SUN4V_CHIP_SPARC_M8: 1993 default: 1994 /* M8 and later support 54-bit virtual addresses. 1995 * However, restricting M8 and above VA bits to 53 1996 * as 4-level page table cannot support more than 1997 * 53 VA bits. 1998 */ 1999 sparc64_va_hole_top = 0xfff0000000000000UL; 2000 sparc64_va_hole_bottom = 0x0010000000000000UL; 2001 max_phys_bits = 51; 2002 break; 2003 } 2004 } 2005 2006 if (max_phys_bits > MAX_PHYS_ADDRESS_BITS) { 2007 prom_printf("MAX_PHYS_ADDRESS_BITS is too small, need %lu\n", 2008 max_phys_bits); 2009 prom_halt(); 2010 } 2011 2012 PAGE_OFFSET = sparc64_va_hole_top; 2013 VMALLOC_END = ((sparc64_va_hole_bottom >> 1) + 2014 (sparc64_va_hole_bottom >> 2)); 2015 2016 pr_info("MM: PAGE_OFFSET is 0x%016lx (max_phys_bits == %lu)\n", 2017 PAGE_OFFSET, max_phys_bits); 2018 pr_info("MM: VMALLOC [0x%016lx --> 0x%016lx]\n", 2019 VMALLOC_START, VMALLOC_END); 2020 pr_info("MM: VMEMMAP [0x%016lx --> 0x%016lx]\n", 2021 VMEMMAP_BASE, VMEMMAP_BASE << 1); 2022 } 2023 2024 static void __init tsb_phys_patch(void) 2025 { 2026 struct tsb_ldquad_phys_patch_entry *pquad; 2027 struct tsb_phys_patch_entry *p; 2028 2029 pquad = &__tsb_ldquad_phys_patch; 2030 while (pquad < &__tsb_ldquad_phys_patch_end) { 2031 unsigned long addr = pquad->addr; 2032 2033 if (tlb_type == hypervisor) 2034 *(unsigned int *) addr = pquad->sun4v_insn; 2035 else 2036 *(unsigned int *) addr = pquad->sun4u_insn; 2037 wmb(); 2038 __asm__ __volatile__("flush %0" 2039 : /* no outputs */ 2040 : "r" (addr)); 2041 2042 pquad++; 2043 } 2044 2045 p = &__tsb_phys_patch; 2046 while (p < &__tsb_phys_patch_end) { 2047 unsigned long addr = p->addr; 2048 2049 *(unsigned int *) addr = p->insn; 2050 wmb(); 2051 __asm__ __volatile__("flush %0" 2052 : /* no outputs */ 2053 : "r" (addr)); 2054 2055 p++; 2056 } 2057 } 2058 2059 /* Don't mark as init, we give this to the Hypervisor. */ 2060 #ifndef CONFIG_DEBUG_PAGEALLOC 2061 #define NUM_KTSB_DESCR 2 2062 #else 2063 #define NUM_KTSB_DESCR 1 2064 #endif 2065 static struct hv_tsb_descr ktsb_descr[NUM_KTSB_DESCR]; 2066 2067 /* The swapper TSBs are loaded with a base sequence of: 2068 * 2069 * sethi %uhi(SYMBOL), REG1 2070 * sethi %hi(SYMBOL), REG2 2071 * or REG1, %ulo(SYMBOL), REG1 2072 * or REG2, %lo(SYMBOL), REG2 2073 * sllx REG1, 32, REG1 2074 * or REG1, REG2, REG1 2075 * 2076 * When we use physical addressing for the TSB accesses, we patch the 2077 * first four instructions in the above sequence. 2078 */ 2079 2080 static void patch_one_ktsb_phys(unsigned int *start, unsigned int *end, unsigned long pa) 2081 { 2082 unsigned long high_bits, low_bits; 2083 2084 high_bits = (pa >> 32) & 0xffffffff; 2085 low_bits = (pa >> 0) & 0xffffffff; 2086 2087 while (start < end) { 2088 unsigned int *ia = (unsigned int *)(unsigned long)*start; 2089 2090 ia[0] = (ia[0] & ~0x3fffff) | (high_bits >> 10); 2091 __asm__ __volatile__("flush %0" : : "r" (ia)); 2092 2093 ia[1] = (ia[1] & ~0x3fffff) | (low_bits >> 10); 2094 __asm__ __volatile__("flush %0" : : "r" (ia + 1)); 2095 2096 ia[2] = (ia[2] & ~0x1fff) | (high_bits & 0x3ff); 2097 __asm__ __volatile__("flush %0" : : "r" (ia + 2)); 2098 2099 ia[3] = (ia[3] & ~0x1fff) | (low_bits & 0x3ff); 2100 __asm__ __volatile__("flush %0" : : "r" (ia + 3)); 2101 2102 start++; 2103 } 2104 } 2105 2106 static void ktsb_phys_patch(void) 2107 { 2108 extern unsigned int __swapper_tsb_phys_patch; 2109 extern unsigned int __swapper_tsb_phys_patch_end; 2110 unsigned long ktsb_pa; 2111 2112 ktsb_pa = kern_base + ((unsigned long)&swapper_tsb[0] - KERNBASE); 2113 patch_one_ktsb_phys(&__swapper_tsb_phys_patch, 2114 &__swapper_tsb_phys_patch_end, ktsb_pa); 2115 #ifndef CONFIG_DEBUG_PAGEALLOC 2116 { 2117 extern unsigned int __swapper_4m_tsb_phys_patch; 2118 extern unsigned int __swapper_4m_tsb_phys_patch_end; 2119 ktsb_pa = (kern_base + 2120 ((unsigned long)&swapper_4m_tsb[0] - KERNBASE)); 2121 patch_one_ktsb_phys(&__swapper_4m_tsb_phys_patch, 2122 &__swapper_4m_tsb_phys_patch_end, ktsb_pa); 2123 } 2124 #endif 2125 } 2126 2127 static void __init sun4v_ktsb_init(void) 2128 { 2129 unsigned long ktsb_pa; 2130 2131 /* First KTSB for PAGE_SIZE mappings. */ 2132 ktsb_pa = kern_base + ((unsigned long)&swapper_tsb[0] - KERNBASE); 2133 2134 switch (PAGE_SIZE) { 2135 case 8 * 1024: 2136 default: 2137 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_8K; 2138 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_8K; 2139 break; 2140 2141 case 64 * 1024: 2142 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_64K; 2143 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_64K; 2144 break; 2145 2146 case 512 * 1024: 2147 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_512K; 2148 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_512K; 2149 break; 2150 2151 case 4 * 1024 * 1024: 2152 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_4MB; 2153 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_4MB; 2154 break; 2155 } 2156 2157 ktsb_descr[0].assoc = 1; 2158 ktsb_descr[0].num_ttes = KERNEL_TSB_NENTRIES; 2159 ktsb_descr[0].ctx_idx = 0; 2160 ktsb_descr[0].tsb_base = ktsb_pa; 2161 ktsb_descr[0].resv = 0; 2162 2163 #ifndef CONFIG_DEBUG_PAGEALLOC 2164 /* Second KTSB for 4MB/256MB/2GB/16GB mappings. */ 2165 ktsb_pa = (kern_base + 2166 ((unsigned long)&swapper_4m_tsb[0] - KERNBASE)); 2167 2168 ktsb_descr[1].pgsz_idx = HV_PGSZ_IDX_4MB; 2169 ktsb_descr[1].pgsz_mask = ((HV_PGSZ_MASK_4MB | 2170 HV_PGSZ_MASK_256MB | 2171 HV_PGSZ_MASK_2GB | 2172 HV_PGSZ_MASK_16GB) & 2173 cpu_pgsz_mask); 2174 ktsb_descr[1].assoc = 1; 2175 ktsb_descr[1].num_ttes = KERNEL_TSB4M_NENTRIES; 2176 ktsb_descr[1].ctx_idx = 0; 2177 ktsb_descr[1].tsb_base = ktsb_pa; 2178 ktsb_descr[1].resv = 0; 2179 #endif 2180 } 2181 2182 void sun4v_ktsb_register(void) 2183 { 2184 unsigned long pa, ret; 2185 2186 pa = kern_base + ((unsigned long)&ktsb_descr[0] - KERNBASE); 2187 2188 ret = sun4v_mmu_tsb_ctx0(NUM_KTSB_DESCR, pa); 2189 if (ret != 0) { 2190 prom_printf("hypervisor_mmu_tsb_ctx0[%lx]: " 2191 "errors with %lx\n", pa, ret); 2192 prom_halt(); 2193 } 2194 } 2195 2196 static void __init sun4u_linear_pte_xor_finalize(void) 2197 { 2198 #ifndef CONFIG_DEBUG_PAGEALLOC 2199 /* This is where we would add Panther support for 2200 * 32MB and 256MB pages. 2201 */ 2202 #endif 2203 } 2204 2205 static void __init sun4v_linear_pte_xor_finalize(void) 2206 { 2207 unsigned long pagecv_flag; 2208 2209 /* Bit 9 of TTE is no longer CV bit on M7 processor and it instead 2210 * enables MCD error. Do not set bit 9 on M7 processor. 2211 */ 2212 switch (sun4v_chip_type) { 2213 case SUN4V_CHIP_SPARC_M7: 2214 case SUN4V_CHIP_SPARC_M8: 2215 case SUN4V_CHIP_SPARC_SN: 2216 pagecv_flag = 0x00; 2217 break; 2218 default: 2219 pagecv_flag = _PAGE_CV_4V; 2220 break; 2221 } 2222 #ifndef CONFIG_DEBUG_PAGEALLOC 2223 if (cpu_pgsz_mask & HV_PGSZ_MASK_256MB) { 2224 kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^ 2225 PAGE_OFFSET; 2226 kern_linear_pte_xor[1] |= (_PAGE_CP_4V | pagecv_flag | 2227 _PAGE_P_4V | _PAGE_W_4V); 2228 } else { 2229 kern_linear_pte_xor[1] = kern_linear_pte_xor[0]; 2230 } 2231 2232 if (cpu_pgsz_mask & HV_PGSZ_MASK_2GB) { 2233 kern_linear_pte_xor[2] = (_PAGE_VALID | _PAGE_SZ2GB_4V) ^ 2234 PAGE_OFFSET; 2235 kern_linear_pte_xor[2] |= (_PAGE_CP_4V | pagecv_flag | 2236 _PAGE_P_4V | _PAGE_W_4V); 2237 } else { 2238 kern_linear_pte_xor[2] = kern_linear_pte_xor[1]; 2239 } 2240 2241 if (cpu_pgsz_mask & HV_PGSZ_MASK_16GB) { 2242 kern_linear_pte_xor[3] = (_PAGE_VALID | _PAGE_SZ16GB_4V) ^ 2243 PAGE_OFFSET; 2244 kern_linear_pte_xor[3] |= (_PAGE_CP_4V | pagecv_flag | 2245 _PAGE_P_4V | _PAGE_W_4V); 2246 } else { 2247 kern_linear_pte_xor[3] = kern_linear_pte_xor[2]; 2248 } 2249 #endif 2250 } 2251 2252 /* paging_init() sets up the page tables */ 2253 2254 static unsigned long last_valid_pfn; 2255 2256 static void sun4u_pgprot_init(void); 2257 static void sun4v_pgprot_init(void); 2258 2259 static phys_addr_t __init available_memory(void) 2260 { 2261 phys_addr_t available = 0ULL; 2262 phys_addr_t pa_start, pa_end; 2263 u64 i; 2264 2265 for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start, 2266 &pa_end, NULL) 2267 available = available + (pa_end - pa_start); 2268 2269 return available; 2270 } 2271 2272 #define _PAGE_CACHE_4U (_PAGE_CP_4U | _PAGE_CV_4U) 2273 #define _PAGE_CACHE_4V (_PAGE_CP_4V | _PAGE_CV_4V) 2274 #define __DIRTY_BITS_4U (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U) 2275 #define __DIRTY_BITS_4V (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V) 2276 #define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R) 2277 #define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R) 2278 2279 /* We need to exclude reserved regions. This exclusion will include 2280 * vmlinux and initrd. To be more precise the initrd size could be used to 2281 * compute a new lower limit because it is freed later during initialization. 2282 */ 2283 static void __init reduce_memory(phys_addr_t limit_ram) 2284 { 2285 phys_addr_t avail_ram = available_memory(); 2286 phys_addr_t pa_start, pa_end; 2287 u64 i; 2288 2289 if (limit_ram >= avail_ram) 2290 return; 2291 2292 for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start, 2293 &pa_end, NULL) { 2294 phys_addr_t region_size = pa_end - pa_start; 2295 phys_addr_t clip_start = pa_start; 2296 2297 avail_ram = avail_ram - region_size; 2298 /* Are we consuming too much? */ 2299 if (avail_ram < limit_ram) { 2300 phys_addr_t give_back = limit_ram - avail_ram; 2301 2302 region_size = region_size - give_back; 2303 clip_start = clip_start + give_back; 2304 } 2305 2306 memblock_remove(clip_start, region_size); 2307 2308 if (avail_ram <= limit_ram) 2309 break; 2310 i = 0UL; 2311 } 2312 } 2313 2314 void __init paging_init(void) 2315 { 2316 unsigned long end_pfn, shift, phys_base; 2317 unsigned long real_end, i; 2318 2319 setup_page_offset(); 2320 2321 /* These build time checkes make sure that the dcache_dirty_cpu() 2322 * page->flags usage will work. 2323 * 2324 * When a page gets marked as dcache-dirty, we store the 2325 * cpu number starting at bit 32 in the page->flags. Also, 2326 * functions like clear_dcache_dirty_cpu use the cpu mask 2327 * in 13-bit signed-immediate instruction fields. 2328 */ 2329 2330 /* 2331 * Page flags must not reach into upper 32 bits that are used 2332 * for the cpu number 2333 */ 2334 BUILD_BUG_ON(NR_PAGEFLAGS > 32); 2335 2336 /* 2337 * The bit fields placed in the high range must not reach below 2338 * the 32 bit boundary. Otherwise we cannot place the cpu field 2339 * at the 32 bit boundary. 2340 */ 2341 BUILD_BUG_ON(SECTIONS_WIDTH + NODES_WIDTH + ZONES_WIDTH + 2342 ilog2(roundup_pow_of_two(NR_CPUS)) > 32); 2343 2344 BUILD_BUG_ON(NR_CPUS > 4096); 2345 2346 kern_base = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB; 2347 kern_size = (unsigned long)&_end - (unsigned long)KERNBASE; 2348 2349 /* Invalidate both kernel TSBs. */ 2350 memset(swapper_tsb, 0x40, sizeof(swapper_tsb)); 2351 #ifndef CONFIG_DEBUG_PAGEALLOC 2352 memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb)); 2353 #endif 2354 2355 /* TTE.cv bit on sparc v9 occupies the same position as TTE.mcde 2356 * bit on M7 processor. This is a conflicting usage of the same 2357 * bit. Enabling TTE.cv on M7 would turn on Memory Corruption 2358 * Detection error on all pages and this will lead to problems 2359 * later. Kernel does not run with MCD enabled and hence rest 2360 * of the required steps to fully configure memory corruption 2361 * detection are not taken. We need to ensure TTE.mcde is not 2362 * set on M7 processor. Compute the value of cacheability 2363 * flag for use later taking this into consideration. 2364 */ 2365 switch (sun4v_chip_type) { 2366 case SUN4V_CHIP_SPARC_M7: 2367 case SUN4V_CHIP_SPARC_M8: 2368 case SUN4V_CHIP_SPARC_SN: 2369 page_cache4v_flag = _PAGE_CP_4V; 2370 break; 2371 default: 2372 page_cache4v_flag = _PAGE_CACHE_4V; 2373 break; 2374 } 2375 2376 if (tlb_type == hypervisor) 2377 sun4v_pgprot_init(); 2378 else 2379 sun4u_pgprot_init(); 2380 2381 if (tlb_type == cheetah_plus || 2382 tlb_type == hypervisor) { 2383 tsb_phys_patch(); 2384 ktsb_phys_patch(); 2385 } 2386 2387 if (tlb_type == hypervisor) 2388 sun4v_patch_tlb_handlers(); 2389 2390 /* Find available physical memory... 2391 * 2392 * Read it twice in order to work around a bug in openfirmware. 2393 * The call to grab this table itself can cause openfirmware to 2394 * allocate memory, which in turn can take away some space from 2395 * the list of available memory. Reading it twice makes sure 2396 * we really do get the final value. 2397 */ 2398 read_obp_translations(); 2399 read_obp_memory("reg", &pall[0], &pall_ents); 2400 read_obp_memory("available", &pavail[0], &pavail_ents); 2401 read_obp_memory("available", &pavail[0], &pavail_ents); 2402 2403 phys_base = 0xffffffffffffffffUL; 2404 for (i = 0; i < pavail_ents; i++) { 2405 phys_base = min(phys_base, pavail[i].phys_addr); 2406 memblock_add(pavail[i].phys_addr, pavail[i].reg_size); 2407 } 2408 2409 memblock_reserve(kern_base, kern_size); 2410 2411 find_ramdisk(phys_base); 2412 2413 if (cmdline_memory_size) 2414 reduce_memory(cmdline_memory_size); 2415 2416 memblock_allow_resize(); 2417 memblock_dump_all(); 2418 2419 set_bit(0, mmu_context_bmap); 2420 2421 shift = kern_base + PAGE_OFFSET - ((unsigned long)KERNBASE); 2422 2423 real_end = (unsigned long)_end; 2424 num_kernel_image_mappings = DIV_ROUND_UP(real_end - KERNBASE, 1 << ILOG2_4MB); 2425 printk("Kernel: Using %d locked TLB entries for main kernel image.\n", 2426 num_kernel_image_mappings); 2427 2428 /* Set kernel pgd to upper alias so physical page computations 2429 * work. 2430 */ 2431 init_mm.pgd += ((shift) / (sizeof(pgd_t))); 2432 2433 memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir)); 2434 2435 inherit_prom_mappings(); 2436 2437 /* Ok, we can use our TLB miss and window trap handlers safely. */ 2438 setup_tba(); 2439 2440 __flush_tlb_all(); 2441 2442 prom_build_devicetree(); 2443 of_populate_present_mask(); 2444 #ifndef CONFIG_SMP 2445 of_fill_in_cpu_data(); 2446 #endif 2447 2448 if (tlb_type == hypervisor) { 2449 sun4v_mdesc_init(); 2450 mdesc_populate_present_mask(cpu_all_mask); 2451 #ifndef CONFIG_SMP 2452 mdesc_fill_in_cpu_data(cpu_all_mask); 2453 #endif 2454 mdesc_get_page_sizes(cpu_all_mask, &cpu_pgsz_mask); 2455 2456 sun4v_linear_pte_xor_finalize(); 2457 2458 sun4v_ktsb_init(); 2459 sun4v_ktsb_register(); 2460 } else { 2461 unsigned long impl, ver; 2462 2463 cpu_pgsz_mask = (HV_PGSZ_MASK_8K | HV_PGSZ_MASK_64K | 2464 HV_PGSZ_MASK_512K | HV_PGSZ_MASK_4MB); 2465 2466 __asm__ __volatile__("rdpr %%ver, %0" : "=r" (ver)); 2467 impl = ((ver >> 32) & 0xffff); 2468 if (impl == PANTHER_IMPL) 2469 cpu_pgsz_mask |= (HV_PGSZ_MASK_32MB | 2470 HV_PGSZ_MASK_256MB); 2471 2472 sun4u_linear_pte_xor_finalize(); 2473 } 2474 2475 /* Flush the TLBs and the 4M TSB so that the updated linear 2476 * pte XOR settings are realized for all mappings. 2477 */ 2478 __flush_tlb_all(); 2479 #ifndef CONFIG_DEBUG_PAGEALLOC 2480 memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb)); 2481 #endif 2482 __flush_tlb_all(); 2483 2484 /* Setup bootmem... */ 2485 last_valid_pfn = end_pfn = bootmem_init(phys_base); 2486 2487 kernel_physical_mapping_init(); 2488 2489 { 2490 unsigned long max_zone_pfns[MAX_NR_ZONES]; 2491 2492 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 2493 2494 max_zone_pfns[ZONE_NORMAL] = end_pfn; 2495 2496 free_area_init_nodes(max_zone_pfns); 2497 } 2498 2499 printk("Booting Linux...\n"); 2500 } 2501 2502 int page_in_phys_avail(unsigned long paddr) 2503 { 2504 int i; 2505 2506 paddr &= PAGE_MASK; 2507 2508 for (i = 0; i < pavail_ents; i++) { 2509 unsigned long start, end; 2510 2511 start = pavail[i].phys_addr; 2512 end = start + pavail[i].reg_size; 2513 2514 if (paddr >= start && paddr < end) 2515 return 1; 2516 } 2517 if (paddr >= kern_base && paddr < (kern_base + kern_size)) 2518 return 1; 2519 #ifdef CONFIG_BLK_DEV_INITRD 2520 if (paddr >= __pa(initrd_start) && 2521 paddr < __pa(PAGE_ALIGN(initrd_end))) 2522 return 1; 2523 #endif 2524 2525 return 0; 2526 } 2527 2528 static void __init register_page_bootmem_info(void) 2529 { 2530 #ifdef CONFIG_NEED_MULTIPLE_NODES 2531 int i; 2532 2533 for_each_online_node(i) 2534 if (NODE_DATA(i)->node_spanned_pages) 2535 register_page_bootmem_info_node(NODE_DATA(i)); 2536 #endif 2537 } 2538 void __init mem_init(void) 2539 { 2540 high_memory = __va(last_valid_pfn << PAGE_SHIFT); 2541 2542 register_page_bootmem_info(); 2543 free_all_bootmem(); 2544 2545 /* 2546 * Set up the zero page, mark it reserved, so that page count 2547 * is not manipulated when freeing the page from user ptes. 2548 */ 2549 mem_map_zero = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0); 2550 if (mem_map_zero == NULL) { 2551 prom_printf("paging_init: Cannot alloc zero page.\n"); 2552 prom_halt(); 2553 } 2554 mark_page_reserved(mem_map_zero); 2555 2556 mem_init_print_info(NULL); 2557 2558 if (tlb_type == cheetah || tlb_type == cheetah_plus) 2559 cheetah_ecache_flush_init(); 2560 } 2561 2562 void free_initmem(void) 2563 { 2564 unsigned long addr, initend; 2565 int do_free = 1; 2566 2567 /* If the physical memory maps were trimmed by kernel command 2568 * line options, don't even try freeing this initmem stuff up. 2569 * The kernel image could have been in the trimmed out region 2570 * and if so the freeing below will free invalid page structs. 2571 */ 2572 if (cmdline_memory_size) 2573 do_free = 0; 2574 2575 /* 2576 * The init section is aligned to 8k in vmlinux.lds. Page align for >8k pagesizes. 2577 */ 2578 addr = PAGE_ALIGN((unsigned long)(__init_begin)); 2579 initend = (unsigned long)(__init_end) & PAGE_MASK; 2580 for (; addr < initend; addr += PAGE_SIZE) { 2581 unsigned long page; 2582 2583 page = (addr + 2584 ((unsigned long) __va(kern_base)) - 2585 ((unsigned long) KERNBASE)); 2586 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); 2587 2588 if (do_free) 2589 free_reserved_page(virt_to_page(page)); 2590 } 2591 } 2592 2593 #ifdef CONFIG_BLK_DEV_INITRD 2594 void free_initrd_mem(unsigned long start, unsigned long end) 2595 { 2596 free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, 2597 "initrd"); 2598 } 2599 #endif 2600 2601 pgprot_t PAGE_KERNEL __read_mostly; 2602 EXPORT_SYMBOL(PAGE_KERNEL); 2603 2604 pgprot_t PAGE_KERNEL_LOCKED __read_mostly; 2605 pgprot_t PAGE_COPY __read_mostly; 2606 2607 pgprot_t PAGE_SHARED __read_mostly; 2608 EXPORT_SYMBOL(PAGE_SHARED); 2609 2610 unsigned long pg_iobits __read_mostly; 2611 2612 unsigned long _PAGE_IE __read_mostly; 2613 EXPORT_SYMBOL(_PAGE_IE); 2614 2615 unsigned long _PAGE_E __read_mostly; 2616 EXPORT_SYMBOL(_PAGE_E); 2617 2618 unsigned long _PAGE_CACHE __read_mostly; 2619 EXPORT_SYMBOL(_PAGE_CACHE); 2620 2621 #ifdef CONFIG_SPARSEMEM_VMEMMAP 2622 int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend, 2623 int node) 2624 { 2625 unsigned long pte_base; 2626 2627 pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U | 2628 _PAGE_CP_4U | _PAGE_CV_4U | 2629 _PAGE_P_4U | _PAGE_W_4U); 2630 if (tlb_type == hypervisor) 2631 pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V | 2632 page_cache4v_flag | _PAGE_P_4V | _PAGE_W_4V); 2633 2634 pte_base |= _PAGE_PMD_HUGE; 2635 2636 vstart = vstart & PMD_MASK; 2637 vend = ALIGN(vend, PMD_SIZE); 2638 for (; vstart < vend; vstart += PMD_SIZE) { 2639 pgd_t *pgd = pgd_offset_k(vstart); 2640 unsigned long pte; 2641 pud_t *pud; 2642 pmd_t *pmd; 2643 2644 if (pgd_none(*pgd)) { 2645 pud_t *new = vmemmap_alloc_block(PAGE_SIZE, node); 2646 2647 if (!new) 2648 return -ENOMEM; 2649 pgd_populate(&init_mm, pgd, new); 2650 } 2651 2652 pud = pud_offset(pgd, vstart); 2653 if (pud_none(*pud)) { 2654 pmd_t *new = vmemmap_alloc_block(PAGE_SIZE, node); 2655 2656 if (!new) 2657 return -ENOMEM; 2658 pud_populate(&init_mm, pud, new); 2659 } 2660 2661 pmd = pmd_offset(pud, vstart); 2662 2663 pte = pmd_val(*pmd); 2664 if (!(pte & _PAGE_VALID)) { 2665 void *block = vmemmap_alloc_block(PMD_SIZE, node); 2666 2667 if (!block) 2668 return -ENOMEM; 2669 2670 pmd_val(*pmd) = pte_base | __pa(block); 2671 } 2672 } 2673 2674 return 0; 2675 } 2676 2677 void vmemmap_free(unsigned long start, unsigned long end) 2678 { 2679 } 2680 #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 2681 2682 static void prot_init_common(unsigned long page_none, 2683 unsigned long page_shared, 2684 unsigned long page_copy, 2685 unsigned long page_readonly, 2686 unsigned long page_exec_bit) 2687 { 2688 PAGE_COPY = __pgprot(page_copy); 2689 PAGE_SHARED = __pgprot(page_shared); 2690 2691 protection_map[0x0] = __pgprot(page_none); 2692 protection_map[0x1] = __pgprot(page_readonly & ~page_exec_bit); 2693 protection_map[0x2] = __pgprot(page_copy & ~page_exec_bit); 2694 protection_map[0x3] = __pgprot(page_copy & ~page_exec_bit); 2695 protection_map[0x4] = __pgprot(page_readonly); 2696 protection_map[0x5] = __pgprot(page_readonly); 2697 protection_map[0x6] = __pgprot(page_copy); 2698 protection_map[0x7] = __pgprot(page_copy); 2699 protection_map[0x8] = __pgprot(page_none); 2700 protection_map[0x9] = __pgprot(page_readonly & ~page_exec_bit); 2701 protection_map[0xa] = __pgprot(page_shared & ~page_exec_bit); 2702 protection_map[0xb] = __pgprot(page_shared & ~page_exec_bit); 2703 protection_map[0xc] = __pgprot(page_readonly); 2704 protection_map[0xd] = __pgprot(page_readonly); 2705 protection_map[0xe] = __pgprot(page_shared); 2706 protection_map[0xf] = __pgprot(page_shared); 2707 } 2708 2709 static void __init sun4u_pgprot_init(void) 2710 { 2711 unsigned long page_none, page_shared, page_copy, page_readonly; 2712 unsigned long page_exec_bit; 2713 int i; 2714 2715 PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID | 2716 _PAGE_CACHE_4U | _PAGE_P_4U | 2717 __ACCESS_BITS_4U | __DIRTY_BITS_4U | 2718 _PAGE_EXEC_4U); 2719 PAGE_KERNEL_LOCKED = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID | 2720 _PAGE_CACHE_4U | _PAGE_P_4U | 2721 __ACCESS_BITS_4U | __DIRTY_BITS_4U | 2722 _PAGE_EXEC_4U | _PAGE_L_4U); 2723 2724 _PAGE_IE = _PAGE_IE_4U; 2725 _PAGE_E = _PAGE_E_4U; 2726 _PAGE_CACHE = _PAGE_CACHE_4U; 2727 2728 pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4U | __DIRTY_BITS_4U | 2729 __ACCESS_BITS_4U | _PAGE_E_4U); 2730 2731 #ifdef CONFIG_DEBUG_PAGEALLOC 2732 kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET; 2733 #else 2734 kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^ 2735 PAGE_OFFSET; 2736 #endif 2737 kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U | 2738 _PAGE_P_4U | _PAGE_W_4U); 2739 2740 for (i = 1; i < 4; i++) 2741 kern_linear_pte_xor[i] = kern_linear_pte_xor[0]; 2742 2743 _PAGE_ALL_SZ_BITS = (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U | 2744 _PAGE_SZ64K_4U | _PAGE_SZ8K_4U | 2745 _PAGE_SZ32MB_4U | _PAGE_SZ256MB_4U); 2746 2747 2748 page_none = _PAGE_PRESENT_4U | _PAGE_ACCESSED_4U | _PAGE_CACHE_4U; 2749 page_shared = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U | 2750 __ACCESS_BITS_4U | _PAGE_WRITE_4U | _PAGE_EXEC_4U); 2751 page_copy = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U | 2752 __ACCESS_BITS_4U | _PAGE_EXEC_4U); 2753 page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U | 2754 __ACCESS_BITS_4U | _PAGE_EXEC_4U); 2755 2756 page_exec_bit = _PAGE_EXEC_4U; 2757 2758 prot_init_common(page_none, page_shared, page_copy, page_readonly, 2759 page_exec_bit); 2760 } 2761 2762 static void __init sun4v_pgprot_init(void) 2763 { 2764 unsigned long page_none, page_shared, page_copy, page_readonly; 2765 unsigned long page_exec_bit; 2766 int i; 2767 2768 PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4V | _PAGE_VALID | 2769 page_cache4v_flag | _PAGE_P_4V | 2770 __ACCESS_BITS_4V | __DIRTY_BITS_4V | 2771 _PAGE_EXEC_4V); 2772 PAGE_KERNEL_LOCKED = PAGE_KERNEL; 2773 2774 _PAGE_IE = _PAGE_IE_4V; 2775 _PAGE_E = _PAGE_E_4V; 2776 _PAGE_CACHE = page_cache4v_flag; 2777 2778 #ifdef CONFIG_DEBUG_PAGEALLOC 2779 kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET; 2780 #else 2781 kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^ 2782 PAGE_OFFSET; 2783 #endif 2784 kern_linear_pte_xor[0] |= (page_cache4v_flag | _PAGE_P_4V | 2785 _PAGE_W_4V); 2786 2787 for (i = 1; i < 4; i++) 2788 kern_linear_pte_xor[i] = kern_linear_pte_xor[0]; 2789 2790 pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V | 2791 __ACCESS_BITS_4V | _PAGE_E_4V); 2792 2793 _PAGE_ALL_SZ_BITS = (_PAGE_SZ16GB_4V | _PAGE_SZ2GB_4V | 2794 _PAGE_SZ256MB_4V | _PAGE_SZ32MB_4V | 2795 _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V | 2796 _PAGE_SZ64K_4V | _PAGE_SZ8K_4V); 2797 2798 page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | page_cache4v_flag; 2799 page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag | 2800 __ACCESS_BITS_4V | _PAGE_WRITE_4V | _PAGE_EXEC_4V); 2801 page_copy = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag | 2802 __ACCESS_BITS_4V | _PAGE_EXEC_4V); 2803 page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag | 2804 __ACCESS_BITS_4V | _PAGE_EXEC_4V); 2805 2806 page_exec_bit = _PAGE_EXEC_4V; 2807 2808 prot_init_common(page_none, page_shared, page_copy, page_readonly, 2809 page_exec_bit); 2810 } 2811 2812 unsigned long pte_sz_bits(unsigned long sz) 2813 { 2814 if (tlb_type == hypervisor) { 2815 switch (sz) { 2816 case 8 * 1024: 2817 default: 2818 return _PAGE_SZ8K_4V; 2819 case 64 * 1024: 2820 return _PAGE_SZ64K_4V; 2821 case 512 * 1024: 2822 return _PAGE_SZ512K_4V; 2823 case 4 * 1024 * 1024: 2824 return _PAGE_SZ4MB_4V; 2825 } 2826 } else { 2827 switch (sz) { 2828 case 8 * 1024: 2829 default: 2830 return _PAGE_SZ8K_4U; 2831 case 64 * 1024: 2832 return _PAGE_SZ64K_4U; 2833 case 512 * 1024: 2834 return _PAGE_SZ512K_4U; 2835 case 4 * 1024 * 1024: 2836 return _PAGE_SZ4MB_4U; 2837 } 2838 } 2839 } 2840 2841 pte_t mk_pte_io(unsigned long page, pgprot_t prot, int space, unsigned long page_size) 2842 { 2843 pte_t pte; 2844 2845 pte_val(pte) = page | pgprot_val(pgprot_noncached(prot)); 2846 pte_val(pte) |= (((unsigned long)space) << 32); 2847 pte_val(pte) |= pte_sz_bits(page_size); 2848 2849 return pte; 2850 } 2851 2852 static unsigned long kern_large_tte(unsigned long paddr) 2853 { 2854 unsigned long val; 2855 2856 val = (_PAGE_VALID | _PAGE_SZ4MB_4U | 2857 _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_P_4U | 2858 _PAGE_EXEC_4U | _PAGE_L_4U | _PAGE_W_4U); 2859 if (tlb_type == hypervisor) 2860 val = (_PAGE_VALID | _PAGE_SZ4MB_4V | 2861 page_cache4v_flag | _PAGE_P_4V | 2862 _PAGE_EXEC_4V | _PAGE_W_4V); 2863 2864 return val | paddr; 2865 } 2866 2867 /* If not locked, zap it. */ 2868 void __flush_tlb_all(void) 2869 { 2870 unsigned long pstate; 2871 int i; 2872 2873 __asm__ __volatile__("flushw\n\t" 2874 "rdpr %%pstate, %0\n\t" 2875 "wrpr %0, %1, %%pstate" 2876 : "=r" (pstate) 2877 : "i" (PSTATE_IE)); 2878 if (tlb_type == hypervisor) { 2879 sun4v_mmu_demap_all(); 2880 } else if (tlb_type == spitfire) { 2881 for (i = 0; i < 64; i++) { 2882 /* Spitfire Errata #32 workaround */ 2883 /* NOTE: Always runs on spitfire, so no 2884 * cheetah+ page size encodings. 2885 */ 2886 __asm__ __volatile__("stxa %0, [%1] %2\n\t" 2887 "flush %%g6" 2888 : /* No outputs */ 2889 : "r" (0), 2890 "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); 2891 2892 if (!(spitfire_get_dtlb_data(i) & _PAGE_L_4U)) { 2893 __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" 2894 "membar #Sync" 2895 : /* no outputs */ 2896 : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU)); 2897 spitfire_put_dtlb_data(i, 0x0UL); 2898 } 2899 2900 /* Spitfire Errata #32 workaround */ 2901 /* NOTE: Always runs on spitfire, so no 2902 * cheetah+ page size encodings. 2903 */ 2904 __asm__ __volatile__("stxa %0, [%1] %2\n\t" 2905 "flush %%g6" 2906 : /* No outputs */ 2907 : "r" (0), 2908 "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); 2909 2910 if (!(spitfire_get_itlb_data(i) & _PAGE_L_4U)) { 2911 __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" 2912 "membar #Sync" 2913 : /* no outputs */ 2914 : "r" (TLB_TAG_ACCESS), "i" (ASI_IMMU)); 2915 spitfire_put_itlb_data(i, 0x0UL); 2916 } 2917 } 2918 } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { 2919 cheetah_flush_dtlb_all(); 2920 cheetah_flush_itlb_all(); 2921 } 2922 __asm__ __volatile__("wrpr %0, 0, %%pstate" 2923 : : "r" (pstate)); 2924 } 2925 2926 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, 2927 unsigned long address) 2928 { 2929 struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 2930 pte_t *pte = NULL; 2931 2932 if (page) 2933 pte = (pte_t *) page_address(page); 2934 2935 return pte; 2936 } 2937 2938 pgtable_t pte_alloc_one(struct mm_struct *mm, 2939 unsigned long address) 2940 { 2941 struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 2942 if (!page) 2943 return NULL; 2944 if (!pgtable_page_ctor(page)) { 2945 free_hot_cold_page(page, 0); 2946 return NULL; 2947 } 2948 return (pte_t *) page_address(page); 2949 } 2950 2951 void pte_free_kernel(struct mm_struct *mm, pte_t *pte) 2952 { 2953 free_page((unsigned long)pte); 2954 } 2955 2956 static void __pte_free(pgtable_t pte) 2957 { 2958 struct page *page = virt_to_page(pte); 2959 2960 pgtable_page_dtor(page); 2961 __free_page(page); 2962 } 2963 2964 void pte_free(struct mm_struct *mm, pgtable_t pte) 2965 { 2966 __pte_free(pte); 2967 } 2968 2969 void pgtable_free(void *table, bool is_page) 2970 { 2971 if (is_page) 2972 __pte_free(table); 2973 else 2974 kmem_cache_free(pgtable_cache, table); 2975 } 2976 2977 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2978 void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, 2979 pmd_t *pmd) 2980 { 2981 unsigned long pte, flags; 2982 struct mm_struct *mm; 2983 pmd_t entry = *pmd; 2984 2985 if (!pmd_large(entry) || !pmd_young(entry)) 2986 return; 2987 2988 pte = pmd_val(entry); 2989 2990 /* Don't insert a non-valid PMD into the TSB, we'll deadlock. */ 2991 if (!(pte & _PAGE_VALID)) 2992 return; 2993 2994 /* We are fabricating 8MB pages using 4MB real hw pages. */ 2995 pte |= (addr & (1UL << REAL_HPAGE_SHIFT)); 2996 2997 mm = vma->vm_mm; 2998 2999 spin_lock_irqsave(&mm->context.lock, flags); 3000 3001 if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) 3002 __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT, 3003 addr, pte); 3004 3005 spin_unlock_irqrestore(&mm->context.lock, flags); 3006 } 3007 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3008 3009 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) 3010 static void context_reload(void *__data) 3011 { 3012 struct mm_struct *mm = __data; 3013 3014 if (mm == current->mm) 3015 load_secondary_context(mm); 3016 } 3017 3018 void hugetlb_setup(struct pt_regs *regs) 3019 { 3020 struct mm_struct *mm = current->mm; 3021 struct tsb_config *tp; 3022 3023 if (faulthandler_disabled() || !mm) { 3024 const struct exception_table_entry *entry; 3025 3026 entry = search_exception_tables(regs->tpc); 3027 if (entry) { 3028 regs->tpc = entry->fixup; 3029 regs->tnpc = regs->tpc + 4; 3030 return; 3031 } 3032 pr_alert("Unexpected HugeTLB setup in atomic context.\n"); 3033 die_if_kernel("HugeTSB in atomic", regs); 3034 } 3035 3036 tp = &mm->context.tsb_block[MM_TSB_HUGE]; 3037 if (likely(tp->tsb == NULL)) 3038 tsb_grow(mm, MM_TSB_HUGE, 0); 3039 3040 tsb_context_switch(mm); 3041 smp_tsb_sync(mm); 3042 3043 /* On UltraSPARC-III+ and later, configure the second half of 3044 * the Data-TLB for huge pages. 3045 */ 3046 if (tlb_type == cheetah_plus) { 3047 bool need_context_reload = false; 3048 unsigned long ctx; 3049 3050 spin_lock_irq(&ctx_alloc_lock); 3051 ctx = mm->context.sparc64_ctx_val; 3052 ctx &= ~CTX_PGSZ_MASK; 3053 ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT; 3054 ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT; 3055 3056 if (ctx != mm->context.sparc64_ctx_val) { 3057 /* When changing the page size fields, we 3058 * must perform a context flush so that no 3059 * stale entries match. This flush must 3060 * occur with the original context register 3061 * settings. 3062 */ 3063 do_flush_tlb_mm(mm); 3064 3065 /* Reload the context register of all processors 3066 * also executing in this address space. 3067 */ 3068 mm->context.sparc64_ctx_val = ctx; 3069 need_context_reload = true; 3070 } 3071 spin_unlock_irq(&ctx_alloc_lock); 3072 3073 if (need_context_reload) 3074 on_each_cpu(context_reload, mm, 0); 3075 } 3076 } 3077 #endif 3078 3079 static struct resource code_resource = { 3080 .name = "Kernel code", 3081 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM 3082 }; 3083 3084 static struct resource data_resource = { 3085 .name = "Kernel data", 3086 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM 3087 }; 3088 3089 static struct resource bss_resource = { 3090 .name = "Kernel bss", 3091 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM 3092 }; 3093 3094 static inline resource_size_t compute_kern_paddr(void *addr) 3095 { 3096 return (resource_size_t) (addr - KERNBASE + kern_base); 3097 } 3098 3099 static void __init kernel_lds_init(void) 3100 { 3101 code_resource.start = compute_kern_paddr(_text); 3102 code_resource.end = compute_kern_paddr(_etext - 1); 3103 data_resource.start = compute_kern_paddr(_etext); 3104 data_resource.end = compute_kern_paddr(_edata - 1); 3105 bss_resource.start = compute_kern_paddr(__bss_start); 3106 bss_resource.end = compute_kern_paddr(_end - 1); 3107 } 3108 3109 static int __init report_memory(void) 3110 { 3111 int i; 3112 struct resource *res; 3113 3114 kernel_lds_init(); 3115 3116 for (i = 0; i < pavail_ents; i++) { 3117 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 3118 3119 if (!res) { 3120 pr_warn("Failed to allocate source.\n"); 3121 break; 3122 } 3123 3124 res->name = "System RAM"; 3125 res->start = pavail[i].phys_addr; 3126 res->end = pavail[i].phys_addr + pavail[i].reg_size - 1; 3127 res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; 3128 3129 if (insert_resource(&iomem_resource, res) < 0) { 3130 pr_warn("Resource insertion failed.\n"); 3131 break; 3132 } 3133 3134 insert_resource(res, &code_resource); 3135 insert_resource(res, &data_resource); 3136 insert_resource(res, &bss_resource); 3137 } 3138 3139 return 0; 3140 } 3141 arch_initcall(report_memory); 3142 3143 #ifdef CONFIG_SMP 3144 #define do_flush_tlb_kernel_range smp_flush_tlb_kernel_range 3145 #else 3146 #define do_flush_tlb_kernel_range __flush_tlb_kernel_range 3147 #endif 3148 3149 void flush_tlb_kernel_range(unsigned long start, unsigned long end) 3150 { 3151 if (start < HI_OBP_ADDRESS && end > LOW_OBP_ADDRESS) { 3152 if (start < LOW_OBP_ADDRESS) { 3153 flush_tsb_kernel_range(start, LOW_OBP_ADDRESS); 3154 do_flush_tlb_kernel_range(start, LOW_OBP_ADDRESS); 3155 } 3156 if (end > HI_OBP_ADDRESS) { 3157 flush_tsb_kernel_range(HI_OBP_ADDRESS, end); 3158 do_flush_tlb_kernel_range(HI_OBP_ADDRESS, end); 3159 } 3160 } else { 3161 flush_tsb_kernel_range(start, end); 3162 do_flush_tlb_kernel_range(start, end); 3163 } 3164 } 3165