1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * arch/sparc64/mm/init.c 4 * 5 * Copyright (C) 1996-1999 David S. Miller (davem@caip.rutgers.edu) 6 * Copyright (C) 1997-1999 Jakub Jelinek (jj@sunsite.mff.cuni.cz) 7 */ 8 9 #include <linux/extable.h> 10 #include <linux/kernel.h> 11 #include <linux/sched.h> 12 #include <linux/string.h> 13 #include <linux/init.h> 14 #include <linux/memblock.h> 15 #include <linux/mm.h> 16 #include <linux/hugetlb.h> 17 #include <linux/initrd.h> 18 #include <linux/swap.h> 19 #include <linux/pagemap.h> 20 #include <linux/poison.h> 21 #include <linux/fs.h> 22 #include <linux/seq_file.h> 23 #include <linux/kprobes.h> 24 #include <linux/cache.h> 25 #include <linux/sort.h> 26 #include <linux/ioport.h> 27 #include <linux/percpu.h> 28 #include <linux/mmzone.h> 29 #include <linux/gfp.h> 30 31 #include <asm/head.h> 32 #include <asm/page.h> 33 #include <asm/pgalloc.h> 34 #include <asm/pgtable.h> 35 #include <asm/oplib.h> 36 #include <asm/iommu.h> 37 #include <asm/io.h> 38 #include <linux/uaccess.h> 39 #include <asm/mmu_context.h> 40 #include <asm/tlbflush.h> 41 #include <asm/dma.h> 42 #include <asm/starfire.h> 43 #include <asm/tlb.h> 44 #include <asm/spitfire.h> 45 #include <asm/sections.h> 46 #include <asm/tsb.h> 47 #include <asm/hypervisor.h> 48 #include <asm/prom.h> 49 #include <asm/mdesc.h> 50 #include <asm/cpudata.h> 51 #include <asm/setup.h> 52 #include <asm/irq.h> 53 54 #include "init_64.h" 55 56 unsigned long kern_linear_pte_xor[4] __read_mostly; 57 static unsigned long page_cache4v_flag; 58 59 /* A bitmap, two bits for every 256MB of physical memory. These two 60 * bits determine what page size we use for kernel linear 61 * translations. They form an index into kern_linear_pte_xor[]. The 62 * value in the indexed slot is XOR'd with the TLB miss virtual 63 * address to form the resulting TTE. The mapping is: 64 * 65 * 0 ==> 4MB 66 * 1 ==> 256MB 67 * 2 ==> 2GB 68 * 3 ==> 16GB 69 * 70 * All sun4v chips support 256MB pages. Only SPARC-T4 and later 71 * support 2GB pages, and hopefully future cpus will support the 16GB 72 * pages as well. For slots 2 and 3, we encode a 256MB TTE xor there 73 * if these larger page sizes are not supported by the cpu. 74 * 75 * It would be nice to determine this from the machine description 76 * 'cpu' properties, but we need to have this table setup before the 77 * MDESC is initialized. 78 */ 79 80 #ifndef CONFIG_DEBUG_PAGEALLOC 81 /* A special kernel TSB for 4MB, 256MB, 2GB and 16GB linear mappings. 82 * Space is allocated for this right after the trap table in 83 * arch/sparc64/kernel/head.S 84 */ 85 extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES]; 86 #endif 87 extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES]; 88 89 static unsigned long cpu_pgsz_mask; 90 91 #define MAX_BANKS 1024 92 93 static struct linux_prom64_registers pavail[MAX_BANKS]; 94 static int pavail_ents; 95 96 u64 numa_latency[MAX_NUMNODES][MAX_NUMNODES]; 97 98 static int cmp_p64(const void *a, const void *b) 99 { 100 const struct linux_prom64_registers *x = a, *y = b; 101 102 if (x->phys_addr > y->phys_addr) 103 return 1; 104 if (x->phys_addr < y->phys_addr) 105 return -1; 106 return 0; 107 } 108 109 static void __init read_obp_memory(const char *property, 110 struct linux_prom64_registers *regs, 111 int *num_ents) 112 { 113 phandle node = prom_finddevice("/memory"); 114 int prop_size = prom_getproplen(node, property); 115 int ents, ret, i; 116 117 ents = prop_size / sizeof(struct linux_prom64_registers); 118 if (ents > MAX_BANKS) { 119 prom_printf("The machine has more %s property entries than " 120 "this kernel can support (%d).\n", 121 property, MAX_BANKS); 122 prom_halt(); 123 } 124 125 ret = prom_getproperty(node, property, (char *) regs, prop_size); 126 if (ret == -1) { 127 prom_printf("Couldn't get %s property from /memory.\n", 128 property); 129 prom_halt(); 130 } 131 132 /* Sanitize what we got from the firmware, by page aligning 133 * everything. 134 */ 135 for (i = 0; i < ents; i++) { 136 unsigned long base, size; 137 138 base = regs[i].phys_addr; 139 size = regs[i].reg_size; 140 141 size &= PAGE_MASK; 142 if (base & ~PAGE_MASK) { 143 unsigned long new_base = PAGE_ALIGN(base); 144 145 size -= new_base - base; 146 if ((long) size < 0L) 147 size = 0UL; 148 base = new_base; 149 } 150 if (size == 0UL) { 151 /* If it is empty, simply get rid of it. 152 * This simplifies the logic of the other 153 * functions that process these arrays. 154 */ 155 memmove(®s[i], ®s[i + 1], 156 (ents - i - 1) * sizeof(regs[0])); 157 i--; 158 ents--; 159 continue; 160 } 161 regs[i].phys_addr = base; 162 regs[i].reg_size = size; 163 } 164 165 *num_ents = ents; 166 167 sort(regs, ents, sizeof(struct linux_prom64_registers), 168 cmp_p64, NULL); 169 } 170 171 /* Kernel physical address base and size in bytes. */ 172 unsigned long kern_base __read_mostly; 173 unsigned long kern_size __read_mostly; 174 175 /* Initial ramdisk setup */ 176 extern unsigned long sparc_ramdisk_image64; 177 extern unsigned int sparc_ramdisk_image; 178 extern unsigned int sparc_ramdisk_size; 179 180 struct page *mem_map_zero __read_mostly; 181 EXPORT_SYMBOL(mem_map_zero); 182 183 unsigned int sparc64_highest_unlocked_tlb_ent __read_mostly; 184 185 unsigned long sparc64_kern_pri_context __read_mostly; 186 unsigned long sparc64_kern_pri_nuc_bits __read_mostly; 187 unsigned long sparc64_kern_sec_context __read_mostly; 188 189 int num_kernel_image_mappings; 190 191 #ifdef CONFIG_DEBUG_DCFLUSH 192 atomic_t dcpage_flushes = ATOMIC_INIT(0); 193 #ifdef CONFIG_SMP 194 atomic_t dcpage_flushes_xcall = ATOMIC_INIT(0); 195 #endif 196 #endif 197 198 inline void flush_dcache_page_impl(struct page *page) 199 { 200 BUG_ON(tlb_type == hypervisor); 201 #ifdef CONFIG_DEBUG_DCFLUSH 202 atomic_inc(&dcpage_flushes); 203 #endif 204 205 #ifdef DCACHE_ALIASING_POSSIBLE 206 __flush_dcache_page(page_address(page), 207 ((tlb_type == spitfire) && 208 page_mapping_file(page) != NULL)); 209 #else 210 if (page_mapping_file(page) != NULL && 211 tlb_type == spitfire) 212 __flush_icache_page(__pa(page_address(page))); 213 #endif 214 } 215 216 #define PG_dcache_dirty PG_arch_1 217 #define PG_dcache_cpu_shift 32UL 218 #define PG_dcache_cpu_mask \ 219 ((1UL<<ilog2(roundup_pow_of_two(NR_CPUS)))-1UL) 220 221 #define dcache_dirty_cpu(page) \ 222 (((page)->flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) 223 224 static inline void set_dcache_dirty(struct page *page, int this_cpu) 225 { 226 unsigned long mask = this_cpu; 227 unsigned long non_cpu_bits; 228 229 non_cpu_bits = ~(PG_dcache_cpu_mask << PG_dcache_cpu_shift); 230 mask = (mask << PG_dcache_cpu_shift) | (1UL << PG_dcache_dirty); 231 232 __asm__ __volatile__("1:\n\t" 233 "ldx [%2], %%g7\n\t" 234 "and %%g7, %1, %%g1\n\t" 235 "or %%g1, %0, %%g1\n\t" 236 "casx [%2], %%g7, %%g1\n\t" 237 "cmp %%g7, %%g1\n\t" 238 "bne,pn %%xcc, 1b\n\t" 239 " nop" 240 : /* no outputs */ 241 : "r" (mask), "r" (non_cpu_bits), "r" (&page->flags) 242 : "g1", "g7"); 243 } 244 245 static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu) 246 { 247 unsigned long mask = (1UL << PG_dcache_dirty); 248 249 __asm__ __volatile__("! test_and_clear_dcache_dirty\n" 250 "1:\n\t" 251 "ldx [%2], %%g7\n\t" 252 "srlx %%g7, %4, %%g1\n\t" 253 "and %%g1, %3, %%g1\n\t" 254 "cmp %%g1, %0\n\t" 255 "bne,pn %%icc, 2f\n\t" 256 " andn %%g7, %1, %%g1\n\t" 257 "casx [%2], %%g7, %%g1\n\t" 258 "cmp %%g7, %%g1\n\t" 259 "bne,pn %%xcc, 1b\n\t" 260 " nop\n" 261 "2:" 262 : /* no outputs */ 263 : "r" (cpu), "r" (mask), "r" (&page->flags), 264 "i" (PG_dcache_cpu_mask), 265 "i" (PG_dcache_cpu_shift) 266 : "g1", "g7"); 267 } 268 269 static inline void tsb_insert(struct tsb *ent, unsigned long tag, unsigned long pte) 270 { 271 unsigned long tsb_addr = (unsigned long) ent; 272 273 if (tlb_type == cheetah_plus || tlb_type == hypervisor) 274 tsb_addr = __pa(tsb_addr); 275 276 __tsb_insert(tsb_addr, tag, pte); 277 } 278 279 unsigned long _PAGE_ALL_SZ_BITS __read_mostly; 280 281 static void flush_dcache(unsigned long pfn) 282 { 283 struct page *page; 284 285 page = pfn_to_page(pfn); 286 if (page) { 287 unsigned long pg_flags; 288 289 pg_flags = page->flags; 290 if (pg_flags & (1UL << PG_dcache_dirty)) { 291 int cpu = ((pg_flags >> PG_dcache_cpu_shift) & 292 PG_dcache_cpu_mask); 293 int this_cpu = get_cpu(); 294 295 /* This is just to optimize away some function calls 296 * in the SMP case. 297 */ 298 if (cpu == this_cpu) 299 flush_dcache_page_impl(page); 300 else 301 smp_flush_dcache_page_impl(page, cpu); 302 303 clear_dcache_dirty_cpu(page, cpu); 304 305 put_cpu(); 306 } 307 } 308 } 309 310 /* mm->context.lock must be held */ 311 static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_index, 312 unsigned long tsb_hash_shift, unsigned long address, 313 unsigned long tte) 314 { 315 struct tsb *tsb = mm->context.tsb_block[tsb_index].tsb; 316 unsigned long tag; 317 318 if (unlikely(!tsb)) 319 return; 320 321 tsb += ((address >> tsb_hash_shift) & 322 (mm->context.tsb_block[tsb_index].tsb_nentries - 1UL)); 323 tag = (address >> 22UL); 324 tsb_insert(tsb, tag, tte); 325 } 326 327 #ifdef CONFIG_HUGETLB_PAGE 328 static void __init add_huge_page_size(unsigned long size) 329 { 330 unsigned int order; 331 332 if (size_to_hstate(size)) 333 return; 334 335 order = ilog2(size) - PAGE_SHIFT; 336 hugetlb_add_hstate(order); 337 } 338 339 static int __init hugetlbpage_init(void) 340 { 341 add_huge_page_size(1UL << HPAGE_64K_SHIFT); 342 add_huge_page_size(1UL << HPAGE_SHIFT); 343 add_huge_page_size(1UL << HPAGE_256MB_SHIFT); 344 add_huge_page_size(1UL << HPAGE_2GB_SHIFT); 345 346 return 0; 347 } 348 349 arch_initcall(hugetlbpage_init); 350 351 static void __init pud_huge_patch(void) 352 { 353 struct pud_huge_patch_entry *p; 354 unsigned long addr; 355 356 p = &__pud_huge_patch; 357 addr = p->addr; 358 *(unsigned int *)addr = p->insn; 359 360 __asm__ __volatile__("flush %0" : : "r" (addr)); 361 } 362 363 static int __init setup_hugepagesz(char *string) 364 { 365 unsigned long long hugepage_size; 366 unsigned int hugepage_shift; 367 unsigned short hv_pgsz_idx; 368 unsigned int hv_pgsz_mask; 369 int rc = 0; 370 371 hugepage_size = memparse(string, &string); 372 hugepage_shift = ilog2(hugepage_size); 373 374 switch (hugepage_shift) { 375 case HPAGE_16GB_SHIFT: 376 hv_pgsz_mask = HV_PGSZ_MASK_16GB; 377 hv_pgsz_idx = HV_PGSZ_IDX_16GB; 378 pud_huge_patch(); 379 break; 380 case HPAGE_2GB_SHIFT: 381 hv_pgsz_mask = HV_PGSZ_MASK_2GB; 382 hv_pgsz_idx = HV_PGSZ_IDX_2GB; 383 break; 384 case HPAGE_256MB_SHIFT: 385 hv_pgsz_mask = HV_PGSZ_MASK_256MB; 386 hv_pgsz_idx = HV_PGSZ_IDX_256MB; 387 break; 388 case HPAGE_SHIFT: 389 hv_pgsz_mask = HV_PGSZ_MASK_4MB; 390 hv_pgsz_idx = HV_PGSZ_IDX_4MB; 391 break; 392 case HPAGE_64K_SHIFT: 393 hv_pgsz_mask = HV_PGSZ_MASK_64K; 394 hv_pgsz_idx = HV_PGSZ_IDX_64K; 395 break; 396 default: 397 hv_pgsz_mask = 0; 398 } 399 400 if ((hv_pgsz_mask & cpu_pgsz_mask) == 0U) { 401 hugetlb_bad_size(); 402 pr_err("hugepagesz=%llu not supported by MMU.\n", 403 hugepage_size); 404 goto out; 405 } 406 407 add_huge_page_size(hugepage_size); 408 rc = 1; 409 410 out: 411 return rc; 412 } 413 __setup("hugepagesz=", setup_hugepagesz); 414 #endif /* CONFIG_HUGETLB_PAGE */ 415 416 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) 417 { 418 struct mm_struct *mm; 419 unsigned long flags; 420 bool is_huge_tsb; 421 pte_t pte = *ptep; 422 423 if (tlb_type != hypervisor) { 424 unsigned long pfn = pte_pfn(pte); 425 426 if (pfn_valid(pfn)) 427 flush_dcache(pfn); 428 } 429 430 mm = vma->vm_mm; 431 432 /* Don't insert a non-valid PTE into the TSB, we'll deadlock. */ 433 if (!pte_accessible(mm, pte)) 434 return; 435 436 spin_lock_irqsave(&mm->context.lock, flags); 437 438 is_huge_tsb = false; 439 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) 440 if (mm->context.hugetlb_pte_count || mm->context.thp_pte_count) { 441 unsigned long hugepage_size = PAGE_SIZE; 442 443 if (is_vm_hugetlb_page(vma)) 444 hugepage_size = huge_page_size(hstate_vma(vma)); 445 446 if (hugepage_size >= PUD_SIZE) { 447 unsigned long mask = 0x1ffc00000UL; 448 449 /* Transfer bits [32:22] from address to resolve 450 * at 4M granularity. 451 */ 452 pte_val(pte) &= ~mask; 453 pte_val(pte) |= (address & mask); 454 } else if (hugepage_size >= PMD_SIZE) { 455 /* We are fabricating 8MB pages using 4MB 456 * real hw pages. 457 */ 458 pte_val(pte) |= (address & (1UL << REAL_HPAGE_SHIFT)); 459 } 460 461 if (hugepage_size >= PMD_SIZE) { 462 __update_mmu_tsb_insert(mm, MM_TSB_HUGE, 463 REAL_HPAGE_SHIFT, address, pte_val(pte)); 464 is_huge_tsb = true; 465 } 466 } 467 #endif 468 if (!is_huge_tsb) 469 __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT, 470 address, pte_val(pte)); 471 472 spin_unlock_irqrestore(&mm->context.lock, flags); 473 } 474 475 void flush_dcache_page(struct page *page) 476 { 477 struct address_space *mapping; 478 int this_cpu; 479 480 if (tlb_type == hypervisor) 481 return; 482 483 /* Do not bother with the expensive D-cache flush if it 484 * is merely the zero page. The 'bigcore' testcase in GDB 485 * causes this case to run millions of times. 486 */ 487 if (page == ZERO_PAGE(0)) 488 return; 489 490 this_cpu = get_cpu(); 491 492 mapping = page_mapping_file(page); 493 if (mapping && !mapping_mapped(mapping)) { 494 int dirty = test_bit(PG_dcache_dirty, &page->flags); 495 if (dirty) { 496 int dirty_cpu = dcache_dirty_cpu(page); 497 498 if (dirty_cpu == this_cpu) 499 goto out; 500 smp_flush_dcache_page_impl(page, dirty_cpu); 501 } 502 set_dcache_dirty(page, this_cpu); 503 } else { 504 /* We could delay the flush for the !page_mapping 505 * case too. But that case is for exec env/arg 506 * pages and those are %99 certainly going to get 507 * faulted into the tlb (and thus flushed) anyways. 508 */ 509 flush_dcache_page_impl(page); 510 } 511 512 out: 513 put_cpu(); 514 } 515 EXPORT_SYMBOL(flush_dcache_page); 516 517 void __kprobes flush_icache_range(unsigned long start, unsigned long end) 518 { 519 /* Cheetah and Hypervisor platform cpus have coherent I-cache. */ 520 if (tlb_type == spitfire) { 521 unsigned long kaddr; 522 523 /* This code only runs on Spitfire cpus so this is 524 * why we can assume _PAGE_PADDR_4U. 525 */ 526 for (kaddr = start; kaddr < end; kaddr += PAGE_SIZE) { 527 unsigned long paddr, mask = _PAGE_PADDR_4U; 528 529 if (kaddr >= PAGE_OFFSET) 530 paddr = kaddr & mask; 531 else { 532 pgd_t *pgdp = pgd_offset_k(kaddr); 533 p4d_t *p4dp = p4d_offset(pgdp, kaddr); 534 pud_t *pudp = pud_offset(p4dp, kaddr); 535 pmd_t *pmdp = pmd_offset(pudp, kaddr); 536 pte_t *ptep = pte_offset_kernel(pmdp, kaddr); 537 538 paddr = pte_val(*ptep) & mask; 539 } 540 __flush_icache_page(paddr); 541 } 542 } 543 } 544 EXPORT_SYMBOL(flush_icache_range); 545 546 void mmu_info(struct seq_file *m) 547 { 548 static const char *pgsz_strings[] = { 549 "8K", "64K", "512K", "4MB", "32MB", 550 "256MB", "2GB", "16GB", 551 }; 552 int i, printed; 553 554 if (tlb_type == cheetah) 555 seq_printf(m, "MMU Type\t: Cheetah\n"); 556 else if (tlb_type == cheetah_plus) 557 seq_printf(m, "MMU Type\t: Cheetah+\n"); 558 else if (tlb_type == spitfire) 559 seq_printf(m, "MMU Type\t: Spitfire\n"); 560 else if (tlb_type == hypervisor) 561 seq_printf(m, "MMU Type\t: Hypervisor (sun4v)\n"); 562 else 563 seq_printf(m, "MMU Type\t: ???\n"); 564 565 seq_printf(m, "MMU PGSZs\t: "); 566 printed = 0; 567 for (i = 0; i < ARRAY_SIZE(pgsz_strings); i++) { 568 if (cpu_pgsz_mask & (1UL << i)) { 569 seq_printf(m, "%s%s", 570 printed ? "," : "", pgsz_strings[i]); 571 printed++; 572 } 573 } 574 seq_putc(m, '\n'); 575 576 #ifdef CONFIG_DEBUG_DCFLUSH 577 seq_printf(m, "DCPageFlushes\t: %d\n", 578 atomic_read(&dcpage_flushes)); 579 #ifdef CONFIG_SMP 580 seq_printf(m, "DCPageFlushesXC\t: %d\n", 581 atomic_read(&dcpage_flushes_xcall)); 582 #endif /* CONFIG_SMP */ 583 #endif /* CONFIG_DEBUG_DCFLUSH */ 584 } 585 586 struct linux_prom_translation prom_trans[512] __read_mostly; 587 unsigned int prom_trans_ents __read_mostly; 588 589 unsigned long kern_locked_tte_data; 590 591 /* The obp translations are saved based on 8k pagesize, since obp can 592 * use a mixture of pagesizes. Misses to the LOW_OBP_ADDRESS -> 593 * HI_OBP_ADDRESS range are handled in ktlb.S. 594 */ 595 static inline int in_obp_range(unsigned long vaddr) 596 { 597 return (vaddr >= LOW_OBP_ADDRESS && 598 vaddr < HI_OBP_ADDRESS); 599 } 600 601 static int cmp_ptrans(const void *a, const void *b) 602 { 603 const struct linux_prom_translation *x = a, *y = b; 604 605 if (x->virt > y->virt) 606 return 1; 607 if (x->virt < y->virt) 608 return -1; 609 return 0; 610 } 611 612 /* Read OBP translations property into 'prom_trans[]'. */ 613 static void __init read_obp_translations(void) 614 { 615 int n, node, ents, first, last, i; 616 617 node = prom_finddevice("/virtual-memory"); 618 n = prom_getproplen(node, "translations"); 619 if (unlikely(n == 0 || n == -1)) { 620 prom_printf("prom_mappings: Couldn't get size.\n"); 621 prom_halt(); 622 } 623 if (unlikely(n > sizeof(prom_trans))) { 624 prom_printf("prom_mappings: Size %d is too big.\n", n); 625 prom_halt(); 626 } 627 628 if ((n = prom_getproperty(node, "translations", 629 (char *)&prom_trans[0], 630 sizeof(prom_trans))) == -1) { 631 prom_printf("prom_mappings: Couldn't get property.\n"); 632 prom_halt(); 633 } 634 635 n = n / sizeof(struct linux_prom_translation); 636 637 ents = n; 638 639 sort(prom_trans, ents, sizeof(struct linux_prom_translation), 640 cmp_ptrans, NULL); 641 642 /* Now kick out all the non-OBP entries. */ 643 for (i = 0; i < ents; i++) { 644 if (in_obp_range(prom_trans[i].virt)) 645 break; 646 } 647 first = i; 648 for (; i < ents; i++) { 649 if (!in_obp_range(prom_trans[i].virt)) 650 break; 651 } 652 last = i; 653 654 for (i = 0; i < (last - first); i++) { 655 struct linux_prom_translation *src = &prom_trans[i + first]; 656 struct linux_prom_translation *dest = &prom_trans[i]; 657 658 *dest = *src; 659 } 660 for (; i < ents; i++) { 661 struct linux_prom_translation *dest = &prom_trans[i]; 662 dest->virt = dest->size = dest->data = 0x0UL; 663 } 664 665 prom_trans_ents = last - first; 666 667 if (tlb_type == spitfire) { 668 /* Clear diag TTE bits. */ 669 for (i = 0; i < prom_trans_ents; i++) 670 prom_trans[i].data &= ~0x0003fe0000000000UL; 671 } 672 673 /* Force execute bit on. */ 674 for (i = 0; i < prom_trans_ents; i++) 675 prom_trans[i].data |= (tlb_type == hypervisor ? 676 _PAGE_EXEC_4V : _PAGE_EXEC_4U); 677 } 678 679 static void __init hypervisor_tlb_lock(unsigned long vaddr, 680 unsigned long pte, 681 unsigned long mmu) 682 { 683 unsigned long ret = sun4v_mmu_map_perm_addr(vaddr, 0, pte, mmu); 684 685 if (ret != 0) { 686 prom_printf("hypervisor_tlb_lock[%lx:%x:%lx:%lx]: " 687 "errors with %lx\n", vaddr, 0, pte, mmu, ret); 688 prom_halt(); 689 } 690 } 691 692 static unsigned long kern_large_tte(unsigned long paddr); 693 694 static void __init remap_kernel(void) 695 { 696 unsigned long phys_page, tte_vaddr, tte_data; 697 int i, tlb_ent = sparc64_highest_locked_tlbent(); 698 699 tte_vaddr = (unsigned long) KERNBASE; 700 phys_page = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB; 701 tte_data = kern_large_tte(phys_page); 702 703 kern_locked_tte_data = tte_data; 704 705 /* Now lock us into the TLBs via Hypervisor or OBP. */ 706 if (tlb_type == hypervisor) { 707 for (i = 0; i < num_kernel_image_mappings; i++) { 708 hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_DMMU); 709 hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_IMMU); 710 tte_vaddr += 0x400000; 711 tte_data += 0x400000; 712 } 713 } else { 714 for (i = 0; i < num_kernel_image_mappings; i++) { 715 prom_dtlb_load(tlb_ent - i, tte_data, tte_vaddr); 716 prom_itlb_load(tlb_ent - i, tte_data, tte_vaddr); 717 tte_vaddr += 0x400000; 718 tte_data += 0x400000; 719 } 720 sparc64_highest_unlocked_tlb_ent = tlb_ent - i; 721 } 722 if (tlb_type == cheetah_plus) { 723 sparc64_kern_pri_context = (CTX_CHEETAH_PLUS_CTX0 | 724 CTX_CHEETAH_PLUS_NUC); 725 sparc64_kern_pri_nuc_bits = CTX_CHEETAH_PLUS_NUC; 726 sparc64_kern_sec_context = CTX_CHEETAH_PLUS_CTX0; 727 } 728 } 729 730 731 static void __init inherit_prom_mappings(void) 732 { 733 /* Now fixup OBP's idea about where we really are mapped. */ 734 printk("Remapping the kernel... "); 735 remap_kernel(); 736 printk("done.\n"); 737 } 738 739 void prom_world(int enter) 740 { 741 if (!enter) 742 set_fs(get_fs()); 743 744 __asm__ __volatile__("flushw"); 745 } 746 747 void __flush_dcache_range(unsigned long start, unsigned long end) 748 { 749 unsigned long va; 750 751 if (tlb_type == spitfire) { 752 int n = 0; 753 754 for (va = start; va < end; va += 32) { 755 spitfire_put_dcache_tag(va & 0x3fe0, 0x0); 756 if (++n >= 512) 757 break; 758 } 759 } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { 760 start = __pa(start); 761 end = __pa(end); 762 for (va = start; va < end; va += 32) 763 __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" 764 "membar #Sync" 765 : /* no outputs */ 766 : "r" (va), 767 "i" (ASI_DCACHE_INVALIDATE)); 768 } 769 } 770 EXPORT_SYMBOL(__flush_dcache_range); 771 772 /* get_new_mmu_context() uses "cache + 1". */ 773 DEFINE_SPINLOCK(ctx_alloc_lock); 774 unsigned long tlb_context_cache = CTX_FIRST_VERSION; 775 #define MAX_CTX_NR (1UL << CTX_NR_BITS) 776 #define CTX_BMAP_SLOTS BITS_TO_LONGS(MAX_CTX_NR) 777 DECLARE_BITMAP(mmu_context_bmap, MAX_CTX_NR); 778 DEFINE_PER_CPU(struct mm_struct *, per_cpu_secondary_mm) = {0}; 779 780 static void mmu_context_wrap(void) 781 { 782 unsigned long old_ver = tlb_context_cache & CTX_VERSION_MASK; 783 unsigned long new_ver, new_ctx, old_ctx; 784 struct mm_struct *mm; 785 int cpu; 786 787 bitmap_zero(mmu_context_bmap, 1 << CTX_NR_BITS); 788 789 /* Reserve kernel context */ 790 set_bit(0, mmu_context_bmap); 791 792 new_ver = (tlb_context_cache & CTX_VERSION_MASK) + CTX_FIRST_VERSION; 793 if (unlikely(new_ver == 0)) 794 new_ver = CTX_FIRST_VERSION; 795 tlb_context_cache = new_ver; 796 797 /* 798 * Make sure that any new mm that are added into per_cpu_secondary_mm, 799 * are going to go through get_new_mmu_context() path. 800 */ 801 mb(); 802 803 /* 804 * Updated versions to current on those CPUs that had valid secondary 805 * contexts 806 */ 807 for_each_online_cpu(cpu) { 808 /* 809 * If a new mm is stored after we took this mm from the array, 810 * it will go into get_new_mmu_context() path, because we 811 * already bumped the version in tlb_context_cache. 812 */ 813 mm = per_cpu(per_cpu_secondary_mm, cpu); 814 815 if (unlikely(!mm || mm == &init_mm)) 816 continue; 817 818 old_ctx = mm->context.sparc64_ctx_val; 819 if (likely((old_ctx & CTX_VERSION_MASK) == old_ver)) { 820 new_ctx = (old_ctx & ~CTX_VERSION_MASK) | new_ver; 821 set_bit(new_ctx & CTX_NR_MASK, mmu_context_bmap); 822 mm->context.sparc64_ctx_val = new_ctx; 823 } 824 } 825 } 826 827 /* Caller does TLB context flushing on local CPU if necessary. 828 * The caller also ensures that CTX_VALID(mm->context) is false. 829 * 830 * We must be careful about boundary cases so that we never 831 * let the user have CTX 0 (nucleus) or we ever use a CTX 832 * version of zero (and thus NO_CONTEXT would not be caught 833 * by version mis-match tests in mmu_context.h). 834 * 835 * Always invoked with interrupts disabled. 836 */ 837 void get_new_mmu_context(struct mm_struct *mm) 838 { 839 unsigned long ctx, new_ctx; 840 unsigned long orig_pgsz_bits; 841 842 spin_lock(&ctx_alloc_lock); 843 retry: 844 /* wrap might have happened, test again if our context became valid */ 845 if (unlikely(CTX_VALID(mm->context))) 846 goto out; 847 orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK); 848 ctx = (tlb_context_cache + 1) & CTX_NR_MASK; 849 new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx); 850 if (new_ctx >= (1 << CTX_NR_BITS)) { 851 new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1); 852 if (new_ctx >= ctx) { 853 mmu_context_wrap(); 854 goto retry; 855 } 856 } 857 if (mm->context.sparc64_ctx_val) 858 cpumask_clear(mm_cpumask(mm)); 859 mmu_context_bmap[new_ctx>>6] |= (1UL << (new_ctx & 63)); 860 new_ctx |= (tlb_context_cache & CTX_VERSION_MASK); 861 tlb_context_cache = new_ctx; 862 mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits; 863 out: 864 spin_unlock(&ctx_alloc_lock); 865 } 866 867 static int numa_enabled = 1; 868 static int numa_debug; 869 870 static int __init early_numa(char *p) 871 { 872 if (!p) 873 return 0; 874 875 if (strstr(p, "off")) 876 numa_enabled = 0; 877 878 if (strstr(p, "debug")) 879 numa_debug = 1; 880 881 return 0; 882 } 883 early_param("numa", early_numa); 884 885 #define numadbg(f, a...) \ 886 do { if (numa_debug) \ 887 printk(KERN_INFO f, ## a); \ 888 } while (0) 889 890 static void __init find_ramdisk(unsigned long phys_base) 891 { 892 #ifdef CONFIG_BLK_DEV_INITRD 893 if (sparc_ramdisk_image || sparc_ramdisk_image64) { 894 unsigned long ramdisk_image; 895 896 /* Older versions of the bootloader only supported a 897 * 32-bit physical address for the ramdisk image 898 * location, stored at sparc_ramdisk_image. Newer 899 * SILO versions set sparc_ramdisk_image to zero and 900 * provide a full 64-bit physical address at 901 * sparc_ramdisk_image64. 902 */ 903 ramdisk_image = sparc_ramdisk_image; 904 if (!ramdisk_image) 905 ramdisk_image = sparc_ramdisk_image64; 906 907 /* Another bootloader quirk. The bootloader normalizes 908 * the physical address to KERNBASE, so we have to 909 * factor that back out and add in the lowest valid 910 * physical page address to get the true physical address. 911 */ 912 ramdisk_image -= KERNBASE; 913 ramdisk_image += phys_base; 914 915 numadbg("Found ramdisk at physical address 0x%lx, size %u\n", 916 ramdisk_image, sparc_ramdisk_size); 917 918 initrd_start = ramdisk_image; 919 initrd_end = ramdisk_image + sparc_ramdisk_size; 920 921 memblock_reserve(initrd_start, sparc_ramdisk_size); 922 923 initrd_start += PAGE_OFFSET; 924 initrd_end += PAGE_OFFSET; 925 } 926 #endif 927 } 928 929 struct node_mem_mask { 930 unsigned long mask; 931 unsigned long match; 932 }; 933 static struct node_mem_mask node_masks[MAX_NUMNODES]; 934 static int num_node_masks; 935 936 #ifdef CONFIG_NEED_MULTIPLE_NODES 937 938 struct mdesc_mlgroup { 939 u64 node; 940 u64 latency; 941 u64 match; 942 u64 mask; 943 }; 944 945 static struct mdesc_mlgroup *mlgroups; 946 static int num_mlgroups; 947 948 int numa_cpu_lookup_table[NR_CPUS]; 949 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; 950 951 struct mdesc_mblock { 952 u64 base; 953 u64 size; 954 u64 offset; /* RA-to-PA */ 955 }; 956 static struct mdesc_mblock *mblocks; 957 static int num_mblocks; 958 959 static struct mdesc_mblock * __init addr_to_mblock(unsigned long addr) 960 { 961 struct mdesc_mblock *m = NULL; 962 int i; 963 964 for (i = 0; i < num_mblocks; i++) { 965 m = &mblocks[i]; 966 967 if (addr >= m->base && 968 addr < (m->base + m->size)) { 969 break; 970 } 971 } 972 973 return m; 974 } 975 976 static u64 __init memblock_nid_range_sun4u(u64 start, u64 end, int *nid) 977 { 978 int prev_nid, new_nid; 979 980 prev_nid = NUMA_NO_NODE; 981 for ( ; start < end; start += PAGE_SIZE) { 982 for (new_nid = 0; new_nid < num_node_masks; new_nid++) { 983 struct node_mem_mask *p = &node_masks[new_nid]; 984 985 if ((start & p->mask) == p->match) { 986 if (prev_nid == NUMA_NO_NODE) 987 prev_nid = new_nid; 988 break; 989 } 990 } 991 992 if (new_nid == num_node_masks) { 993 prev_nid = 0; 994 WARN_ONCE(1, "addr[%Lx] doesn't match a NUMA node rule. Some memory will be owned by node 0.", 995 start); 996 break; 997 } 998 999 if (prev_nid != new_nid) 1000 break; 1001 } 1002 *nid = prev_nid; 1003 1004 return start > end ? end : start; 1005 } 1006 1007 static u64 __init memblock_nid_range(u64 start, u64 end, int *nid) 1008 { 1009 u64 ret_end, pa_start, m_mask, m_match, m_end; 1010 struct mdesc_mblock *mblock; 1011 int _nid, i; 1012 1013 if (tlb_type != hypervisor) 1014 return memblock_nid_range_sun4u(start, end, nid); 1015 1016 mblock = addr_to_mblock(start); 1017 if (!mblock) { 1018 WARN_ONCE(1, "memblock_nid_range: Can't find mblock addr[%Lx]", 1019 start); 1020 1021 _nid = 0; 1022 ret_end = end; 1023 goto done; 1024 } 1025 1026 pa_start = start + mblock->offset; 1027 m_match = 0; 1028 m_mask = 0; 1029 1030 for (_nid = 0; _nid < num_node_masks; _nid++) { 1031 struct node_mem_mask *const m = &node_masks[_nid]; 1032 1033 if ((pa_start & m->mask) == m->match) { 1034 m_match = m->match; 1035 m_mask = m->mask; 1036 break; 1037 } 1038 } 1039 1040 if (num_node_masks == _nid) { 1041 /* We could not find NUMA group, so default to 0, but lets 1042 * search for latency group, so we could calculate the correct 1043 * end address that we return 1044 */ 1045 _nid = 0; 1046 1047 for (i = 0; i < num_mlgroups; i++) { 1048 struct mdesc_mlgroup *const m = &mlgroups[i]; 1049 1050 if ((pa_start & m->mask) == m->match) { 1051 m_match = m->match; 1052 m_mask = m->mask; 1053 break; 1054 } 1055 } 1056 1057 if (i == num_mlgroups) { 1058 WARN_ONCE(1, "memblock_nid_range: Can't find latency group addr[%Lx]", 1059 start); 1060 1061 ret_end = end; 1062 goto done; 1063 } 1064 } 1065 1066 /* 1067 * Each latency group has match and mask, and each memory block has an 1068 * offset. An address belongs to a latency group if its address matches 1069 * the following formula: ((addr + offset) & mask) == match 1070 * It is, however, slow to check every single page if it matches a 1071 * particular latency group. As optimization we calculate end value by 1072 * using bit arithmetics. 1073 */ 1074 m_end = m_match + (1ul << __ffs(m_mask)) - mblock->offset; 1075 m_end += pa_start & ~((1ul << fls64(m_mask)) - 1); 1076 ret_end = m_end > end ? end : m_end; 1077 1078 done: 1079 *nid = _nid; 1080 return ret_end; 1081 } 1082 #endif 1083 1084 /* This must be invoked after performing all of the necessary 1085 * memblock_set_node() calls for 'nid'. We need to be able to get 1086 * correct data from get_pfn_range_for_nid(). 1087 */ 1088 static void __init allocate_node_data(int nid) 1089 { 1090 struct pglist_data *p; 1091 unsigned long start_pfn, end_pfn; 1092 #ifdef CONFIG_NEED_MULTIPLE_NODES 1093 1094 NODE_DATA(nid) = memblock_alloc_node(sizeof(struct pglist_data), 1095 SMP_CACHE_BYTES, nid); 1096 if (!NODE_DATA(nid)) { 1097 prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid); 1098 prom_halt(); 1099 } 1100 1101 NODE_DATA(nid)->node_id = nid; 1102 #endif 1103 1104 p = NODE_DATA(nid); 1105 1106 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 1107 p->node_start_pfn = start_pfn; 1108 p->node_spanned_pages = end_pfn - start_pfn; 1109 } 1110 1111 static void init_node_masks_nonnuma(void) 1112 { 1113 #ifdef CONFIG_NEED_MULTIPLE_NODES 1114 int i; 1115 #endif 1116 1117 numadbg("Initializing tables for non-numa.\n"); 1118 1119 node_masks[0].mask = 0; 1120 node_masks[0].match = 0; 1121 num_node_masks = 1; 1122 1123 #ifdef CONFIG_NEED_MULTIPLE_NODES 1124 for (i = 0; i < NR_CPUS; i++) 1125 numa_cpu_lookup_table[i] = 0; 1126 1127 cpumask_setall(&numa_cpumask_lookup_table[0]); 1128 #endif 1129 } 1130 1131 #ifdef CONFIG_NEED_MULTIPLE_NODES 1132 struct pglist_data *node_data[MAX_NUMNODES]; 1133 1134 EXPORT_SYMBOL(numa_cpu_lookup_table); 1135 EXPORT_SYMBOL(numa_cpumask_lookup_table); 1136 EXPORT_SYMBOL(node_data); 1137 1138 static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio, 1139 u32 cfg_handle) 1140 { 1141 u64 arc; 1142 1143 mdesc_for_each_arc(arc, md, pio, MDESC_ARC_TYPE_FWD) { 1144 u64 target = mdesc_arc_target(md, arc); 1145 const u64 *val; 1146 1147 val = mdesc_get_property(md, target, 1148 "cfg-handle", NULL); 1149 if (val && *val == cfg_handle) 1150 return 0; 1151 } 1152 return -ENODEV; 1153 } 1154 1155 static int scan_arcs_for_cfg_handle(struct mdesc_handle *md, u64 grp, 1156 u32 cfg_handle) 1157 { 1158 u64 arc, candidate, best_latency = ~(u64)0; 1159 1160 candidate = MDESC_NODE_NULL; 1161 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) { 1162 u64 target = mdesc_arc_target(md, arc); 1163 const char *name = mdesc_node_name(md, target); 1164 const u64 *val; 1165 1166 if (strcmp(name, "pio-latency-group")) 1167 continue; 1168 1169 val = mdesc_get_property(md, target, "latency", NULL); 1170 if (!val) 1171 continue; 1172 1173 if (*val < best_latency) { 1174 candidate = target; 1175 best_latency = *val; 1176 } 1177 } 1178 1179 if (candidate == MDESC_NODE_NULL) 1180 return -ENODEV; 1181 1182 return scan_pio_for_cfg_handle(md, candidate, cfg_handle); 1183 } 1184 1185 int of_node_to_nid(struct device_node *dp) 1186 { 1187 const struct linux_prom64_registers *regs; 1188 struct mdesc_handle *md; 1189 u32 cfg_handle; 1190 int count, nid; 1191 u64 grp; 1192 1193 /* This is the right thing to do on currently supported 1194 * SUN4U NUMA platforms as well, as the PCI controller does 1195 * not sit behind any particular memory controller. 1196 */ 1197 if (!mlgroups) 1198 return -1; 1199 1200 regs = of_get_property(dp, "reg", NULL); 1201 if (!regs) 1202 return -1; 1203 1204 cfg_handle = (regs->phys_addr >> 32UL) & 0x0fffffff; 1205 1206 md = mdesc_grab(); 1207 1208 count = 0; 1209 nid = NUMA_NO_NODE; 1210 mdesc_for_each_node_by_name(md, grp, "group") { 1211 if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) { 1212 nid = count; 1213 break; 1214 } 1215 count++; 1216 } 1217 1218 mdesc_release(md); 1219 1220 return nid; 1221 } 1222 1223 static void __init add_node_ranges(void) 1224 { 1225 struct memblock_region *reg; 1226 unsigned long prev_max; 1227 1228 memblock_resized: 1229 prev_max = memblock.memory.max; 1230 1231 for_each_memblock(memory, reg) { 1232 unsigned long size = reg->size; 1233 unsigned long start, end; 1234 1235 start = reg->base; 1236 end = start + size; 1237 while (start < end) { 1238 unsigned long this_end; 1239 int nid; 1240 1241 this_end = memblock_nid_range(start, end, &nid); 1242 1243 numadbg("Setting memblock NUMA node nid[%d] " 1244 "start[%lx] end[%lx]\n", 1245 nid, start, this_end); 1246 1247 memblock_set_node(start, this_end - start, 1248 &memblock.memory, nid); 1249 if (memblock.memory.max != prev_max) 1250 goto memblock_resized; 1251 start = this_end; 1252 } 1253 } 1254 } 1255 1256 static int __init grab_mlgroups(struct mdesc_handle *md) 1257 { 1258 unsigned long paddr; 1259 int count = 0; 1260 u64 node; 1261 1262 mdesc_for_each_node_by_name(md, node, "memory-latency-group") 1263 count++; 1264 if (!count) 1265 return -ENOENT; 1266 1267 paddr = memblock_phys_alloc(count * sizeof(struct mdesc_mlgroup), 1268 SMP_CACHE_BYTES); 1269 if (!paddr) 1270 return -ENOMEM; 1271 1272 mlgroups = __va(paddr); 1273 num_mlgroups = count; 1274 1275 count = 0; 1276 mdesc_for_each_node_by_name(md, node, "memory-latency-group") { 1277 struct mdesc_mlgroup *m = &mlgroups[count++]; 1278 const u64 *val; 1279 1280 m->node = node; 1281 1282 val = mdesc_get_property(md, node, "latency", NULL); 1283 m->latency = *val; 1284 val = mdesc_get_property(md, node, "address-match", NULL); 1285 m->match = *val; 1286 val = mdesc_get_property(md, node, "address-mask", NULL); 1287 m->mask = *val; 1288 1289 numadbg("MLGROUP[%d]: node[%llx] latency[%llx] " 1290 "match[%llx] mask[%llx]\n", 1291 count - 1, m->node, m->latency, m->match, m->mask); 1292 } 1293 1294 return 0; 1295 } 1296 1297 static int __init grab_mblocks(struct mdesc_handle *md) 1298 { 1299 unsigned long paddr; 1300 int count = 0; 1301 u64 node; 1302 1303 mdesc_for_each_node_by_name(md, node, "mblock") 1304 count++; 1305 if (!count) 1306 return -ENOENT; 1307 1308 paddr = memblock_phys_alloc(count * sizeof(struct mdesc_mblock), 1309 SMP_CACHE_BYTES); 1310 if (!paddr) 1311 return -ENOMEM; 1312 1313 mblocks = __va(paddr); 1314 num_mblocks = count; 1315 1316 count = 0; 1317 mdesc_for_each_node_by_name(md, node, "mblock") { 1318 struct mdesc_mblock *m = &mblocks[count++]; 1319 const u64 *val; 1320 1321 val = mdesc_get_property(md, node, "base", NULL); 1322 m->base = *val; 1323 val = mdesc_get_property(md, node, "size", NULL); 1324 m->size = *val; 1325 val = mdesc_get_property(md, node, 1326 "address-congruence-offset", NULL); 1327 1328 /* The address-congruence-offset property is optional. 1329 * Explicity zero it be identifty this. 1330 */ 1331 if (val) 1332 m->offset = *val; 1333 else 1334 m->offset = 0UL; 1335 1336 numadbg("MBLOCK[%d]: base[%llx] size[%llx] offset[%llx]\n", 1337 count - 1, m->base, m->size, m->offset); 1338 } 1339 1340 return 0; 1341 } 1342 1343 static void __init numa_parse_mdesc_group_cpus(struct mdesc_handle *md, 1344 u64 grp, cpumask_t *mask) 1345 { 1346 u64 arc; 1347 1348 cpumask_clear(mask); 1349 1350 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_BACK) { 1351 u64 target = mdesc_arc_target(md, arc); 1352 const char *name = mdesc_node_name(md, target); 1353 const u64 *id; 1354 1355 if (strcmp(name, "cpu")) 1356 continue; 1357 id = mdesc_get_property(md, target, "id", NULL); 1358 if (*id < nr_cpu_ids) 1359 cpumask_set_cpu(*id, mask); 1360 } 1361 } 1362 1363 static struct mdesc_mlgroup * __init find_mlgroup(u64 node) 1364 { 1365 int i; 1366 1367 for (i = 0; i < num_mlgroups; i++) { 1368 struct mdesc_mlgroup *m = &mlgroups[i]; 1369 if (m->node == node) 1370 return m; 1371 } 1372 return NULL; 1373 } 1374 1375 int __node_distance(int from, int to) 1376 { 1377 if ((from >= MAX_NUMNODES) || (to >= MAX_NUMNODES)) { 1378 pr_warn("Returning default NUMA distance value for %d->%d\n", 1379 from, to); 1380 return (from == to) ? LOCAL_DISTANCE : REMOTE_DISTANCE; 1381 } 1382 return numa_latency[from][to]; 1383 } 1384 EXPORT_SYMBOL(__node_distance); 1385 1386 static int __init find_best_numa_node_for_mlgroup(struct mdesc_mlgroup *grp) 1387 { 1388 int i; 1389 1390 for (i = 0; i < MAX_NUMNODES; i++) { 1391 struct node_mem_mask *n = &node_masks[i]; 1392 1393 if ((grp->mask == n->mask) && (grp->match == n->match)) 1394 break; 1395 } 1396 return i; 1397 } 1398 1399 static void __init find_numa_latencies_for_group(struct mdesc_handle *md, 1400 u64 grp, int index) 1401 { 1402 u64 arc; 1403 1404 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) { 1405 int tnode; 1406 u64 target = mdesc_arc_target(md, arc); 1407 struct mdesc_mlgroup *m = find_mlgroup(target); 1408 1409 if (!m) 1410 continue; 1411 tnode = find_best_numa_node_for_mlgroup(m); 1412 if (tnode == MAX_NUMNODES) 1413 continue; 1414 numa_latency[index][tnode] = m->latency; 1415 } 1416 } 1417 1418 static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp, 1419 int index) 1420 { 1421 struct mdesc_mlgroup *candidate = NULL; 1422 u64 arc, best_latency = ~(u64)0; 1423 struct node_mem_mask *n; 1424 1425 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) { 1426 u64 target = mdesc_arc_target(md, arc); 1427 struct mdesc_mlgroup *m = find_mlgroup(target); 1428 if (!m) 1429 continue; 1430 if (m->latency < best_latency) { 1431 candidate = m; 1432 best_latency = m->latency; 1433 } 1434 } 1435 if (!candidate) 1436 return -ENOENT; 1437 1438 if (num_node_masks != index) { 1439 printk(KERN_ERR "Inconsistent NUMA state, " 1440 "index[%d] != num_node_masks[%d]\n", 1441 index, num_node_masks); 1442 return -EINVAL; 1443 } 1444 1445 n = &node_masks[num_node_masks++]; 1446 1447 n->mask = candidate->mask; 1448 n->match = candidate->match; 1449 1450 numadbg("NUMA NODE[%d]: mask[%lx] match[%lx] (latency[%llx])\n", 1451 index, n->mask, n->match, candidate->latency); 1452 1453 return 0; 1454 } 1455 1456 static int __init numa_parse_mdesc_group(struct mdesc_handle *md, u64 grp, 1457 int index) 1458 { 1459 cpumask_t mask; 1460 int cpu; 1461 1462 numa_parse_mdesc_group_cpus(md, grp, &mask); 1463 1464 for_each_cpu(cpu, &mask) 1465 numa_cpu_lookup_table[cpu] = index; 1466 cpumask_copy(&numa_cpumask_lookup_table[index], &mask); 1467 1468 if (numa_debug) { 1469 printk(KERN_INFO "NUMA GROUP[%d]: cpus [ ", index); 1470 for_each_cpu(cpu, &mask) 1471 printk("%d ", cpu); 1472 printk("]\n"); 1473 } 1474 1475 return numa_attach_mlgroup(md, grp, index); 1476 } 1477 1478 static int __init numa_parse_mdesc(void) 1479 { 1480 struct mdesc_handle *md = mdesc_grab(); 1481 int i, j, err, count; 1482 u64 node; 1483 1484 node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups"); 1485 if (node == MDESC_NODE_NULL) { 1486 mdesc_release(md); 1487 return -ENOENT; 1488 } 1489 1490 err = grab_mblocks(md); 1491 if (err < 0) 1492 goto out; 1493 1494 err = grab_mlgroups(md); 1495 if (err < 0) 1496 goto out; 1497 1498 count = 0; 1499 mdesc_for_each_node_by_name(md, node, "group") { 1500 err = numa_parse_mdesc_group(md, node, count); 1501 if (err < 0) 1502 break; 1503 count++; 1504 } 1505 1506 count = 0; 1507 mdesc_for_each_node_by_name(md, node, "group") { 1508 find_numa_latencies_for_group(md, node, count); 1509 count++; 1510 } 1511 1512 /* Normalize numa latency matrix according to ACPI SLIT spec. */ 1513 for (i = 0; i < MAX_NUMNODES; i++) { 1514 u64 self_latency = numa_latency[i][i]; 1515 1516 for (j = 0; j < MAX_NUMNODES; j++) { 1517 numa_latency[i][j] = 1518 (numa_latency[i][j] * LOCAL_DISTANCE) / 1519 self_latency; 1520 } 1521 } 1522 1523 add_node_ranges(); 1524 1525 for (i = 0; i < num_node_masks; i++) { 1526 allocate_node_data(i); 1527 node_set_online(i); 1528 } 1529 1530 err = 0; 1531 out: 1532 mdesc_release(md); 1533 return err; 1534 } 1535 1536 static int __init numa_parse_jbus(void) 1537 { 1538 unsigned long cpu, index; 1539 1540 /* NUMA node id is encoded in bits 36 and higher, and there is 1541 * a 1-to-1 mapping from CPU ID to NUMA node ID. 1542 */ 1543 index = 0; 1544 for_each_present_cpu(cpu) { 1545 numa_cpu_lookup_table[cpu] = index; 1546 cpumask_copy(&numa_cpumask_lookup_table[index], cpumask_of(cpu)); 1547 node_masks[index].mask = ~((1UL << 36UL) - 1UL); 1548 node_masks[index].match = cpu << 36UL; 1549 1550 index++; 1551 } 1552 num_node_masks = index; 1553 1554 add_node_ranges(); 1555 1556 for (index = 0; index < num_node_masks; index++) { 1557 allocate_node_data(index); 1558 node_set_online(index); 1559 } 1560 1561 return 0; 1562 } 1563 1564 static int __init numa_parse_sun4u(void) 1565 { 1566 if (tlb_type == cheetah || tlb_type == cheetah_plus) { 1567 unsigned long ver; 1568 1569 __asm__ ("rdpr %%ver, %0" : "=r" (ver)); 1570 if ((ver >> 32UL) == __JALAPENO_ID || 1571 (ver >> 32UL) == __SERRANO_ID) 1572 return numa_parse_jbus(); 1573 } 1574 return -1; 1575 } 1576 1577 static int __init bootmem_init_numa(void) 1578 { 1579 int i, j; 1580 int err = -1; 1581 1582 numadbg("bootmem_init_numa()\n"); 1583 1584 /* Some sane defaults for numa latency values */ 1585 for (i = 0; i < MAX_NUMNODES; i++) { 1586 for (j = 0; j < MAX_NUMNODES; j++) 1587 numa_latency[i][j] = (i == j) ? 1588 LOCAL_DISTANCE : REMOTE_DISTANCE; 1589 } 1590 1591 if (numa_enabled) { 1592 if (tlb_type == hypervisor) 1593 err = numa_parse_mdesc(); 1594 else 1595 err = numa_parse_sun4u(); 1596 } 1597 return err; 1598 } 1599 1600 #else 1601 1602 static int bootmem_init_numa(void) 1603 { 1604 return -1; 1605 } 1606 1607 #endif 1608 1609 static void __init bootmem_init_nonnuma(void) 1610 { 1611 unsigned long top_of_ram = memblock_end_of_DRAM(); 1612 unsigned long total_ram = memblock_phys_mem_size(); 1613 1614 numadbg("bootmem_init_nonnuma()\n"); 1615 1616 printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 1617 top_of_ram, total_ram); 1618 printk(KERN_INFO "Memory hole size: %ldMB\n", 1619 (top_of_ram - total_ram) >> 20); 1620 1621 init_node_masks_nonnuma(); 1622 memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); 1623 allocate_node_data(0); 1624 node_set_online(0); 1625 } 1626 1627 static unsigned long __init bootmem_init(unsigned long phys_base) 1628 { 1629 unsigned long end_pfn; 1630 1631 end_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; 1632 max_pfn = max_low_pfn = end_pfn; 1633 min_low_pfn = (phys_base >> PAGE_SHIFT); 1634 1635 if (bootmem_init_numa() < 0) 1636 bootmem_init_nonnuma(); 1637 1638 /* Dump memblock with node info. */ 1639 memblock_dump_all(); 1640 1641 /* XXX cpu notifier XXX */ 1642 1643 sparse_memory_present_with_active_regions(MAX_NUMNODES); 1644 sparse_init(); 1645 1646 return end_pfn; 1647 } 1648 1649 static struct linux_prom64_registers pall[MAX_BANKS] __initdata; 1650 static int pall_ents __initdata; 1651 1652 static unsigned long max_phys_bits = 40; 1653 1654 bool kern_addr_valid(unsigned long addr) 1655 { 1656 pgd_t *pgd; 1657 p4d_t *p4d; 1658 pud_t *pud; 1659 pmd_t *pmd; 1660 pte_t *pte; 1661 1662 if ((long)addr < 0L) { 1663 unsigned long pa = __pa(addr); 1664 1665 if ((pa >> max_phys_bits) != 0UL) 1666 return false; 1667 1668 return pfn_valid(pa >> PAGE_SHIFT); 1669 } 1670 1671 if (addr >= (unsigned long) KERNBASE && 1672 addr < (unsigned long)&_end) 1673 return true; 1674 1675 pgd = pgd_offset_k(addr); 1676 if (pgd_none(*pgd)) 1677 return 0; 1678 1679 p4d = p4d_offset(pgd, addr); 1680 if (p4d_none(*p4d)) 1681 return 0; 1682 1683 pud = pud_offset(p4d, addr); 1684 if (pud_none(*pud)) 1685 return 0; 1686 1687 if (pud_large(*pud)) 1688 return pfn_valid(pud_pfn(*pud)); 1689 1690 pmd = pmd_offset(pud, addr); 1691 if (pmd_none(*pmd)) 1692 return 0; 1693 1694 if (pmd_large(*pmd)) 1695 return pfn_valid(pmd_pfn(*pmd)); 1696 1697 pte = pte_offset_kernel(pmd, addr); 1698 if (pte_none(*pte)) 1699 return 0; 1700 1701 return pfn_valid(pte_pfn(*pte)); 1702 } 1703 EXPORT_SYMBOL(kern_addr_valid); 1704 1705 static unsigned long __ref kernel_map_hugepud(unsigned long vstart, 1706 unsigned long vend, 1707 pud_t *pud) 1708 { 1709 const unsigned long mask16gb = (1UL << 34) - 1UL; 1710 u64 pte_val = vstart; 1711 1712 /* Each PUD is 8GB */ 1713 if ((vstart & mask16gb) || 1714 (vend - vstart <= mask16gb)) { 1715 pte_val ^= kern_linear_pte_xor[2]; 1716 pud_val(*pud) = pte_val | _PAGE_PUD_HUGE; 1717 1718 return vstart + PUD_SIZE; 1719 } 1720 1721 pte_val ^= kern_linear_pte_xor[3]; 1722 pte_val |= _PAGE_PUD_HUGE; 1723 1724 vend = vstart + mask16gb + 1UL; 1725 while (vstart < vend) { 1726 pud_val(*pud) = pte_val; 1727 1728 pte_val += PUD_SIZE; 1729 vstart += PUD_SIZE; 1730 pud++; 1731 } 1732 return vstart; 1733 } 1734 1735 static bool kernel_can_map_hugepud(unsigned long vstart, unsigned long vend, 1736 bool guard) 1737 { 1738 if (guard && !(vstart & ~PUD_MASK) && (vend - vstart) >= PUD_SIZE) 1739 return true; 1740 1741 return false; 1742 } 1743 1744 static unsigned long __ref kernel_map_hugepmd(unsigned long vstart, 1745 unsigned long vend, 1746 pmd_t *pmd) 1747 { 1748 const unsigned long mask256mb = (1UL << 28) - 1UL; 1749 const unsigned long mask2gb = (1UL << 31) - 1UL; 1750 u64 pte_val = vstart; 1751 1752 /* Each PMD is 8MB */ 1753 if ((vstart & mask256mb) || 1754 (vend - vstart <= mask256mb)) { 1755 pte_val ^= kern_linear_pte_xor[0]; 1756 pmd_val(*pmd) = pte_val | _PAGE_PMD_HUGE; 1757 1758 return vstart + PMD_SIZE; 1759 } 1760 1761 if ((vstart & mask2gb) || 1762 (vend - vstart <= mask2gb)) { 1763 pte_val ^= kern_linear_pte_xor[1]; 1764 pte_val |= _PAGE_PMD_HUGE; 1765 vend = vstart + mask256mb + 1UL; 1766 } else { 1767 pte_val ^= kern_linear_pte_xor[2]; 1768 pte_val |= _PAGE_PMD_HUGE; 1769 vend = vstart + mask2gb + 1UL; 1770 } 1771 1772 while (vstart < vend) { 1773 pmd_val(*pmd) = pte_val; 1774 1775 pte_val += PMD_SIZE; 1776 vstart += PMD_SIZE; 1777 pmd++; 1778 } 1779 1780 return vstart; 1781 } 1782 1783 static bool kernel_can_map_hugepmd(unsigned long vstart, unsigned long vend, 1784 bool guard) 1785 { 1786 if (guard && !(vstart & ~PMD_MASK) && (vend - vstart) >= PMD_SIZE) 1787 return true; 1788 1789 return false; 1790 } 1791 1792 static unsigned long __ref kernel_map_range(unsigned long pstart, 1793 unsigned long pend, pgprot_t prot, 1794 bool use_huge) 1795 { 1796 unsigned long vstart = PAGE_OFFSET + pstart; 1797 unsigned long vend = PAGE_OFFSET + pend; 1798 unsigned long alloc_bytes = 0UL; 1799 1800 if ((vstart & ~PAGE_MASK) || (vend & ~PAGE_MASK)) { 1801 prom_printf("kernel_map: Unaligned physmem[%lx:%lx]\n", 1802 vstart, vend); 1803 prom_halt(); 1804 } 1805 1806 while (vstart < vend) { 1807 unsigned long this_end, paddr = __pa(vstart); 1808 pgd_t *pgd = pgd_offset_k(vstart); 1809 p4d_t *p4d; 1810 pud_t *pud; 1811 pmd_t *pmd; 1812 pte_t *pte; 1813 1814 if (pgd_none(*pgd)) { 1815 pud_t *new; 1816 1817 new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, 1818 PAGE_SIZE); 1819 if (!new) 1820 goto err_alloc; 1821 alloc_bytes += PAGE_SIZE; 1822 pgd_populate(&init_mm, pgd, new); 1823 } 1824 1825 p4d = p4d_offset(pgd, vstart); 1826 if (p4d_none(*p4d)) { 1827 pud_t *new; 1828 1829 new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, 1830 PAGE_SIZE); 1831 if (!new) 1832 goto err_alloc; 1833 alloc_bytes += PAGE_SIZE; 1834 p4d_populate(&init_mm, p4d, new); 1835 } 1836 1837 pud = pud_offset(p4d, vstart); 1838 if (pud_none(*pud)) { 1839 pmd_t *new; 1840 1841 if (kernel_can_map_hugepud(vstart, vend, use_huge)) { 1842 vstart = kernel_map_hugepud(vstart, vend, pud); 1843 continue; 1844 } 1845 new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, 1846 PAGE_SIZE); 1847 if (!new) 1848 goto err_alloc; 1849 alloc_bytes += PAGE_SIZE; 1850 pud_populate(&init_mm, pud, new); 1851 } 1852 1853 pmd = pmd_offset(pud, vstart); 1854 if (pmd_none(*pmd)) { 1855 pte_t *new; 1856 1857 if (kernel_can_map_hugepmd(vstart, vend, use_huge)) { 1858 vstart = kernel_map_hugepmd(vstart, vend, pmd); 1859 continue; 1860 } 1861 new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, 1862 PAGE_SIZE); 1863 if (!new) 1864 goto err_alloc; 1865 alloc_bytes += PAGE_SIZE; 1866 pmd_populate_kernel(&init_mm, pmd, new); 1867 } 1868 1869 pte = pte_offset_kernel(pmd, vstart); 1870 this_end = (vstart + PMD_SIZE) & PMD_MASK; 1871 if (this_end > vend) 1872 this_end = vend; 1873 1874 while (vstart < this_end) { 1875 pte_val(*pte) = (paddr | pgprot_val(prot)); 1876 1877 vstart += PAGE_SIZE; 1878 paddr += PAGE_SIZE; 1879 pte++; 1880 } 1881 } 1882 1883 return alloc_bytes; 1884 1885 err_alloc: 1886 panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", 1887 __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); 1888 return -ENOMEM; 1889 } 1890 1891 static void __init flush_all_kernel_tsbs(void) 1892 { 1893 int i; 1894 1895 for (i = 0; i < KERNEL_TSB_NENTRIES; i++) { 1896 struct tsb *ent = &swapper_tsb[i]; 1897 1898 ent->tag = (1UL << TSB_TAG_INVALID_BIT); 1899 } 1900 #ifndef CONFIG_DEBUG_PAGEALLOC 1901 for (i = 0; i < KERNEL_TSB4M_NENTRIES; i++) { 1902 struct tsb *ent = &swapper_4m_tsb[i]; 1903 1904 ent->tag = (1UL << TSB_TAG_INVALID_BIT); 1905 } 1906 #endif 1907 } 1908 1909 extern unsigned int kvmap_linear_patch[1]; 1910 1911 static void __init kernel_physical_mapping_init(void) 1912 { 1913 unsigned long i, mem_alloced = 0UL; 1914 bool use_huge = true; 1915 1916 #ifdef CONFIG_DEBUG_PAGEALLOC 1917 use_huge = false; 1918 #endif 1919 for (i = 0; i < pall_ents; i++) { 1920 unsigned long phys_start, phys_end; 1921 1922 phys_start = pall[i].phys_addr; 1923 phys_end = phys_start + pall[i].reg_size; 1924 1925 mem_alloced += kernel_map_range(phys_start, phys_end, 1926 PAGE_KERNEL, use_huge); 1927 } 1928 1929 printk("Allocated %ld bytes for kernel page tables.\n", 1930 mem_alloced); 1931 1932 kvmap_linear_patch[0] = 0x01000000; /* nop */ 1933 flushi(&kvmap_linear_patch[0]); 1934 1935 flush_all_kernel_tsbs(); 1936 1937 __flush_tlb_all(); 1938 } 1939 1940 #ifdef CONFIG_DEBUG_PAGEALLOC 1941 void __kernel_map_pages(struct page *page, int numpages, int enable) 1942 { 1943 unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT; 1944 unsigned long phys_end = phys_start + (numpages * PAGE_SIZE); 1945 1946 kernel_map_range(phys_start, phys_end, 1947 (enable ? PAGE_KERNEL : __pgprot(0)), false); 1948 1949 flush_tsb_kernel_range(PAGE_OFFSET + phys_start, 1950 PAGE_OFFSET + phys_end); 1951 1952 /* we should perform an IPI and flush all tlbs, 1953 * but that can deadlock->flush only current cpu. 1954 */ 1955 __flush_tlb_kernel_range(PAGE_OFFSET + phys_start, 1956 PAGE_OFFSET + phys_end); 1957 } 1958 #endif 1959 1960 unsigned long __init find_ecache_flush_span(unsigned long size) 1961 { 1962 int i; 1963 1964 for (i = 0; i < pavail_ents; i++) { 1965 if (pavail[i].reg_size >= size) 1966 return pavail[i].phys_addr; 1967 } 1968 1969 return ~0UL; 1970 } 1971 1972 unsigned long PAGE_OFFSET; 1973 EXPORT_SYMBOL(PAGE_OFFSET); 1974 1975 unsigned long VMALLOC_END = 0x0000010000000000UL; 1976 EXPORT_SYMBOL(VMALLOC_END); 1977 1978 unsigned long sparc64_va_hole_top = 0xfffff80000000000UL; 1979 unsigned long sparc64_va_hole_bottom = 0x0000080000000000UL; 1980 1981 static void __init setup_page_offset(void) 1982 { 1983 if (tlb_type == cheetah || tlb_type == cheetah_plus) { 1984 /* Cheetah/Panther support a full 64-bit virtual 1985 * address, so we can use all that our page tables 1986 * support. 1987 */ 1988 sparc64_va_hole_top = 0xfff0000000000000UL; 1989 sparc64_va_hole_bottom = 0x0010000000000000UL; 1990 1991 max_phys_bits = 42; 1992 } else if (tlb_type == hypervisor) { 1993 switch (sun4v_chip_type) { 1994 case SUN4V_CHIP_NIAGARA1: 1995 case SUN4V_CHIP_NIAGARA2: 1996 /* T1 and T2 support 48-bit virtual addresses. */ 1997 sparc64_va_hole_top = 0xffff800000000000UL; 1998 sparc64_va_hole_bottom = 0x0000800000000000UL; 1999 2000 max_phys_bits = 39; 2001 break; 2002 case SUN4V_CHIP_NIAGARA3: 2003 /* T3 supports 48-bit virtual addresses. */ 2004 sparc64_va_hole_top = 0xffff800000000000UL; 2005 sparc64_va_hole_bottom = 0x0000800000000000UL; 2006 2007 max_phys_bits = 43; 2008 break; 2009 case SUN4V_CHIP_NIAGARA4: 2010 case SUN4V_CHIP_NIAGARA5: 2011 case SUN4V_CHIP_SPARC64X: 2012 case SUN4V_CHIP_SPARC_M6: 2013 /* T4 and later support 52-bit virtual addresses. */ 2014 sparc64_va_hole_top = 0xfff8000000000000UL; 2015 sparc64_va_hole_bottom = 0x0008000000000000UL; 2016 max_phys_bits = 47; 2017 break; 2018 case SUN4V_CHIP_SPARC_M7: 2019 case SUN4V_CHIP_SPARC_SN: 2020 /* M7 and later support 52-bit virtual addresses. */ 2021 sparc64_va_hole_top = 0xfff8000000000000UL; 2022 sparc64_va_hole_bottom = 0x0008000000000000UL; 2023 max_phys_bits = 49; 2024 break; 2025 case SUN4V_CHIP_SPARC_M8: 2026 default: 2027 /* M8 and later support 54-bit virtual addresses. 2028 * However, restricting M8 and above VA bits to 53 2029 * as 4-level page table cannot support more than 2030 * 53 VA bits. 2031 */ 2032 sparc64_va_hole_top = 0xfff0000000000000UL; 2033 sparc64_va_hole_bottom = 0x0010000000000000UL; 2034 max_phys_bits = 51; 2035 break; 2036 } 2037 } 2038 2039 if (max_phys_bits > MAX_PHYS_ADDRESS_BITS) { 2040 prom_printf("MAX_PHYS_ADDRESS_BITS is too small, need %lu\n", 2041 max_phys_bits); 2042 prom_halt(); 2043 } 2044 2045 PAGE_OFFSET = sparc64_va_hole_top; 2046 VMALLOC_END = ((sparc64_va_hole_bottom >> 1) + 2047 (sparc64_va_hole_bottom >> 2)); 2048 2049 pr_info("MM: PAGE_OFFSET is 0x%016lx (max_phys_bits == %lu)\n", 2050 PAGE_OFFSET, max_phys_bits); 2051 pr_info("MM: VMALLOC [0x%016lx --> 0x%016lx]\n", 2052 VMALLOC_START, VMALLOC_END); 2053 pr_info("MM: VMEMMAP [0x%016lx --> 0x%016lx]\n", 2054 VMEMMAP_BASE, VMEMMAP_BASE << 1); 2055 } 2056 2057 static void __init tsb_phys_patch(void) 2058 { 2059 struct tsb_ldquad_phys_patch_entry *pquad; 2060 struct tsb_phys_patch_entry *p; 2061 2062 pquad = &__tsb_ldquad_phys_patch; 2063 while (pquad < &__tsb_ldquad_phys_patch_end) { 2064 unsigned long addr = pquad->addr; 2065 2066 if (tlb_type == hypervisor) 2067 *(unsigned int *) addr = pquad->sun4v_insn; 2068 else 2069 *(unsigned int *) addr = pquad->sun4u_insn; 2070 wmb(); 2071 __asm__ __volatile__("flush %0" 2072 : /* no outputs */ 2073 : "r" (addr)); 2074 2075 pquad++; 2076 } 2077 2078 p = &__tsb_phys_patch; 2079 while (p < &__tsb_phys_patch_end) { 2080 unsigned long addr = p->addr; 2081 2082 *(unsigned int *) addr = p->insn; 2083 wmb(); 2084 __asm__ __volatile__("flush %0" 2085 : /* no outputs */ 2086 : "r" (addr)); 2087 2088 p++; 2089 } 2090 } 2091 2092 /* Don't mark as init, we give this to the Hypervisor. */ 2093 #ifndef CONFIG_DEBUG_PAGEALLOC 2094 #define NUM_KTSB_DESCR 2 2095 #else 2096 #define NUM_KTSB_DESCR 1 2097 #endif 2098 static struct hv_tsb_descr ktsb_descr[NUM_KTSB_DESCR]; 2099 2100 /* The swapper TSBs are loaded with a base sequence of: 2101 * 2102 * sethi %uhi(SYMBOL), REG1 2103 * sethi %hi(SYMBOL), REG2 2104 * or REG1, %ulo(SYMBOL), REG1 2105 * or REG2, %lo(SYMBOL), REG2 2106 * sllx REG1, 32, REG1 2107 * or REG1, REG2, REG1 2108 * 2109 * When we use physical addressing for the TSB accesses, we patch the 2110 * first four instructions in the above sequence. 2111 */ 2112 2113 static void patch_one_ktsb_phys(unsigned int *start, unsigned int *end, unsigned long pa) 2114 { 2115 unsigned long high_bits, low_bits; 2116 2117 high_bits = (pa >> 32) & 0xffffffff; 2118 low_bits = (pa >> 0) & 0xffffffff; 2119 2120 while (start < end) { 2121 unsigned int *ia = (unsigned int *)(unsigned long)*start; 2122 2123 ia[0] = (ia[0] & ~0x3fffff) | (high_bits >> 10); 2124 __asm__ __volatile__("flush %0" : : "r" (ia)); 2125 2126 ia[1] = (ia[1] & ~0x3fffff) | (low_bits >> 10); 2127 __asm__ __volatile__("flush %0" : : "r" (ia + 1)); 2128 2129 ia[2] = (ia[2] & ~0x1fff) | (high_bits & 0x3ff); 2130 __asm__ __volatile__("flush %0" : : "r" (ia + 2)); 2131 2132 ia[3] = (ia[3] & ~0x1fff) | (low_bits & 0x3ff); 2133 __asm__ __volatile__("flush %0" : : "r" (ia + 3)); 2134 2135 start++; 2136 } 2137 } 2138 2139 static void ktsb_phys_patch(void) 2140 { 2141 extern unsigned int __swapper_tsb_phys_patch; 2142 extern unsigned int __swapper_tsb_phys_patch_end; 2143 unsigned long ktsb_pa; 2144 2145 ktsb_pa = kern_base + ((unsigned long)&swapper_tsb[0] - KERNBASE); 2146 patch_one_ktsb_phys(&__swapper_tsb_phys_patch, 2147 &__swapper_tsb_phys_patch_end, ktsb_pa); 2148 #ifndef CONFIG_DEBUG_PAGEALLOC 2149 { 2150 extern unsigned int __swapper_4m_tsb_phys_patch; 2151 extern unsigned int __swapper_4m_tsb_phys_patch_end; 2152 ktsb_pa = (kern_base + 2153 ((unsigned long)&swapper_4m_tsb[0] - KERNBASE)); 2154 patch_one_ktsb_phys(&__swapper_4m_tsb_phys_patch, 2155 &__swapper_4m_tsb_phys_patch_end, ktsb_pa); 2156 } 2157 #endif 2158 } 2159 2160 static void __init sun4v_ktsb_init(void) 2161 { 2162 unsigned long ktsb_pa; 2163 2164 /* First KTSB for PAGE_SIZE mappings. */ 2165 ktsb_pa = kern_base + ((unsigned long)&swapper_tsb[0] - KERNBASE); 2166 2167 switch (PAGE_SIZE) { 2168 case 8 * 1024: 2169 default: 2170 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_8K; 2171 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_8K; 2172 break; 2173 2174 case 64 * 1024: 2175 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_64K; 2176 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_64K; 2177 break; 2178 2179 case 512 * 1024: 2180 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_512K; 2181 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_512K; 2182 break; 2183 2184 case 4 * 1024 * 1024: 2185 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_4MB; 2186 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_4MB; 2187 break; 2188 } 2189 2190 ktsb_descr[0].assoc = 1; 2191 ktsb_descr[0].num_ttes = KERNEL_TSB_NENTRIES; 2192 ktsb_descr[0].ctx_idx = 0; 2193 ktsb_descr[0].tsb_base = ktsb_pa; 2194 ktsb_descr[0].resv = 0; 2195 2196 #ifndef CONFIG_DEBUG_PAGEALLOC 2197 /* Second KTSB for 4MB/256MB/2GB/16GB mappings. */ 2198 ktsb_pa = (kern_base + 2199 ((unsigned long)&swapper_4m_tsb[0] - KERNBASE)); 2200 2201 ktsb_descr[1].pgsz_idx = HV_PGSZ_IDX_4MB; 2202 ktsb_descr[1].pgsz_mask = ((HV_PGSZ_MASK_4MB | 2203 HV_PGSZ_MASK_256MB | 2204 HV_PGSZ_MASK_2GB | 2205 HV_PGSZ_MASK_16GB) & 2206 cpu_pgsz_mask); 2207 ktsb_descr[1].assoc = 1; 2208 ktsb_descr[1].num_ttes = KERNEL_TSB4M_NENTRIES; 2209 ktsb_descr[1].ctx_idx = 0; 2210 ktsb_descr[1].tsb_base = ktsb_pa; 2211 ktsb_descr[1].resv = 0; 2212 #endif 2213 } 2214 2215 void sun4v_ktsb_register(void) 2216 { 2217 unsigned long pa, ret; 2218 2219 pa = kern_base + ((unsigned long)&ktsb_descr[0] - KERNBASE); 2220 2221 ret = sun4v_mmu_tsb_ctx0(NUM_KTSB_DESCR, pa); 2222 if (ret != 0) { 2223 prom_printf("hypervisor_mmu_tsb_ctx0[%lx]: " 2224 "errors with %lx\n", pa, ret); 2225 prom_halt(); 2226 } 2227 } 2228 2229 static void __init sun4u_linear_pte_xor_finalize(void) 2230 { 2231 #ifndef CONFIG_DEBUG_PAGEALLOC 2232 /* This is where we would add Panther support for 2233 * 32MB and 256MB pages. 2234 */ 2235 #endif 2236 } 2237 2238 static void __init sun4v_linear_pte_xor_finalize(void) 2239 { 2240 unsigned long pagecv_flag; 2241 2242 /* Bit 9 of TTE is no longer CV bit on M7 processor and it instead 2243 * enables MCD error. Do not set bit 9 on M7 processor. 2244 */ 2245 switch (sun4v_chip_type) { 2246 case SUN4V_CHIP_SPARC_M7: 2247 case SUN4V_CHIP_SPARC_M8: 2248 case SUN4V_CHIP_SPARC_SN: 2249 pagecv_flag = 0x00; 2250 break; 2251 default: 2252 pagecv_flag = _PAGE_CV_4V; 2253 break; 2254 } 2255 #ifndef CONFIG_DEBUG_PAGEALLOC 2256 if (cpu_pgsz_mask & HV_PGSZ_MASK_256MB) { 2257 kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^ 2258 PAGE_OFFSET; 2259 kern_linear_pte_xor[1] |= (_PAGE_CP_4V | pagecv_flag | 2260 _PAGE_P_4V | _PAGE_W_4V); 2261 } else { 2262 kern_linear_pte_xor[1] = kern_linear_pte_xor[0]; 2263 } 2264 2265 if (cpu_pgsz_mask & HV_PGSZ_MASK_2GB) { 2266 kern_linear_pte_xor[2] = (_PAGE_VALID | _PAGE_SZ2GB_4V) ^ 2267 PAGE_OFFSET; 2268 kern_linear_pte_xor[2] |= (_PAGE_CP_4V | pagecv_flag | 2269 _PAGE_P_4V | _PAGE_W_4V); 2270 } else { 2271 kern_linear_pte_xor[2] = kern_linear_pte_xor[1]; 2272 } 2273 2274 if (cpu_pgsz_mask & HV_PGSZ_MASK_16GB) { 2275 kern_linear_pte_xor[3] = (_PAGE_VALID | _PAGE_SZ16GB_4V) ^ 2276 PAGE_OFFSET; 2277 kern_linear_pte_xor[3] |= (_PAGE_CP_4V | pagecv_flag | 2278 _PAGE_P_4V | _PAGE_W_4V); 2279 } else { 2280 kern_linear_pte_xor[3] = kern_linear_pte_xor[2]; 2281 } 2282 #endif 2283 } 2284 2285 /* paging_init() sets up the page tables */ 2286 2287 static unsigned long last_valid_pfn; 2288 2289 static void sun4u_pgprot_init(void); 2290 static void sun4v_pgprot_init(void); 2291 2292 #define _PAGE_CACHE_4U (_PAGE_CP_4U | _PAGE_CV_4U) 2293 #define _PAGE_CACHE_4V (_PAGE_CP_4V | _PAGE_CV_4V) 2294 #define __DIRTY_BITS_4U (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U) 2295 #define __DIRTY_BITS_4V (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V) 2296 #define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R) 2297 #define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R) 2298 2299 /* We need to exclude reserved regions. This exclusion will include 2300 * vmlinux and initrd. To be more precise the initrd size could be used to 2301 * compute a new lower limit because it is freed later during initialization. 2302 */ 2303 static void __init reduce_memory(phys_addr_t limit_ram) 2304 { 2305 limit_ram += memblock_reserved_size(); 2306 memblock_enforce_memory_limit(limit_ram); 2307 } 2308 2309 void __init paging_init(void) 2310 { 2311 unsigned long end_pfn, shift, phys_base; 2312 unsigned long real_end, i; 2313 2314 setup_page_offset(); 2315 2316 /* These build time checkes make sure that the dcache_dirty_cpu() 2317 * page->flags usage will work. 2318 * 2319 * When a page gets marked as dcache-dirty, we store the 2320 * cpu number starting at bit 32 in the page->flags. Also, 2321 * functions like clear_dcache_dirty_cpu use the cpu mask 2322 * in 13-bit signed-immediate instruction fields. 2323 */ 2324 2325 /* 2326 * Page flags must not reach into upper 32 bits that are used 2327 * for the cpu number 2328 */ 2329 BUILD_BUG_ON(NR_PAGEFLAGS > 32); 2330 2331 /* 2332 * The bit fields placed in the high range must not reach below 2333 * the 32 bit boundary. Otherwise we cannot place the cpu field 2334 * at the 32 bit boundary. 2335 */ 2336 BUILD_BUG_ON(SECTIONS_WIDTH + NODES_WIDTH + ZONES_WIDTH + 2337 ilog2(roundup_pow_of_two(NR_CPUS)) > 32); 2338 2339 BUILD_BUG_ON(NR_CPUS > 4096); 2340 2341 kern_base = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB; 2342 kern_size = (unsigned long)&_end - (unsigned long)KERNBASE; 2343 2344 /* Invalidate both kernel TSBs. */ 2345 memset(swapper_tsb, 0x40, sizeof(swapper_tsb)); 2346 #ifndef CONFIG_DEBUG_PAGEALLOC 2347 memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb)); 2348 #endif 2349 2350 /* TTE.cv bit on sparc v9 occupies the same position as TTE.mcde 2351 * bit on M7 processor. This is a conflicting usage of the same 2352 * bit. Enabling TTE.cv on M7 would turn on Memory Corruption 2353 * Detection error on all pages and this will lead to problems 2354 * later. Kernel does not run with MCD enabled and hence rest 2355 * of the required steps to fully configure memory corruption 2356 * detection are not taken. We need to ensure TTE.mcde is not 2357 * set on M7 processor. Compute the value of cacheability 2358 * flag for use later taking this into consideration. 2359 */ 2360 switch (sun4v_chip_type) { 2361 case SUN4V_CHIP_SPARC_M7: 2362 case SUN4V_CHIP_SPARC_M8: 2363 case SUN4V_CHIP_SPARC_SN: 2364 page_cache4v_flag = _PAGE_CP_4V; 2365 break; 2366 default: 2367 page_cache4v_flag = _PAGE_CACHE_4V; 2368 break; 2369 } 2370 2371 if (tlb_type == hypervisor) 2372 sun4v_pgprot_init(); 2373 else 2374 sun4u_pgprot_init(); 2375 2376 if (tlb_type == cheetah_plus || 2377 tlb_type == hypervisor) { 2378 tsb_phys_patch(); 2379 ktsb_phys_patch(); 2380 } 2381 2382 if (tlb_type == hypervisor) 2383 sun4v_patch_tlb_handlers(); 2384 2385 /* Find available physical memory... 2386 * 2387 * Read it twice in order to work around a bug in openfirmware. 2388 * The call to grab this table itself can cause openfirmware to 2389 * allocate memory, which in turn can take away some space from 2390 * the list of available memory. Reading it twice makes sure 2391 * we really do get the final value. 2392 */ 2393 read_obp_translations(); 2394 read_obp_memory("reg", &pall[0], &pall_ents); 2395 read_obp_memory("available", &pavail[0], &pavail_ents); 2396 read_obp_memory("available", &pavail[0], &pavail_ents); 2397 2398 phys_base = 0xffffffffffffffffUL; 2399 for (i = 0; i < pavail_ents; i++) { 2400 phys_base = min(phys_base, pavail[i].phys_addr); 2401 memblock_add(pavail[i].phys_addr, pavail[i].reg_size); 2402 } 2403 2404 memblock_reserve(kern_base, kern_size); 2405 2406 find_ramdisk(phys_base); 2407 2408 if (cmdline_memory_size) 2409 reduce_memory(cmdline_memory_size); 2410 2411 memblock_allow_resize(); 2412 memblock_dump_all(); 2413 2414 set_bit(0, mmu_context_bmap); 2415 2416 shift = kern_base + PAGE_OFFSET - ((unsigned long)KERNBASE); 2417 2418 real_end = (unsigned long)_end; 2419 num_kernel_image_mappings = DIV_ROUND_UP(real_end - KERNBASE, 1 << ILOG2_4MB); 2420 printk("Kernel: Using %d locked TLB entries for main kernel image.\n", 2421 num_kernel_image_mappings); 2422 2423 /* Set kernel pgd to upper alias so physical page computations 2424 * work. 2425 */ 2426 init_mm.pgd += ((shift) / (sizeof(pgd_t))); 2427 2428 memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir)); 2429 2430 inherit_prom_mappings(); 2431 2432 /* Ok, we can use our TLB miss and window trap handlers safely. */ 2433 setup_tba(); 2434 2435 __flush_tlb_all(); 2436 2437 prom_build_devicetree(); 2438 of_populate_present_mask(); 2439 #ifndef CONFIG_SMP 2440 of_fill_in_cpu_data(); 2441 #endif 2442 2443 if (tlb_type == hypervisor) { 2444 sun4v_mdesc_init(); 2445 mdesc_populate_present_mask(cpu_all_mask); 2446 #ifndef CONFIG_SMP 2447 mdesc_fill_in_cpu_data(cpu_all_mask); 2448 #endif 2449 mdesc_get_page_sizes(cpu_all_mask, &cpu_pgsz_mask); 2450 2451 sun4v_linear_pte_xor_finalize(); 2452 2453 sun4v_ktsb_init(); 2454 sun4v_ktsb_register(); 2455 } else { 2456 unsigned long impl, ver; 2457 2458 cpu_pgsz_mask = (HV_PGSZ_MASK_8K | HV_PGSZ_MASK_64K | 2459 HV_PGSZ_MASK_512K | HV_PGSZ_MASK_4MB); 2460 2461 __asm__ __volatile__("rdpr %%ver, %0" : "=r" (ver)); 2462 impl = ((ver >> 32) & 0xffff); 2463 if (impl == PANTHER_IMPL) 2464 cpu_pgsz_mask |= (HV_PGSZ_MASK_32MB | 2465 HV_PGSZ_MASK_256MB); 2466 2467 sun4u_linear_pte_xor_finalize(); 2468 } 2469 2470 /* Flush the TLBs and the 4M TSB so that the updated linear 2471 * pte XOR settings are realized for all mappings. 2472 */ 2473 __flush_tlb_all(); 2474 #ifndef CONFIG_DEBUG_PAGEALLOC 2475 memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb)); 2476 #endif 2477 __flush_tlb_all(); 2478 2479 /* Setup bootmem... */ 2480 last_valid_pfn = end_pfn = bootmem_init(phys_base); 2481 2482 kernel_physical_mapping_init(); 2483 2484 { 2485 unsigned long max_zone_pfns[MAX_NR_ZONES]; 2486 2487 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 2488 2489 max_zone_pfns[ZONE_NORMAL] = end_pfn; 2490 2491 free_area_init_nodes(max_zone_pfns); 2492 } 2493 2494 printk("Booting Linux...\n"); 2495 } 2496 2497 int page_in_phys_avail(unsigned long paddr) 2498 { 2499 int i; 2500 2501 paddr &= PAGE_MASK; 2502 2503 for (i = 0; i < pavail_ents; i++) { 2504 unsigned long start, end; 2505 2506 start = pavail[i].phys_addr; 2507 end = start + pavail[i].reg_size; 2508 2509 if (paddr >= start && paddr < end) 2510 return 1; 2511 } 2512 if (paddr >= kern_base && paddr < (kern_base + kern_size)) 2513 return 1; 2514 #ifdef CONFIG_BLK_DEV_INITRD 2515 if (paddr >= __pa(initrd_start) && 2516 paddr < __pa(PAGE_ALIGN(initrd_end))) 2517 return 1; 2518 #endif 2519 2520 return 0; 2521 } 2522 2523 static void __init register_page_bootmem_info(void) 2524 { 2525 #ifdef CONFIG_NEED_MULTIPLE_NODES 2526 int i; 2527 2528 for_each_online_node(i) 2529 if (NODE_DATA(i)->node_spanned_pages) 2530 register_page_bootmem_info_node(NODE_DATA(i)); 2531 #endif 2532 } 2533 void __init mem_init(void) 2534 { 2535 high_memory = __va(last_valid_pfn << PAGE_SHIFT); 2536 2537 memblock_free_all(); 2538 2539 /* 2540 * Must be done after boot memory is put on freelist, because here we 2541 * might set fields in deferred struct pages that have not yet been 2542 * initialized, and memblock_free_all() initializes all the reserved 2543 * deferred pages for us. 2544 */ 2545 register_page_bootmem_info(); 2546 2547 /* 2548 * Set up the zero page, mark it reserved, so that page count 2549 * is not manipulated when freeing the page from user ptes. 2550 */ 2551 mem_map_zero = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0); 2552 if (mem_map_zero == NULL) { 2553 prom_printf("paging_init: Cannot alloc zero page.\n"); 2554 prom_halt(); 2555 } 2556 mark_page_reserved(mem_map_zero); 2557 2558 mem_init_print_info(NULL); 2559 2560 if (tlb_type == cheetah || tlb_type == cheetah_plus) 2561 cheetah_ecache_flush_init(); 2562 } 2563 2564 void free_initmem(void) 2565 { 2566 unsigned long addr, initend; 2567 int do_free = 1; 2568 2569 /* If the physical memory maps were trimmed by kernel command 2570 * line options, don't even try freeing this initmem stuff up. 2571 * The kernel image could have been in the trimmed out region 2572 * and if so the freeing below will free invalid page structs. 2573 */ 2574 if (cmdline_memory_size) 2575 do_free = 0; 2576 2577 /* 2578 * The init section is aligned to 8k in vmlinux.lds. Page align for >8k pagesizes. 2579 */ 2580 addr = PAGE_ALIGN((unsigned long)(__init_begin)); 2581 initend = (unsigned long)(__init_end) & PAGE_MASK; 2582 for (; addr < initend; addr += PAGE_SIZE) { 2583 unsigned long page; 2584 2585 page = (addr + 2586 ((unsigned long) __va(kern_base)) - 2587 ((unsigned long) KERNBASE)); 2588 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); 2589 2590 if (do_free) 2591 free_reserved_page(virt_to_page(page)); 2592 } 2593 } 2594 2595 pgprot_t PAGE_KERNEL __read_mostly; 2596 EXPORT_SYMBOL(PAGE_KERNEL); 2597 2598 pgprot_t PAGE_KERNEL_LOCKED __read_mostly; 2599 pgprot_t PAGE_COPY __read_mostly; 2600 2601 pgprot_t PAGE_SHARED __read_mostly; 2602 EXPORT_SYMBOL(PAGE_SHARED); 2603 2604 unsigned long pg_iobits __read_mostly; 2605 2606 unsigned long _PAGE_IE __read_mostly; 2607 EXPORT_SYMBOL(_PAGE_IE); 2608 2609 unsigned long _PAGE_E __read_mostly; 2610 EXPORT_SYMBOL(_PAGE_E); 2611 2612 unsigned long _PAGE_CACHE __read_mostly; 2613 EXPORT_SYMBOL(_PAGE_CACHE); 2614 2615 #ifdef CONFIG_SPARSEMEM_VMEMMAP 2616 int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend, 2617 int node, struct vmem_altmap *altmap) 2618 { 2619 unsigned long pte_base; 2620 2621 pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U | 2622 _PAGE_CP_4U | _PAGE_CV_4U | 2623 _PAGE_P_4U | _PAGE_W_4U); 2624 if (tlb_type == hypervisor) 2625 pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V | 2626 page_cache4v_flag | _PAGE_P_4V | _PAGE_W_4V); 2627 2628 pte_base |= _PAGE_PMD_HUGE; 2629 2630 vstart = vstart & PMD_MASK; 2631 vend = ALIGN(vend, PMD_SIZE); 2632 for (; vstart < vend; vstart += PMD_SIZE) { 2633 pgd_t *pgd = vmemmap_pgd_populate(vstart, node); 2634 unsigned long pte; 2635 p4d_t *p4d; 2636 pud_t *pud; 2637 pmd_t *pmd; 2638 2639 if (!pgd) 2640 return -ENOMEM; 2641 2642 p4d = vmemmap_p4d_populate(pgd, vstart, node); 2643 if (!p4d) 2644 return -ENOMEM; 2645 2646 pud = vmemmap_pud_populate(p4d, vstart, node); 2647 if (!pud) 2648 return -ENOMEM; 2649 2650 pmd = pmd_offset(pud, vstart); 2651 pte = pmd_val(*pmd); 2652 if (!(pte & _PAGE_VALID)) { 2653 void *block = vmemmap_alloc_block(PMD_SIZE, node); 2654 2655 if (!block) 2656 return -ENOMEM; 2657 2658 pmd_val(*pmd) = pte_base | __pa(block); 2659 } 2660 } 2661 2662 return 0; 2663 } 2664 2665 void vmemmap_free(unsigned long start, unsigned long end, 2666 struct vmem_altmap *altmap) 2667 { 2668 } 2669 #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 2670 2671 static void prot_init_common(unsigned long page_none, 2672 unsigned long page_shared, 2673 unsigned long page_copy, 2674 unsigned long page_readonly, 2675 unsigned long page_exec_bit) 2676 { 2677 PAGE_COPY = __pgprot(page_copy); 2678 PAGE_SHARED = __pgprot(page_shared); 2679 2680 protection_map[0x0] = __pgprot(page_none); 2681 protection_map[0x1] = __pgprot(page_readonly & ~page_exec_bit); 2682 protection_map[0x2] = __pgprot(page_copy & ~page_exec_bit); 2683 protection_map[0x3] = __pgprot(page_copy & ~page_exec_bit); 2684 protection_map[0x4] = __pgprot(page_readonly); 2685 protection_map[0x5] = __pgprot(page_readonly); 2686 protection_map[0x6] = __pgprot(page_copy); 2687 protection_map[0x7] = __pgprot(page_copy); 2688 protection_map[0x8] = __pgprot(page_none); 2689 protection_map[0x9] = __pgprot(page_readonly & ~page_exec_bit); 2690 protection_map[0xa] = __pgprot(page_shared & ~page_exec_bit); 2691 protection_map[0xb] = __pgprot(page_shared & ~page_exec_bit); 2692 protection_map[0xc] = __pgprot(page_readonly); 2693 protection_map[0xd] = __pgprot(page_readonly); 2694 protection_map[0xe] = __pgprot(page_shared); 2695 protection_map[0xf] = __pgprot(page_shared); 2696 } 2697 2698 static void __init sun4u_pgprot_init(void) 2699 { 2700 unsigned long page_none, page_shared, page_copy, page_readonly; 2701 unsigned long page_exec_bit; 2702 int i; 2703 2704 PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID | 2705 _PAGE_CACHE_4U | _PAGE_P_4U | 2706 __ACCESS_BITS_4U | __DIRTY_BITS_4U | 2707 _PAGE_EXEC_4U); 2708 PAGE_KERNEL_LOCKED = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID | 2709 _PAGE_CACHE_4U | _PAGE_P_4U | 2710 __ACCESS_BITS_4U | __DIRTY_BITS_4U | 2711 _PAGE_EXEC_4U | _PAGE_L_4U); 2712 2713 _PAGE_IE = _PAGE_IE_4U; 2714 _PAGE_E = _PAGE_E_4U; 2715 _PAGE_CACHE = _PAGE_CACHE_4U; 2716 2717 pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4U | __DIRTY_BITS_4U | 2718 __ACCESS_BITS_4U | _PAGE_E_4U); 2719 2720 #ifdef CONFIG_DEBUG_PAGEALLOC 2721 kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET; 2722 #else 2723 kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^ 2724 PAGE_OFFSET; 2725 #endif 2726 kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U | 2727 _PAGE_P_4U | _PAGE_W_4U); 2728 2729 for (i = 1; i < 4; i++) 2730 kern_linear_pte_xor[i] = kern_linear_pte_xor[0]; 2731 2732 _PAGE_ALL_SZ_BITS = (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U | 2733 _PAGE_SZ64K_4U | _PAGE_SZ8K_4U | 2734 _PAGE_SZ32MB_4U | _PAGE_SZ256MB_4U); 2735 2736 2737 page_none = _PAGE_PRESENT_4U | _PAGE_ACCESSED_4U | _PAGE_CACHE_4U; 2738 page_shared = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U | 2739 __ACCESS_BITS_4U | _PAGE_WRITE_4U | _PAGE_EXEC_4U); 2740 page_copy = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U | 2741 __ACCESS_BITS_4U | _PAGE_EXEC_4U); 2742 page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U | 2743 __ACCESS_BITS_4U | _PAGE_EXEC_4U); 2744 2745 page_exec_bit = _PAGE_EXEC_4U; 2746 2747 prot_init_common(page_none, page_shared, page_copy, page_readonly, 2748 page_exec_bit); 2749 } 2750 2751 static void __init sun4v_pgprot_init(void) 2752 { 2753 unsigned long page_none, page_shared, page_copy, page_readonly; 2754 unsigned long page_exec_bit; 2755 int i; 2756 2757 PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4V | _PAGE_VALID | 2758 page_cache4v_flag | _PAGE_P_4V | 2759 __ACCESS_BITS_4V | __DIRTY_BITS_4V | 2760 _PAGE_EXEC_4V); 2761 PAGE_KERNEL_LOCKED = PAGE_KERNEL; 2762 2763 _PAGE_IE = _PAGE_IE_4V; 2764 _PAGE_E = _PAGE_E_4V; 2765 _PAGE_CACHE = page_cache4v_flag; 2766 2767 #ifdef CONFIG_DEBUG_PAGEALLOC 2768 kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET; 2769 #else 2770 kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^ 2771 PAGE_OFFSET; 2772 #endif 2773 kern_linear_pte_xor[0] |= (page_cache4v_flag | _PAGE_P_4V | 2774 _PAGE_W_4V); 2775 2776 for (i = 1; i < 4; i++) 2777 kern_linear_pte_xor[i] = kern_linear_pte_xor[0]; 2778 2779 pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V | 2780 __ACCESS_BITS_4V | _PAGE_E_4V); 2781 2782 _PAGE_ALL_SZ_BITS = (_PAGE_SZ16GB_4V | _PAGE_SZ2GB_4V | 2783 _PAGE_SZ256MB_4V | _PAGE_SZ32MB_4V | 2784 _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V | 2785 _PAGE_SZ64K_4V | _PAGE_SZ8K_4V); 2786 2787 page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | page_cache4v_flag; 2788 page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag | 2789 __ACCESS_BITS_4V | _PAGE_WRITE_4V | _PAGE_EXEC_4V); 2790 page_copy = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag | 2791 __ACCESS_BITS_4V | _PAGE_EXEC_4V); 2792 page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag | 2793 __ACCESS_BITS_4V | _PAGE_EXEC_4V); 2794 2795 page_exec_bit = _PAGE_EXEC_4V; 2796 2797 prot_init_common(page_none, page_shared, page_copy, page_readonly, 2798 page_exec_bit); 2799 } 2800 2801 unsigned long pte_sz_bits(unsigned long sz) 2802 { 2803 if (tlb_type == hypervisor) { 2804 switch (sz) { 2805 case 8 * 1024: 2806 default: 2807 return _PAGE_SZ8K_4V; 2808 case 64 * 1024: 2809 return _PAGE_SZ64K_4V; 2810 case 512 * 1024: 2811 return _PAGE_SZ512K_4V; 2812 case 4 * 1024 * 1024: 2813 return _PAGE_SZ4MB_4V; 2814 } 2815 } else { 2816 switch (sz) { 2817 case 8 * 1024: 2818 default: 2819 return _PAGE_SZ8K_4U; 2820 case 64 * 1024: 2821 return _PAGE_SZ64K_4U; 2822 case 512 * 1024: 2823 return _PAGE_SZ512K_4U; 2824 case 4 * 1024 * 1024: 2825 return _PAGE_SZ4MB_4U; 2826 } 2827 } 2828 } 2829 2830 pte_t mk_pte_io(unsigned long page, pgprot_t prot, int space, unsigned long page_size) 2831 { 2832 pte_t pte; 2833 2834 pte_val(pte) = page | pgprot_val(pgprot_noncached(prot)); 2835 pte_val(pte) |= (((unsigned long)space) << 32); 2836 pte_val(pte) |= pte_sz_bits(page_size); 2837 2838 return pte; 2839 } 2840 2841 static unsigned long kern_large_tte(unsigned long paddr) 2842 { 2843 unsigned long val; 2844 2845 val = (_PAGE_VALID | _PAGE_SZ4MB_4U | 2846 _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_P_4U | 2847 _PAGE_EXEC_4U | _PAGE_L_4U | _PAGE_W_4U); 2848 if (tlb_type == hypervisor) 2849 val = (_PAGE_VALID | _PAGE_SZ4MB_4V | 2850 page_cache4v_flag | _PAGE_P_4V | 2851 _PAGE_EXEC_4V | _PAGE_W_4V); 2852 2853 return val | paddr; 2854 } 2855 2856 /* If not locked, zap it. */ 2857 void __flush_tlb_all(void) 2858 { 2859 unsigned long pstate; 2860 int i; 2861 2862 __asm__ __volatile__("flushw\n\t" 2863 "rdpr %%pstate, %0\n\t" 2864 "wrpr %0, %1, %%pstate" 2865 : "=r" (pstate) 2866 : "i" (PSTATE_IE)); 2867 if (tlb_type == hypervisor) { 2868 sun4v_mmu_demap_all(); 2869 } else if (tlb_type == spitfire) { 2870 for (i = 0; i < 64; i++) { 2871 /* Spitfire Errata #32 workaround */ 2872 /* NOTE: Always runs on spitfire, so no 2873 * cheetah+ page size encodings. 2874 */ 2875 __asm__ __volatile__("stxa %0, [%1] %2\n\t" 2876 "flush %%g6" 2877 : /* No outputs */ 2878 : "r" (0), 2879 "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); 2880 2881 if (!(spitfire_get_dtlb_data(i) & _PAGE_L_4U)) { 2882 __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" 2883 "membar #Sync" 2884 : /* no outputs */ 2885 : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU)); 2886 spitfire_put_dtlb_data(i, 0x0UL); 2887 } 2888 2889 /* Spitfire Errata #32 workaround */ 2890 /* NOTE: Always runs on spitfire, so no 2891 * cheetah+ page size encodings. 2892 */ 2893 __asm__ __volatile__("stxa %0, [%1] %2\n\t" 2894 "flush %%g6" 2895 : /* No outputs */ 2896 : "r" (0), 2897 "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU)); 2898 2899 if (!(spitfire_get_itlb_data(i) & _PAGE_L_4U)) { 2900 __asm__ __volatile__("stxa %%g0, [%0] %1\n\t" 2901 "membar #Sync" 2902 : /* no outputs */ 2903 : "r" (TLB_TAG_ACCESS), "i" (ASI_IMMU)); 2904 spitfire_put_itlb_data(i, 0x0UL); 2905 } 2906 } 2907 } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { 2908 cheetah_flush_dtlb_all(); 2909 cheetah_flush_itlb_all(); 2910 } 2911 __asm__ __volatile__("wrpr %0, 0, %%pstate" 2912 : : "r" (pstate)); 2913 } 2914 2915 pte_t *pte_alloc_one_kernel(struct mm_struct *mm) 2916 { 2917 struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2918 pte_t *pte = NULL; 2919 2920 if (page) 2921 pte = (pte_t *) page_address(page); 2922 2923 return pte; 2924 } 2925 2926 pgtable_t pte_alloc_one(struct mm_struct *mm) 2927 { 2928 struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2929 if (!page) 2930 return NULL; 2931 if (!pgtable_pte_page_ctor(page)) { 2932 free_unref_page(page); 2933 return NULL; 2934 } 2935 return (pte_t *) page_address(page); 2936 } 2937 2938 void pte_free_kernel(struct mm_struct *mm, pte_t *pte) 2939 { 2940 free_page((unsigned long)pte); 2941 } 2942 2943 static void __pte_free(pgtable_t pte) 2944 { 2945 struct page *page = virt_to_page(pte); 2946 2947 pgtable_pte_page_dtor(page); 2948 __free_page(page); 2949 } 2950 2951 void pte_free(struct mm_struct *mm, pgtable_t pte) 2952 { 2953 __pte_free(pte); 2954 } 2955 2956 void pgtable_free(void *table, bool is_page) 2957 { 2958 if (is_page) 2959 __pte_free(table); 2960 else 2961 kmem_cache_free(pgtable_cache, table); 2962 } 2963 2964 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2965 void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, 2966 pmd_t *pmd) 2967 { 2968 unsigned long pte, flags; 2969 struct mm_struct *mm; 2970 pmd_t entry = *pmd; 2971 2972 if (!pmd_large(entry) || !pmd_young(entry)) 2973 return; 2974 2975 pte = pmd_val(entry); 2976 2977 /* Don't insert a non-valid PMD into the TSB, we'll deadlock. */ 2978 if (!(pte & _PAGE_VALID)) 2979 return; 2980 2981 /* We are fabricating 8MB pages using 4MB real hw pages. */ 2982 pte |= (addr & (1UL << REAL_HPAGE_SHIFT)); 2983 2984 mm = vma->vm_mm; 2985 2986 spin_lock_irqsave(&mm->context.lock, flags); 2987 2988 if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) 2989 __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT, 2990 addr, pte); 2991 2992 spin_unlock_irqrestore(&mm->context.lock, flags); 2993 } 2994 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2995 2996 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) 2997 static void context_reload(void *__data) 2998 { 2999 struct mm_struct *mm = __data; 3000 3001 if (mm == current->mm) 3002 load_secondary_context(mm); 3003 } 3004 3005 void hugetlb_setup(struct pt_regs *regs) 3006 { 3007 struct mm_struct *mm = current->mm; 3008 struct tsb_config *tp; 3009 3010 if (faulthandler_disabled() || !mm) { 3011 const struct exception_table_entry *entry; 3012 3013 entry = search_exception_tables(regs->tpc); 3014 if (entry) { 3015 regs->tpc = entry->fixup; 3016 regs->tnpc = regs->tpc + 4; 3017 return; 3018 } 3019 pr_alert("Unexpected HugeTLB setup in atomic context.\n"); 3020 die_if_kernel("HugeTSB in atomic", regs); 3021 } 3022 3023 tp = &mm->context.tsb_block[MM_TSB_HUGE]; 3024 if (likely(tp->tsb == NULL)) 3025 tsb_grow(mm, MM_TSB_HUGE, 0); 3026 3027 tsb_context_switch(mm); 3028 smp_tsb_sync(mm); 3029 3030 /* On UltraSPARC-III+ and later, configure the second half of 3031 * the Data-TLB for huge pages. 3032 */ 3033 if (tlb_type == cheetah_plus) { 3034 bool need_context_reload = false; 3035 unsigned long ctx; 3036 3037 spin_lock_irq(&ctx_alloc_lock); 3038 ctx = mm->context.sparc64_ctx_val; 3039 ctx &= ~CTX_PGSZ_MASK; 3040 ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT; 3041 ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT; 3042 3043 if (ctx != mm->context.sparc64_ctx_val) { 3044 /* When changing the page size fields, we 3045 * must perform a context flush so that no 3046 * stale entries match. This flush must 3047 * occur with the original context register 3048 * settings. 3049 */ 3050 do_flush_tlb_mm(mm); 3051 3052 /* Reload the context register of all processors 3053 * also executing in this address space. 3054 */ 3055 mm->context.sparc64_ctx_val = ctx; 3056 need_context_reload = true; 3057 } 3058 spin_unlock_irq(&ctx_alloc_lock); 3059 3060 if (need_context_reload) 3061 on_each_cpu(context_reload, mm, 0); 3062 } 3063 } 3064 #endif 3065 3066 static struct resource code_resource = { 3067 .name = "Kernel code", 3068 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM 3069 }; 3070 3071 static struct resource data_resource = { 3072 .name = "Kernel data", 3073 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM 3074 }; 3075 3076 static struct resource bss_resource = { 3077 .name = "Kernel bss", 3078 .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM 3079 }; 3080 3081 static inline resource_size_t compute_kern_paddr(void *addr) 3082 { 3083 return (resource_size_t) (addr - KERNBASE + kern_base); 3084 } 3085 3086 static void __init kernel_lds_init(void) 3087 { 3088 code_resource.start = compute_kern_paddr(_text); 3089 code_resource.end = compute_kern_paddr(_etext - 1); 3090 data_resource.start = compute_kern_paddr(_etext); 3091 data_resource.end = compute_kern_paddr(_edata - 1); 3092 bss_resource.start = compute_kern_paddr(__bss_start); 3093 bss_resource.end = compute_kern_paddr(_end - 1); 3094 } 3095 3096 static int __init report_memory(void) 3097 { 3098 int i; 3099 struct resource *res; 3100 3101 kernel_lds_init(); 3102 3103 for (i = 0; i < pavail_ents; i++) { 3104 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 3105 3106 if (!res) { 3107 pr_warn("Failed to allocate source.\n"); 3108 break; 3109 } 3110 3111 res->name = "System RAM"; 3112 res->start = pavail[i].phys_addr; 3113 res->end = pavail[i].phys_addr + pavail[i].reg_size - 1; 3114 res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; 3115 3116 if (insert_resource(&iomem_resource, res) < 0) { 3117 pr_warn("Resource insertion failed.\n"); 3118 break; 3119 } 3120 3121 insert_resource(res, &code_resource); 3122 insert_resource(res, &data_resource); 3123 insert_resource(res, &bss_resource); 3124 } 3125 3126 return 0; 3127 } 3128 arch_initcall(report_memory); 3129 3130 #ifdef CONFIG_SMP 3131 #define do_flush_tlb_kernel_range smp_flush_tlb_kernel_range 3132 #else 3133 #define do_flush_tlb_kernel_range __flush_tlb_kernel_range 3134 #endif 3135 3136 void flush_tlb_kernel_range(unsigned long start, unsigned long end) 3137 { 3138 if (start < HI_OBP_ADDRESS && end > LOW_OBP_ADDRESS) { 3139 if (start < LOW_OBP_ADDRESS) { 3140 flush_tsb_kernel_range(start, LOW_OBP_ADDRESS); 3141 do_flush_tlb_kernel_range(start, LOW_OBP_ADDRESS); 3142 } 3143 if (end > HI_OBP_ADDRESS) { 3144 flush_tsb_kernel_range(HI_OBP_ADDRESS, end); 3145 do_flush_tlb_kernel_range(HI_OBP_ADDRESS, end); 3146 } 3147 } else { 3148 flush_tsb_kernel_range(start, end); 3149 do_flush_tlb_kernel_range(start, end); 3150 } 3151 } 3152 3153 void copy_user_highpage(struct page *to, struct page *from, 3154 unsigned long vaddr, struct vm_area_struct *vma) 3155 { 3156 char *vfrom, *vto; 3157 3158 vfrom = kmap_atomic(from); 3159 vto = kmap_atomic(to); 3160 copy_user_page(vto, vfrom, vaddr, to); 3161 kunmap_atomic(vto); 3162 kunmap_atomic(vfrom); 3163 3164 /* If this page has ADI enabled, copy over any ADI tags 3165 * as well 3166 */ 3167 if (vma->vm_flags & VM_SPARC_ADI) { 3168 unsigned long pfrom, pto, i, adi_tag; 3169 3170 pfrom = page_to_phys(from); 3171 pto = page_to_phys(to); 3172 3173 for (i = pfrom; i < (pfrom + PAGE_SIZE); i += adi_blksize()) { 3174 asm volatile("ldxa [%1] %2, %0\n\t" 3175 : "=r" (adi_tag) 3176 : "r" (i), "i" (ASI_MCD_REAL)); 3177 asm volatile("stxa %0, [%1] %2\n\t" 3178 : 3179 : "r" (adi_tag), "r" (pto), 3180 "i" (ASI_MCD_REAL)); 3181 pto += adi_blksize(); 3182 } 3183 asm volatile("membar #Sync\n\t"); 3184 } 3185 } 3186 EXPORT_SYMBOL(copy_user_highpage); 3187 3188 void copy_highpage(struct page *to, struct page *from) 3189 { 3190 char *vfrom, *vto; 3191 3192 vfrom = kmap_atomic(from); 3193 vto = kmap_atomic(to); 3194 copy_page(vto, vfrom); 3195 kunmap_atomic(vto); 3196 kunmap_atomic(vfrom); 3197 3198 /* If this platform is ADI enabled, copy any ADI tags 3199 * as well 3200 */ 3201 if (adi_capable()) { 3202 unsigned long pfrom, pto, i, adi_tag; 3203 3204 pfrom = page_to_phys(from); 3205 pto = page_to_phys(to); 3206 3207 for (i = pfrom; i < (pfrom + PAGE_SIZE); i += adi_blksize()) { 3208 asm volatile("ldxa [%1] %2, %0\n\t" 3209 : "=r" (adi_tag) 3210 : "r" (i), "i" (ASI_MCD_REAL)); 3211 asm volatile("stxa %0, [%1] %2\n\t" 3212 : 3213 : "r" (adi_tag), "r" (pto), 3214 "i" (ASI_MCD_REAL)); 3215 pto += adi_blksize(); 3216 } 3217 asm volatile("membar #Sync\n\t"); 3218 } 3219 } 3220 EXPORT_SYMBOL(copy_highpage); 3221