1 #include <linux/initrd.h> 2 #include <linux/ioport.h> 3 #include <linux/swap.h> 4 5 #include <asm/cacheflush.h> 6 #include <asm/e820.h> 7 #include <asm/init.h> 8 #include <asm/page.h> 9 #include <asm/page_types.h> 10 #include <asm/sections.h> 11 #include <asm/setup.h> 12 #include <asm/system.h> 13 #include <asm/tlbflush.h> 14 #include <asm/tlb.h> 15 16 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 17 18 unsigned long __initdata e820_table_start; 19 unsigned long __meminitdata e820_table_end; 20 unsigned long __meminitdata e820_table_top; 21 22 int after_bootmem; 23 24 int direct_gbpages 25 #ifdef CONFIG_DIRECT_GBPAGES 26 = 1 27 #endif 28 ; 29 30 int nx_enabled; 31 32 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 33 static int disable_nx __cpuinitdata; 34 35 /* 36 * noexec = on|off 37 * 38 * Control non-executable mappings for processes. 39 * 40 * on Enable 41 * off Disable 42 */ 43 static int __init noexec_setup(char *str) 44 { 45 if (!str) 46 return -EINVAL; 47 if (!strncmp(str, "on", 2)) { 48 __supported_pte_mask |= _PAGE_NX; 49 disable_nx = 0; 50 } else if (!strncmp(str, "off", 3)) { 51 disable_nx = 1; 52 __supported_pte_mask &= ~_PAGE_NX; 53 } 54 return 0; 55 } 56 early_param("noexec", noexec_setup); 57 #endif 58 59 #ifdef CONFIG_X86_PAE 60 static void __init set_nx(void) 61 { 62 unsigned int v[4], l, h; 63 64 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { 65 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); 66 67 if ((v[3] & (1 << 20)) && !disable_nx) { 68 rdmsr(MSR_EFER, l, h); 69 l |= EFER_NX; 70 wrmsr(MSR_EFER, l, h); 71 nx_enabled = 1; 72 __supported_pte_mask |= _PAGE_NX; 73 } 74 } 75 } 76 #else 77 static inline void set_nx(void) 78 { 79 } 80 #endif 81 82 #ifdef CONFIG_X86_64 83 void __cpuinit check_efer(void) 84 { 85 unsigned long efer; 86 87 rdmsrl(MSR_EFER, efer); 88 if (!(efer & EFER_NX) || disable_nx) 89 __supported_pte_mask &= ~_PAGE_NX; 90 } 91 #endif 92 93 static void __init find_early_table_space(unsigned long end, int use_pse, 94 int use_gbpages) 95 { 96 unsigned long puds, pmds, ptes, tables, start; 97 98 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 99 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); 100 101 if (use_gbpages) { 102 unsigned long extra; 103 104 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); 105 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; 106 } else 107 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 108 109 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); 110 111 if (use_pse) { 112 unsigned long extra; 113 114 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 115 #ifdef CONFIG_X86_32 116 extra += PMD_SIZE; 117 #endif 118 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; 119 } else 120 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 121 122 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); 123 124 #ifdef CONFIG_X86_32 125 /* for fixmap */ 126 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); 127 #endif 128 129 /* 130 * RED-PEN putting page tables only on node 0 could 131 * cause a hotspot and fill up ZONE_DMA. The page tables 132 * need roughly 0.5KB per GB. 133 */ 134 #ifdef CONFIG_X86_32 135 start = 0x7000; 136 #else 137 start = 0x8000; 138 #endif 139 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, 140 tables, PAGE_SIZE); 141 if (e820_table_start == -1UL) 142 panic("Cannot find space for the kernel page tables"); 143 144 e820_table_start >>= PAGE_SHIFT; 145 e820_table_end = e820_table_start; 146 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); 147 148 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", 149 end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); 150 } 151 152 struct map_range { 153 unsigned long start; 154 unsigned long end; 155 unsigned page_size_mask; 156 }; 157 158 #ifdef CONFIG_X86_32 159 #define NR_RANGE_MR 3 160 #else /* CONFIG_X86_64 */ 161 #define NR_RANGE_MR 5 162 #endif 163 164 static int __meminit save_mr(struct map_range *mr, int nr_range, 165 unsigned long start_pfn, unsigned long end_pfn, 166 unsigned long page_size_mask) 167 { 168 if (start_pfn < end_pfn) { 169 if (nr_range >= NR_RANGE_MR) 170 panic("run out of range for init_memory_mapping\n"); 171 mr[nr_range].start = start_pfn<<PAGE_SHIFT; 172 mr[nr_range].end = end_pfn<<PAGE_SHIFT; 173 mr[nr_range].page_size_mask = page_size_mask; 174 nr_range++; 175 } 176 177 return nr_range; 178 } 179 180 #ifdef CONFIG_X86_64 181 static void __init init_gbpages(void) 182 { 183 if (direct_gbpages && cpu_has_gbpages) 184 printk(KERN_INFO "Using GB pages for direct mapping\n"); 185 else 186 direct_gbpages = 0; 187 } 188 #else 189 static inline void init_gbpages(void) 190 { 191 } 192 #endif 193 194 /* 195 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 196 * This runs before bootmem is initialized and gets pages directly from 197 * the physical memory. To access them they are temporarily mapped. 198 */ 199 unsigned long __init_refok init_memory_mapping(unsigned long start, 200 unsigned long end) 201 { 202 unsigned long page_size_mask = 0; 203 unsigned long start_pfn, end_pfn; 204 unsigned long ret = 0; 205 unsigned long pos; 206 207 struct map_range mr[NR_RANGE_MR]; 208 int nr_range, i; 209 int use_pse, use_gbpages; 210 211 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); 212 213 if (!after_bootmem) 214 init_gbpages(); 215 216 #ifdef CONFIG_DEBUG_PAGEALLOC 217 /* 218 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 219 * This will simplify cpa(), which otherwise needs to support splitting 220 * large pages into small in interrupt context, etc. 221 */ 222 use_pse = use_gbpages = 0; 223 #else 224 use_pse = cpu_has_pse; 225 use_gbpages = direct_gbpages; 226 #endif 227 228 set_nx(); 229 if (nx_enabled) 230 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 231 232 /* Enable PSE if available */ 233 if (cpu_has_pse) 234 set_in_cr4(X86_CR4_PSE); 235 236 /* Enable PGE if available */ 237 if (cpu_has_pge) { 238 set_in_cr4(X86_CR4_PGE); 239 __supported_pte_mask |= _PAGE_GLOBAL; 240 } 241 242 if (use_gbpages) 243 page_size_mask |= 1 << PG_LEVEL_1G; 244 if (use_pse) 245 page_size_mask |= 1 << PG_LEVEL_2M; 246 247 memset(mr, 0, sizeof(mr)); 248 nr_range = 0; 249 250 /* head if not big page alignment ? */ 251 start_pfn = start >> PAGE_SHIFT; 252 pos = start_pfn << PAGE_SHIFT; 253 #ifdef CONFIG_X86_32 254 /* 255 * Don't use a large page for the first 2/4MB of memory 256 * because there are often fixed size MTRRs in there 257 * and overlapping MTRRs into large pages can cause 258 * slowdowns. 259 */ 260 if (pos == 0) 261 end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); 262 else 263 end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) 264 << (PMD_SHIFT - PAGE_SHIFT); 265 #else /* CONFIG_X86_64 */ 266 end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) 267 << (PMD_SHIFT - PAGE_SHIFT); 268 #endif 269 if (end_pfn > (end >> PAGE_SHIFT)) 270 end_pfn = end >> PAGE_SHIFT; 271 if (start_pfn < end_pfn) { 272 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); 273 pos = end_pfn << PAGE_SHIFT; 274 } 275 276 /* big page (2M) range */ 277 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) 278 << (PMD_SHIFT - PAGE_SHIFT); 279 #ifdef CONFIG_X86_32 280 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); 281 #else /* CONFIG_X86_64 */ 282 end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) 283 << (PUD_SHIFT - PAGE_SHIFT); 284 if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) 285 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); 286 #endif 287 288 if (start_pfn < end_pfn) { 289 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 290 page_size_mask & (1<<PG_LEVEL_2M)); 291 pos = end_pfn << PAGE_SHIFT; 292 } 293 294 #ifdef CONFIG_X86_64 295 /* big page (1G) range */ 296 start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) 297 << (PUD_SHIFT - PAGE_SHIFT); 298 end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); 299 if (start_pfn < end_pfn) { 300 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 301 page_size_mask & 302 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); 303 pos = end_pfn << PAGE_SHIFT; 304 } 305 306 /* tail is not big page (1G) alignment */ 307 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) 308 << (PMD_SHIFT - PAGE_SHIFT); 309 end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); 310 if (start_pfn < end_pfn) { 311 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 312 page_size_mask & (1<<PG_LEVEL_2M)); 313 pos = end_pfn << PAGE_SHIFT; 314 } 315 #endif 316 317 /* tail is not big page (2M) alignment */ 318 start_pfn = pos>>PAGE_SHIFT; 319 end_pfn = end>>PAGE_SHIFT; 320 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); 321 322 /* try to merge same page size and continuous */ 323 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { 324 unsigned long old_start; 325 if (mr[i].end != mr[i+1].start || 326 mr[i].page_size_mask != mr[i+1].page_size_mask) 327 continue; 328 /* move it */ 329 old_start = mr[i].start; 330 memmove(&mr[i], &mr[i+1], 331 (nr_range - 1 - i) * sizeof(struct map_range)); 332 mr[i--].start = old_start; 333 nr_range--; 334 } 335 336 for (i = 0; i < nr_range; i++) 337 printk(KERN_DEBUG " %010lx - %010lx page %s\n", 338 mr[i].start, mr[i].end, 339 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( 340 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); 341 342 /* 343 * Find space for the kernel direct mapping tables. 344 * 345 * Later we should allocate these tables in the local node of the 346 * memory mapped. Unfortunately this is done currently before the 347 * nodes are discovered. 348 */ 349 if (!after_bootmem) 350 find_early_table_space(end, use_pse, use_gbpages); 351 352 #ifdef CONFIG_X86_32 353 for (i = 0; i < nr_range; i++) 354 kernel_physical_mapping_init(mr[i].start, mr[i].end, 355 mr[i].page_size_mask); 356 ret = end; 357 #else /* CONFIG_X86_64 */ 358 for (i = 0; i < nr_range; i++) 359 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, 360 mr[i].page_size_mask); 361 #endif 362 363 #ifdef CONFIG_X86_32 364 early_ioremap_page_table_range_init(); 365 366 load_cr3(swapper_pg_dir); 367 #endif 368 369 #ifdef CONFIG_X86_64 370 if (!after_bootmem && !start) { 371 pud_t *pud; 372 pmd_t *pmd; 373 374 mmu_cr4_features = read_cr4(); 375 376 /* 377 * _brk_end cannot change anymore, but it and _end may be 378 * located on different 2M pages. cleanup_highmap(), however, 379 * can only consider _end when it runs, so destroy any 380 * mappings beyond _brk_end here. 381 */ 382 pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); 383 pmd = pmd_offset(pud, _brk_end - 1); 384 while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) 385 pmd_clear(pmd); 386 } 387 #endif 388 __flush_tlb_all(); 389 390 if (!after_bootmem && e820_table_end > e820_table_start) 391 reserve_early(e820_table_start << PAGE_SHIFT, 392 e820_table_end << PAGE_SHIFT, "PGTABLE"); 393 394 if (!after_bootmem) 395 early_memtest(start, end); 396 397 return ret >> PAGE_SHIFT; 398 } 399 400 401 /* 402 * devmem_is_allowed() checks to see if /dev/mem access to a certain address 403 * is valid. The argument is a physical page number. 404 * 405 * 406 * On x86, access has to be given to the first megabyte of ram because that area 407 * contains bios code and data regions used by X and dosemu and similar apps. 408 * Access has to be given to non-kernel-ram areas as well, these contain the PCI 409 * mmio resources as well as potential bios/acpi data regions. 410 */ 411 int devmem_is_allowed(unsigned long pagenr) 412 { 413 if (pagenr <= 256) 414 return 1; 415 if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) 416 return 0; 417 if (!page_is_ram(pagenr)) 418 return 1; 419 return 0; 420 } 421 422 void free_init_pages(char *what, unsigned long begin, unsigned long end) 423 { 424 unsigned long addr = begin; 425 426 if (addr >= end) 427 return; 428 429 /* 430 * If debugging page accesses then do not free this memory but 431 * mark them not present - any buggy init-section access will 432 * create a kernel page fault: 433 */ 434 #ifdef CONFIG_DEBUG_PAGEALLOC 435 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", 436 begin, PAGE_ALIGN(end)); 437 set_memory_np(begin, (end - begin) >> PAGE_SHIFT); 438 #else 439 /* 440 * We just marked the kernel text read only above, now that 441 * we are going to free part of that, we need to make that 442 * writeable first. 443 */ 444 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); 445 446 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); 447 448 for (; addr < end; addr += PAGE_SIZE) { 449 ClearPageReserved(virt_to_page(addr)); 450 init_page_count(virt_to_page(addr)); 451 memset((void *)(addr & ~(PAGE_SIZE-1)), 452 POISON_FREE_INITMEM, PAGE_SIZE); 453 free_page(addr); 454 totalram_pages++; 455 } 456 #endif 457 } 458 459 void free_initmem(void) 460 { 461 free_init_pages("unused kernel memory", 462 (unsigned long)(&__init_begin), 463 (unsigned long)(&__init_end)); 464 } 465 466 #ifdef CONFIG_BLK_DEV_INITRD 467 void free_initrd_mem(unsigned long start, unsigned long end) 468 { 469 free_init_pages("initrd memory", start, end); 470 } 471 #endif 472