xref: /openbmc/linux/arch/arm/mm/mmu.c (revision e8e0929d)
1 /*
2  *  linux/arch/arm/mm/mmu.c
3  *
4  *  Copyright (C) 1995-2005 Russell King
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10 #include <linux/module.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/init.h>
14 #include <linux/bootmem.h>
15 #include <linux/mman.h>
16 #include <linux/nodemask.h>
17 
18 #include <asm/cputype.h>
19 #include <asm/mach-types.h>
20 #include <asm/sections.h>
21 #include <asm/cachetype.h>
22 #include <asm/setup.h>
23 #include <asm/sizes.h>
24 #include <asm/smp_plat.h>
25 #include <asm/tlb.h>
26 #include <asm/highmem.h>
27 
28 #include <asm/mach/arch.h>
29 #include <asm/mach/map.h>
30 
31 #include "mm.h"
32 
33 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
34 
35 /*
36  * empty_zero_page is a special page that is used for
37  * zero-initialized data and COW.
38  */
39 struct page *empty_zero_page;
40 EXPORT_SYMBOL(empty_zero_page);
41 
42 /*
43  * The pmd table for the upper-most set of pages.
44  */
45 pmd_t *top_pmd;
46 
47 #define CPOLICY_UNCACHED	0
48 #define CPOLICY_BUFFERED	1
49 #define CPOLICY_WRITETHROUGH	2
50 #define CPOLICY_WRITEBACK	3
51 #define CPOLICY_WRITEALLOC	4
52 
53 static unsigned int cachepolicy __initdata = CPOLICY_WRITEBACK;
54 static unsigned int ecc_mask __initdata = 0;
55 pgprot_t pgprot_user;
56 pgprot_t pgprot_kernel;
57 
58 EXPORT_SYMBOL(pgprot_user);
59 EXPORT_SYMBOL(pgprot_kernel);
60 
61 struct cachepolicy {
62 	const char	policy[16];
63 	unsigned int	cr_mask;
64 	unsigned int	pmd;
65 	unsigned int	pte;
66 };
67 
68 static struct cachepolicy cache_policies[] __initdata = {
69 	{
70 		.policy		= "uncached",
71 		.cr_mask	= CR_W|CR_C,
72 		.pmd		= PMD_SECT_UNCACHED,
73 		.pte		= L_PTE_MT_UNCACHED,
74 	}, {
75 		.policy		= "buffered",
76 		.cr_mask	= CR_C,
77 		.pmd		= PMD_SECT_BUFFERED,
78 		.pte		= L_PTE_MT_BUFFERABLE,
79 	}, {
80 		.policy		= "writethrough",
81 		.cr_mask	= 0,
82 		.pmd		= PMD_SECT_WT,
83 		.pte		= L_PTE_MT_WRITETHROUGH,
84 	}, {
85 		.policy		= "writeback",
86 		.cr_mask	= 0,
87 		.pmd		= PMD_SECT_WB,
88 		.pte		= L_PTE_MT_WRITEBACK,
89 	}, {
90 		.policy		= "writealloc",
91 		.cr_mask	= 0,
92 		.pmd		= PMD_SECT_WBWA,
93 		.pte		= L_PTE_MT_WRITEALLOC,
94 	}
95 };
96 
97 /*
98  * These are useful for identifying cache coherency
99  * problems by allowing the cache or the cache and
100  * writebuffer to be turned off.  (Note: the write
101  * buffer should not be on and the cache off).
102  */
103 static void __init early_cachepolicy(char **p)
104 {
105 	int i;
106 
107 	for (i = 0; i < ARRAY_SIZE(cache_policies); i++) {
108 		int len = strlen(cache_policies[i].policy);
109 
110 		if (memcmp(*p, cache_policies[i].policy, len) == 0) {
111 			cachepolicy = i;
112 			cr_alignment &= ~cache_policies[i].cr_mask;
113 			cr_no_alignment &= ~cache_policies[i].cr_mask;
114 			*p += len;
115 			break;
116 		}
117 	}
118 	if (i == ARRAY_SIZE(cache_policies))
119 		printk(KERN_ERR "ERROR: unknown or unsupported cache policy\n");
120 	if (cpu_architecture() >= CPU_ARCH_ARMv6) {
121 		printk(KERN_WARNING "Only cachepolicy=writeback supported on ARMv6 and later\n");
122 		cachepolicy = CPOLICY_WRITEBACK;
123 	}
124 	flush_cache_all();
125 	set_cr(cr_alignment);
126 }
127 __early_param("cachepolicy=", early_cachepolicy);
128 
129 static void __init early_nocache(char **__unused)
130 {
131 	char *p = "buffered";
132 	printk(KERN_WARNING "nocache is deprecated; use cachepolicy=%s\n", p);
133 	early_cachepolicy(&p);
134 }
135 __early_param("nocache", early_nocache);
136 
137 static void __init early_nowrite(char **__unused)
138 {
139 	char *p = "uncached";
140 	printk(KERN_WARNING "nowb is deprecated; use cachepolicy=%s\n", p);
141 	early_cachepolicy(&p);
142 }
143 __early_param("nowb", early_nowrite);
144 
145 static void __init early_ecc(char **p)
146 {
147 	if (memcmp(*p, "on", 2) == 0) {
148 		ecc_mask = PMD_PROTECTION;
149 		*p += 2;
150 	} else if (memcmp(*p, "off", 3) == 0) {
151 		ecc_mask = 0;
152 		*p += 3;
153 	}
154 }
155 __early_param("ecc=", early_ecc);
156 
157 static int __init noalign_setup(char *__unused)
158 {
159 	cr_alignment &= ~CR_A;
160 	cr_no_alignment &= ~CR_A;
161 	set_cr(cr_alignment);
162 	return 1;
163 }
164 __setup("noalign", noalign_setup);
165 
166 #ifndef CONFIG_SMP
167 void adjust_cr(unsigned long mask, unsigned long set)
168 {
169 	unsigned long flags;
170 
171 	mask &= ~CR_A;
172 
173 	set &= mask;
174 
175 	local_irq_save(flags);
176 
177 	cr_no_alignment = (cr_no_alignment & ~mask) | set;
178 	cr_alignment = (cr_alignment & ~mask) | set;
179 
180 	set_cr((get_cr() & ~mask) | set);
181 
182 	local_irq_restore(flags);
183 }
184 #endif
185 
186 #define PROT_PTE_DEVICE		L_PTE_PRESENT|L_PTE_YOUNG|L_PTE_DIRTY|L_PTE_WRITE
187 #define PROT_SECT_DEVICE	PMD_TYPE_SECT|PMD_SECT_AP_WRITE
188 
189 static struct mem_type mem_types[] = {
190 	[MT_DEVICE] = {		  /* Strongly ordered / ARMv6 shared device */
191 		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_SHARED |
192 				  L_PTE_SHARED,
193 		.prot_l1	= PMD_TYPE_TABLE,
194 		.prot_sect	= PROT_SECT_DEVICE | PMD_SECT_S,
195 		.domain		= DOMAIN_IO,
196 	},
197 	[MT_DEVICE_NONSHARED] = { /* ARMv6 non-shared device */
198 		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_NONSHARED,
199 		.prot_l1	= PMD_TYPE_TABLE,
200 		.prot_sect	= PROT_SECT_DEVICE,
201 		.domain		= DOMAIN_IO,
202 	},
203 	[MT_DEVICE_CACHED] = {	  /* ioremap_cached */
204 		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_CACHED,
205 		.prot_l1	= PMD_TYPE_TABLE,
206 		.prot_sect	= PROT_SECT_DEVICE | PMD_SECT_WB,
207 		.domain		= DOMAIN_IO,
208 	},
209 	[MT_DEVICE_WC] = {	/* ioremap_wc */
210 		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_WC,
211 		.prot_l1	= PMD_TYPE_TABLE,
212 		.prot_sect	= PROT_SECT_DEVICE,
213 		.domain		= DOMAIN_IO,
214 	},
215 	[MT_UNCACHED] = {
216 		.prot_pte	= PROT_PTE_DEVICE,
217 		.prot_l1	= PMD_TYPE_TABLE,
218 		.prot_sect	= PMD_TYPE_SECT | PMD_SECT_XN,
219 		.domain		= DOMAIN_IO,
220 	},
221 	[MT_CACHECLEAN] = {
222 		.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
223 		.domain    = DOMAIN_KERNEL,
224 	},
225 	[MT_MINICLEAN] = {
226 		.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN | PMD_SECT_MINICACHE,
227 		.domain    = DOMAIN_KERNEL,
228 	},
229 	[MT_LOW_VECTORS] = {
230 		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
231 				L_PTE_EXEC,
232 		.prot_l1   = PMD_TYPE_TABLE,
233 		.domain    = DOMAIN_USER,
234 	},
235 	[MT_HIGH_VECTORS] = {
236 		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
237 				L_PTE_USER | L_PTE_EXEC,
238 		.prot_l1   = PMD_TYPE_TABLE,
239 		.domain    = DOMAIN_USER,
240 	},
241 	[MT_MEMORY] = {
242 		.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
243 		.domain    = DOMAIN_KERNEL,
244 	},
245 	[MT_ROM] = {
246 		.prot_sect = PMD_TYPE_SECT,
247 		.domain    = DOMAIN_KERNEL,
248 	},
249 	[MT_MEMORY_NONCACHED] = {
250 		.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
251 		.domain    = DOMAIN_KERNEL,
252 	},
253 };
254 
255 const struct mem_type *get_mem_type(unsigned int type)
256 {
257 	return type < ARRAY_SIZE(mem_types) ? &mem_types[type] : NULL;
258 }
259 EXPORT_SYMBOL(get_mem_type);
260 
261 /*
262  * Adjust the PMD section entries according to the CPU in use.
263  */
264 static void __init build_mem_type_table(void)
265 {
266 	struct cachepolicy *cp;
267 	unsigned int cr = get_cr();
268 	unsigned int user_pgprot, kern_pgprot, vecs_pgprot;
269 	int cpu_arch = cpu_architecture();
270 	int i;
271 
272 	if (cpu_arch < CPU_ARCH_ARMv6) {
273 #if defined(CONFIG_CPU_DCACHE_DISABLE)
274 		if (cachepolicy > CPOLICY_BUFFERED)
275 			cachepolicy = CPOLICY_BUFFERED;
276 #elif defined(CONFIG_CPU_DCACHE_WRITETHROUGH)
277 		if (cachepolicy > CPOLICY_WRITETHROUGH)
278 			cachepolicy = CPOLICY_WRITETHROUGH;
279 #endif
280 	}
281 	if (cpu_arch < CPU_ARCH_ARMv5) {
282 		if (cachepolicy >= CPOLICY_WRITEALLOC)
283 			cachepolicy = CPOLICY_WRITEBACK;
284 		ecc_mask = 0;
285 	}
286 #ifdef CONFIG_SMP
287 	cachepolicy = CPOLICY_WRITEALLOC;
288 #endif
289 
290 	/*
291 	 * Strip out features not present on earlier architectures.
292 	 * Pre-ARMv5 CPUs don't have TEX bits.  Pre-ARMv6 CPUs or those
293 	 * without extended page tables don't have the 'Shared' bit.
294 	 */
295 	if (cpu_arch < CPU_ARCH_ARMv5)
296 		for (i = 0; i < ARRAY_SIZE(mem_types); i++)
297 			mem_types[i].prot_sect &= ~PMD_SECT_TEX(7);
298 	if ((cpu_arch < CPU_ARCH_ARMv6 || !(cr & CR_XP)) && !cpu_is_xsc3())
299 		for (i = 0; i < ARRAY_SIZE(mem_types); i++)
300 			mem_types[i].prot_sect &= ~PMD_SECT_S;
301 
302 	/*
303 	 * ARMv5 and lower, bit 4 must be set for page tables (was: cache
304 	 * "update-able on write" bit on ARM610).  However, Xscale and
305 	 * Xscale3 require this bit to be cleared.
306 	 */
307 	if (cpu_is_xscale() || cpu_is_xsc3()) {
308 		for (i = 0; i < ARRAY_SIZE(mem_types); i++) {
309 			mem_types[i].prot_sect &= ~PMD_BIT4;
310 			mem_types[i].prot_l1 &= ~PMD_BIT4;
311 		}
312 	} else if (cpu_arch < CPU_ARCH_ARMv6) {
313 		for (i = 0; i < ARRAY_SIZE(mem_types); i++) {
314 			if (mem_types[i].prot_l1)
315 				mem_types[i].prot_l1 |= PMD_BIT4;
316 			if (mem_types[i].prot_sect)
317 				mem_types[i].prot_sect |= PMD_BIT4;
318 		}
319 	}
320 
321 	/*
322 	 * Mark the device areas according to the CPU/architecture.
323 	 */
324 	if (cpu_is_xsc3() || (cpu_arch >= CPU_ARCH_ARMv6 && (cr & CR_XP))) {
325 		if (!cpu_is_xsc3()) {
326 			/*
327 			 * Mark device regions on ARMv6+ as execute-never
328 			 * to prevent speculative instruction fetches.
329 			 */
330 			mem_types[MT_DEVICE].prot_sect |= PMD_SECT_XN;
331 			mem_types[MT_DEVICE_NONSHARED].prot_sect |= PMD_SECT_XN;
332 			mem_types[MT_DEVICE_CACHED].prot_sect |= PMD_SECT_XN;
333 			mem_types[MT_DEVICE_WC].prot_sect |= PMD_SECT_XN;
334 		}
335 		if (cpu_arch >= CPU_ARCH_ARMv7 && (cr & CR_TRE)) {
336 			/*
337 			 * For ARMv7 with TEX remapping,
338 			 * - shared device is SXCB=1100
339 			 * - nonshared device is SXCB=0100
340 			 * - write combine device mem is SXCB=0001
341 			 * (Uncached Normal memory)
342 			 */
343 			mem_types[MT_DEVICE].prot_sect |= PMD_SECT_TEX(1);
344 			mem_types[MT_DEVICE_NONSHARED].prot_sect |= PMD_SECT_TEX(1);
345 			mem_types[MT_DEVICE_WC].prot_sect |= PMD_SECT_BUFFERABLE;
346 		} else if (cpu_is_xsc3()) {
347 			/*
348 			 * For Xscale3,
349 			 * - shared device is TEXCB=00101
350 			 * - nonshared device is TEXCB=01000
351 			 * - write combine device mem is TEXCB=00100
352 			 * (Inner/Outer Uncacheable in xsc3 parlance)
353 			 */
354 			mem_types[MT_DEVICE].prot_sect |= PMD_SECT_TEX(1) | PMD_SECT_BUFFERED;
355 			mem_types[MT_DEVICE_NONSHARED].prot_sect |= PMD_SECT_TEX(2);
356 			mem_types[MT_DEVICE_WC].prot_sect |= PMD_SECT_TEX(1);
357 		} else {
358 			/*
359 			 * For ARMv6 and ARMv7 without TEX remapping,
360 			 * - shared device is TEXCB=00001
361 			 * - nonshared device is TEXCB=01000
362 			 * - write combine device mem is TEXCB=00100
363 			 * (Uncached Normal in ARMv6 parlance).
364 			 */
365 			mem_types[MT_DEVICE].prot_sect |= PMD_SECT_BUFFERED;
366 			mem_types[MT_DEVICE_NONSHARED].prot_sect |= PMD_SECT_TEX(2);
367 			mem_types[MT_DEVICE_WC].prot_sect |= PMD_SECT_TEX(1);
368 		}
369 	} else {
370 		/*
371 		 * On others, write combining is "Uncached/Buffered"
372 		 */
373 		mem_types[MT_DEVICE_WC].prot_sect |= PMD_SECT_BUFFERABLE;
374 	}
375 
376 	/*
377 	 * Now deal with the memory-type mappings
378 	 */
379 	cp = &cache_policies[cachepolicy];
380 	vecs_pgprot = kern_pgprot = user_pgprot = cp->pte;
381 
382 #ifndef CONFIG_SMP
383 	/*
384 	 * Only use write-through for non-SMP systems
385 	 */
386 	if (cpu_arch >= CPU_ARCH_ARMv5 && cachepolicy > CPOLICY_WRITETHROUGH)
387 		vecs_pgprot = cache_policies[CPOLICY_WRITETHROUGH].pte;
388 #endif
389 
390 	/*
391 	 * Enable CPU-specific coherency if supported.
392 	 * (Only available on XSC3 at the moment.)
393 	 */
394 	if (arch_is_coherent() && cpu_is_xsc3())
395 		mem_types[MT_MEMORY].prot_sect |= PMD_SECT_S;
396 
397 	/*
398 	 * ARMv6 and above have extended page tables.
399 	 */
400 	if (cpu_arch >= CPU_ARCH_ARMv6 && (cr & CR_XP)) {
401 		/*
402 		 * Mark cache clean areas and XIP ROM read only
403 		 * from SVC mode and no access from userspace.
404 		 */
405 		mem_types[MT_ROM].prot_sect |= PMD_SECT_APX|PMD_SECT_AP_WRITE;
406 		mem_types[MT_MINICLEAN].prot_sect |= PMD_SECT_APX|PMD_SECT_AP_WRITE;
407 		mem_types[MT_CACHECLEAN].prot_sect |= PMD_SECT_APX|PMD_SECT_AP_WRITE;
408 
409 #ifdef CONFIG_SMP
410 		/*
411 		 * Mark memory with the "shared" attribute for SMP systems
412 		 */
413 		user_pgprot |= L_PTE_SHARED;
414 		kern_pgprot |= L_PTE_SHARED;
415 		vecs_pgprot |= L_PTE_SHARED;
416 		mem_types[MT_MEMORY].prot_sect |= PMD_SECT_S;
417 		mem_types[MT_MEMORY_NONCACHED].prot_sect |= PMD_SECT_S;
418 #endif
419 	}
420 
421 	/*
422 	 * Non-cacheable Normal - intended for memory areas that must
423 	 * not cause dirty cache line writebacks when used
424 	 */
425 	if (cpu_arch >= CPU_ARCH_ARMv6) {
426 		if (cpu_arch >= CPU_ARCH_ARMv7 && (cr & CR_TRE)) {
427 			/* Non-cacheable Normal is XCB = 001 */
428 			mem_types[MT_MEMORY_NONCACHED].prot_sect |=
429 				PMD_SECT_BUFFERED;
430 		} else {
431 			/* For both ARMv6 and non-TEX-remapping ARMv7 */
432 			mem_types[MT_MEMORY_NONCACHED].prot_sect |=
433 				PMD_SECT_TEX(1);
434 		}
435 	} else {
436 		mem_types[MT_MEMORY_NONCACHED].prot_sect |= PMD_SECT_BUFFERABLE;
437 	}
438 
439 	for (i = 0; i < 16; i++) {
440 		unsigned long v = pgprot_val(protection_map[i]);
441 		protection_map[i] = __pgprot(v | user_pgprot);
442 	}
443 
444 	mem_types[MT_LOW_VECTORS].prot_pte |= vecs_pgprot;
445 	mem_types[MT_HIGH_VECTORS].prot_pte |= vecs_pgprot;
446 
447 	pgprot_user   = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | user_pgprot);
448 	pgprot_kernel = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG |
449 				 L_PTE_DIRTY | L_PTE_WRITE |
450 				 L_PTE_EXEC | kern_pgprot);
451 
452 	mem_types[MT_LOW_VECTORS].prot_l1 |= ecc_mask;
453 	mem_types[MT_HIGH_VECTORS].prot_l1 |= ecc_mask;
454 	mem_types[MT_MEMORY].prot_sect |= ecc_mask | cp->pmd;
455 	mem_types[MT_ROM].prot_sect |= cp->pmd;
456 
457 	switch (cp->pmd) {
458 	case PMD_SECT_WT:
459 		mem_types[MT_CACHECLEAN].prot_sect |= PMD_SECT_WT;
460 		break;
461 	case PMD_SECT_WB:
462 	case PMD_SECT_WBWA:
463 		mem_types[MT_CACHECLEAN].prot_sect |= PMD_SECT_WB;
464 		break;
465 	}
466 	printk("Memory policy: ECC %sabled, Data cache %s\n",
467 		ecc_mask ? "en" : "dis", cp->policy);
468 
469 	for (i = 0; i < ARRAY_SIZE(mem_types); i++) {
470 		struct mem_type *t = &mem_types[i];
471 		if (t->prot_l1)
472 			t->prot_l1 |= PMD_DOMAIN(t->domain);
473 		if (t->prot_sect)
474 			t->prot_sect |= PMD_DOMAIN(t->domain);
475 	}
476 }
477 
478 #define vectors_base()	(vectors_high() ? 0xffff0000 : 0)
479 
480 static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
481 				  unsigned long end, unsigned long pfn,
482 				  const struct mem_type *type)
483 {
484 	pte_t *pte;
485 
486 	if (pmd_none(*pmd)) {
487 		pte = alloc_bootmem_low_pages(2 * PTRS_PER_PTE * sizeof(pte_t));
488 		__pmd_populate(pmd, __pa(pte) | type->prot_l1);
489 	}
490 
491 	pte = pte_offset_kernel(pmd, addr);
492 	do {
493 		set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)), 0);
494 		pfn++;
495 	} while (pte++, addr += PAGE_SIZE, addr != end);
496 }
497 
498 static void __init alloc_init_section(pgd_t *pgd, unsigned long addr,
499 				      unsigned long end, unsigned long phys,
500 				      const struct mem_type *type)
501 {
502 	pmd_t *pmd = pmd_offset(pgd, addr);
503 
504 	/*
505 	 * Try a section mapping - end, addr and phys must all be aligned
506 	 * to a section boundary.  Note that PMDs refer to the individual
507 	 * L1 entries, whereas PGDs refer to a group of L1 entries making
508 	 * up one logical pointer to an L2 table.
509 	 */
510 	if (((addr | end | phys) & ~SECTION_MASK) == 0) {
511 		pmd_t *p = pmd;
512 
513 		if (addr & SECTION_SIZE)
514 			pmd++;
515 
516 		do {
517 			*pmd = __pmd(phys | type->prot_sect);
518 			phys += SECTION_SIZE;
519 		} while (pmd++, addr += SECTION_SIZE, addr != end);
520 
521 		flush_pmd_entry(p);
522 	} else {
523 		/*
524 		 * No need to loop; pte's aren't interested in the
525 		 * individual L1 entries.
526 		 */
527 		alloc_init_pte(pmd, addr, end, __phys_to_pfn(phys), type);
528 	}
529 }
530 
531 static void __init create_36bit_mapping(struct map_desc *md,
532 					const struct mem_type *type)
533 {
534 	unsigned long phys, addr, length, end;
535 	pgd_t *pgd;
536 
537 	addr = md->virtual;
538 	phys = (unsigned long)__pfn_to_phys(md->pfn);
539 	length = PAGE_ALIGN(md->length);
540 
541 	if (!(cpu_architecture() >= CPU_ARCH_ARMv6 || cpu_is_xsc3())) {
542 		printk(KERN_ERR "MM: CPU does not support supersection "
543 		       "mapping for 0x%08llx at 0x%08lx\n",
544 		       __pfn_to_phys((u64)md->pfn), addr);
545 		return;
546 	}
547 
548 	/* N.B.	ARMv6 supersections are only defined to work with domain 0.
549 	 *	Since domain assignments can in fact be arbitrary, the
550 	 *	'domain == 0' check below is required to insure that ARMv6
551 	 *	supersections are only allocated for domain 0 regardless
552 	 *	of the actual domain assignments in use.
553 	 */
554 	if (type->domain) {
555 		printk(KERN_ERR "MM: invalid domain in supersection "
556 		       "mapping for 0x%08llx at 0x%08lx\n",
557 		       __pfn_to_phys((u64)md->pfn), addr);
558 		return;
559 	}
560 
561 	if ((addr | length | __pfn_to_phys(md->pfn)) & ~SUPERSECTION_MASK) {
562 		printk(KERN_ERR "MM: cannot create mapping for "
563 		       "0x%08llx at 0x%08lx invalid alignment\n",
564 		       __pfn_to_phys((u64)md->pfn), addr);
565 		return;
566 	}
567 
568 	/*
569 	 * Shift bits [35:32] of address into bits [23:20] of PMD
570 	 * (See ARMv6 spec).
571 	 */
572 	phys |= (((md->pfn >> (32 - PAGE_SHIFT)) & 0xF) << 20);
573 
574 	pgd = pgd_offset_k(addr);
575 	end = addr + length;
576 	do {
577 		pmd_t *pmd = pmd_offset(pgd, addr);
578 		int i;
579 
580 		for (i = 0; i < 16; i++)
581 			*pmd++ = __pmd(phys | type->prot_sect | PMD_SECT_SUPER);
582 
583 		addr += SUPERSECTION_SIZE;
584 		phys += SUPERSECTION_SIZE;
585 		pgd += SUPERSECTION_SIZE >> PGDIR_SHIFT;
586 	} while (addr != end);
587 }
588 
589 /*
590  * Create the page directory entries and any necessary
591  * page tables for the mapping specified by `md'.  We
592  * are able to cope here with varying sizes and address
593  * offsets, and we take full advantage of sections and
594  * supersections.
595  */
596 void __init create_mapping(struct map_desc *md)
597 {
598 	unsigned long phys, addr, length, end;
599 	const struct mem_type *type;
600 	pgd_t *pgd;
601 
602 	if (md->virtual != vectors_base() && md->virtual < TASK_SIZE) {
603 		printk(KERN_WARNING "BUG: not creating mapping for "
604 		       "0x%08llx at 0x%08lx in user region\n",
605 		       __pfn_to_phys((u64)md->pfn), md->virtual);
606 		return;
607 	}
608 
609 	if ((md->type == MT_DEVICE || md->type == MT_ROM) &&
610 	    md->virtual >= PAGE_OFFSET && md->virtual < VMALLOC_END) {
611 		printk(KERN_WARNING "BUG: mapping for 0x%08llx at 0x%08lx "
612 		       "overlaps vmalloc space\n",
613 		       __pfn_to_phys((u64)md->pfn), md->virtual);
614 	}
615 
616 	type = &mem_types[md->type];
617 
618 	/*
619 	 * Catch 36-bit addresses
620 	 */
621 	if (md->pfn >= 0x100000) {
622 		create_36bit_mapping(md, type);
623 		return;
624 	}
625 
626 	addr = md->virtual & PAGE_MASK;
627 	phys = (unsigned long)__pfn_to_phys(md->pfn);
628 	length = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));
629 
630 	if (type->prot_l1 == 0 && ((addr | phys | length) & ~SECTION_MASK)) {
631 		printk(KERN_WARNING "BUG: map for 0x%08lx at 0x%08lx can not "
632 		       "be mapped using pages, ignoring.\n",
633 		       __pfn_to_phys(md->pfn), addr);
634 		return;
635 	}
636 
637 	pgd = pgd_offset_k(addr);
638 	end = addr + length;
639 	do {
640 		unsigned long next = pgd_addr_end(addr, end);
641 
642 		alloc_init_section(pgd, addr, next, phys, type);
643 
644 		phys += next - addr;
645 		addr = next;
646 	} while (pgd++, addr != end);
647 }
648 
649 /*
650  * Create the architecture specific mappings
651  */
652 void __init iotable_init(struct map_desc *io_desc, int nr)
653 {
654 	int i;
655 
656 	for (i = 0; i < nr; i++)
657 		create_mapping(io_desc + i);
658 }
659 
660 static unsigned long __initdata vmalloc_reserve = SZ_128M;
661 
662 /*
663  * vmalloc=size forces the vmalloc area to be exactly 'size'
664  * bytes. This can be used to increase (or decrease) the vmalloc
665  * area - the default is 128m.
666  */
667 static void __init early_vmalloc(char **arg)
668 {
669 	vmalloc_reserve = memparse(*arg, arg);
670 
671 	if (vmalloc_reserve < SZ_16M) {
672 		vmalloc_reserve = SZ_16M;
673 		printk(KERN_WARNING
674 			"vmalloc area too small, limiting to %luMB\n",
675 			vmalloc_reserve >> 20);
676 	}
677 
678 	if (vmalloc_reserve > VMALLOC_END - (PAGE_OFFSET + SZ_32M)) {
679 		vmalloc_reserve = VMALLOC_END - (PAGE_OFFSET + SZ_32M);
680 		printk(KERN_WARNING
681 			"vmalloc area is too big, limiting to %luMB\n",
682 			vmalloc_reserve >> 20);
683 	}
684 }
685 __early_param("vmalloc=", early_vmalloc);
686 
687 #define VMALLOC_MIN	(void *)(VMALLOC_END - vmalloc_reserve)
688 
689 static void __init sanity_check_meminfo(void)
690 {
691 	int i, j, highmem = 0;
692 
693 	for (i = 0, j = 0; i < meminfo.nr_banks; i++) {
694 		struct membank *bank = &meminfo.bank[j];
695 		*bank = meminfo.bank[i];
696 
697 #ifdef CONFIG_HIGHMEM
698 		if (__va(bank->start) > VMALLOC_MIN ||
699 		    __va(bank->start) < (void *)PAGE_OFFSET)
700 			highmem = 1;
701 
702 		bank->highmem = highmem;
703 
704 		/*
705 		 * Split those memory banks which are partially overlapping
706 		 * the vmalloc area greatly simplifying things later.
707 		 */
708 		if (__va(bank->start) < VMALLOC_MIN &&
709 		    bank->size > VMALLOC_MIN - __va(bank->start)) {
710 			if (meminfo.nr_banks >= NR_BANKS) {
711 				printk(KERN_CRIT "NR_BANKS too low, "
712 						 "ignoring high memory\n");
713 			} else {
714 				memmove(bank + 1, bank,
715 					(meminfo.nr_banks - i) * sizeof(*bank));
716 				meminfo.nr_banks++;
717 				i++;
718 				bank[1].size -= VMALLOC_MIN - __va(bank->start);
719 				bank[1].start = __pa(VMALLOC_MIN - 1) + 1;
720 				bank[1].highmem = highmem = 1;
721 				j++;
722 			}
723 			bank->size = VMALLOC_MIN - __va(bank->start);
724 		}
725 #else
726 		bank->highmem = highmem;
727 
728 		/*
729 		 * Check whether this memory bank would entirely overlap
730 		 * the vmalloc area.
731 		 */
732 		if (__va(bank->start) >= VMALLOC_MIN ||
733 		    __va(bank->start) < (void *)PAGE_OFFSET) {
734 			printk(KERN_NOTICE "Ignoring RAM at %.8lx-%.8lx "
735 			       "(vmalloc region overlap).\n",
736 			       bank->start, bank->start + bank->size - 1);
737 			continue;
738 		}
739 
740 		/*
741 		 * Check whether this memory bank would partially overlap
742 		 * the vmalloc area.
743 		 */
744 		if (__va(bank->start + bank->size) > VMALLOC_MIN ||
745 		    __va(bank->start + bank->size) < __va(bank->start)) {
746 			unsigned long newsize = VMALLOC_MIN - __va(bank->start);
747 			printk(KERN_NOTICE "Truncating RAM at %.8lx-%.8lx "
748 			       "to -%.8lx (vmalloc region overlap).\n",
749 			       bank->start, bank->start + bank->size - 1,
750 			       bank->start + newsize - 1);
751 			bank->size = newsize;
752 		}
753 #endif
754 		j++;
755 	}
756 #ifdef CONFIG_HIGHMEM
757 	if (highmem) {
758 		const char *reason = NULL;
759 
760 		if (cache_is_vipt_aliasing()) {
761 			/*
762 			 * Interactions between kmap and other mappings
763 			 * make highmem support with aliasing VIPT caches
764 			 * rather difficult.
765 			 */
766 			reason = "with VIPT aliasing cache";
767 #ifdef CONFIG_SMP
768 		} else if (tlb_ops_need_broadcast()) {
769 			/*
770 			 * kmap_high needs to occasionally flush TLB entries,
771 			 * however, if the TLB entries need to be broadcast
772 			 * we may deadlock:
773 			 *  kmap_high(irqs off)->flush_all_zero_pkmaps->
774 			 *  flush_tlb_kernel_range->smp_call_function_many
775 			 *   (must not be called with irqs off)
776 			 */
777 			reason = "without hardware TLB ops broadcasting";
778 #endif
779 		}
780 		if (reason) {
781 			printk(KERN_CRIT "HIGHMEM is not supported %s, ignoring high memory\n",
782 				reason);
783 			while (j > 0 && meminfo.bank[j - 1].highmem)
784 				j--;
785 		}
786 	}
787 #endif
788 	meminfo.nr_banks = j;
789 }
790 
791 static inline void prepare_page_table(void)
792 {
793 	unsigned long addr;
794 
795 	/*
796 	 * Clear out all the mappings below the kernel image.
797 	 */
798 	for (addr = 0; addr < MODULES_VADDR; addr += PGDIR_SIZE)
799 		pmd_clear(pmd_off_k(addr));
800 
801 #ifdef CONFIG_XIP_KERNEL
802 	/* The XIP kernel is mapped in the module area -- skip over it */
803 	addr = ((unsigned long)_etext + PGDIR_SIZE - 1) & PGDIR_MASK;
804 #endif
805 	for ( ; addr < PAGE_OFFSET; addr += PGDIR_SIZE)
806 		pmd_clear(pmd_off_k(addr));
807 
808 	/*
809 	 * Clear out all the kernel space mappings, except for the first
810 	 * memory bank, up to the end of the vmalloc region.
811 	 */
812 	for (addr = __phys_to_virt(bank_phys_end(&meminfo.bank[0]));
813 	     addr < VMALLOC_END; addr += PGDIR_SIZE)
814 		pmd_clear(pmd_off_k(addr));
815 }
816 
817 /*
818  * Reserve the various regions of node 0
819  */
820 void __init reserve_node_zero(pg_data_t *pgdat)
821 {
822 	unsigned long res_size = 0;
823 
824 	/*
825 	 * Register the kernel text and data with bootmem.
826 	 * Note that this can only be in node 0.
827 	 */
828 #ifdef CONFIG_XIP_KERNEL
829 	reserve_bootmem_node(pgdat, __pa(_data), _end - _data,
830 			BOOTMEM_DEFAULT);
831 #else
832 	reserve_bootmem_node(pgdat, __pa(_stext), _end - _stext,
833 			BOOTMEM_DEFAULT);
834 #endif
835 
836 	/*
837 	 * Reserve the page tables.  These are already in use,
838 	 * and can only be in node 0.
839 	 */
840 	reserve_bootmem_node(pgdat, __pa(swapper_pg_dir),
841 			     PTRS_PER_PGD * sizeof(pgd_t), BOOTMEM_DEFAULT);
842 
843 	/*
844 	 * Hmm... This should go elsewhere, but we really really need to
845 	 * stop things allocating the low memory; ideally we need a better
846 	 * implementation of GFP_DMA which does not assume that DMA-able
847 	 * memory starts at zero.
848 	 */
849 	if (machine_is_integrator() || machine_is_cintegrator())
850 		res_size = __pa(swapper_pg_dir) - PHYS_OFFSET;
851 
852 	/*
853 	 * These should likewise go elsewhere.  They pre-reserve the
854 	 * screen memory region at the start of main system memory.
855 	 */
856 	if (machine_is_edb7211())
857 		res_size = 0x00020000;
858 	if (machine_is_p720t())
859 		res_size = 0x00014000;
860 
861 	/* H1940 and RX3715 need to reserve this for suspend */
862 
863 	if (machine_is_h1940() || machine_is_rx3715()) {
864 		reserve_bootmem_node(pgdat, 0x30003000, 0x1000,
865 				BOOTMEM_DEFAULT);
866 		reserve_bootmem_node(pgdat, 0x30081000, 0x1000,
867 				BOOTMEM_DEFAULT);
868 	}
869 
870 	if (machine_is_palmld() || machine_is_palmtx()) {
871 		reserve_bootmem_node(pgdat, 0xa0000000, 0x1000,
872 				BOOTMEM_EXCLUSIVE);
873 		reserve_bootmem_node(pgdat, 0xa0200000, 0x1000,
874 				BOOTMEM_EXCLUSIVE);
875 	}
876 
877 	if (machine_is_treo680()) {
878 		reserve_bootmem_node(pgdat, 0xa0000000, 0x1000,
879 				BOOTMEM_EXCLUSIVE);
880 		reserve_bootmem_node(pgdat, 0xa2000000, 0x1000,
881 				BOOTMEM_EXCLUSIVE);
882 	}
883 
884 	if (machine_is_palmt5())
885 		reserve_bootmem_node(pgdat, 0xa0200000, 0x1000,
886 				BOOTMEM_EXCLUSIVE);
887 
888 	/*
889 	 * U300 - This platform family can share physical memory
890 	 * between two ARM cpus, one running Linux and the other
891 	 * running another OS.
892 	 */
893 	if (machine_is_u300()) {
894 #ifdef CONFIG_MACH_U300_SINGLE_RAM
895 #if ((CONFIG_MACH_U300_ACCESS_MEM_SIZE & 1) == 1) &&	\
896 	CONFIG_MACH_U300_2MB_ALIGNMENT_FIX
897 		res_size = 0x00100000;
898 #endif
899 #endif
900 	}
901 
902 #ifdef CONFIG_SA1111
903 	/*
904 	 * Because of the SA1111 DMA bug, we want to preserve our
905 	 * precious DMA-able memory...
906 	 */
907 	res_size = __pa(swapper_pg_dir) - PHYS_OFFSET;
908 #endif
909 	if (res_size)
910 		reserve_bootmem_node(pgdat, PHYS_OFFSET, res_size,
911 				BOOTMEM_DEFAULT);
912 }
913 
914 /*
915  * Set up device the mappings.  Since we clear out the page tables for all
916  * mappings above VMALLOC_END, we will remove any debug device mappings.
917  * This means you have to be careful how you debug this function, or any
918  * called function.  This means you can't use any function or debugging
919  * method which may touch any device, otherwise the kernel _will_ crash.
920  */
921 static void __init devicemaps_init(struct machine_desc *mdesc)
922 {
923 	struct map_desc map;
924 	unsigned long addr;
925 	void *vectors;
926 
927 	/*
928 	 * Allocate the vector page early.
929 	 */
930 	vectors = alloc_bootmem_low_pages(PAGE_SIZE);
931 
932 	for (addr = VMALLOC_END; addr; addr += PGDIR_SIZE)
933 		pmd_clear(pmd_off_k(addr));
934 
935 	/*
936 	 * Map the kernel if it is XIP.
937 	 * It is always first in the modulearea.
938 	 */
939 #ifdef CONFIG_XIP_KERNEL
940 	map.pfn = __phys_to_pfn(CONFIG_XIP_PHYS_ADDR & SECTION_MASK);
941 	map.virtual = MODULES_VADDR;
942 	map.length = ((unsigned long)_etext - map.virtual + ~SECTION_MASK) & SECTION_MASK;
943 	map.type = MT_ROM;
944 	create_mapping(&map);
945 #endif
946 
947 	/*
948 	 * Map the cache flushing regions.
949 	 */
950 #ifdef FLUSH_BASE
951 	map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS);
952 	map.virtual = FLUSH_BASE;
953 	map.length = SZ_1M;
954 	map.type = MT_CACHECLEAN;
955 	create_mapping(&map);
956 #endif
957 #ifdef FLUSH_BASE_MINICACHE
958 	map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS + SZ_1M);
959 	map.virtual = FLUSH_BASE_MINICACHE;
960 	map.length = SZ_1M;
961 	map.type = MT_MINICLEAN;
962 	create_mapping(&map);
963 #endif
964 
965 	/*
966 	 * Create a mapping for the machine vectors at the high-vectors
967 	 * location (0xffff0000).  If we aren't using high-vectors, also
968 	 * create a mapping at the low-vectors virtual address.
969 	 */
970 	map.pfn = __phys_to_pfn(virt_to_phys(vectors));
971 	map.virtual = 0xffff0000;
972 	map.length = PAGE_SIZE;
973 	map.type = MT_HIGH_VECTORS;
974 	create_mapping(&map);
975 
976 	if (!vectors_high()) {
977 		map.virtual = 0;
978 		map.type = MT_LOW_VECTORS;
979 		create_mapping(&map);
980 	}
981 
982 	/*
983 	 * Ask the machine support to map in the statically mapped devices.
984 	 */
985 	if (mdesc->map_io)
986 		mdesc->map_io();
987 
988 	/*
989 	 * Finally flush the caches and tlb to ensure that we're in a
990 	 * consistent state wrt the writebuffer.  This also ensures that
991 	 * any write-allocated cache lines in the vector page are written
992 	 * back.  After this point, we can start to touch devices again.
993 	 */
994 	local_flush_tlb_all();
995 	flush_cache_all();
996 }
997 
998 static void __init kmap_init(void)
999 {
1000 #ifdef CONFIG_HIGHMEM
1001 	pmd_t *pmd = pmd_off_k(PKMAP_BASE);
1002 	pte_t *pte = alloc_bootmem_low_pages(2 * PTRS_PER_PTE * sizeof(pte_t));
1003 	BUG_ON(!pmd_none(*pmd) || !pte);
1004 	__pmd_populate(pmd, __pa(pte) | _PAGE_KERNEL_TABLE);
1005 	pkmap_page_table = pte + PTRS_PER_PTE;
1006 #endif
1007 }
1008 
1009 /*
1010  * paging_init() sets up the page tables, initialises the zone memory
1011  * maps, and sets up the zero page, bad page and bad page tables.
1012  */
1013 void __init paging_init(struct machine_desc *mdesc)
1014 {
1015 	void *zero_page;
1016 
1017 	build_mem_type_table();
1018 	sanity_check_meminfo();
1019 	prepare_page_table();
1020 	bootmem_init();
1021 	devicemaps_init(mdesc);
1022 	kmap_init();
1023 
1024 	top_pmd = pmd_off_k(0xffff0000);
1025 
1026 	/*
1027 	 * allocate the zero page.  Note that this always succeeds and
1028 	 * returns a zeroed result.
1029 	 */
1030 	zero_page = alloc_bootmem_low_pages(PAGE_SIZE);
1031 	empty_zero_page = virt_to_page(zero_page);
1032 	flush_dcache_page(empty_zero_page);
1033 }
1034 
1035 /*
1036  * In order to soft-boot, we need to insert a 1:1 mapping in place of
1037  * the user-mode pages.  This will then ensure that we have predictable
1038  * results when turning the mmu off
1039  */
1040 void setup_mm_for_reboot(char mode)
1041 {
1042 	unsigned long base_pmdval;
1043 	pgd_t *pgd;
1044 	int i;
1045 
1046 	if (current->mm && current->mm->pgd)
1047 		pgd = current->mm->pgd;
1048 	else
1049 		pgd = init_mm.pgd;
1050 
1051 	base_pmdval = PMD_SECT_AP_WRITE | PMD_SECT_AP_READ | PMD_TYPE_SECT;
1052 	if (cpu_architecture() <= CPU_ARCH_ARMv5TEJ && !cpu_is_xscale())
1053 		base_pmdval |= PMD_BIT4;
1054 
1055 	for (i = 0; i < FIRST_USER_PGD_NR + USER_PTRS_PER_PGD; i++, pgd++) {
1056 		unsigned long pmdval = (i << PGDIR_SHIFT) | base_pmdval;
1057 		pmd_t *pmd;
1058 
1059 		pmd = pmd_off(pgd, i << PGDIR_SHIFT);
1060 		pmd[0] = __pmd(pmdval);
1061 		pmd[1] = __pmd(pmdval + (1 << (PGDIR_SHIFT - 1)));
1062 		flush_pmd_entry(pmd);
1063 	}
1064 }
1065