xref: /openbmc/linux/arch/s390/mm/pgtable.c (revision 7fe2f639)
1 /*
2  *    Copyright IBM Corp. 2007,2009
3  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/gfp.h>
10 #include <linux/mm.h>
11 #include <linux/swap.h>
12 #include <linux/smp.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h>
15 #include <linux/spinlock.h>
16 #include <linux/module.h>
17 #include <linux/quicklist.h>
18 #include <linux/rcupdate.h>
19 
20 #include <asm/system.h>
21 #include <asm/pgtable.h>
22 #include <asm/pgalloc.h>
23 #include <asm/tlb.h>
24 #include <asm/tlbflush.h>
25 #include <asm/mmu_context.h>
26 
27 #ifndef CONFIG_64BIT
28 #define ALLOC_ORDER	1
29 #define FRAG_MASK	0x0f
30 #else
31 #define ALLOC_ORDER	2
32 #define FRAG_MASK	0x03
33 #endif
34 
35 unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
36 EXPORT_SYMBOL(VMALLOC_START);
37 
38 static int __init parse_vmalloc(char *arg)
39 {
40 	if (!arg)
41 		return -EINVAL;
42 	VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK;
43 	return 0;
44 }
45 early_param("vmalloc", parse_vmalloc);
46 
47 unsigned long *crst_table_alloc(struct mm_struct *mm)
48 {
49 	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
50 
51 	if (!page)
52 		return NULL;
53 	return (unsigned long *) page_to_phys(page);
54 }
55 
56 void crst_table_free(struct mm_struct *mm, unsigned long *table)
57 {
58 	free_pages((unsigned long) table, ALLOC_ORDER);
59 }
60 
61 #ifdef CONFIG_64BIT
62 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
63 {
64 	unsigned long *table, *pgd;
65 	unsigned long entry;
66 
67 	BUG_ON(limit > (1UL << 53));
68 repeat:
69 	table = crst_table_alloc(mm);
70 	if (!table)
71 		return -ENOMEM;
72 	spin_lock_bh(&mm->page_table_lock);
73 	if (mm->context.asce_limit < limit) {
74 		pgd = (unsigned long *) mm->pgd;
75 		if (mm->context.asce_limit <= (1UL << 31)) {
76 			entry = _REGION3_ENTRY_EMPTY;
77 			mm->context.asce_limit = 1UL << 42;
78 			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
79 						_ASCE_USER_BITS |
80 						_ASCE_TYPE_REGION3;
81 		} else {
82 			entry = _REGION2_ENTRY_EMPTY;
83 			mm->context.asce_limit = 1UL << 53;
84 			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
85 						_ASCE_USER_BITS |
86 						_ASCE_TYPE_REGION2;
87 		}
88 		crst_table_init(table, entry);
89 		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
90 		mm->pgd = (pgd_t *) table;
91 		mm->task_size = mm->context.asce_limit;
92 		table = NULL;
93 	}
94 	spin_unlock_bh(&mm->page_table_lock);
95 	if (table)
96 		crst_table_free(mm, table);
97 	if (mm->context.asce_limit < limit)
98 		goto repeat;
99 	update_mm(mm, current);
100 	return 0;
101 }
102 
103 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
104 {
105 	pgd_t *pgd;
106 
107 	if (mm->context.asce_limit <= limit)
108 		return;
109 	__tlb_flush_mm(mm);
110 	while (mm->context.asce_limit > limit) {
111 		pgd = mm->pgd;
112 		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
113 		case _REGION_ENTRY_TYPE_R2:
114 			mm->context.asce_limit = 1UL << 42;
115 			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
116 						_ASCE_USER_BITS |
117 						_ASCE_TYPE_REGION3;
118 			break;
119 		case _REGION_ENTRY_TYPE_R3:
120 			mm->context.asce_limit = 1UL << 31;
121 			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
122 						_ASCE_USER_BITS |
123 						_ASCE_TYPE_SEGMENT;
124 			break;
125 		default:
126 			BUG();
127 		}
128 		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
129 		mm->task_size = mm->context.asce_limit;
130 		crst_table_free(mm, (unsigned long *) pgd);
131 	}
132 	update_mm(mm, current);
133 }
134 #endif
135 
136 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
137 {
138 	unsigned int old, new;
139 
140 	do {
141 		old = atomic_read(v);
142 		new = old ^ bits;
143 	} while (atomic_cmpxchg(v, old, new) != old);
144 	return new;
145 }
146 
147 /*
148  * page table entry allocation/free routines.
149  */
150 #ifdef CONFIG_PGSTE
151 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm)
152 {
153 	struct page *page;
154 	unsigned long *table;
155 
156 	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
157 	if (!page)
158 		return NULL;
159 	pgtable_page_ctor(page);
160 	atomic_set(&page->_mapcount, 3);
161 	table = (unsigned long *) page_to_phys(page);
162 	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
163 	clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
164 	return table;
165 }
166 
167 static inline void page_table_free_pgste(unsigned long *table)
168 {
169 	struct page *page;
170 
171 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
172 	pgtable_page_ctor(page);
173 	atomic_set(&page->_mapcount, -1);
174 	__free_page(page);
175 }
176 #endif
177 
178 unsigned long *page_table_alloc(struct mm_struct *mm)
179 {
180 	struct page *page;
181 	unsigned long *table;
182 	unsigned int mask, bit;
183 
184 #ifdef CONFIG_PGSTE
185 	if (mm_has_pgste(mm))
186 		return page_table_alloc_pgste(mm);
187 #endif
188 	/* Allocate fragments of a 4K page as 1K/2K page table */
189 	spin_lock_bh(&mm->context.list_lock);
190 	mask = FRAG_MASK;
191 	if (!list_empty(&mm->context.pgtable_list)) {
192 		page = list_first_entry(&mm->context.pgtable_list,
193 					struct page, lru);
194 		table = (unsigned long *) page_to_phys(page);
195 		mask = atomic_read(&page->_mapcount);
196 		mask = mask | (mask >> 4);
197 	}
198 	if ((mask & FRAG_MASK) == FRAG_MASK) {
199 		spin_unlock_bh(&mm->context.list_lock);
200 		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
201 		if (!page)
202 			return NULL;
203 		pgtable_page_ctor(page);
204 		atomic_set(&page->_mapcount, 1);
205 		table = (unsigned long *) page_to_phys(page);
206 		clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
207 		spin_lock_bh(&mm->context.list_lock);
208 		list_add(&page->lru, &mm->context.pgtable_list);
209 	} else {
210 		for (bit = 1; mask & bit; bit <<= 1)
211 			table += PTRS_PER_PTE;
212 		mask = atomic_xor_bits(&page->_mapcount, bit);
213 		if ((mask & FRAG_MASK) == FRAG_MASK)
214 			list_del(&page->lru);
215 	}
216 	spin_unlock_bh(&mm->context.list_lock);
217 	return table;
218 }
219 
220 void page_table_free(struct mm_struct *mm, unsigned long *table)
221 {
222 	struct page *page;
223 	unsigned int bit, mask;
224 
225 #ifdef CONFIG_PGSTE
226 	if (mm_has_pgste(mm))
227 		return page_table_free_pgste(table);
228 #endif
229 	/* Free 1K/2K page table fragment of a 4K page */
230 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
231 	bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
232 	spin_lock_bh(&mm->context.list_lock);
233 	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
234 		list_del(&page->lru);
235 	mask = atomic_xor_bits(&page->_mapcount, bit);
236 	if (mask & FRAG_MASK)
237 		list_add(&page->lru, &mm->context.pgtable_list);
238 	spin_unlock_bh(&mm->context.list_lock);
239 	if (mask == 0) {
240 		pgtable_page_dtor(page);
241 		atomic_set(&page->_mapcount, -1);
242 		__free_page(page);
243 	}
244 }
245 
246 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
247 
248 static void __page_table_free_rcu(void *table, unsigned bit)
249 {
250 	struct page *page;
251 
252 #ifdef CONFIG_PGSTE
253 	if (bit == FRAG_MASK)
254 		return page_table_free_pgste(table);
255 #endif
256 	/* Free 1K/2K page table fragment of a 4K page */
257 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
258 	if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
259 		pgtable_page_dtor(page);
260 		atomic_set(&page->_mapcount, -1);
261 		__free_page(page);
262 	}
263 }
264 
265 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
266 {
267 	struct mm_struct *mm;
268 	struct page *page;
269 	unsigned int bit, mask;
270 
271 	mm = tlb->mm;
272 #ifdef CONFIG_PGSTE
273 	if (mm_has_pgste(mm)) {
274 		table = (unsigned long *) (__pa(table) | FRAG_MASK);
275 		tlb_remove_table(tlb, table);
276 		return;
277 	}
278 #endif
279 	bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
280 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
281 	spin_lock_bh(&mm->context.list_lock);
282 	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
283 		list_del(&page->lru);
284 	mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
285 	if (mask & FRAG_MASK)
286 		list_add_tail(&page->lru, &mm->context.pgtable_list);
287 	spin_unlock_bh(&mm->context.list_lock);
288 	table = (unsigned long *) (__pa(table) | (bit << 4));
289 	tlb_remove_table(tlb, table);
290 }
291 
292 void __tlb_remove_table(void *_table)
293 {
294 	void *table = (void *)((unsigned long) _table & PAGE_MASK);
295 	unsigned type = (unsigned long) _table & ~PAGE_MASK;
296 
297 	if (type)
298 		__page_table_free_rcu(table, type);
299 	else
300 		free_pages((unsigned long) table, ALLOC_ORDER);
301 }
302 
303 #endif
304 
305 /*
306  * switch on pgstes for its userspace process (for kvm)
307  */
308 int s390_enable_sie(void)
309 {
310 	struct task_struct *tsk = current;
311 	struct mm_struct *mm, *old_mm;
312 
313 	/* Do we have switched amode? If no, we cannot do sie */
314 	if (user_mode == HOME_SPACE_MODE)
315 		return -EINVAL;
316 
317 	/* Do we have pgstes? if yes, we are done */
318 	if (mm_has_pgste(tsk->mm))
319 		return 0;
320 
321 	/* lets check if we are allowed to replace the mm */
322 	task_lock(tsk);
323 	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
324 #ifdef CONFIG_AIO
325 	    !hlist_empty(&tsk->mm->ioctx_list) ||
326 #endif
327 	    tsk->mm != tsk->active_mm) {
328 		task_unlock(tsk);
329 		return -EINVAL;
330 	}
331 	task_unlock(tsk);
332 
333 	/* we copy the mm and let dup_mm create the page tables with_pgstes */
334 	tsk->mm->context.alloc_pgste = 1;
335 	mm = dup_mm(tsk);
336 	tsk->mm->context.alloc_pgste = 0;
337 	if (!mm)
338 		return -ENOMEM;
339 
340 	/* Now lets check again if something happened */
341 	task_lock(tsk);
342 	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
343 #ifdef CONFIG_AIO
344 	    !hlist_empty(&tsk->mm->ioctx_list) ||
345 #endif
346 	    tsk->mm != tsk->active_mm) {
347 		mmput(mm);
348 		task_unlock(tsk);
349 		return -EINVAL;
350 	}
351 
352 	/* ok, we are alone. No ptrace, no threads, etc. */
353 	old_mm = tsk->mm;
354 	tsk->mm = tsk->active_mm = mm;
355 	preempt_disable();
356 	update_mm(mm, tsk);
357 	atomic_inc(&mm->context.attach_count);
358 	atomic_dec(&old_mm->context.attach_count);
359 	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
360 	preempt_enable();
361 	task_unlock(tsk);
362 	mmput(old_mm);
363 	return 0;
364 }
365 EXPORT_SYMBOL_GPL(s390_enable_sie);
366 
367 #if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
368 bool kernel_page_present(struct page *page)
369 {
370 	unsigned long addr;
371 	int cc;
372 
373 	addr = page_to_phys(page);
374 	asm volatile(
375 		"	lra	%1,0(%1)\n"
376 		"	ipm	%0\n"
377 		"	srl	%0,28"
378 		: "=d" (cc), "+a" (addr) : : "cc");
379 	return cc == 0;
380 }
381 #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */
382