xref: /openbmc/linux/arch/s390/mm/pgtable.c (revision b34e08d5)
1 /*
2  *    Copyright IBM Corp. 2007, 2011
3  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/gfp.h>
10 #include <linux/mm.h>
11 #include <linux/swap.h>
12 #include <linux/smp.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h>
15 #include <linux/spinlock.h>
16 #include <linux/module.h>
17 #include <linux/quicklist.h>
18 #include <linux/rcupdate.h>
19 #include <linux/slab.h>
20 #include <linux/swapops.h>
21 
22 #include <asm/pgtable.h>
23 #include <asm/pgalloc.h>
24 #include <asm/tlb.h>
25 #include <asm/tlbflush.h>
26 #include <asm/mmu_context.h>
27 
28 #ifndef CONFIG_64BIT
29 #define ALLOC_ORDER	1
30 #define FRAG_MASK	0x0f
31 #else
32 #define ALLOC_ORDER	2
33 #define FRAG_MASK	0x03
34 #endif
35 
36 
37 unsigned long *crst_table_alloc(struct mm_struct *mm)
38 {
39 	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
40 
41 	if (!page)
42 		return NULL;
43 	return (unsigned long *) page_to_phys(page);
44 }
45 
46 void crst_table_free(struct mm_struct *mm, unsigned long *table)
47 {
48 	free_pages((unsigned long) table, ALLOC_ORDER);
49 }
50 
51 #ifdef CONFIG_64BIT
52 static void __crst_table_upgrade(void *arg)
53 {
54 	struct mm_struct *mm = arg;
55 
56 	if (current->active_mm == mm)
57 		update_user_asce(mm, 1);
58 	__tlb_flush_local();
59 }
60 
61 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
62 {
63 	unsigned long *table, *pgd;
64 	unsigned long entry;
65 	int flush;
66 
67 	BUG_ON(limit > (1UL << 53));
68 	flush = 0;
69 repeat:
70 	table = crst_table_alloc(mm);
71 	if (!table)
72 		return -ENOMEM;
73 	spin_lock_bh(&mm->page_table_lock);
74 	if (mm->context.asce_limit < limit) {
75 		pgd = (unsigned long *) mm->pgd;
76 		if (mm->context.asce_limit <= (1UL << 31)) {
77 			entry = _REGION3_ENTRY_EMPTY;
78 			mm->context.asce_limit = 1UL << 42;
79 			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
80 						_ASCE_USER_BITS |
81 						_ASCE_TYPE_REGION3;
82 		} else {
83 			entry = _REGION2_ENTRY_EMPTY;
84 			mm->context.asce_limit = 1UL << 53;
85 			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
86 						_ASCE_USER_BITS |
87 						_ASCE_TYPE_REGION2;
88 		}
89 		crst_table_init(table, entry);
90 		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
91 		mm->pgd = (pgd_t *) table;
92 		mm->task_size = mm->context.asce_limit;
93 		table = NULL;
94 		flush = 1;
95 	}
96 	spin_unlock_bh(&mm->page_table_lock);
97 	if (table)
98 		crst_table_free(mm, table);
99 	if (mm->context.asce_limit < limit)
100 		goto repeat;
101 	if (flush)
102 		on_each_cpu(__crst_table_upgrade, mm, 0);
103 	return 0;
104 }
105 
106 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
107 {
108 	pgd_t *pgd;
109 
110 	if (current->active_mm == mm) {
111 		clear_user_asce(mm, 1);
112 		__tlb_flush_mm(mm);
113 	}
114 	while (mm->context.asce_limit > limit) {
115 		pgd = mm->pgd;
116 		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
117 		case _REGION_ENTRY_TYPE_R2:
118 			mm->context.asce_limit = 1UL << 42;
119 			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
120 						_ASCE_USER_BITS |
121 						_ASCE_TYPE_REGION3;
122 			break;
123 		case _REGION_ENTRY_TYPE_R3:
124 			mm->context.asce_limit = 1UL << 31;
125 			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
126 						_ASCE_USER_BITS |
127 						_ASCE_TYPE_SEGMENT;
128 			break;
129 		default:
130 			BUG();
131 		}
132 		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
133 		mm->task_size = mm->context.asce_limit;
134 		crst_table_free(mm, (unsigned long *) pgd);
135 	}
136 	if (current->active_mm == mm)
137 		update_user_asce(mm, 1);
138 }
139 #endif
140 
141 #ifdef CONFIG_PGSTE
142 
143 /**
144  * gmap_alloc - allocate a guest address space
145  * @mm: pointer to the parent mm_struct
146  *
147  * Returns a guest address space structure.
148  */
149 struct gmap *gmap_alloc(struct mm_struct *mm)
150 {
151 	struct gmap *gmap;
152 	struct page *page;
153 	unsigned long *table;
154 
155 	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
156 	if (!gmap)
157 		goto out;
158 	INIT_LIST_HEAD(&gmap->crst_list);
159 	gmap->mm = mm;
160 	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
161 	if (!page)
162 		goto out_free;
163 	list_add(&page->lru, &gmap->crst_list);
164 	table = (unsigned long *) page_to_phys(page);
165 	crst_table_init(table, _REGION1_ENTRY_EMPTY);
166 	gmap->table = table;
167 	gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH |
168 		     _ASCE_USER_BITS | __pa(table);
169 	list_add(&gmap->list, &mm->context.gmap_list);
170 	return gmap;
171 
172 out_free:
173 	kfree(gmap);
174 out:
175 	return NULL;
176 }
177 EXPORT_SYMBOL_GPL(gmap_alloc);
178 
179 static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table)
180 {
181 	struct gmap_pgtable *mp;
182 	struct gmap_rmap *rmap;
183 	struct page *page;
184 
185 	if (*table & _SEGMENT_ENTRY_INVALID)
186 		return 0;
187 	page = pfn_to_page(*table >> PAGE_SHIFT);
188 	mp = (struct gmap_pgtable *) page->index;
189 	list_for_each_entry(rmap, &mp->mapper, list) {
190 		if (rmap->entry != table)
191 			continue;
192 		list_del(&rmap->list);
193 		kfree(rmap);
194 		break;
195 	}
196 	*table = mp->vmaddr | _SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_PROTECT;
197 	return 1;
198 }
199 
200 static void gmap_flush_tlb(struct gmap *gmap)
201 {
202 	if (MACHINE_HAS_IDTE)
203 		__tlb_flush_asce(gmap->mm, (unsigned long) gmap->table |
204 				 _ASCE_TYPE_REGION1);
205 	else
206 		__tlb_flush_global();
207 }
208 
209 /**
210  * gmap_free - free a guest address space
211  * @gmap: pointer to the guest address space structure
212  */
213 void gmap_free(struct gmap *gmap)
214 {
215 	struct page *page, *next;
216 	unsigned long *table;
217 	int i;
218 
219 
220 	/* Flush tlb. */
221 	if (MACHINE_HAS_IDTE)
222 		__tlb_flush_asce(gmap->mm, (unsigned long) gmap->table |
223 				 _ASCE_TYPE_REGION1);
224 	else
225 		__tlb_flush_global();
226 
227 	/* Free all segment & region tables. */
228 	down_read(&gmap->mm->mmap_sem);
229 	spin_lock(&gmap->mm->page_table_lock);
230 	list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
231 		table = (unsigned long *) page_to_phys(page);
232 		if ((*table & _REGION_ENTRY_TYPE_MASK) == 0)
233 			/* Remove gmap rmap structures for segment table. */
234 			for (i = 0; i < PTRS_PER_PMD; i++, table++)
235 				gmap_unlink_segment(gmap, table);
236 		__free_pages(page, ALLOC_ORDER);
237 	}
238 	spin_unlock(&gmap->mm->page_table_lock);
239 	up_read(&gmap->mm->mmap_sem);
240 	list_del(&gmap->list);
241 	kfree(gmap);
242 }
243 EXPORT_SYMBOL_GPL(gmap_free);
244 
245 /**
246  * gmap_enable - switch primary space to the guest address space
247  * @gmap: pointer to the guest address space structure
248  */
249 void gmap_enable(struct gmap *gmap)
250 {
251 	S390_lowcore.gmap = (unsigned long) gmap;
252 }
253 EXPORT_SYMBOL_GPL(gmap_enable);
254 
255 /**
256  * gmap_disable - switch back to the standard primary address space
257  * @gmap: pointer to the guest address space structure
258  */
259 void gmap_disable(struct gmap *gmap)
260 {
261 	S390_lowcore.gmap = 0UL;
262 }
263 EXPORT_SYMBOL_GPL(gmap_disable);
264 
265 /*
266  * gmap_alloc_table is assumed to be called with mmap_sem held
267  */
268 static int gmap_alloc_table(struct gmap *gmap,
269 			    unsigned long *table, unsigned long init)
270 	__releases(&gmap->mm->page_table_lock)
271 	__acquires(&gmap->mm->page_table_lock)
272 {
273 	struct page *page;
274 	unsigned long *new;
275 
276 	/* since we dont free the gmap table until gmap_free we can unlock */
277 	spin_unlock(&gmap->mm->page_table_lock);
278 	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
279 	spin_lock(&gmap->mm->page_table_lock);
280 	if (!page)
281 		return -ENOMEM;
282 	new = (unsigned long *) page_to_phys(page);
283 	crst_table_init(new, init);
284 	if (*table & _REGION_ENTRY_INVALID) {
285 		list_add(&page->lru, &gmap->crst_list);
286 		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
287 			(*table & _REGION_ENTRY_TYPE_MASK);
288 	} else
289 		__free_pages(page, ALLOC_ORDER);
290 	return 0;
291 }
292 
293 /**
294  * gmap_unmap_segment - unmap segment from the guest address space
295  * @gmap: pointer to the guest address space structure
296  * @addr: address in the guest address space
297  * @len: length of the memory area to unmap
298  *
299  * Returns 0 if the unmap succeeded, -EINVAL if not.
300  */
301 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
302 {
303 	unsigned long *table;
304 	unsigned long off;
305 	int flush;
306 
307 	if ((to | len) & (PMD_SIZE - 1))
308 		return -EINVAL;
309 	if (len == 0 || to + len < to)
310 		return -EINVAL;
311 
312 	flush = 0;
313 	down_read(&gmap->mm->mmap_sem);
314 	spin_lock(&gmap->mm->page_table_lock);
315 	for (off = 0; off < len; off += PMD_SIZE) {
316 		/* Walk the guest addr space page table */
317 		table = gmap->table + (((to + off) >> 53) & 0x7ff);
318 		if (*table & _REGION_ENTRY_INVALID)
319 			goto out;
320 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
321 		table = table + (((to + off) >> 42) & 0x7ff);
322 		if (*table & _REGION_ENTRY_INVALID)
323 			goto out;
324 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
325 		table = table + (((to + off) >> 31) & 0x7ff);
326 		if (*table & _REGION_ENTRY_INVALID)
327 			goto out;
328 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
329 		table = table + (((to + off) >> 20) & 0x7ff);
330 
331 		/* Clear segment table entry in guest address space. */
332 		flush |= gmap_unlink_segment(gmap, table);
333 		*table = _SEGMENT_ENTRY_INVALID;
334 	}
335 out:
336 	spin_unlock(&gmap->mm->page_table_lock);
337 	up_read(&gmap->mm->mmap_sem);
338 	if (flush)
339 		gmap_flush_tlb(gmap);
340 	return 0;
341 }
342 EXPORT_SYMBOL_GPL(gmap_unmap_segment);
343 
344 /**
345  * gmap_mmap_segment - map a segment to the guest address space
346  * @gmap: pointer to the guest address space structure
347  * @from: source address in the parent address space
348  * @to: target address in the guest address space
349  *
350  * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
351  */
352 int gmap_map_segment(struct gmap *gmap, unsigned long from,
353 		     unsigned long to, unsigned long len)
354 {
355 	unsigned long *table;
356 	unsigned long off;
357 	int flush;
358 
359 	if ((from | to | len) & (PMD_SIZE - 1))
360 		return -EINVAL;
361 	if (len == 0 || from + len > TASK_MAX_SIZE ||
362 	    from + len < from || to + len < to)
363 		return -EINVAL;
364 
365 	flush = 0;
366 	down_read(&gmap->mm->mmap_sem);
367 	spin_lock(&gmap->mm->page_table_lock);
368 	for (off = 0; off < len; off += PMD_SIZE) {
369 		/* Walk the gmap address space page table */
370 		table = gmap->table + (((to + off) >> 53) & 0x7ff);
371 		if ((*table & _REGION_ENTRY_INVALID) &&
372 		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
373 			goto out_unmap;
374 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
375 		table = table + (((to + off) >> 42) & 0x7ff);
376 		if ((*table & _REGION_ENTRY_INVALID) &&
377 		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
378 			goto out_unmap;
379 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
380 		table = table + (((to + off) >> 31) & 0x7ff);
381 		if ((*table & _REGION_ENTRY_INVALID) &&
382 		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
383 			goto out_unmap;
384 		table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
385 		table = table + (((to + off) >> 20) & 0x7ff);
386 
387 		/* Store 'from' address in an invalid segment table entry. */
388 		flush |= gmap_unlink_segment(gmap, table);
389 		*table =  (from + off) | (_SEGMENT_ENTRY_INVALID |
390 					  _SEGMENT_ENTRY_PROTECT);
391 	}
392 	spin_unlock(&gmap->mm->page_table_lock);
393 	up_read(&gmap->mm->mmap_sem);
394 	if (flush)
395 		gmap_flush_tlb(gmap);
396 	return 0;
397 
398 out_unmap:
399 	spin_unlock(&gmap->mm->page_table_lock);
400 	up_read(&gmap->mm->mmap_sem);
401 	gmap_unmap_segment(gmap, to, len);
402 	return -ENOMEM;
403 }
404 EXPORT_SYMBOL_GPL(gmap_map_segment);
405 
406 static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap)
407 {
408 	unsigned long *table;
409 
410 	table = gmap->table + ((address >> 53) & 0x7ff);
411 	if (unlikely(*table & _REGION_ENTRY_INVALID))
412 		return ERR_PTR(-EFAULT);
413 	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
414 	table = table + ((address >> 42) & 0x7ff);
415 	if (unlikely(*table & _REGION_ENTRY_INVALID))
416 		return ERR_PTR(-EFAULT);
417 	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
418 	table = table + ((address >> 31) & 0x7ff);
419 	if (unlikely(*table & _REGION_ENTRY_INVALID))
420 		return ERR_PTR(-EFAULT);
421 	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
422 	table = table + ((address >> 20) & 0x7ff);
423 	return table;
424 }
425 
426 /**
427  * __gmap_translate - translate a guest address to a user space address
428  * @address: guest address
429  * @gmap: pointer to guest mapping meta data structure
430  *
431  * Returns user space address which corresponds to the guest address or
432  * -EFAULT if no such mapping exists.
433  * This function does not establish potentially missing page table entries.
434  * The mmap_sem of the mm that belongs to the address space must be held
435  * when this function gets called.
436  */
437 unsigned long __gmap_translate(unsigned long address, struct gmap *gmap)
438 {
439 	unsigned long *segment_ptr, vmaddr, segment;
440 	struct gmap_pgtable *mp;
441 	struct page *page;
442 
443 	current->thread.gmap_addr = address;
444 	segment_ptr = gmap_table_walk(address, gmap);
445 	if (IS_ERR(segment_ptr))
446 		return PTR_ERR(segment_ptr);
447 	/* Convert the gmap address to an mm address. */
448 	segment = *segment_ptr;
449 	if (!(segment & _SEGMENT_ENTRY_INVALID)) {
450 		page = pfn_to_page(segment >> PAGE_SHIFT);
451 		mp = (struct gmap_pgtable *) page->index;
452 		return mp->vmaddr | (address & ~PMD_MASK);
453 	} else if (segment & _SEGMENT_ENTRY_PROTECT) {
454 		vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
455 		return vmaddr | (address & ~PMD_MASK);
456 	}
457 	return -EFAULT;
458 }
459 EXPORT_SYMBOL_GPL(__gmap_translate);
460 
461 /**
462  * gmap_translate - translate a guest address to a user space address
463  * @address: guest address
464  * @gmap: pointer to guest mapping meta data structure
465  *
466  * Returns user space address which corresponds to the guest address or
467  * -EFAULT if no such mapping exists.
468  * This function does not establish potentially missing page table entries.
469  */
470 unsigned long gmap_translate(unsigned long address, struct gmap *gmap)
471 {
472 	unsigned long rc;
473 
474 	down_read(&gmap->mm->mmap_sem);
475 	rc = __gmap_translate(address, gmap);
476 	up_read(&gmap->mm->mmap_sem);
477 	return rc;
478 }
479 EXPORT_SYMBOL_GPL(gmap_translate);
480 
481 static int gmap_connect_pgtable(unsigned long address, unsigned long segment,
482 				unsigned long *segment_ptr, struct gmap *gmap)
483 {
484 	unsigned long vmaddr;
485 	struct vm_area_struct *vma;
486 	struct gmap_pgtable *mp;
487 	struct gmap_rmap *rmap;
488 	struct mm_struct *mm;
489 	struct page *page;
490 	pgd_t *pgd;
491 	pud_t *pud;
492 	pmd_t *pmd;
493 
494 	mm = gmap->mm;
495 	vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
496 	vma = find_vma(mm, vmaddr);
497 	if (!vma || vma->vm_start > vmaddr)
498 		return -EFAULT;
499 	/* Walk the parent mm page table */
500 	pgd = pgd_offset(mm, vmaddr);
501 	pud = pud_alloc(mm, pgd, vmaddr);
502 	if (!pud)
503 		return -ENOMEM;
504 	pmd = pmd_alloc(mm, pud, vmaddr);
505 	if (!pmd)
506 		return -ENOMEM;
507 	if (!pmd_present(*pmd) &&
508 	    __pte_alloc(mm, vma, pmd, vmaddr))
509 		return -ENOMEM;
510 	/* large pmds cannot yet be handled */
511 	if (pmd_large(*pmd))
512 		return -EFAULT;
513 	/* pmd now points to a valid segment table entry. */
514 	rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT);
515 	if (!rmap)
516 		return -ENOMEM;
517 	/* Link gmap segment table entry location to page table. */
518 	page = pmd_page(*pmd);
519 	mp = (struct gmap_pgtable *) page->index;
520 	rmap->gmap = gmap;
521 	rmap->entry = segment_ptr;
522 	rmap->vmaddr = address & PMD_MASK;
523 	spin_lock(&mm->page_table_lock);
524 	if (*segment_ptr == segment) {
525 		list_add(&rmap->list, &mp->mapper);
526 		/* Set gmap segment table entry to page table. */
527 		*segment_ptr = pmd_val(*pmd) & PAGE_MASK;
528 		rmap = NULL;
529 	}
530 	spin_unlock(&mm->page_table_lock);
531 	kfree(rmap);
532 	return 0;
533 }
534 
535 static void gmap_disconnect_pgtable(struct mm_struct *mm, unsigned long *table)
536 {
537 	struct gmap_rmap *rmap, *next;
538 	struct gmap_pgtable *mp;
539 	struct page *page;
540 	int flush;
541 
542 	flush = 0;
543 	spin_lock(&mm->page_table_lock);
544 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
545 	mp = (struct gmap_pgtable *) page->index;
546 	list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
547 		*rmap->entry = mp->vmaddr | (_SEGMENT_ENTRY_INVALID |
548 					     _SEGMENT_ENTRY_PROTECT);
549 		list_del(&rmap->list);
550 		kfree(rmap);
551 		flush = 1;
552 	}
553 	spin_unlock(&mm->page_table_lock);
554 	if (flush)
555 		__tlb_flush_global();
556 }
557 
558 /*
559  * this function is assumed to be called with mmap_sem held
560  */
561 unsigned long __gmap_fault(unsigned long address, struct gmap *gmap)
562 {
563 	unsigned long *segment_ptr, segment;
564 	struct gmap_pgtable *mp;
565 	struct page *page;
566 	int rc;
567 
568 	current->thread.gmap_addr = address;
569 	segment_ptr = gmap_table_walk(address, gmap);
570 	if (IS_ERR(segment_ptr))
571 		return -EFAULT;
572 	/* Convert the gmap address to an mm address. */
573 	while (1) {
574 		segment = *segment_ptr;
575 		if (!(segment & _SEGMENT_ENTRY_INVALID)) {
576 			/* Page table is present */
577 			page = pfn_to_page(segment >> PAGE_SHIFT);
578 			mp = (struct gmap_pgtable *) page->index;
579 			return mp->vmaddr | (address & ~PMD_MASK);
580 		}
581 		if (!(segment & _SEGMENT_ENTRY_PROTECT))
582 			/* Nothing mapped in the gmap address space. */
583 			break;
584 		rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap);
585 		if (rc)
586 			return rc;
587 	}
588 	return -EFAULT;
589 }
590 
591 unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
592 {
593 	unsigned long rc;
594 
595 	down_read(&gmap->mm->mmap_sem);
596 	rc = __gmap_fault(address, gmap);
597 	up_read(&gmap->mm->mmap_sem);
598 
599 	return rc;
600 }
601 EXPORT_SYMBOL_GPL(gmap_fault);
602 
603 static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
604 {
605 	if (!non_swap_entry(entry))
606 		dec_mm_counter(mm, MM_SWAPENTS);
607 	else if (is_migration_entry(entry)) {
608 		struct page *page = migration_entry_to_page(entry);
609 
610 		if (PageAnon(page))
611 			dec_mm_counter(mm, MM_ANONPAGES);
612 		else
613 			dec_mm_counter(mm, MM_FILEPAGES);
614 	}
615 	free_swap_and_cache(entry);
616 }
617 
618 /**
619  * The mm->mmap_sem lock must be held
620  */
621 static void gmap_zap_unused(struct mm_struct *mm, unsigned long address)
622 {
623 	unsigned long ptev, pgstev;
624 	spinlock_t *ptl;
625 	pgste_t pgste;
626 	pte_t *ptep, pte;
627 
628 	ptep = get_locked_pte(mm, address, &ptl);
629 	if (unlikely(!ptep))
630 		return;
631 	pte = *ptep;
632 	if (!pte_swap(pte))
633 		goto out_pte;
634 	/* Zap unused and logically-zero pages */
635 	pgste = pgste_get_lock(ptep);
636 	pgstev = pgste_val(pgste);
637 	ptev = pte_val(pte);
638 	if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
639 	    ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) {
640 		gmap_zap_swap_entry(pte_to_swp_entry(pte), mm);
641 		pte_clear(mm, address, ptep);
642 	}
643 	pgste_set_unlock(ptep, pgste);
644 out_pte:
645 	pte_unmap_unlock(*ptep, ptl);
646 }
647 
648 /*
649  * this function is assumed to be called with mmap_sem held
650  */
651 void __gmap_zap(unsigned long address, struct gmap *gmap)
652 {
653 	unsigned long *table, *segment_ptr;
654 	unsigned long segment, pgstev, ptev;
655 	struct gmap_pgtable *mp;
656 	struct page *page;
657 
658 	segment_ptr = gmap_table_walk(address, gmap);
659 	if (IS_ERR(segment_ptr))
660 		return;
661 	segment = *segment_ptr;
662 	if (segment & _SEGMENT_ENTRY_INVALID)
663 		return;
664 	page = pfn_to_page(segment >> PAGE_SHIFT);
665 	mp = (struct gmap_pgtable *) page->index;
666 	address = mp->vmaddr | (address & ~PMD_MASK);
667 	/* Page table is present */
668 	table = (unsigned long *)(segment & _SEGMENT_ENTRY_ORIGIN);
669 	table = table + ((address >> 12) & 0xff);
670 	pgstev = table[PTRS_PER_PTE];
671 	ptev = table[0];
672 	/* quick check, checked again with locks held */
673 	if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
674 	    ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID)))
675 		gmap_zap_unused(gmap->mm, address);
676 }
677 EXPORT_SYMBOL_GPL(__gmap_zap);
678 
679 void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap)
680 {
681 
682 	unsigned long *table, address, size;
683 	struct vm_area_struct *vma;
684 	struct gmap_pgtable *mp;
685 	struct page *page;
686 
687 	down_read(&gmap->mm->mmap_sem);
688 	address = from;
689 	while (address < to) {
690 		/* Walk the gmap address space page table */
691 		table = gmap->table + ((address >> 53) & 0x7ff);
692 		if (unlikely(*table & _REGION_ENTRY_INVALID)) {
693 			address = (address + PMD_SIZE) & PMD_MASK;
694 			continue;
695 		}
696 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
697 		table = table + ((address >> 42) & 0x7ff);
698 		if (unlikely(*table & _REGION_ENTRY_INVALID)) {
699 			address = (address + PMD_SIZE) & PMD_MASK;
700 			continue;
701 		}
702 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
703 		table = table + ((address >> 31) & 0x7ff);
704 		if (unlikely(*table & _REGION_ENTRY_INVALID)) {
705 			address = (address + PMD_SIZE) & PMD_MASK;
706 			continue;
707 		}
708 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
709 		table = table + ((address >> 20) & 0x7ff);
710 		if (unlikely(*table & _SEGMENT_ENTRY_INVALID)) {
711 			address = (address + PMD_SIZE) & PMD_MASK;
712 			continue;
713 		}
714 		page = pfn_to_page(*table >> PAGE_SHIFT);
715 		mp = (struct gmap_pgtable *) page->index;
716 		vma = find_vma(gmap->mm, mp->vmaddr);
717 		size = min(to - address, PMD_SIZE - (address & ~PMD_MASK));
718 		zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK),
719 			       size, NULL);
720 		address = (address + PMD_SIZE) & PMD_MASK;
721 	}
722 	up_read(&gmap->mm->mmap_sem);
723 }
724 EXPORT_SYMBOL_GPL(gmap_discard);
725 
726 static LIST_HEAD(gmap_notifier_list);
727 static DEFINE_SPINLOCK(gmap_notifier_lock);
728 
729 /**
730  * gmap_register_ipte_notifier - register a pte invalidation callback
731  * @nb: pointer to the gmap notifier block
732  */
733 void gmap_register_ipte_notifier(struct gmap_notifier *nb)
734 {
735 	spin_lock(&gmap_notifier_lock);
736 	list_add(&nb->list, &gmap_notifier_list);
737 	spin_unlock(&gmap_notifier_lock);
738 }
739 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
740 
741 /**
742  * gmap_unregister_ipte_notifier - remove a pte invalidation callback
743  * @nb: pointer to the gmap notifier block
744  */
745 void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
746 {
747 	spin_lock(&gmap_notifier_lock);
748 	list_del_init(&nb->list);
749 	spin_unlock(&gmap_notifier_lock);
750 }
751 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
752 
753 /**
754  * gmap_ipte_notify - mark a range of ptes for invalidation notification
755  * @gmap: pointer to guest mapping meta data structure
756  * @start: virtual address in the guest address space
757  * @len: size of area
758  *
759  * Returns 0 if for each page in the given range a gmap mapping exists and
760  * the invalidation notification could be set. If the gmap mapping is missing
761  * for one or more pages -EFAULT is returned. If no memory could be allocated
762  * -ENOMEM is returned. This function establishes missing page table entries.
763  */
764 int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len)
765 {
766 	unsigned long addr;
767 	spinlock_t *ptl;
768 	pte_t *ptep, entry;
769 	pgste_t pgste;
770 	int rc = 0;
771 
772 	if ((start & ~PAGE_MASK) || (len & ~PAGE_MASK))
773 		return -EINVAL;
774 	down_read(&gmap->mm->mmap_sem);
775 	while (len) {
776 		/* Convert gmap address and connect the page tables */
777 		addr = __gmap_fault(start, gmap);
778 		if (IS_ERR_VALUE(addr)) {
779 			rc = addr;
780 			break;
781 		}
782 		/* Get the page mapped */
783 		if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) {
784 			rc = -EFAULT;
785 			break;
786 		}
787 		/* Walk the process page table, lock and get pte pointer */
788 		ptep = get_locked_pte(gmap->mm, addr, &ptl);
789 		if (unlikely(!ptep))
790 			continue;
791 		/* Set notification bit in the pgste of the pte */
792 		entry = *ptep;
793 		if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
794 			pgste = pgste_get_lock(ptep);
795 			pgste_val(pgste) |= PGSTE_IN_BIT;
796 			pgste_set_unlock(ptep, pgste);
797 			start += PAGE_SIZE;
798 			len -= PAGE_SIZE;
799 		}
800 		spin_unlock(ptl);
801 	}
802 	up_read(&gmap->mm->mmap_sem);
803 	return rc;
804 }
805 EXPORT_SYMBOL_GPL(gmap_ipte_notify);
806 
807 /**
808  * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
809  * @mm: pointer to the process mm_struct
810  * @pte: pointer to the page table entry
811  *
812  * This function is assumed to be called with the page table lock held
813  * for the pte to notify.
814  */
815 void gmap_do_ipte_notify(struct mm_struct *mm, pte_t *pte)
816 {
817 	unsigned long segment_offset;
818 	struct gmap_notifier *nb;
819 	struct gmap_pgtable *mp;
820 	struct gmap_rmap *rmap;
821 	struct page *page;
822 
823 	segment_offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
824 	segment_offset = segment_offset * (4096 / sizeof(pte_t));
825 	page = pfn_to_page(__pa(pte) >> PAGE_SHIFT);
826 	mp = (struct gmap_pgtable *) page->index;
827 	spin_lock(&gmap_notifier_lock);
828 	list_for_each_entry(rmap, &mp->mapper, list) {
829 		list_for_each_entry(nb, &gmap_notifier_list, list)
830 			nb->notifier_call(rmap->gmap,
831 					  rmap->vmaddr + segment_offset);
832 	}
833 	spin_unlock(&gmap_notifier_lock);
834 }
835 
836 static inline int page_table_with_pgste(struct page *page)
837 {
838 	return atomic_read(&page->_mapcount) == 0;
839 }
840 
841 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
842 						    unsigned long vmaddr)
843 {
844 	struct page *page;
845 	unsigned long *table;
846 	struct gmap_pgtable *mp;
847 
848 	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
849 	if (!page)
850 		return NULL;
851 	mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT);
852 	if (!mp) {
853 		__free_page(page);
854 		return NULL;
855 	}
856 	if (!pgtable_page_ctor(page)) {
857 		kfree(mp);
858 		__free_page(page);
859 		return NULL;
860 	}
861 	mp->vmaddr = vmaddr & PMD_MASK;
862 	INIT_LIST_HEAD(&mp->mapper);
863 	page->index = (unsigned long) mp;
864 	atomic_set(&page->_mapcount, 0);
865 	table = (unsigned long *) page_to_phys(page);
866 	clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
867 	clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT,
868 		    PAGE_SIZE/2);
869 	return table;
870 }
871 
872 static inline void page_table_free_pgste(unsigned long *table)
873 {
874 	struct page *page;
875 	struct gmap_pgtable *mp;
876 
877 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
878 	mp = (struct gmap_pgtable *) page->index;
879 	BUG_ON(!list_empty(&mp->mapper));
880 	pgtable_page_dtor(page);
881 	atomic_set(&page->_mapcount, -1);
882 	kfree(mp);
883 	__free_page(page);
884 }
885 
886 static inline unsigned long page_table_reset_pte(struct mm_struct *mm,
887 			pmd_t *pmd, unsigned long addr, unsigned long end)
888 {
889 	pte_t *start_pte, *pte;
890 	spinlock_t *ptl;
891 	pgste_t pgste;
892 
893 	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
894 	pte = start_pte;
895 	do {
896 		pgste = pgste_get_lock(pte);
897 		pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
898 		pgste_set_unlock(pte, pgste);
899 	} while (pte++, addr += PAGE_SIZE, addr != end);
900 	pte_unmap_unlock(start_pte, ptl);
901 
902 	return addr;
903 }
904 
905 static inline unsigned long page_table_reset_pmd(struct mm_struct *mm,
906 			pud_t *pud, unsigned long addr, unsigned long end)
907 {
908 	unsigned long next;
909 	pmd_t *pmd;
910 
911 	pmd = pmd_offset(pud, addr);
912 	do {
913 		next = pmd_addr_end(addr, end);
914 		if (pmd_none_or_clear_bad(pmd))
915 			continue;
916 		next = page_table_reset_pte(mm, pmd, addr, next);
917 	} while (pmd++, addr = next, addr != end);
918 
919 	return addr;
920 }
921 
922 static inline unsigned long page_table_reset_pud(struct mm_struct *mm,
923 			pgd_t *pgd, unsigned long addr, unsigned long end)
924 {
925 	unsigned long next;
926 	pud_t *pud;
927 
928 	pud = pud_offset(pgd, addr);
929 	do {
930 		next = pud_addr_end(addr, end);
931 		if (pud_none_or_clear_bad(pud))
932 			continue;
933 		next = page_table_reset_pmd(mm, pud, addr, next);
934 	} while (pud++, addr = next, addr != end);
935 
936 	return addr;
937 }
938 
939 void page_table_reset_pgste(struct mm_struct *mm,
940 			unsigned long start, unsigned long end)
941 {
942 	unsigned long addr, next;
943 	pgd_t *pgd;
944 
945 	addr = start;
946 	down_read(&mm->mmap_sem);
947 	pgd = pgd_offset(mm, addr);
948 	do {
949 		next = pgd_addr_end(addr, end);
950 		if (pgd_none_or_clear_bad(pgd))
951 			continue;
952 		next = page_table_reset_pud(mm, pgd, addr, next);
953 	} while (pgd++, addr = next, addr != end);
954 	up_read(&mm->mmap_sem);
955 }
956 EXPORT_SYMBOL(page_table_reset_pgste);
957 
958 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
959 			  unsigned long key, bool nq)
960 {
961 	spinlock_t *ptl;
962 	pgste_t old, new;
963 	pte_t *ptep;
964 
965 	down_read(&mm->mmap_sem);
966 	ptep = get_locked_pte(current->mm, addr, &ptl);
967 	if (unlikely(!ptep)) {
968 		up_read(&mm->mmap_sem);
969 		return -EFAULT;
970 	}
971 
972 	new = old = pgste_get_lock(ptep);
973 	pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
974 			    PGSTE_ACC_BITS | PGSTE_FP_BIT);
975 	pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
976 	pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
977 	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
978 		unsigned long address, bits, skey;
979 
980 		address = pte_val(*ptep) & PAGE_MASK;
981 		skey = (unsigned long) page_get_storage_key(address);
982 		bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
983 		skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
984 		/* Set storage key ACC and FP */
985 		page_set_storage_key(address, skey, !nq);
986 		/* Merge host changed & referenced into pgste  */
987 		pgste_val(new) |= bits << 52;
988 	}
989 	/* changing the guest storage key is considered a change of the page */
990 	if ((pgste_val(new) ^ pgste_val(old)) &
991 	    (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
992 		pgste_val(new) |= PGSTE_HC_BIT;
993 
994 	pgste_set_unlock(ptep, new);
995 	pte_unmap_unlock(*ptep, ptl);
996 	up_read(&mm->mmap_sem);
997 	return 0;
998 }
999 EXPORT_SYMBOL(set_guest_storage_key);
1000 
1001 #else /* CONFIG_PGSTE */
1002 
1003 static inline int page_table_with_pgste(struct page *page)
1004 {
1005 	return 0;
1006 }
1007 
1008 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
1009 						    unsigned long vmaddr)
1010 {
1011 	return NULL;
1012 }
1013 
1014 static inline void page_table_free_pgste(unsigned long *table)
1015 {
1016 }
1017 
1018 static inline void gmap_disconnect_pgtable(struct mm_struct *mm,
1019 					   unsigned long *table)
1020 {
1021 }
1022 
1023 #endif /* CONFIG_PGSTE */
1024 
1025 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
1026 {
1027 	unsigned int old, new;
1028 
1029 	do {
1030 		old = atomic_read(v);
1031 		new = old ^ bits;
1032 	} while (atomic_cmpxchg(v, old, new) != old);
1033 	return new;
1034 }
1035 
1036 /*
1037  * page table entry allocation/free routines.
1038  */
1039 unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
1040 {
1041 	unsigned long *uninitialized_var(table);
1042 	struct page *uninitialized_var(page);
1043 	unsigned int mask, bit;
1044 
1045 	if (mm_has_pgste(mm))
1046 		return page_table_alloc_pgste(mm, vmaddr);
1047 	/* Allocate fragments of a 4K page as 1K/2K page table */
1048 	spin_lock_bh(&mm->context.list_lock);
1049 	mask = FRAG_MASK;
1050 	if (!list_empty(&mm->context.pgtable_list)) {
1051 		page = list_first_entry(&mm->context.pgtable_list,
1052 					struct page, lru);
1053 		table = (unsigned long *) page_to_phys(page);
1054 		mask = atomic_read(&page->_mapcount);
1055 		mask = mask | (mask >> 4);
1056 	}
1057 	if ((mask & FRAG_MASK) == FRAG_MASK) {
1058 		spin_unlock_bh(&mm->context.list_lock);
1059 		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
1060 		if (!page)
1061 			return NULL;
1062 		if (!pgtable_page_ctor(page)) {
1063 			__free_page(page);
1064 			return NULL;
1065 		}
1066 		atomic_set(&page->_mapcount, 1);
1067 		table = (unsigned long *) page_to_phys(page);
1068 		clear_table(table, _PAGE_INVALID, PAGE_SIZE);
1069 		spin_lock_bh(&mm->context.list_lock);
1070 		list_add(&page->lru, &mm->context.pgtable_list);
1071 	} else {
1072 		for (bit = 1; mask & bit; bit <<= 1)
1073 			table += PTRS_PER_PTE;
1074 		mask = atomic_xor_bits(&page->_mapcount, bit);
1075 		if ((mask & FRAG_MASK) == FRAG_MASK)
1076 			list_del(&page->lru);
1077 	}
1078 	spin_unlock_bh(&mm->context.list_lock);
1079 	return table;
1080 }
1081 
1082 void page_table_free(struct mm_struct *mm, unsigned long *table)
1083 {
1084 	struct page *page;
1085 	unsigned int bit, mask;
1086 
1087 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1088 	if (page_table_with_pgste(page)) {
1089 		gmap_disconnect_pgtable(mm, table);
1090 		return page_table_free_pgste(table);
1091 	}
1092 	/* Free 1K/2K page table fragment of a 4K page */
1093 	bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
1094 	spin_lock_bh(&mm->context.list_lock);
1095 	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
1096 		list_del(&page->lru);
1097 	mask = atomic_xor_bits(&page->_mapcount, bit);
1098 	if (mask & FRAG_MASK)
1099 		list_add(&page->lru, &mm->context.pgtable_list);
1100 	spin_unlock_bh(&mm->context.list_lock);
1101 	if (mask == 0) {
1102 		pgtable_page_dtor(page);
1103 		atomic_set(&page->_mapcount, -1);
1104 		__free_page(page);
1105 	}
1106 }
1107 
1108 static void __page_table_free_rcu(void *table, unsigned bit)
1109 {
1110 	struct page *page;
1111 
1112 	if (bit == FRAG_MASK)
1113 		return page_table_free_pgste(table);
1114 	/* Free 1K/2K page table fragment of a 4K page */
1115 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1116 	if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
1117 		pgtable_page_dtor(page);
1118 		atomic_set(&page->_mapcount, -1);
1119 		__free_page(page);
1120 	}
1121 }
1122 
1123 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
1124 {
1125 	struct mm_struct *mm;
1126 	struct page *page;
1127 	unsigned int bit, mask;
1128 
1129 	mm = tlb->mm;
1130 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1131 	if (page_table_with_pgste(page)) {
1132 		gmap_disconnect_pgtable(mm, table);
1133 		table = (unsigned long *) (__pa(table) | FRAG_MASK);
1134 		tlb_remove_table(tlb, table);
1135 		return;
1136 	}
1137 	bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
1138 	spin_lock_bh(&mm->context.list_lock);
1139 	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
1140 		list_del(&page->lru);
1141 	mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
1142 	if (mask & FRAG_MASK)
1143 		list_add_tail(&page->lru, &mm->context.pgtable_list);
1144 	spin_unlock_bh(&mm->context.list_lock);
1145 	table = (unsigned long *) (__pa(table) | (bit << 4));
1146 	tlb_remove_table(tlb, table);
1147 }
1148 
1149 static void __tlb_remove_table(void *_table)
1150 {
1151 	const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK;
1152 	void *table = (void *)((unsigned long) _table & ~mask);
1153 	unsigned type = (unsigned long) _table & mask;
1154 
1155 	if (type)
1156 		__page_table_free_rcu(table, type);
1157 	else
1158 		free_pages((unsigned long) table, ALLOC_ORDER);
1159 }
1160 
1161 static void tlb_remove_table_smp_sync(void *arg)
1162 {
1163 	/* Simply deliver the interrupt */
1164 }
1165 
1166 static void tlb_remove_table_one(void *table)
1167 {
1168 	/*
1169 	 * This isn't an RCU grace period and hence the page-tables cannot be
1170 	 * assumed to be actually RCU-freed.
1171 	 *
1172 	 * It is however sufficient for software page-table walkers that rely
1173 	 * on IRQ disabling. See the comment near struct mmu_table_batch.
1174 	 */
1175 	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
1176 	__tlb_remove_table(table);
1177 }
1178 
1179 static void tlb_remove_table_rcu(struct rcu_head *head)
1180 {
1181 	struct mmu_table_batch *batch;
1182 	int i;
1183 
1184 	batch = container_of(head, struct mmu_table_batch, rcu);
1185 
1186 	for (i = 0; i < batch->nr; i++)
1187 		__tlb_remove_table(batch->tables[i]);
1188 
1189 	free_page((unsigned long)batch);
1190 }
1191 
1192 void tlb_table_flush(struct mmu_gather *tlb)
1193 {
1194 	struct mmu_table_batch **batch = &tlb->batch;
1195 
1196 	if (*batch) {
1197 		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
1198 		*batch = NULL;
1199 	}
1200 }
1201 
1202 void tlb_remove_table(struct mmu_gather *tlb, void *table)
1203 {
1204 	struct mmu_table_batch **batch = &tlb->batch;
1205 
1206 	tlb->mm->context.flush_mm = 1;
1207 	if (*batch == NULL) {
1208 		*batch = (struct mmu_table_batch *)
1209 			__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
1210 		if (*batch == NULL) {
1211 			__tlb_flush_mm_lazy(tlb->mm);
1212 			tlb_remove_table_one(table);
1213 			return;
1214 		}
1215 		(*batch)->nr = 0;
1216 	}
1217 	(*batch)->tables[(*batch)->nr++] = table;
1218 	if ((*batch)->nr == MAX_TABLE_BATCH)
1219 		tlb_flush_mmu(tlb);
1220 }
1221 
1222 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1223 static inline void thp_split_vma(struct vm_area_struct *vma)
1224 {
1225 	unsigned long addr;
1226 
1227 	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
1228 		follow_page(vma, addr, FOLL_SPLIT);
1229 }
1230 
1231 static inline void thp_split_mm(struct mm_struct *mm)
1232 {
1233 	struct vm_area_struct *vma;
1234 
1235 	for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
1236 		thp_split_vma(vma);
1237 		vma->vm_flags &= ~VM_HUGEPAGE;
1238 		vma->vm_flags |= VM_NOHUGEPAGE;
1239 	}
1240 	mm->def_flags |= VM_NOHUGEPAGE;
1241 }
1242 #else
1243 static inline void thp_split_mm(struct mm_struct *mm)
1244 {
1245 }
1246 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1247 
1248 static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
1249 				struct mm_struct *mm, pud_t *pud,
1250 				unsigned long addr, unsigned long end)
1251 {
1252 	unsigned long next, *table, *new;
1253 	struct page *page;
1254 	pmd_t *pmd;
1255 
1256 	pmd = pmd_offset(pud, addr);
1257 	do {
1258 		next = pmd_addr_end(addr, end);
1259 again:
1260 		if (pmd_none_or_clear_bad(pmd))
1261 			continue;
1262 		table = (unsigned long *) pmd_deref(*pmd);
1263 		page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1264 		if (page_table_with_pgste(page))
1265 			continue;
1266 		/* Allocate new page table with pgstes */
1267 		new = page_table_alloc_pgste(mm, addr);
1268 		if (!new)
1269 			return -ENOMEM;
1270 
1271 		spin_lock(&mm->page_table_lock);
1272 		if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
1273 			/* Nuke pmd entry pointing to the "short" page table */
1274 			pmdp_flush_lazy(mm, addr, pmd);
1275 			pmd_clear(pmd);
1276 			/* Copy ptes from old table to new table */
1277 			memcpy(new, table, PAGE_SIZE/2);
1278 			clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
1279 			/* Establish new table */
1280 			pmd_populate(mm, pmd, (pte_t *) new);
1281 			/* Free old table with rcu, there might be a walker! */
1282 			page_table_free_rcu(tlb, table);
1283 			new = NULL;
1284 		}
1285 		spin_unlock(&mm->page_table_lock);
1286 		if (new) {
1287 			page_table_free_pgste(new);
1288 			goto again;
1289 		}
1290 	} while (pmd++, addr = next, addr != end);
1291 
1292 	return addr;
1293 }
1294 
1295 static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
1296 				   struct mm_struct *mm, pgd_t *pgd,
1297 				   unsigned long addr, unsigned long end)
1298 {
1299 	unsigned long next;
1300 	pud_t *pud;
1301 
1302 	pud = pud_offset(pgd, addr);
1303 	do {
1304 		next = pud_addr_end(addr, end);
1305 		if (pud_none_or_clear_bad(pud))
1306 			continue;
1307 		next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
1308 		if (unlikely(IS_ERR_VALUE(next)))
1309 			return next;
1310 	} while (pud++, addr = next, addr != end);
1311 
1312 	return addr;
1313 }
1314 
1315 static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
1316 					unsigned long addr, unsigned long end)
1317 {
1318 	unsigned long next;
1319 	pgd_t *pgd;
1320 
1321 	pgd = pgd_offset(mm, addr);
1322 	do {
1323 		next = pgd_addr_end(addr, end);
1324 		if (pgd_none_or_clear_bad(pgd))
1325 			continue;
1326 		next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
1327 		if (unlikely(IS_ERR_VALUE(next)))
1328 			return next;
1329 	} while (pgd++, addr = next, addr != end);
1330 
1331 	return 0;
1332 }
1333 
1334 /*
1335  * switch on pgstes for its userspace process (for kvm)
1336  */
1337 int s390_enable_sie(void)
1338 {
1339 	struct task_struct *tsk = current;
1340 	struct mm_struct *mm = tsk->mm;
1341 	struct mmu_gather tlb;
1342 
1343 	/* Do we have pgstes? if yes, we are done */
1344 	if (mm_has_pgste(tsk->mm))
1345 		return 0;
1346 
1347 	down_write(&mm->mmap_sem);
1348 	/* split thp mappings and disable thp for future mappings */
1349 	thp_split_mm(mm);
1350 	/* Reallocate the page tables with pgstes */
1351 	tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
1352 	if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE))
1353 		mm->context.has_pgste = 1;
1354 	tlb_finish_mmu(&tlb, 0, TASK_SIZE);
1355 	up_write(&mm->mmap_sem);
1356 	return mm->context.has_pgste ? 0 : -ENOMEM;
1357 }
1358 EXPORT_SYMBOL_GPL(s390_enable_sie);
1359 
1360 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1361 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
1362 			   pmd_t *pmdp)
1363 {
1364 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1365 	/* No need to flush TLB
1366 	 * On s390 reference bits are in storage key and never in TLB */
1367 	return pmdp_test_and_clear_young(vma, address, pmdp);
1368 }
1369 
1370 int pmdp_set_access_flags(struct vm_area_struct *vma,
1371 			  unsigned long address, pmd_t *pmdp,
1372 			  pmd_t entry, int dirty)
1373 {
1374 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1375 
1376 	if (pmd_same(*pmdp, entry))
1377 		return 0;
1378 	pmdp_invalidate(vma, address, pmdp);
1379 	set_pmd_at(vma->vm_mm, address, pmdp, entry);
1380 	return 1;
1381 }
1382 
1383 static void pmdp_splitting_flush_sync(void *arg)
1384 {
1385 	/* Simply deliver the interrupt */
1386 }
1387 
1388 void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
1389 			  pmd_t *pmdp)
1390 {
1391 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1392 	if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
1393 			      (unsigned long *) pmdp)) {
1394 		/* need to serialize against gup-fast (IRQ disabled) */
1395 		smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
1396 	}
1397 }
1398 
1399 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1400 				pgtable_t pgtable)
1401 {
1402 	struct list_head *lh = (struct list_head *) pgtable;
1403 
1404 	assert_spin_locked(pmd_lockptr(mm, pmdp));
1405 
1406 	/* FIFO */
1407 	if (!pmd_huge_pte(mm, pmdp))
1408 		INIT_LIST_HEAD(lh);
1409 	else
1410 		list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1411 	pmd_huge_pte(mm, pmdp) = pgtable;
1412 }
1413 
1414 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1415 {
1416 	struct list_head *lh;
1417 	pgtable_t pgtable;
1418 	pte_t *ptep;
1419 
1420 	assert_spin_locked(pmd_lockptr(mm, pmdp));
1421 
1422 	/* FIFO */
1423 	pgtable = pmd_huge_pte(mm, pmdp);
1424 	lh = (struct list_head *) pgtable;
1425 	if (list_empty(lh))
1426 		pmd_huge_pte(mm, pmdp) = NULL;
1427 	else {
1428 		pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
1429 		list_del(lh);
1430 	}
1431 	ptep = (pte_t *) pgtable;
1432 	pte_val(*ptep) = _PAGE_INVALID;
1433 	ptep++;
1434 	pte_val(*ptep) = _PAGE_INVALID;
1435 	return pgtable;
1436 }
1437 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1438