xref: /openbmc/linux/mm/mmu_gather.c (revision 85716a80)
1 #include <linux/gfp.h>
2 #include <linux/highmem.h>
3 #include <linux/kernel.h>
4 #include <linux/mmdebug.h>
5 #include <linux/mm_types.h>
6 #include <linux/mm_inline.h>
7 #include <linux/pagemap.h>
8 #include <linux/rcupdate.h>
9 #include <linux/smp.h>
10 #include <linux/swap.h>
11 #include <linux/rmap.h>
12 
13 #include <asm/pgalloc.h>
14 #include <asm/tlb.h>
15 
16 #ifndef CONFIG_MMU_GATHER_NO_GATHER
17 
18 static bool tlb_next_batch(struct mmu_gather *tlb)
19 {
20 	struct mmu_gather_batch *batch;
21 
22 	/* No more batching if we have delayed rmaps pending */
23 	if (tlb->delayed_rmap)
24 		return false;
25 
26 	batch = tlb->active;
27 	if (batch->next) {
28 		tlb->active = batch->next;
29 		return true;
30 	}
31 
32 	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
33 		return false;
34 
35 	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
36 	if (!batch)
37 		return false;
38 
39 	tlb->batch_count++;
40 	batch->next = NULL;
41 	batch->nr   = 0;
42 	batch->max  = MAX_GATHER_BATCH;
43 
44 	tlb->active->next = batch;
45 	tlb->active = batch;
46 
47 	return true;
48 }
49 
50 #ifdef CONFIG_SMP
51 /**
52  * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
53  * @tlb: the current mmu_gather
54  *
55  * Note that because of how tlb_next_batch() above works, we will
56  * never start new batches with pending delayed rmaps, so we only
57  * need to walk through the current active batch.
58  */
59 void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
60 {
61 	struct mmu_gather_batch *batch;
62 
63 	if (!tlb->delayed_rmap)
64 		return;
65 
66 	batch = tlb->active;
67 	for (int i = 0; i < batch->nr; i++) {
68 		struct encoded_page *enc = batch->encoded_pages[i];
69 
70 		if (encoded_page_flags(enc)) {
71 			struct page *page = encoded_page_ptr(enc);
72 			page_remove_rmap(page, vma, false);
73 		}
74 	}
75 
76 	tlb->delayed_rmap = 0;
77 }
78 #endif
79 
80 static void tlb_batch_pages_flush(struct mmu_gather *tlb)
81 {
82 	struct mmu_gather_batch *batch;
83 
84 	for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
85 		struct encoded_page **pages = batch->encoded_pages;
86 
87 		do {
88 			/*
89 			 * limit free batch count when PAGE_SIZE > 4K
90 			 */
91 			unsigned int nr = min(512U, batch->nr);
92 
93 			free_pages_and_swap_cache(pages, nr);
94 			pages += nr;
95 			batch->nr -= nr;
96 
97 			cond_resched();
98 		} while (batch->nr);
99 	}
100 	tlb->active = &tlb->local;
101 }
102 
103 static void tlb_batch_list_free(struct mmu_gather *tlb)
104 {
105 	struct mmu_gather_batch *batch, *next;
106 
107 	for (batch = tlb->local.next; batch; batch = next) {
108 		next = batch->next;
109 		free_pages((unsigned long)batch, 0);
110 	}
111 	tlb->local.next = NULL;
112 }
113 
114 bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size)
115 {
116 	struct mmu_gather_batch *batch;
117 
118 	VM_BUG_ON(!tlb->end);
119 
120 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
121 	VM_WARN_ON(tlb->page_size != page_size);
122 #endif
123 
124 	batch = tlb->active;
125 	/*
126 	 * Add the page and check if we are full. If so
127 	 * force a flush.
128 	 */
129 	batch->encoded_pages[batch->nr++] = page;
130 	if (batch->nr == batch->max) {
131 		if (!tlb_next_batch(tlb))
132 			return true;
133 		batch = tlb->active;
134 	}
135 	VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page));
136 
137 	return false;
138 }
139 
140 #endif /* MMU_GATHER_NO_GATHER */
141 
142 #ifdef CONFIG_MMU_GATHER_TABLE_FREE
143 
144 static void __tlb_remove_table_free(struct mmu_table_batch *batch)
145 {
146 	int i;
147 
148 	for (i = 0; i < batch->nr; i++)
149 		__tlb_remove_table(batch->tables[i]);
150 
151 	free_page((unsigned long)batch);
152 }
153 
154 #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
155 
156 /*
157  * Semi RCU freeing of the page directories.
158  *
159  * This is needed by some architectures to implement software pagetable walkers.
160  *
161  * gup_fast() and other software pagetable walkers do a lockless page-table
162  * walk and therefore needs some synchronization with the freeing of the page
163  * directories. The chosen means to accomplish that is by disabling IRQs over
164  * the walk.
165  *
166  * Architectures that use IPIs to flush TLBs will then automagically DTRT,
167  * since we unlink the page, flush TLBs, free the page. Since the disabling of
168  * IRQs delays the completion of the TLB flush we can never observe an already
169  * freed page.
170  *
171  * Architectures that do not have this (PPC) need to delay the freeing by some
172  * other means, this is that means.
173  *
174  * What we do is batch the freed directory pages (tables) and RCU free them.
175  * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
176  * holds off grace periods.
177  *
178  * However, in order to batch these pages we need to allocate storage, this
179  * allocation is deep inside the MM code and can thus easily fail on memory
180  * pressure. To guarantee progress we fall back to single table freeing, see
181  * the implementation of tlb_remove_table_one().
182  *
183  */
184 
185 static void tlb_remove_table_smp_sync(void *arg)
186 {
187 	/* Simply deliver the interrupt */
188 }
189 
190 void tlb_remove_table_sync_one(void)
191 {
192 	/*
193 	 * This isn't an RCU grace period and hence the page-tables cannot be
194 	 * assumed to be actually RCU-freed.
195 	 *
196 	 * It is however sufficient for software page-table walkers that rely on
197 	 * IRQ disabling.
198 	 */
199 	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
200 }
201 
202 static void tlb_remove_table_rcu(struct rcu_head *head)
203 {
204 	__tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
205 }
206 
207 static void tlb_remove_table_free(struct mmu_table_batch *batch)
208 {
209 	call_rcu(&batch->rcu, tlb_remove_table_rcu);
210 }
211 
212 #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
213 
214 static void tlb_remove_table_free(struct mmu_table_batch *batch)
215 {
216 	__tlb_remove_table_free(batch);
217 }
218 
219 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
220 
221 /*
222  * If we want tlb_remove_table() to imply TLB invalidates.
223  */
224 static inline void tlb_table_invalidate(struct mmu_gather *tlb)
225 {
226 	if (tlb_needs_table_invalidate()) {
227 		/*
228 		 * Invalidate page-table caches used by hardware walkers. Then
229 		 * we still need to RCU-sched wait while freeing the pages
230 		 * because software walkers can still be in-flight.
231 		 */
232 		tlb_flush_mmu_tlbonly(tlb);
233 	}
234 }
235 
236 static void tlb_remove_table_one(void *table)
237 {
238 	tlb_remove_table_sync_one();
239 	__tlb_remove_table(table);
240 }
241 
242 static void tlb_table_flush(struct mmu_gather *tlb)
243 {
244 	struct mmu_table_batch **batch = &tlb->batch;
245 
246 	if (*batch) {
247 		tlb_table_invalidate(tlb);
248 		tlb_remove_table_free(*batch);
249 		*batch = NULL;
250 	}
251 }
252 
253 void tlb_remove_table(struct mmu_gather *tlb, void *table)
254 {
255 	struct mmu_table_batch **batch = &tlb->batch;
256 
257 	if (*batch == NULL) {
258 		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
259 		if (*batch == NULL) {
260 			tlb_table_invalidate(tlb);
261 			tlb_remove_table_one(table);
262 			return;
263 		}
264 		(*batch)->nr = 0;
265 	}
266 
267 	(*batch)->tables[(*batch)->nr++] = table;
268 	if ((*batch)->nr == MAX_TABLE_BATCH)
269 		tlb_table_flush(tlb);
270 }
271 
272 static inline void tlb_table_init(struct mmu_gather *tlb)
273 {
274 	tlb->batch = NULL;
275 }
276 
277 #else /* !CONFIG_MMU_GATHER_TABLE_FREE */
278 
279 static inline void tlb_table_flush(struct mmu_gather *tlb) { }
280 static inline void tlb_table_init(struct mmu_gather *tlb) { }
281 
282 #endif /* CONFIG_MMU_GATHER_TABLE_FREE */
283 
284 static void tlb_flush_mmu_free(struct mmu_gather *tlb)
285 {
286 	tlb_table_flush(tlb);
287 #ifndef CONFIG_MMU_GATHER_NO_GATHER
288 	tlb_batch_pages_flush(tlb);
289 #endif
290 }
291 
292 void tlb_flush_mmu(struct mmu_gather *tlb)
293 {
294 	tlb_flush_mmu_tlbonly(tlb);
295 	tlb_flush_mmu_free(tlb);
296 }
297 
298 static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
299 			     bool fullmm)
300 {
301 	tlb->mm = mm;
302 	tlb->fullmm = fullmm;
303 
304 #ifndef CONFIG_MMU_GATHER_NO_GATHER
305 	tlb->need_flush_all = 0;
306 	tlb->local.next = NULL;
307 	tlb->local.nr   = 0;
308 	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
309 	tlb->active     = &tlb->local;
310 	tlb->batch_count = 0;
311 #endif
312 	tlb->delayed_rmap = 0;
313 
314 	tlb_table_init(tlb);
315 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
316 	tlb->page_size = 0;
317 #endif
318 
319 	__tlb_reset_range(tlb);
320 	inc_tlb_flush_pending(tlb->mm);
321 }
322 
323 /**
324  * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
325  * @tlb: the mmu_gather structure to initialize
326  * @mm: the mm_struct of the target address space
327  *
328  * Called to initialize an (on-stack) mmu_gather structure for page-table
329  * tear-down from @mm.
330  */
331 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
332 {
333 	__tlb_gather_mmu(tlb, mm, false);
334 }
335 
336 /**
337  * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
338  * @tlb: the mmu_gather structure to initialize
339  * @mm: the mm_struct of the target address space
340  *
341  * In this case, @mm is without users and we're going to destroy the
342  * full address space (exit/execve).
343  *
344  * Called to initialize an (on-stack) mmu_gather structure for page-table
345  * tear-down from @mm.
346  */
347 void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
348 {
349 	__tlb_gather_mmu(tlb, mm, true);
350 }
351 
352 /**
353  * tlb_finish_mmu - finish an mmu_gather structure
354  * @tlb: the mmu_gather structure to finish
355  *
356  * Called at the end of the shootdown operation to free up any resources that
357  * were required.
358  */
359 void tlb_finish_mmu(struct mmu_gather *tlb)
360 {
361 	/*
362 	 * If there are parallel threads are doing PTE changes on same range
363 	 * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
364 	 * flush by batching, one thread may end up seeing inconsistent PTEs
365 	 * and result in having stale TLB entries.  So flush TLB forcefully
366 	 * if we detect parallel PTE batching threads.
367 	 *
368 	 * However, some syscalls, e.g. munmap(), may free page tables, this
369 	 * needs force flush everything in the given range. Otherwise this
370 	 * may result in having stale TLB entries for some architectures,
371 	 * e.g. aarch64, that could specify flush what level TLB.
372 	 */
373 	if (mm_tlb_flush_nested(tlb->mm)) {
374 		/*
375 		 * The aarch64 yields better performance with fullmm by
376 		 * avoiding multiple CPUs spamming TLBI messages at the
377 		 * same time.
378 		 *
379 		 * On x86 non-fullmm doesn't yield significant difference
380 		 * against fullmm.
381 		 */
382 		tlb->fullmm = 1;
383 		__tlb_reset_range(tlb);
384 		tlb->freed_tables = 1;
385 	}
386 
387 	tlb_flush_mmu(tlb);
388 
389 #ifndef CONFIG_MMU_GATHER_NO_GATHER
390 	tlb_batch_list_free(tlb);
391 #endif
392 	dec_tlb_flush_pending(tlb->mm);
393 }
394