xref: /openbmc/linux/mm/mmu_gather.c (revision c900529f3d9161bfde5cca0754f83b4d3c3e0220)
1  #include <linux/gfp.h>
2  #include <linux/highmem.h>
3  #include <linux/kernel.h>
4  #include <linux/mmdebug.h>
5  #include <linux/mm_types.h>
6  #include <linux/mm_inline.h>
7  #include <linux/pagemap.h>
8  #include <linux/rcupdate.h>
9  #include <linux/smp.h>
10  #include <linux/swap.h>
11  #include <linux/rmap.h>
12  
13  #include <asm/pgalloc.h>
14  #include <asm/tlb.h>
15  
16  #ifndef CONFIG_MMU_GATHER_NO_GATHER
17  
tlb_next_batch(struct mmu_gather * tlb)18  static bool tlb_next_batch(struct mmu_gather *tlb)
19  {
20  	struct mmu_gather_batch *batch;
21  
22  	/* Limit batching if we have delayed rmaps pending */
23  	if (tlb->delayed_rmap && tlb->active != &tlb->local)
24  		return false;
25  
26  	batch = tlb->active;
27  	if (batch->next) {
28  		tlb->active = batch->next;
29  		return true;
30  	}
31  
32  	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
33  		return false;
34  
35  	batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
36  	if (!batch)
37  		return false;
38  
39  	tlb->batch_count++;
40  	batch->next = NULL;
41  	batch->nr   = 0;
42  	batch->max  = MAX_GATHER_BATCH;
43  
44  	tlb->active->next = batch;
45  	tlb->active = batch;
46  
47  	return true;
48  }
49  
50  #ifdef CONFIG_SMP
tlb_flush_rmap_batch(struct mmu_gather_batch * batch,struct vm_area_struct * vma)51  static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
52  {
53  	for (int i = 0; i < batch->nr; i++) {
54  		struct encoded_page *enc = batch->encoded_pages[i];
55  
56  		if (encoded_page_flags(enc)) {
57  			struct page *page = encoded_page_ptr(enc);
58  			page_remove_rmap(page, vma, false);
59  		}
60  	}
61  }
62  
63  /**
64   * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
65   * @tlb: the current mmu_gather
66   * @vma: The memory area from which the pages are being removed.
67   *
68   * Note that because of how tlb_next_batch() above works, we will
69   * never start multiple new batches with pending delayed rmaps, so
70   * we only need to walk through the current active batch and the
71   * original local one.
72   */
tlb_flush_rmaps(struct mmu_gather * tlb,struct vm_area_struct * vma)73  void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
74  {
75  	if (!tlb->delayed_rmap)
76  		return;
77  
78  	tlb_flush_rmap_batch(&tlb->local, vma);
79  	if (tlb->active != &tlb->local)
80  		tlb_flush_rmap_batch(tlb->active, vma);
81  	tlb->delayed_rmap = 0;
82  }
83  #endif
84  
tlb_batch_pages_flush(struct mmu_gather * tlb)85  static void tlb_batch_pages_flush(struct mmu_gather *tlb)
86  {
87  	struct mmu_gather_batch *batch;
88  
89  	for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
90  		struct encoded_page **pages = batch->encoded_pages;
91  
92  		do {
93  			/*
94  			 * limit free batch count when PAGE_SIZE > 4K
95  			 */
96  			unsigned int nr = min(512U, batch->nr);
97  
98  			free_pages_and_swap_cache(pages, nr);
99  			pages += nr;
100  			batch->nr -= nr;
101  
102  			cond_resched();
103  		} while (batch->nr);
104  	}
105  	tlb->active = &tlb->local;
106  }
107  
tlb_batch_list_free(struct mmu_gather * tlb)108  static void tlb_batch_list_free(struct mmu_gather *tlb)
109  {
110  	struct mmu_gather_batch *batch, *next;
111  
112  	for (batch = tlb->local.next; batch; batch = next) {
113  		next = batch->next;
114  		free_pages((unsigned long)batch, 0);
115  	}
116  	tlb->local.next = NULL;
117  }
118  
__tlb_remove_page_size(struct mmu_gather * tlb,struct encoded_page * page,int page_size)119  bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size)
120  {
121  	struct mmu_gather_batch *batch;
122  
123  	VM_BUG_ON(!tlb->end);
124  
125  #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
126  	VM_WARN_ON(tlb->page_size != page_size);
127  #endif
128  
129  	batch = tlb->active;
130  	/*
131  	 * Add the page and check if we are full. If so
132  	 * force a flush.
133  	 */
134  	batch->encoded_pages[batch->nr++] = page;
135  	if (batch->nr == batch->max) {
136  		if (!tlb_next_batch(tlb))
137  			return true;
138  		batch = tlb->active;
139  	}
140  	VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page));
141  
142  	return false;
143  }
144  
145  #endif /* MMU_GATHER_NO_GATHER */
146  
147  #ifdef CONFIG_MMU_GATHER_TABLE_FREE
148  
__tlb_remove_table_free(struct mmu_table_batch * batch)149  static void __tlb_remove_table_free(struct mmu_table_batch *batch)
150  {
151  	int i;
152  
153  	for (i = 0; i < batch->nr; i++)
154  		__tlb_remove_table(batch->tables[i]);
155  
156  	free_page((unsigned long)batch);
157  }
158  
159  #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
160  
161  /*
162   * Semi RCU freeing of the page directories.
163   *
164   * This is needed by some architectures to implement software pagetable walkers.
165   *
166   * gup_fast() and other software pagetable walkers do a lockless page-table
167   * walk and therefore needs some synchronization with the freeing of the page
168   * directories. The chosen means to accomplish that is by disabling IRQs over
169   * the walk.
170   *
171   * Architectures that use IPIs to flush TLBs will then automagically DTRT,
172   * since we unlink the page, flush TLBs, free the page. Since the disabling of
173   * IRQs delays the completion of the TLB flush we can never observe an already
174   * freed page.
175   *
176   * Architectures that do not have this (PPC) need to delay the freeing by some
177   * other means, this is that means.
178   *
179   * What we do is batch the freed directory pages (tables) and RCU free them.
180   * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
181   * holds off grace periods.
182   *
183   * However, in order to batch these pages we need to allocate storage, this
184   * allocation is deep inside the MM code and can thus easily fail on memory
185   * pressure. To guarantee progress we fall back to single table freeing, see
186   * the implementation of tlb_remove_table_one().
187   *
188   */
189  
tlb_remove_table_smp_sync(void * arg)190  static void tlb_remove_table_smp_sync(void *arg)
191  {
192  	/* Simply deliver the interrupt */
193  }
194  
tlb_remove_table_sync_one(void)195  void tlb_remove_table_sync_one(void)
196  {
197  	/*
198  	 * This isn't an RCU grace period and hence the page-tables cannot be
199  	 * assumed to be actually RCU-freed.
200  	 *
201  	 * It is however sufficient for software page-table walkers that rely on
202  	 * IRQ disabling.
203  	 */
204  	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
205  }
206  
tlb_remove_table_rcu(struct rcu_head * head)207  static void tlb_remove_table_rcu(struct rcu_head *head)
208  {
209  	__tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
210  }
211  
tlb_remove_table_free(struct mmu_table_batch * batch)212  static void tlb_remove_table_free(struct mmu_table_batch *batch)
213  {
214  	call_rcu(&batch->rcu, tlb_remove_table_rcu);
215  }
216  
217  #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
218  
tlb_remove_table_free(struct mmu_table_batch * batch)219  static void tlb_remove_table_free(struct mmu_table_batch *batch)
220  {
221  	__tlb_remove_table_free(batch);
222  }
223  
224  #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
225  
226  /*
227   * If we want tlb_remove_table() to imply TLB invalidates.
228   */
tlb_table_invalidate(struct mmu_gather * tlb)229  static inline void tlb_table_invalidate(struct mmu_gather *tlb)
230  {
231  	if (tlb_needs_table_invalidate()) {
232  		/*
233  		 * Invalidate page-table caches used by hardware walkers. Then
234  		 * we still need to RCU-sched wait while freeing the pages
235  		 * because software walkers can still be in-flight.
236  		 */
237  		tlb_flush_mmu_tlbonly(tlb);
238  	}
239  }
240  
tlb_remove_table_one(void * table)241  static void tlb_remove_table_one(void *table)
242  {
243  	tlb_remove_table_sync_one();
244  	__tlb_remove_table(table);
245  }
246  
tlb_table_flush(struct mmu_gather * tlb)247  static void tlb_table_flush(struct mmu_gather *tlb)
248  {
249  	struct mmu_table_batch **batch = &tlb->batch;
250  
251  	if (*batch) {
252  		tlb_table_invalidate(tlb);
253  		tlb_remove_table_free(*batch);
254  		*batch = NULL;
255  	}
256  }
257  
tlb_remove_table(struct mmu_gather * tlb,void * table)258  void tlb_remove_table(struct mmu_gather *tlb, void *table)
259  {
260  	struct mmu_table_batch **batch = &tlb->batch;
261  
262  	if (*batch == NULL) {
263  		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
264  		if (*batch == NULL) {
265  			tlb_table_invalidate(tlb);
266  			tlb_remove_table_one(table);
267  			return;
268  		}
269  		(*batch)->nr = 0;
270  	}
271  
272  	(*batch)->tables[(*batch)->nr++] = table;
273  	if ((*batch)->nr == MAX_TABLE_BATCH)
274  		tlb_table_flush(tlb);
275  }
276  
tlb_table_init(struct mmu_gather * tlb)277  static inline void tlb_table_init(struct mmu_gather *tlb)
278  {
279  	tlb->batch = NULL;
280  }
281  
282  #else /* !CONFIG_MMU_GATHER_TABLE_FREE */
283  
tlb_table_flush(struct mmu_gather * tlb)284  static inline void tlb_table_flush(struct mmu_gather *tlb) { }
tlb_table_init(struct mmu_gather * tlb)285  static inline void tlb_table_init(struct mmu_gather *tlb) { }
286  
287  #endif /* CONFIG_MMU_GATHER_TABLE_FREE */
288  
tlb_flush_mmu_free(struct mmu_gather * tlb)289  static void tlb_flush_mmu_free(struct mmu_gather *tlb)
290  {
291  	tlb_table_flush(tlb);
292  #ifndef CONFIG_MMU_GATHER_NO_GATHER
293  	tlb_batch_pages_flush(tlb);
294  #endif
295  }
296  
tlb_flush_mmu(struct mmu_gather * tlb)297  void tlb_flush_mmu(struct mmu_gather *tlb)
298  {
299  	tlb_flush_mmu_tlbonly(tlb);
300  	tlb_flush_mmu_free(tlb);
301  }
302  
__tlb_gather_mmu(struct mmu_gather * tlb,struct mm_struct * mm,bool fullmm)303  static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
304  			     bool fullmm)
305  {
306  	tlb->mm = mm;
307  	tlb->fullmm = fullmm;
308  
309  #ifndef CONFIG_MMU_GATHER_NO_GATHER
310  	tlb->need_flush_all = 0;
311  	tlb->local.next = NULL;
312  	tlb->local.nr   = 0;
313  	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
314  	tlb->active     = &tlb->local;
315  	tlb->batch_count = 0;
316  #endif
317  	tlb->delayed_rmap = 0;
318  
319  	tlb_table_init(tlb);
320  #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
321  	tlb->page_size = 0;
322  #endif
323  
324  	__tlb_reset_range(tlb);
325  	inc_tlb_flush_pending(tlb->mm);
326  }
327  
328  /**
329   * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
330   * @tlb: the mmu_gather structure to initialize
331   * @mm: the mm_struct of the target address space
332   *
333   * Called to initialize an (on-stack) mmu_gather structure for page-table
334   * tear-down from @mm.
335   */
tlb_gather_mmu(struct mmu_gather * tlb,struct mm_struct * mm)336  void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
337  {
338  	__tlb_gather_mmu(tlb, mm, false);
339  }
340  
341  /**
342   * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
343   * @tlb: the mmu_gather structure to initialize
344   * @mm: the mm_struct of the target address space
345   *
346   * In this case, @mm is without users and we're going to destroy the
347   * full address space (exit/execve).
348   *
349   * Called to initialize an (on-stack) mmu_gather structure for page-table
350   * tear-down from @mm.
351   */
tlb_gather_mmu_fullmm(struct mmu_gather * tlb,struct mm_struct * mm)352  void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
353  {
354  	__tlb_gather_mmu(tlb, mm, true);
355  }
356  
357  /**
358   * tlb_finish_mmu - finish an mmu_gather structure
359   * @tlb: the mmu_gather structure to finish
360   *
361   * Called at the end of the shootdown operation to free up any resources that
362   * were required.
363   */
tlb_finish_mmu(struct mmu_gather * tlb)364  void tlb_finish_mmu(struct mmu_gather *tlb)
365  {
366  	/*
367  	 * If there are parallel threads are doing PTE changes on same range
368  	 * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
369  	 * flush by batching, one thread may end up seeing inconsistent PTEs
370  	 * and result in having stale TLB entries.  So flush TLB forcefully
371  	 * if we detect parallel PTE batching threads.
372  	 *
373  	 * However, some syscalls, e.g. munmap(), may free page tables, this
374  	 * needs force flush everything in the given range. Otherwise this
375  	 * may result in having stale TLB entries for some architectures,
376  	 * e.g. aarch64, that could specify flush what level TLB.
377  	 */
378  	if (mm_tlb_flush_nested(tlb->mm)) {
379  		/*
380  		 * The aarch64 yields better performance with fullmm by
381  		 * avoiding multiple CPUs spamming TLBI messages at the
382  		 * same time.
383  		 *
384  		 * On x86 non-fullmm doesn't yield significant difference
385  		 * against fullmm.
386  		 */
387  		tlb->fullmm = 1;
388  		__tlb_reset_range(tlb);
389  		tlb->freed_tables = 1;
390  	}
391  
392  	tlb_flush_mmu(tlb);
393  
394  #ifndef CONFIG_MMU_GATHER_NO_GATHER
395  	tlb_batch_list_free(tlb);
396  #endif
397  	dec_tlb_flush_pending(tlb->mm);
398  }
399