1 #include <linux/gfp.h> 2 #include <linux/highmem.h> 3 #include <linux/kernel.h> 4 #include <linux/mmdebug.h> 5 #include <linux/mm_types.h> 6 #include <linux/mm_inline.h> 7 #include <linux/pagemap.h> 8 #include <linux/rcupdate.h> 9 #include <linux/smp.h> 10 #include <linux/swap.h> 11 12 #include <asm/pgalloc.h> 13 #include <asm/tlb.h> 14 15 #ifndef CONFIG_MMU_GATHER_NO_GATHER 16 17 static bool tlb_next_batch(struct mmu_gather *tlb) 18 { 19 struct mmu_gather_batch *batch; 20 21 batch = tlb->active; 22 if (batch->next) { 23 tlb->active = batch->next; 24 return true; 25 } 26 27 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) 28 return false; 29 30 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); 31 if (!batch) 32 return false; 33 34 tlb->batch_count++; 35 batch->next = NULL; 36 batch->nr = 0; 37 batch->max = MAX_GATHER_BATCH; 38 39 tlb->active->next = batch; 40 tlb->active = batch; 41 42 return true; 43 } 44 45 static void tlb_batch_pages_flush(struct mmu_gather *tlb) 46 { 47 struct mmu_gather_batch *batch; 48 49 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { 50 struct page **pages = batch->pages; 51 52 do { 53 /* 54 * limit free batch count when PAGE_SIZE > 4K 55 */ 56 unsigned int nr = min(512U, batch->nr); 57 58 free_pages_and_swap_cache(pages, nr); 59 pages += nr; 60 batch->nr -= nr; 61 62 cond_resched(); 63 } while (batch->nr); 64 } 65 tlb->active = &tlb->local; 66 } 67 68 static void tlb_batch_list_free(struct mmu_gather *tlb) 69 { 70 struct mmu_gather_batch *batch, *next; 71 72 for (batch = tlb->local.next; batch; batch = next) { 73 next = batch->next; 74 free_pages((unsigned long)batch, 0); 75 } 76 tlb->local.next = NULL; 77 } 78 79 bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) 80 { 81 struct mmu_gather_batch *batch; 82 83 VM_BUG_ON(!tlb->end); 84 85 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE 86 VM_WARN_ON(tlb->page_size != page_size); 87 #endif 88 89 batch = tlb->active; 90 /* 91 * Add the page and check if we are full. If so 92 * force a flush. 93 */ 94 batch->pages[batch->nr++] = page; 95 if (batch->nr == batch->max) { 96 if (!tlb_next_batch(tlb)) 97 return true; 98 batch = tlb->active; 99 } 100 VM_BUG_ON_PAGE(batch->nr > batch->max, page); 101 102 return false; 103 } 104 105 #endif /* MMU_GATHER_NO_GATHER */ 106 107 #ifdef CONFIG_MMU_GATHER_TABLE_FREE 108 109 static void __tlb_remove_table_free(struct mmu_table_batch *batch) 110 { 111 int i; 112 113 for (i = 0; i < batch->nr; i++) 114 __tlb_remove_table(batch->tables[i]); 115 116 free_page((unsigned long)batch); 117 } 118 119 #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE 120 121 /* 122 * Semi RCU freeing of the page directories. 123 * 124 * This is needed by some architectures to implement software pagetable walkers. 125 * 126 * gup_fast() and other software pagetable walkers do a lockless page-table 127 * walk and therefore needs some synchronization with the freeing of the page 128 * directories. The chosen means to accomplish that is by disabling IRQs over 129 * the walk. 130 * 131 * Architectures that use IPIs to flush TLBs will then automagically DTRT, 132 * since we unlink the page, flush TLBs, free the page. Since the disabling of 133 * IRQs delays the completion of the TLB flush we can never observe an already 134 * freed page. 135 * 136 * Architectures that do not have this (PPC) need to delay the freeing by some 137 * other means, this is that means. 138 * 139 * What we do is batch the freed directory pages (tables) and RCU free them. 140 * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling 141 * holds off grace periods. 142 * 143 * However, in order to batch these pages we need to allocate storage, this 144 * allocation is deep inside the MM code and can thus easily fail on memory 145 * pressure. To guarantee progress we fall back to single table freeing, see 146 * the implementation of tlb_remove_table_one(). 147 * 148 */ 149 150 static void tlb_remove_table_smp_sync(void *arg) 151 { 152 /* Simply deliver the interrupt */ 153 } 154 155 static void tlb_remove_table_sync_one(void) 156 { 157 /* 158 * This isn't an RCU grace period and hence the page-tables cannot be 159 * assumed to be actually RCU-freed. 160 * 161 * It is however sufficient for software page-table walkers that rely on 162 * IRQ disabling. 163 */ 164 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 165 } 166 167 static void tlb_remove_table_rcu(struct rcu_head *head) 168 { 169 __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu)); 170 } 171 172 static void tlb_remove_table_free(struct mmu_table_batch *batch) 173 { 174 call_rcu(&batch->rcu, tlb_remove_table_rcu); 175 } 176 177 #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ 178 179 static void tlb_remove_table_sync_one(void) { } 180 181 static void tlb_remove_table_free(struct mmu_table_batch *batch) 182 { 183 __tlb_remove_table_free(batch); 184 } 185 186 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ 187 188 /* 189 * If we want tlb_remove_table() to imply TLB invalidates. 190 */ 191 static inline void tlb_table_invalidate(struct mmu_gather *tlb) 192 { 193 if (tlb_needs_table_invalidate()) { 194 /* 195 * Invalidate page-table caches used by hardware walkers. Then 196 * we still need to RCU-sched wait while freeing the pages 197 * because software walkers can still be in-flight. 198 */ 199 tlb_flush_mmu_tlbonly(tlb); 200 } 201 } 202 203 static void tlb_remove_table_one(void *table) 204 { 205 tlb_remove_table_sync_one(); 206 __tlb_remove_table(table); 207 } 208 209 static void tlb_table_flush(struct mmu_gather *tlb) 210 { 211 struct mmu_table_batch **batch = &tlb->batch; 212 213 if (*batch) { 214 tlb_table_invalidate(tlb); 215 tlb_remove_table_free(*batch); 216 *batch = NULL; 217 } 218 } 219 220 void tlb_remove_table(struct mmu_gather *tlb, void *table) 221 { 222 struct mmu_table_batch **batch = &tlb->batch; 223 224 if (*batch == NULL) { 225 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); 226 if (*batch == NULL) { 227 tlb_table_invalidate(tlb); 228 tlb_remove_table_one(table); 229 return; 230 } 231 (*batch)->nr = 0; 232 } 233 234 (*batch)->tables[(*batch)->nr++] = table; 235 if ((*batch)->nr == MAX_TABLE_BATCH) 236 tlb_table_flush(tlb); 237 } 238 239 static inline void tlb_table_init(struct mmu_gather *tlb) 240 { 241 tlb->batch = NULL; 242 } 243 244 #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ 245 246 static inline void tlb_table_flush(struct mmu_gather *tlb) { } 247 static inline void tlb_table_init(struct mmu_gather *tlb) { } 248 249 #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ 250 251 static void tlb_flush_mmu_free(struct mmu_gather *tlb) 252 { 253 tlb_table_flush(tlb); 254 #ifndef CONFIG_MMU_GATHER_NO_GATHER 255 tlb_batch_pages_flush(tlb); 256 #endif 257 } 258 259 void tlb_flush_mmu(struct mmu_gather *tlb) 260 { 261 tlb_flush_mmu_tlbonly(tlb); 262 tlb_flush_mmu_free(tlb); 263 } 264 265 static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, 266 bool fullmm) 267 { 268 tlb->mm = mm; 269 tlb->fullmm = fullmm; 270 271 #ifndef CONFIG_MMU_GATHER_NO_GATHER 272 tlb->need_flush_all = 0; 273 tlb->local.next = NULL; 274 tlb->local.nr = 0; 275 tlb->local.max = ARRAY_SIZE(tlb->__pages); 276 tlb->active = &tlb->local; 277 tlb->batch_count = 0; 278 #endif 279 280 tlb_table_init(tlb); 281 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE 282 tlb->page_size = 0; 283 #endif 284 285 __tlb_reset_range(tlb); 286 inc_tlb_flush_pending(tlb->mm); 287 } 288 289 /** 290 * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down 291 * @tlb: the mmu_gather structure to initialize 292 * @mm: the mm_struct of the target address space 293 * 294 * Called to initialize an (on-stack) mmu_gather structure for page-table 295 * tear-down from @mm. 296 */ 297 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) 298 { 299 __tlb_gather_mmu(tlb, mm, false); 300 } 301 302 /** 303 * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down 304 * @tlb: the mmu_gather structure to initialize 305 * @mm: the mm_struct of the target address space 306 * 307 * In this case, @mm is without users and we're going to destroy the 308 * full address space (exit/execve). 309 * 310 * Called to initialize an (on-stack) mmu_gather structure for page-table 311 * tear-down from @mm. 312 */ 313 void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) 314 { 315 __tlb_gather_mmu(tlb, mm, true); 316 } 317 318 /** 319 * tlb_finish_mmu - finish an mmu_gather structure 320 * @tlb: the mmu_gather structure to finish 321 * 322 * Called at the end of the shootdown operation to free up any resources that 323 * were required. 324 */ 325 void tlb_finish_mmu(struct mmu_gather *tlb) 326 { 327 /* 328 * If there are parallel threads are doing PTE changes on same range 329 * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB 330 * flush by batching, one thread may end up seeing inconsistent PTEs 331 * and result in having stale TLB entries. So flush TLB forcefully 332 * if we detect parallel PTE batching threads. 333 * 334 * However, some syscalls, e.g. munmap(), may free page tables, this 335 * needs force flush everything in the given range. Otherwise this 336 * may result in having stale TLB entries for some architectures, 337 * e.g. aarch64, that could specify flush what level TLB. 338 */ 339 if (mm_tlb_flush_nested(tlb->mm)) { 340 /* 341 * The aarch64 yields better performance with fullmm by 342 * avoiding multiple CPUs spamming TLBI messages at the 343 * same time. 344 * 345 * On x86 non-fullmm doesn't yield significant difference 346 * against fullmm. 347 */ 348 tlb->fullmm = 1; 349 __tlb_reset_range(tlb); 350 tlb->freed_tables = 1; 351 } 352 353 tlb_flush_mmu(tlb); 354 355 #ifndef CONFIG_MMU_GATHER_NO_GATHER 356 tlb_batch_list_free(tlb); 357 #endif 358 dec_tlb_flush_pending(tlb->mm); 359 } 360