1 #include <linux/gfp.h> 2 #include <linux/highmem.h> 3 #include <linux/kernel.h> 4 #include <linux/mmdebug.h> 5 #include <linux/mm_types.h> 6 #include <linux/mm_inline.h> 7 #include <linux/pagemap.h> 8 #include <linux/rcupdate.h> 9 #include <linux/smp.h> 10 #include <linux/swap.h> 11 #include <linux/rmap.h> 12 13 #include <asm/pgalloc.h> 14 #include <asm/tlb.h> 15 16 #ifndef CONFIG_MMU_GATHER_NO_GATHER 17 18 static bool tlb_next_batch(struct mmu_gather *tlb) 19 { 20 struct mmu_gather_batch *batch; 21 22 /* Limit batching if we have delayed rmaps pending */ 23 if (tlb->delayed_rmap && tlb->active != &tlb->local) 24 return false; 25 26 batch = tlb->active; 27 if (batch->next) { 28 tlb->active = batch->next; 29 return true; 30 } 31 32 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) 33 return false; 34 35 batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); 36 if (!batch) 37 return false; 38 39 tlb->batch_count++; 40 batch->next = NULL; 41 batch->nr = 0; 42 batch->max = MAX_GATHER_BATCH; 43 44 tlb->active->next = batch; 45 tlb->active = batch; 46 47 return true; 48 } 49 50 #ifdef CONFIG_SMP 51 static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma) 52 { 53 for (int i = 0; i < batch->nr; i++) { 54 struct encoded_page *enc = batch->encoded_pages[i]; 55 56 if (encoded_page_flags(enc)) { 57 struct page *page = encoded_page_ptr(enc); 58 page_remove_rmap(page, vma, false); 59 } 60 } 61 } 62 63 /** 64 * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB 65 * @tlb: the current mmu_gather 66 * 67 * Note that because of how tlb_next_batch() above works, we will 68 * never start multiple new batches with pending delayed rmaps, so 69 * we only need to walk through the current active batch and the 70 * original local one. 71 */ 72 void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) 73 { 74 if (!tlb->delayed_rmap) 75 return; 76 77 tlb_flush_rmap_batch(&tlb->local, vma); 78 if (tlb->active != &tlb->local) 79 tlb_flush_rmap_batch(tlb->active, vma); 80 tlb->delayed_rmap = 0; 81 } 82 #endif 83 84 static void tlb_batch_pages_flush(struct mmu_gather *tlb) 85 { 86 struct mmu_gather_batch *batch; 87 88 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { 89 struct encoded_page **pages = batch->encoded_pages; 90 91 do { 92 /* 93 * limit free batch count when PAGE_SIZE > 4K 94 */ 95 unsigned int nr = min(512U, batch->nr); 96 97 free_pages_and_swap_cache(pages, nr); 98 pages += nr; 99 batch->nr -= nr; 100 101 cond_resched(); 102 } while (batch->nr); 103 } 104 tlb->active = &tlb->local; 105 } 106 107 static void tlb_batch_list_free(struct mmu_gather *tlb) 108 { 109 struct mmu_gather_batch *batch, *next; 110 111 for (batch = tlb->local.next; batch; batch = next) { 112 next = batch->next; 113 free_pages((unsigned long)batch, 0); 114 } 115 tlb->local.next = NULL; 116 } 117 118 bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size) 119 { 120 struct mmu_gather_batch *batch; 121 122 VM_BUG_ON(!tlb->end); 123 124 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE 125 VM_WARN_ON(tlb->page_size != page_size); 126 #endif 127 128 batch = tlb->active; 129 /* 130 * Add the page and check if we are full. If so 131 * force a flush. 132 */ 133 batch->encoded_pages[batch->nr++] = page; 134 if (batch->nr == batch->max) { 135 if (!tlb_next_batch(tlb)) 136 return true; 137 batch = tlb->active; 138 } 139 VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page)); 140 141 return false; 142 } 143 144 #endif /* MMU_GATHER_NO_GATHER */ 145 146 #ifdef CONFIG_MMU_GATHER_TABLE_FREE 147 148 static void __tlb_remove_table_free(struct mmu_table_batch *batch) 149 { 150 int i; 151 152 for (i = 0; i < batch->nr; i++) 153 __tlb_remove_table(batch->tables[i]); 154 155 free_page((unsigned long)batch); 156 } 157 158 #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE 159 160 /* 161 * Semi RCU freeing of the page directories. 162 * 163 * This is needed by some architectures to implement software pagetable walkers. 164 * 165 * gup_fast() and other software pagetable walkers do a lockless page-table 166 * walk and therefore needs some synchronization with the freeing of the page 167 * directories. The chosen means to accomplish that is by disabling IRQs over 168 * the walk. 169 * 170 * Architectures that use IPIs to flush TLBs will then automagically DTRT, 171 * since we unlink the page, flush TLBs, free the page. Since the disabling of 172 * IRQs delays the completion of the TLB flush we can never observe an already 173 * freed page. 174 * 175 * Architectures that do not have this (PPC) need to delay the freeing by some 176 * other means, this is that means. 177 * 178 * What we do is batch the freed directory pages (tables) and RCU free them. 179 * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling 180 * holds off grace periods. 181 * 182 * However, in order to batch these pages we need to allocate storage, this 183 * allocation is deep inside the MM code and can thus easily fail on memory 184 * pressure. To guarantee progress we fall back to single table freeing, see 185 * the implementation of tlb_remove_table_one(). 186 * 187 */ 188 189 static void tlb_remove_table_smp_sync(void *arg) 190 { 191 /* Simply deliver the interrupt */ 192 } 193 194 void tlb_remove_table_sync_one(void) 195 { 196 /* 197 * This isn't an RCU grace period and hence the page-tables cannot be 198 * assumed to be actually RCU-freed. 199 * 200 * It is however sufficient for software page-table walkers that rely on 201 * IRQ disabling. 202 */ 203 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 204 } 205 206 static void tlb_remove_table_rcu(struct rcu_head *head) 207 { 208 __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu)); 209 } 210 211 static void tlb_remove_table_free(struct mmu_table_batch *batch) 212 { 213 call_rcu(&batch->rcu, tlb_remove_table_rcu); 214 } 215 216 #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ 217 218 static void tlb_remove_table_free(struct mmu_table_batch *batch) 219 { 220 __tlb_remove_table_free(batch); 221 } 222 223 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ 224 225 /* 226 * If we want tlb_remove_table() to imply TLB invalidates. 227 */ 228 static inline void tlb_table_invalidate(struct mmu_gather *tlb) 229 { 230 if (tlb_needs_table_invalidate()) { 231 /* 232 * Invalidate page-table caches used by hardware walkers. Then 233 * we still need to RCU-sched wait while freeing the pages 234 * because software walkers can still be in-flight. 235 */ 236 tlb_flush_mmu_tlbonly(tlb); 237 } 238 } 239 240 static void tlb_remove_table_one(void *table) 241 { 242 tlb_remove_table_sync_one(); 243 __tlb_remove_table(table); 244 } 245 246 static void tlb_table_flush(struct mmu_gather *tlb) 247 { 248 struct mmu_table_batch **batch = &tlb->batch; 249 250 if (*batch) { 251 tlb_table_invalidate(tlb); 252 tlb_remove_table_free(*batch); 253 *batch = NULL; 254 } 255 } 256 257 void tlb_remove_table(struct mmu_gather *tlb, void *table) 258 { 259 struct mmu_table_batch **batch = &tlb->batch; 260 261 if (*batch == NULL) { 262 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); 263 if (*batch == NULL) { 264 tlb_table_invalidate(tlb); 265 tlb_remove_table_one(table); 266 return; 267 } 268 (*batch)->nr = 0; 269 } 270 271 (*batch)->tables[(*batch)->nr++] = table; 272 if ((*batch)->nr == MAX_TABLE_BATCH) 273 tlb_table_flush(tlb); 274 } 275 276 static inline void tlb_table_init(struct mmu_gather *tlb) 277 { 278 tlb->batch = NULL; 279 } 280 281 #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ 282 283 static inline void tlb_table_flush(struct mmu_gather *tlb) { } 284 static inline void tlb_table_init(struct mmu_gather *tlb) { } 285 286 #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ 287 288 static void tlb_flush_mmu_free(struct mmu_gather *tlb) 289 { 290 tlb_table_flush(tlb); 291 #ifndef CONFIG_MMU_GATHER_NO_GATHER 292 tlb_batch_pages_flush(tlb); 293 #endif 294 } 295 296 void tlb_flush_mmu(struct mmu_gather *tlb) 297 { 298 tlb_flush_mmu_tlbonly(tlb); 299 tlb_flush_mmu_free(tlb); 300 } 301 302 static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, 303 bool fullmm) 304 { 305 tlb->mm = mm; 306 tlb->fullmm = fullmm; 307 308 #ifndef CONFIG_MMU_GATHER_NO_GATHER 309 tlb->need_flush_all = 0; 310 tlb->local.next = NULL; 311 tlb->local.nr = 0; 312 tlb->local.max = ARRAY_SIZE(tlb->__pages); 313 tlb->active = &tlb->local; 314 tlb->batch_count = 0; 315 #endif 316 tlb->delayed_rmap = 0; 317 318 tlb_table_init(tlb); 319 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE 320 tlb->page_size = 0; 321 #endif 322 323 __tlb_reset_range(tlb); 324 inc_tlb_flush_pending(tlb->mm); 325 } 326 327 /** 328 * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down 329 * @tlb: the mmu_gather structure to initialize 330 * @mm: the mm_struct of the target address space 331 * 332 * Called to initialize an (on-stack) mmu_gather structure for page-table 333 * tear-down from @mm. 334 */ 335 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) 336 { 337 __tlb_gather_mmu(tlb, mm, false); 338 } 339 340 /** 341 * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down 342 * @tlb: the mmu_gather structure to initialize 343 * @mm: the mm_struct of the target address space 344 * 345 * In this case, @mm is without users and we're going to destroy the 346 * full address space (exit/execve). 347 * 348 * Called to initialize an (on-stack) mmu_gather structure for page-table 349 * tear-down from @mm. 350 */ 351 void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) 352 { 353 __tlb_gather_mmu(tlb, mm, true); 354 } 355 356 /** 357 * tlb_finish_mmu - finish an mmu_gather structure 358 * @tlb: the mmu_gather structure to finish 359 * 360 * Called at the end of the shootdown operation to free up any resources that 361 * were required. 362 */ 363 void tlb_finish_mmu(struct mmu_gather *tlb) 364 { 365 /* 366 * If there are parallel threads are doing PTE changes on same range 367 * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB 368 * flush by batching, one thread may end up seeing inconsistent PTEs 369 * and result in having stale TLB entries. So flush TLB forcefully 370 * if we detect parallel PTE batching threads. 371 * 372 * However, some syscalls, e.g. munmap(), may free page tables, this 373 * needs force flush everything in the given range. Otherwise this 374 * may result in having stale TLB entries for some architectures, 375 * e.g. aarch64, that could specify flush what level TLB. 376 */ 377 if (mm_tlb_flush_nested(tlb->mm)) { 378 /* 379 * The aarch64 yields better performance with fullmm by 380 * avoiding multiple CPUs spamming TLBI messages at the 381 * same time. 382 * 383 * On x86 non-fullmm doesn't yield significant difference 384 * against fullmm. 385 */ 386 tlb->fullmm = 1; 387 __tlb_reset_range(tlb); 388 tlb->freed_tables = 1; 389 } 390 391 tlb_flush_mmu(tlb); 392 393 #ifndef CONFIG_MMU_GATHER_NO_GATHER 394 tlb_batch_list_free(tlb); 395 #endif 396 dec_tlb_flush_pending(tlb->mm); 397 } 398