1 #include <linux/init.h> 2 3 #include <linux/mm.h> 4 #include <linux/spinlock.h> 5 #include <linux/smp.h> 6 #include <linux/interrupt.h> 7 #include <linux/export.h> 8 #include <linux/cpu.h> 9 10 #include <asm/tlbflush.h> 11 #include <asm/mmu_context.h> 12 #include <asm/cache.h> 13 #include <asm/apic.h> 14 #include <asm/uv/uv.h> 15 #include <linux/debugfs.h> 16 17 /* 18 * TLB flushing, formerly SMP-only 19 * c/o Linus Torvalds. 20 * 21 * These mean you can really definitely utterly forget about 22 * writing to user space from interrupts. (Its not allowed anyway). 23 * 24 * Optimizations Manfred Spraul <manfred@colorfullife.com> 25 * 26 * More scalable flush, from Andi Kleen 27 * 28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi 29 */ 30 31 void leave_mm(int cpu) 32 { 33 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 34 35 /* 36 * It's plausible that we're in lazy TLB mode while our mm is init_mm. 37 * If so, our callers still expect us to flush the TLB, but there 38 * aren't any user TLB entries in init_mm to worry about. 39 * 40 * This needs to happen before any other sanity checks due to 41 * intel_idle's shenanigans. 42 */ 43 if (loaded_mm == &init_mm) 44 return; 45 46 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 47 BUG(); 48 49 switch_mm(NULL, &init_mm, NULL); 50 } 51 EXPORT_SYMBOL_GPL(leave_mm); 52 53 void switch_mm(struct mm_struct *prev, struct mm_struct *next, 54 struct task_struct *tsk) 55 { 56 unsigned long flags; 57 58 local_irq_save(flags); 59 switch_mm_irqs_off(prev, next, tsk); 60 local_irq_restore(flags); 61 } 62 63 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, 64 struct task_struct *tsk) 65 { 66 unsigned cpu = smp_processor_id(); 67 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); 68 69 /* 70 * NB: The scheduler will call us with prev == next when 71 * switching from lazy TLB mode to normal mode if active_mm 72 * isn't changing. When this happens, there is no guarantee 73 * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next. 74 * 75 * NB: leave_mm() calls us with prev == NULL and tsk == NULL. 76 */ 77 78 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); 79 80 if (real_prev == next) { 81 /* 82 * There's nothing to do: we always keep the per-mm control 83 * regs in sync with cpu_tlbstate.loaded_mm. Just 84 * sanity-check mm_cpumask. 85 */ 86 if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next)))) 87 cpumask_set_cpu(cpu, mm_cpumask(next)); 88 return; 89 } 90 91 if (IS_ENABLED(CONFIG_VMAP_STACK)) { 92 /* 93 * If our current stack is in vmalloc space and isn't 94 * mapped in the new pgd, we'll double-fault. Forcibly 95 * map it. 96 */ 97 unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); 98 99 pgd_t *pgd = next->pgd + stack_pgd_index; 100 101 if (unlikely(pgd_none(*pgd))) 102 set_pgd(pgd, init_mm.pgd[stack_pgd_index]); 103 } 104 105 this_cpu_write(cpu_tlbstate.loaded_mm, next); 106 107 WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); 108 cpumask_set_cpu(cpu, mm_cpumask(next)); 109 110 /* 111 * Re-load page tables. 112 * 113 * This logic has an ordering constraint: 114 * 115 * CPU 0: Write to a PTE for 'next' 116 * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. 117 * CPU 1: set bit 1 in next's mm_cpumask 118 * CPU 1: load from the PTE that CPU 0 writes (implicit) 119 * 120 * We need to prevent an outcome in which CPU 1 observes 121 * the new PTE value and CPU 0 observes bit 1 clear in 122 * mm_cpumask. (If that occurs, then the IPI will never 123 * be sent, and CPU 0's TLB will contain a stale entry.) 124 * 125 * The bad outcome can occur if either CPU's load is 126 * reordered before that CPU's store, so both CPUs must 127 * execute full barriers to prevent this from happening. 128 * 129 * Thus, switch_mm needs a full barrier between the 130 * store to mm_cpumask and any operation that could load 131 * from next->pgd. TLB fills are special and can happen 132 * due to instruction fetches or for no reason at all, 133 * and neither LOCK nor MFENCE orders them. 134 * Fortunately, load_cr3() is serializing and gives the 135 * ordering guarantee we need. 136 */ 137 load_cr3(next->pgd); 138 139 /* 140 * This gets called via leave_mm() in the idle path where RCU 141 * functions differently. Tracing normally uses RCU, so we have to 142 * call the tracepoint specially here. 143 */ 144 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 145 146 /* Stop flush ipis for the previous mm */ 147 WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && 148 real_prev != &init_mm); 149 cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 150 151 /* Load per-mm CR4 and LDTR state */ 152 load_mm_cr4(next); 153 switch_ldt(real_prev, next); 154 } 155 156 static void flush_tlb_func_common(const struct flush_tlb_info *f, 157 bool local, enum tlb_flush_reason reason) 158 { 159 /* This code cannot presently handle being reentered. */ 160 VM_WARN_ON(!irqs_disabled()); 161 162 if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { 163 leave_mm(smp_processor_id()); 164 return; 165 } 166 167 if (f->end == TLB_FLUSH_ALL) { 168 local_flush_tlb(); 169 if (local) 170 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 171 trace_tlb_flush(reason, TLB_FLUSH_ALL); 172 } else { 173 unsigned long addr; 174 unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; 175 addr = f->start; 176 while (addr < f->end) { 177 __flush_tlb_single(addr); 178 addr += PAGE_SIZE; 179 } 180 if (local) 181 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); 182 trace_tlb_flush(reason, nr_pages); 183 } 184 } 185 186 static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) 187 { 188 const struct flush_tlb_info *f = info; 189 190 flush_tlb_func_common(f, true, reason); 191 } 192 193 static void flush_tlb_func_remote(void *info) 194 { 195 const struct flush_tlb_info *f = info; 196 197 inc_irq_stat(irq_tlb_count); 198 199 if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm)) 200 return; 201 202 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 203 flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); 204 } 205 206 void native_flush_tlb_others(const struct cpumask *cpumask, 207 const struct flush_tlb_info *info) 208 { 209 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 210 if (info->end == TLB_FLUSH_ALL) 211 trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); 212 else 213 trace_tlb_flush(TLB_REMOTE_SEND_IPI, 214 (info->end - info->start) >> PAGE_SHIFT); 215 216 if (is_uv_system()) { 217 unsigned int cpu; 218 219 cpu = smp_processor_id(); 220 cpumask = uv_flush_tlb_others(cpumask, info); 221 if (cpumask) 222 smp_call_function_many(cpumask, flush_tlb_func_remote, 223 (void *)info, 1); 224 return; 225 } 226 smp_call_function_many(cpumask, flush_tlb_func_remote, 227 (void *)info, 1); 228 } 229 230 /* 231 * See Documentation/x86/tlb.txt for details. We choose 33 232 * because it is large enough to cover the vast majority (at 233 * least 95%) of allocations, and is small enough that we are 234 * confident it will not cause too much overhead. Each single 235 * flush is about 100 ns, so this caps the maximum overhead at 236 * _about_ 3,000 ns. 237 * 238 * This is in units of pages. 239 */ 240 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; 241 242 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 243 unsigned long end, unsigned long vmflag) 244 { 245 int cpu; 246 247 struct flush_tlb_info info = { 248 .mm = mm, 249 }; 250 251 cpu = get_cpu(); 252 253 /* Synchronize with switch_mm. */ 254 smp_mb(); 255 256 /* Should we flush just the requested range? */ 257 if ((end != TLB_FLUSH_ALL) && 258 !(vmflag & VM_HUGETLB) && 259 ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { 260 info.start = start; 261 info.end = end; 262 } else { 263 info.start = 0UL; 264 info.end = TLB_FLUSH_ALL; 265 } 266 267 if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { 268 VM_WARN_ON(irqs_disabled()); 269 local_irq_disable(); 270 flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); 271 local_irq_enable(); 272 } 273 274 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) 275 flush_tlb_others(mm_cpumask(mm), &info); 276 put_cpu(); 277 } 278 279 280 static void do_flush_tlb_all(void *info) 281 { 282 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 283 __flush_tlb_all(); 284 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) 285 leave_mm(smp_processor_id()); 286 } 287 288 void flush_tlb_all(void) 289 { 290 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 291 on_each_cpu(do_flush_tlb_all, NULL, 1); 292 } 293 294 static void do_kernel_range_flush(void *info) 295 { 296 struct flush_tlb_info *f = info; 297 unsigned long addr; 298 299 /* flush range by one by one 'invlpg' */ 300 for (addr = f->start; addr < f->end; addr += PAGE_SIZE) 301 __flush_tlb_single(addr); 302 } 303 304 void flush_tlb_kernel_range(unsigned long start, unsigned long end) 305 { 306 307 /* Balance as user space task's flush, a bit conservative */ 308 if (end == TLB_FLUSH_ALL || 309 (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { 310 on_each_cpu(do_flush_tlb_all, NULL, 1); 311 } else { 312 struct flush_tlb_info info; 313 info.start = start; 314 info.end = end; 315 on_each_cpu(do_kernel_range_flush, &info, 1); 316 } 317 } 318 319 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) 320 { 321 struct flush_tlb_info info = { 322 .mm = NULL, 323 .start = 0UL, 324 .end = TLB_FLUSH_ALL, 325 }; 326 327 int cpu = get_cpu(); 328 329 if (cpumask_test_cpu(cpu, &batch->cpumask)) { 330 VM_WARN_ON(irqs_disabled()); 331 local_irq_disable(); 332 flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); 333 local_irq_enable(); 334 } 335 336 if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) 337 flush_tlb_others(&batch->cpumask, &info); 338 cpumask_clear(&batch->cpumask); 339 340 put_cpu(); 341 } 342 343 static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, 344 size_t count, loff_t *ppos) 345 { 346 char buf[32]; 347 unsigned int len; 348 349 len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); 350 return simple_read_from_buffer(user_buf, count, ppos, buf, len); 351 } 352 353 static ssize_t tlbflush_write_file(struct file *file, 354 const char __user *user_buf, size_t count, loff_t *ppos) 355 { 356 char buf[32]; 357 ssize_t len; 358 int ceiling; 359 360 len = min(count, sizeof(buf) - 1); 361 if (copy_from_user(buf, user_buf, len)) 362 return -EFAULT; 363 364 buf[len] = '\0'; 365 if (kstrtoint(buf, 0, &ceiling)) 366 return -EINVAL; 367 368 if (ceiling < 0) 369 return -EINVAL; 370 371 tlb_single_page_flush_ceiling = ceiling; 372 return count; 373 } 374 375 static const struct file_operations fops_tlbflush = { 376 .read = tlbflush_read_file, 377 .write = tlbflush_write_file, 378 .llseek = default_llseek, 379 }; 380 381 static int __init create_tlb_single_page_flush_ceiling(void) 382 { 383 debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, 384 arch_debugfs_dir, NULL, &fops_tlbflush); 385 return 0; 386 } 387 late_initcall(create_tlb_single_page_flush_ceiling); 388