1 #include <linux/init.h> 2 3 #include <linux/mm.h> 4 #include <linux/spinlock.h> 5 #include <linux/smp.h> 6 #include <linux/interrupt.h> 7 #include <linux/export.h> 8 #include <linux/cpu.h> 9 10 #include <asm/tlbflush.h> 11 #include <asm/mmu_context.h> 12 #include <asm/cache.h> 13 #include <asm/apic.h> 14 #include <asm/uv/uv.h> 15 #include <linux/debugfs.h> 16 17 /* 18 * TLB flushing, formerly SMP-only 19 * c/o Linus Torvalds. 20 * 21 * These mean you can really definitely utterly forget about 22 * writing to user space from interrupts. (Its not allowed anyway). 23 * 24 * Optimizations Manfred Spraul <manfred@colorfullife.com> 25 * 26 * More scalable flush, from Andi Kleen 27 * 28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi 29 */ 30 31 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); 32 33 static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, 34 u16 *new_asid, bool *need_flush) 35 { 36 u16 asid; 37 38 if (!static_cpu_has(X86_FEATURE_PCID)) { 39 *new_asid = 0; 40 *need_flush = true; 41 return; 42 } 43 44 for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 45 if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != 46 next->context.ctx_id) 47 continue; 48 49 *new_asid = asid; 50 *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < 51 next_tlb_gen); 52 return; 53 } 54 55 /* 56 * We don't currently own an ASID slot on this CPU. 57 * Allocate a slot. 58 */ 59 *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; 60 if (*new_asid >= TLB_NR_DYN_ASIDS) { 61 *new_asid = 0; 62 this_cpu_write(cpu_tlbstate.next_asid, 1); 63 } 64 *need_flush = true; 65 } 66 67 void leave_mm(int cpu) 68 { 69 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 70 71 /* 72 * It's plausible that we're in lazy TLB mode while our mm is init_mm. 73 * If so, our callers still expect us to flush the TLB, but there 74 * aren't any user TLB entries in init_mm to worry about. 75 * 76 * This needs to happen before any other sanity checks due to 77 * intel_idle's shenanigans. 78 */ 79 if (loaded_mm == &init_mm) 80 return; 81 82 /* Warn if we're not lazy. */ 83 WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); 84 85 switch_mm(NULL, &init_mm, NULL); 86 } 87 88 void switch_mm(struct mm_struct *prev, struct mm_struct *next, 89 struct task_struct *tsk) 90 { 91 unsigned long flags; 92 93 local_irq_save(flags); 94 switch_mm_irqs_off(prev, next, tsk); 95 local_irq_restore(flags); 96 } 97 98 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, 99 struct task_struct *tsk) 100 { 101 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); 102 u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 103 unsigned cpu = smp_processor_id(); 104 u64 next_tlb_gen; 105 106 /* 107 * NB: The scheduler will call us with prev == next when switching 108 * from lazy TLB mode to normal mode if active_mm isn't changing. 109 * When this happens, we don't assume that CR3 (and hence 110 * cpu_tlbstate.loaded_mm) matches next. 111 * 112 * NB: leave_mm() calls us with prev == NULL and tsk == NULL. 113 */ 114 115 /* We don't want flush_tlb_func_* to run concurrently with us. */ 116 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) 117 WARN_ON_ONCE(!irqs_disabled()); 118 119 /* 120 * Verify that CR3 is what we think it is. This will catch 121 * hypothetical buggy code that directly switches to swapper_pg_dir 122 * without going through leave_mm() / switch_mm_irqs_off() or that 123 * does something like write_cr3(read_cr3_pa()). 124 * 125 * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() 126 * isn't free. 127 */ 128 #ifdef CONFIG_DEBUG_VM 129 if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { 130 /* 131 * If we were to BUG here, we'd be very likely to kill 132 * the system so hard that we don't see the call trace. 133 * Try to recover instead by ignoring the error and doing 134 * a global flush to minimize the chance of corruption. 135 * 136 * (This is far from being a fully correct recovery. 137 * Architecturally, the CPU could prefetch something 138 * back into an incorrect ASID slot and leave it there 139 * to cause trouble down the road. It's better than 140 * nothing, though.) 141 */ 142 __flush_tlb_all(); 143 } 144 #endif 145 146 if (real_prev == next) { 147 VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != 148 next->context.ctx_id); 149 150 if (cpumask_test_cpu(cpu, mm_cpumask(next))) { 151 /* 152 * There's nothing to do: we weren't lazy, and we 153 * aren't changing our mm. We don't need to flush 154 * anything, nor do we need to update CR3, CR4, or 155 * LDTR. 156 */ 157 return; 158 } 159 160 /* Resume remote flushes and then read tlb_gen. */ 161 cpumask_set_cpu(cpu, mm_cpumask(next)); 162 next_tlb_gen = atomic64_read(&next->context.tlb_gen); 163 164 if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < 165 next_tlb_gen) { 166 /* 167 * Ideally, we'd have a flush_tlb() variant that 168 * takes the known CR3 value as input. This would 169 * be faster on Xen PV and on hypothetical CPUs 170 * on which INVPCID is fast. 171 */ 172 this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, 173 next_tlb_gen); 174 write_cr3(build_cr3(next, prev_asid)); 175 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 176 TLB_FLUSH_ALL); 177 } 178 179 /* 180 * We just exited lazy mode, which means that CR4 and/or LDTR 181 * may be stale. (Changes to the required CR4 and LDTR states 182 * are not reflected in tlb_gen.) 183 */ 184 } else { 185 u16 new_asid; 186 bool need_flush; 187 188 if (IS_ENABLED(CONFIG_VMAP_STACK)) { 189 /* 190 * If our current stack is in vmalloc space and isn't 191 * mapped in the new pgd, we'll double-fault. Forcibly 192 * map it. 193 */ 194 unsigned int index = pgd_index(current_stack_pointer); 195 pgd_t *pgd = next->pgd + index; 196 197 if (unlikely(pgd_none(*pgd))) 198 set_pgd(pgd, init_mm.pgd[index]); 199 } 200 201 /* Stop remote flushes for the previous mm */ 202 if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) 203 cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 204 205 VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); 206 207 /* 208 * Start remote flushes and then read tlb_gen. 209 */ 210 cpumask_set_cpu(cpu, mm_cpumask(next)); 211 next_tlb_gen = atomic64_read(&next->context.tlb_gen); 212 213 choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); 214 215 if (need_flush) { 216 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 217 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 218 write_cr3(build_cr3(next, new_asid)); 219 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 220 TLB_FLUSH_ALL); 221 } else { 222 /* The new ASID is already up to date. */ 223 write_cr3(build_cr3_noflush(next, new_asid)); 224 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); 225 } 226 227 this_cpu_write(cpu_tlbstate.loaded_mm, next); 228 this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); 229 } 230 231 load_mm_cr4(next); 232 switch_ldt(real_prev, next); 233 } 234 235 /* 236 * Call this when reinitializing a CPU. It fixes the following potential 237 * problems: 238 * 239 * - The ASID changed from what cpu_tlbstate thinks it is (most likely 240 * because the CPU was taken down and came back up with CR3's PCID 241 * bits clear. CPU hotplug can do this. 242 * 243 * - The TLB contains junk in slots corresponding to inactive ASIDs. 244 * 245 * - The CPU went so far out to lunch that it may have missed a TLB 246 * flush. 247 */ 248 void initialize_tlbstate_and_flush(void) 249 { 250 int i; 251 struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); 252 u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); 253 unsigned long cr3 = __read_cr3(); 254 255 /* Assert that CR3 already references the right mm. */ 256 WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); 257 258 /* 259 * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization 260 * doesn't work like other CR4 bits because it can only be set from 261 * long mode.) 262 */ 263 WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && 264 !(cr4_read_shadow() & X86_CR4_PCIDE)); 265 266 /* Force ASID 0 and force a TLB flush. */ 267 write_cr3(build_cr3(mm, 0)); 268 269 /* Reinitialize tlbstate. */ 270 this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); 271 this_cpu_write(cpu_tlbstate.next_asid, 1); 272 this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); 273 this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); 274 275 for (i = 1; i < TLB_NR_DYN_ASIDS; i++) 276 this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); 277 } 278 279 /* 280 * flush_tlb_func_common()'s memory ordering requirement is that any 281 * TLB fills that happen after we flush the TLB are ordered after we 282 * read active_mm's tlb_gen. We don't need any explicit barriers 283 * because all x86 flush operations are serializing and the 284 * atomic64_read operation won't be reordered by the compiler. 285 */ 286 static void flush_tlb_func_common(const struct flush_tlb_info *f, 287 bool local, enum tlb_flush_reason reason) 288 { 289 /* 290 * We have three different tlb_gen values in here. They are: 291 * 292 * - mm_tlb_gen: the latest generation. 293 * - local_tlb_gen: the generation that this CPU has already caught 294 * up to. 295 * - f->new_tlb_gen: the generation that the requester of the flush 296 * wants us to catch up to. 297 */ 298 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 299 u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 300 u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); 301 u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); 302 303 /* This code cannot presently handle being reentered. */ 304 VM_WARN_ON(!irqs_disabled()); 305 306 VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != 307 loaded_mm->context.ctx_id); 308 309 if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { 310 /* 311 * We're in lazy mode -- don't flush. We can get here on 312 * remote flushes due to races and on local flushes if a 313 * kernel thread coincidentally flushes the mm it's lazily 314 * still using. 315 */ 316 return; 317 } 318 319 if (unlikely(local_tlb_gen == mm_tlb_gen)) { 320 /* 321 * There's nothing to do: we're already up to date. This can 322 * happen if two concurrent flushes happen -- the first flush to 323 * be handled can catch us all the way up, leaving no work for 324 * the second flush. 325 */ 326 trace_tlb_flush(reason, 0); 327 return; 328 } 329 330 WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); 331 WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); 332 333 /* 334 * If we get to this point, we know that our TLB is out of date. 335 * This does not strictly imply that we need to flush (it's 336 * possible that f->new_tlb_gen <= local_tlb_gen), but we're 337 * going to need to flush in the very near future, so we might 338 * as well get it over with. 339 * 340 * The only question is whether to do a full or partial flush. 341 * 342 * We do a partial flush if requested and two extra conditions 343 * are met: 344 * 345 * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that 346 * we've always done all needed flushes to catch up to 347 * local_tlb_gen. If, for example, local_tlb_gen == 2 and 348 * f->new_tlb_gen == 3, then we know that the flush needed to bring 349 * us up to date for tlb_gen 3 is the partial flush we're 350 * processing. 351 * 352 * As an example of why this check is needed, suppose that there 353 * are two concurrent flushes. The first is a full flush that 354 * changes context.tlb_gen from 1 to 2. The second is a partial 355 * flush that changes context.tlb_gen from 2 to 3. If they get 356 * processed on this CPU in reverse order, we'll see 357 * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. 358 * If we were to use __flush_tlb_single() and set local_tlb_gen to 359 * 3, we'd be break the invariant: we'd update local_tlb_gen above 360 * 1 without the full flush that's needed for tlb_gen 2. 361 * 362 * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. 363 * Partial TLB flushes are not all that much cheaper than full TLB 364 * flushes, so it seems unlikely that it would be a performance win 365 * to do a partial flush if that won't bring our TLB fully up to 366 * date. By doing a full flush instead, we can increase 367 * local_tlb_gen all the way to mm_tlb_gen and we can probably 368 * avoid another flush in the very near future. 369 */ 370 if (f->end != TLB_FLUSH_ALL && 371 f->new_tlb_gen == local_tlb_gen + 1 && 372 f->new_tlb_gen == mm_tlb_gen) { 373 /* Partial flush */ 374 unsigned long addr; 375 unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; 376 377 addr = f->start; 378 while (addr < f->end) { 379 __flush_tlb_single(addr); 380 addr += PAGE_SIZE; 381 } 382 if (local) 383 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); 384 trace_tlb_flush(reason, nr_pages); 385 } else { 386 /* Full flush. */ 387 local_flush_tlb(); 388 if (local) 389 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 390 trace_tlb_flush(reason, TLB_FLUSH_ALL); 391 } 392 393 /* Both paths above update our state to mm_tlb_gen. */ 394 this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); 395 } 396 397 static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) 398 { 399 const struct flush_tlb_info *f = info; 400 401 flush_tlb_func_common(f, true, reason); 402 } 403 404 static void flush_tlb_func_remote(void *info) 405 { 406 const struct flush_tlb_info *f = info; 407 408 inc_irq_stat(irq_tlb_count); 409 410 if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm)) 411 return; 412 413 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 414 flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); 415 } 416 417 void native_flush_tlb_others(const struct cpumask *cpumask, 418 const struct flush_tlb_info *info) 419 { 420 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 421 if (info->end == TLB_FLUSH_ALL) 422 trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); 423 else 424 trace_tlb_flush(TLB_REMOTE_SEND_IPI, 425 (info->end - info->start) >> PAGE_SHIFT); 426 427 if (is_uv_system()) { 428 /* 429 * This whole special case is confused. UV has a "Broadcast 430 * Assist Unit", which seems to be a fancy way to send IPIs. 431 * Back when x86 used an explicit TLB flush IPI, UV was 432 * optimized to use its own mechanism. These days, x86 uses 433 * smp_call_function_many(), but UV still uses a manual IPI, 434 * and that IPI's action is out of date -- it does a manual 435 * flush instead of calling flush_tlb_func_remote(). This 436 * means that the percpu tlb_gen variables won't be updated 437 * and we'll do pointless flushes on future context switches. 438 * 439 * Rather than hooking native_flush_tlb_others() here, I think 440 * that UV should be updated so that smp_call_function_many(), 441 * etc, are optimal on UV. 442 */ 443 unsigned int cpu; 444 445 cpu = smp_processor_id(); 446 cpumask = uv_flush_tlb_others(cpumask, info); 447 if (cpumask) 448 smp_call_function_many(cpumask, flush_tlb_func_remote, 449 (void *)info, 1); 450 return; 451 } 452 smp_call_function_many(cpumask, flush_tlb_func_remote, 453 (void *)info, 1); 454 } 455 456 /* 457 * See Documentation/x86/tlb.txt for details. We choose 33 458 * because it is large enough to cover the vast majority (at 459 * least 95%) of allocations, and is small enough that we are 460 * confident it will not cause too much overhead. Each single 461 * flush is about 100 ns, so this caps the maximum overhead at 462 * _about_ 3,000 ns. 463 * 464 * This is in units of pages. 465 */ 466 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; 467 468 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 469 unsigned long end, unsigned long vmflag) 470 { 471 int cpu; 472 473 struct flush_tlb_info info = { 474 .mm = mm, 475 }; 476 477 cpu = get_cpu(); 478 479 /* This is also a barrier that synchronizes with switch_mm(). */ 480 info.new_tlb_gen = inc_mm_tlb_gen(mm); 481 482 /* Should we flush just the requested range? */ 483 if ((end != TLB_FLUSH_ALL) && 484 !(vmflag & VM_HUGETLB) && 485 ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { 486 info.start = start; 487 info.end = end; 488 } else { 489 info.start = 0UL; 490 info.end = TLB_FLUSH_ALL; 491 } 492 493 if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { 494 VM_WARN_ON(irqs_disabled()); 495 local_irq_disable(); 496 flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); 497 local_irq_enable(); 498 } 499 500 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) 501 flush_tlb_others(mm_cpumask(mm), &info); 502 503 put_cpu(); 504 } 505 506 507 static void do_flush_tlb_all(void *info) 508 { 509 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 510 __flush_tlb_all(); 511 } 512 513 void flush_tlb_all(void) 514 { 515 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 516 on_each_cpu(do_flush_tlb_all, NULL, 1); 517 } 518 519 static void do_kernel_range_flush(void *info) 520 { 521 struct flush_tlb_info *f = info; 522 unsigned long addr; 523 524 /* flush range by one by one 'invlpg' */ 525 for (addr = f->start; addr < f->end; addr += PAGE_SIZE) 526 __flush_tlb_single(addr); 527 } 528 529 void flush_tlb_kernel_range(unsigned long start, unsigned long end) 530 { 531 532 /* Balance as user space task's flush, a bit conservative */ 533 if (end == TLB_FLUSH_ALL || 534 (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { 535 on_each_cpu(do_flush_tlb_all, NULL, 1); 536 } else { 537 struct flush_tlb_info info; 538 info.start = start; 539 info.end = end; 540 on_each_cpu(do_kernel_range_flush, &info, 1); 541 } 542 } 543 544 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) 545 { 546 struct flush_tlb_info info = { 547 .mm = NULL, 548 .start = 0UL, 549 .end = TLB_FLUSH_ALL, 550 }; 551 552 int cpu = get_cpu(); 553 554 if (cpumask_test_cpu(cpu, &batch->cpumask)) { 555 VM_WARN_ON(irqs_disabled()); 556 local_irq_disable(); 557 flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); 558 local_irq_enable(); 559 } 560 561 if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) 562 flush_tlb_others(&batch->cpumask, &info); 563 564 cpumask_clear(&batch->cpumask); 565 566 put_cpu(); 567 } 568 569 static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, 570 size_t count, loff_t *ppos) 571 { 572 char buf[32]; 573 unsigned int len; 574 575 len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); 576 return simple_read_from_buffer(user_buf, count, ppos, buf, len); 577 } 578 579 static ssize_t tlbflush_write_file(struct file *file, 580 const char __user *user_buf, size_t count, loff_t *ppos) 581 { 582 char buf[32]; 583 ssize_t len; 584 int ceiling; 585 586 len = min(count, sizeof(buf) - 1); 587 if (copy_from_user(buf, user_buf, len)) 588 return -EFAULT; 589 590 buf[len] = '\0'; 591 if (kstrtoint(buf, 0, &ceiling)) 592 return -EINVAL; 593 594 if (ceiling < 0) 595 return -EINVAL; 596 597 tlb_single_page_flush_ceiling = ceiling; 598 return count; 599 } 600 601 static const struct file_operations fops_tlbflush = { 602 .read = tlbflush_read_file, 603 .write = tlbflush_write_file, 604 .llseek = default_llseek, 605 }; 606 607 static int __init create_tlb_single_page_flush_ceiling(void) 608 { 609 debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, 610 arch_debugfs_dir, NULL, &fops_tlbflush); 611 return 0; 612 } 613 late_initcall(create_tlb_single_page_flush_ceiling); 614