1 #include <linux/init.h> 2 3 #include <linux/mm.h> 4 #include <linux/spinlock.h> 5 #include <linux/smp.h> 6 #include <linux/interrupt.h> 7 #include <linux/export.h> 8 #include <linux/cpu.h> 9 10 #include <asm/tlbflush.h> 11 #include <asm/mmu_context.h> 12 #include <asm/cache.h> 13 #include <asm/apic.h> 14 #include <asm/uv/uv.h> 15 #include <linux/debugfs.h> 16 17 /* 18 * TLB flushing, formerly SMP-only 19 * c/o Linus Torvalds. 20 * 21 * These mean you can really definitely utterly forget about 22 * writing to user space from interrupts. (Its not allowed anyway). 23 * 24 * Optimizations Manfred Spraul <manfred@colorfullife.com> 25 * 26 * More scalable flush, from Andi Kleen 27 * 28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi 29 */ 30 31 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); 32 33 static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, 34 u16 *new_asid, bool *need_flush) 35 { 36 u16 asid; 37 38 if (!static_cpu_has(X86_FEATURE_PCID)) { 39 *new_asid = 0; 40 *need_flush = true; 41 return; 42 } 43 44 for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 45 if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != 46 next->context.ctx_id) 47 continue; 48 49 *new_asid = asid; 50 *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < 51 next_tlb_gen); 52 return; 53 } 54 55 /* 56 * We don't currently own an ASID slot on this CPU. 57 * Allocate a slot. 58 */ 59 *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; 60 if (*new_asid >= TLB_NR_DYN_ASIDS) { 61 *new_asid = 0; 62 this_cpu_write(cpu_tlbstate.next_asid, 1); 63 } 64 *need_flush = true; 65 } 66 67 void leave_mm(int cpu) 68 { 69 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 70 71 /* 72 * It's plausible that we're in lazy TLB mode while our mm is init_mm. 73 * If so, our callers still expect us to flush the TLB, but there 74 * aren't any user TLB entries in init_mm to worry about. 75 * 76 * This needs to happen before any other sanity checks due to 77 * intel_idle's shenanigans. 78 */ 79 if (loaded_mm == &init_mm) 80 return; 81 82 /* Warn if we're not lazy. */ 83 WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); 84 85 switch_mm(NULL, &init_mm, NULL); 86 } 87 88 void switch_mm(struct mm_struct *prev, struct mm_struct *next, 89 struct task_struct *tsk) 90 { 91 unsigned long flags; 92 93 local_irq_save(flags); 94 switch_mm_irqs_off(prev, next, tsk); 95 local_irq_restore(flags); 96 } 97 98 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, 99 struct task_struct *tsk) 100 { 101 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); 102 u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 103 unsigned cpu = smp_processor_id(); 104 u64 next_tlb_gen; 105 106 /* 107 * NB: The scheduler will call us with prev == next when switching 108 * from lazy TLB mode to normal mode if active_mm isn't changing. 109 * When this happens, we don't assume that CR3 (and hence 110 * cpu_tlbstate.loaded_mm) matches next. 111 * 112 * NB: leave_mm() calls us with prev == NULL and tsk == NULL. 113 */ 114 115 /* We don't want flush_tlb_func_* to run concurrently with us. */ 116 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) 117 WARN_ON_ONCE(!irqs_disabled()); 118 119 /* 120 * Verify that CR3 is what we think it is. This will catch 121 * hypothetical buggy code that directly switches to swapper_pg_dir 122 * without going through leave_mm() / switch_mm_irqs_off() or that 123 * does something like write_cr3(read_cr3_pa()). 124 * 125 * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() 126 * isn't free. 127 */ 128 #ifdef CONFIG_DEBUG_VM 129 if (WARN_ON_ONCE(__read_cr3() != 130 (__sme_pa(real_prev->pgd) | prev_asid))) { 131 /* 132 * If we were to BUG here, we'd be very likely to kill 133 * the system so hard that we don't see the call trace. 134 * Try to recover instead by ignoring the error and doing 135 * a global flush to minimize the chance of corruption. 136 * 137 * (This is far from being a fully correct recovery. 138 * Architecturally, the CPU could prefetch something 139 * back into an incorrect ASID slot and leave it there 140 * to cause trouble down the road. It's better than 141 * nothing, though.) 142 */ 143 __flush_tlb_all(); 144 } 145 #endif 146 147 if (real_prev == next) { 148 VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != 149 next->context.ctx_id); 150 151 if (cpumask_test_cpu(cpu, mm_cpumask(next))) { 152 /* 153 * There's nothing to do: we weren't lazy, and we 154 * aren't changing our mm. We don't need to flush 155 * anything, nor do we need to update CR3, CR4, or 156 * LDTR. 157 */ 158 return; 159 } 160 161 /* Resume remote flushes and then read tlb_gen. */ 162 cpumask_set_cpu(cpu, mm_cpumask(next)); 163 next_tlb_gen = atomic64_read(&next->context.tlb_gen); 164 165 if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < 166 next_tlb_gen) { 167 /* 168 * Ideally, we'd have a flush_tlb() variant that 169 * takes the known CR3 value as input. This would 170 * be faster on Xen PV and on hypothetical CPUs 171 * on which INVPCID is fast. 172 */ 173 this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, 174 next_tlb_gen); 175 write_cr3(__sme_pa(next->pgd) | prev_asid); 176 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 177 TLB_FLUSH_ALL); 178 } 179 180 /* 181 * We just exited lazy mode, which means that CR4 and/or LDTR 182 * may be stale. (Changes to the required CR4 and LDTR states 183 * are not reflected in tlb_gen.) 184 */ 185 } else { 186 u16 new_asid; 187 bool need_flush; 188 189 if (IS_ENABLED(CONFIG_VMAP_STACK)) { 190 /* 191 * If our current stack is in vmalloc space and isn't 192 * mapped in the new pgd, we'll double-fault. Forcibly 193 * map it. 194 */ 195 unsigned int index = pgd_index(current_stack_pointer()); 196 pgd_t *pgd = next->pgd + index; 197 198 if (unlikely(pgd_none(*pgd))) 199 set_pgd(pgd, init_mm.pgd[index]); 200 } 201 202 /* Stop remote flushes for the previous mm */ 203 if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) 204 cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 205 206 VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); 207 208 /* 209 * Start remote flushes and then read tlb_gen. 210 */ 211 cpumask_set_cpu(cpu, mm_cpumask(next)); 212 next_tlb_gen = atomic64_read(&next->context.tlb_gen); 213 214 choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); 215 216 if (need_flush) { 217 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 218 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 219 write_cr3(__sme_pa(next->pgd) | new_asid); 220 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 221 TLB_FLUSH_ALL); 222 } else { 223 /* The new ASID is already up to date. */ 224 write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); 225 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); 226 } 227 228 this_cpu_write(cpu_tlbstate.loaded_mm, next); 229 this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); 230 } 231 232 load_mm_cr4(next); 233 switch_ldt(real_prev, next); 234 } 235 236 /* 237 * Call this when reinitializing a CPU. It fixes the following potential 238 * problems: 239 * 240 * - The ASID changed from what cpu_tlbstate thinks it is (most likely 241 * because the CPU was taken down and came back up with CR3's PCID 242 * bits clear. CPU hotplug can do this. 243 * 244 * - The TLB contains junk in slots corresponding to inactive ASIDs. 245 * 246 * - The CPU went so far out to lunch that it may have missed a TLB 247 * flush. 248 */ 249 void initialize_tlbstate_and_flush(void) 250 { 251 int i; 252 struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); 253 u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); 254 unsigned long cr3 = __read_cr3(); 255 256 /* Assert that CR3 already references the right mm. */ 257 WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); 258 259 /* 260 * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization 261 * doesn't work like other CR4 bits because it can only be set from 262 * long mode.) 263 */ 264 WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && 265 !(cr4_read_shadow() & X86_CR4_PCIDE)); 266 267 /* Force ASID 0 and force a TLB flush. */ 268 write_cr3(cr3 & ~CR3_PCID_MASK); 269 270 /* Reinitialize tlbstate. */ 271 this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); 272 this_cpu_write(cpu_tlbstate.next_asid, 1); 273 this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); 274 this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); 275 276 for (i = 1; i < TLB_NR_DYN_ASIDS; i++) 277 this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); 278 } 279 280 /* 281 * flush_tlb_func_common()'s memory ordering requirement is that any 282 * TLB fills that happen after we flush the TLB are ordered after we 283 * read active_mm's tlb_gen. We don't need any explicit barriers 284 * because all x86 flush operations are serializing and the 285 * atomic64_read operation won't be reordered by the compiler. 286 */ 287 static void flush_tlb_func_common(const struct flush_tlb_info *f, 288 bool local, enum tlb_flush_reason reason) 289 { 290 /* 291 * We have three different tlb_gen values in here. They are: 292 * 293 * - mm_tlb_gen: the latest generation. 294 * - local_tlb_gen: the generation that this CPU has already caught 295 * up to. 296 * - f->new_tlb_gen: the generation that the requester of the flush 297 * wants us to catch up to. 298 */ 299 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 300 u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 301 u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); 302 u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); 303 304 /* This code cannot presently handle being reentered. */ 305 VM_WARN_ON(!irqs_disabled()); 306 307 VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != 308 loaded_mm->context.ctx_id); 309 310 if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { 311 /* 312 * We're in lazy mode -- don't flush. We can get here on 313 * remote flushes due to races and on local flushes if a 314 * kernel thread coincidentally flushes the mm it's lazily 315 * still using. 316 */ 317 return; 318 } 319 320 if (unlikely(local_tlb_gen == mm_tlb_gen)) { 321 /* 322 * There's nothing to do: we're already up to date. This can 323 * happen if two concurrent flushes happen -- the first flush to 324 * be handled can catch us all the way up, leaving no work for 325 * the second flush. 326 */ 327 trace_tlb_flush(reason, 0); 328 return; 329 } 330 331 WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); 332 WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); 333 334 /* 335 * If we get to this point, we know that our TLB is out of date. 336 * This does not strictly imply that we need to flush (it's 337 * possible that f->new_tlb_gen <= local_tlb_gen), but we're 338 * going to need to flush in the very near future, so we might 339 * as well get it over with. 340 * 341 * The only question is whether to do a full or partial flush. 342 * 343 * We do a partial flush if requested and two extra conditions 344 * are met: 345 * 346 * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that 347 * we've always done all needed flushes to catch up to 348 * local_tlb_gen. If, for example, local_tlb_gen == 2 and 349 * f->new_tlb_gen == 3, then we know that the flush needed to bring 350 * us up to date for tlb_gen 3 is the partial flush we're 351 * processing. 352 * 353 * As an example of why this check is needed, suppose that there 354 * are two concurrent flushes. The first is a full flush that 355 * changes context.tlb_gen from 1 to 2. The second is a partial 356 * flush that changes context.tlb_gen from 2 to 3. If they get 357 * processed on this CPU in reverse order, we'll see 358 * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. 359 * If we were to use __flush_tlb_single() and set local_tlb_gen to 360 * 3, we'd be break the invariant: we'd update local_tlb_gen above 361 * 1 without the full flush that's needed for tlb_gen 2. 362 * 363 * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. 364 * Partial TLB flushes are not all that much cheaper than full TLB 365 * flushes, so it seems unlikely that it would be a performance win 366 * to do a partial flush if that won't bring our TLB fully up to 367 * date. By doing a full flush instead, we can increase 368 * local_tlb_gen all the way to mm_tlb_gen and we can probably 369 * avoid another flush in the very near future. 370 */ 371 if (f->end != TLB_FLUSH_ALL && 372 f->new_tlb_gen == local_tlb_gen + 1 && 373 f->new_tlb_gen == mm_tlb_gen) { 374 /* Partial flush */ 375 unsigned long addr; 376 unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; 377 378 addr = f->start; 379 while (addr < f->end) { 380 __flush_tlb_single(addr); 381 addr += PAGE_SIZE; 382 } 383 if (local) 384 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); 385 trace_tlb_flush(reason, nr_pages); 386 } else { 387 /* Full flush. */ 388 local_flush_tlb(); 389 if (local) 390 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 391 trace_tlb_flush(reason, TLB_FLUSH_ALL); 392 } 393 394 /* Both paths above update our state to mm_tlb_gen. */ 395 this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); 396 } 397 398 static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) 399 { 400 const struct flush_tlb_info *f = info; 401 402 flush_tlb_func_common(f, true, reason); 403 } 404 405 static void flush_tlb_func_remote(void *info) 406 { 407 const struct flush_tlb_info *f = info; 408 409 inc_irq_stat(irq_tlb_count); 410 411 if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm)) 412 return; 413 414 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 415 flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); 416 } 417 418 void native_flush_tlb_others(const struct cpumask *cpumask, 419 const struct flush_tlb_info *info) 420 { 421 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 422 if (info->end == TLB_FLUSH_ALL) 423 trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); 424 else 425 trace_tlb_flush(TLB_REMOTE_SEND_IPI, 426 (info->end - info->start) >> PAGE_SHIFT); 427 428 if (is_uv_system()) { 429 /* 430 * This whole special case is confused. UV has a "Broadcast 431 * Assist Unit", which seems to be a fancy way to send IPIs. 432 * Back when x86 used an explicit TLB flush IPI, UV was 433 * optimized to use its own mechanism. These days, x86 uses 434 * smp_call_function_many(), but UV still uses a manual IPI, 435 * and that IPI's action is out of date -- it does a manual 436 * flush instead of calling flush_tlb_func_remote(). This 437 * means that the percpu tlb_gen variables won't be updated 438 * and we'll do pointless flushes on future context switches. 439 * 440 * Rather than hooking native_flush_tlb_others() here, I think 441 * that UV should be updated so that smp_call_function_many(), 442 * etc, are optimal on UV. 443 */ 444 unsigned int cpu; 445 446 cpu = smp_processor_id(); 447 cpumask = uv_flush_tlb_others(cpumask, info); 448 if (cpumask) 449 smp_call_function_many(cpumask, flush_tlb_func_remote, 450 (void *)info, 1); 451 return; 452 } 453 smp_call_function_many(cpumask, flush_tlb_func_remote, 454 (void *)info, 1); 455 } 456 457 /* 458 * See Documentation/x86/tlb.txt for details. We choose 33 459 * because it is large enough to cover the vast majority (at 460 * least 95%) of allocations, and is small enough that we are 461 * confident it will not cause too much overhead. Each single 462 * flush is about 100 ns, so this caps the maximum overhead at 463 * _about_ 3,000 ns. 464 * 465 * This is in units of pages. 466 */ 467 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; 468 469 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 470 unsigned long end, unsigned long vmflag) 471 { 472 int cpu; 473 474 struct flush_tlb_info info = { 475 .mm = mm, 476 }; 477 478 cpu = get_cpu(); 479 480 /* This is also a barrier that synchronizes with switch_mm(). */ 481 info.new_tlb_gen = inc_mm_tlb_gen(mm); 482 483 /* Should we flush just the requested range? */ 484 if ((end != TLB_FLUSH_ALL) && 485 !(vmflag & VM_HUGETLB) && 486 ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { 487 info.start = start; 488 info.end = end; 489 } else { 490 info.start = 0UL; 491 info.end = TLB_FLUSH_ALL; 492 } 493 494 if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { 495 VM_WARN_ON(irqs_disabled()); 496 local_irq_disable(); 497 flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); 498 local_irq_enable(); 499 } 500 501 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) 502 flush_tlb_others(mm_cpumask(mm), &info); 503 504 put_cpu(); 505 } 506 507 508 static void do_flush_tlb_all(void *info) 509 { 510 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 511 __flush_tlb_all(); 512 } 513 514 void flush_tlb_all(void) 515 { 516 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 517 on_each_cpu(do_flush_tlb_all, NULL, 1); 518 } 519 520 static void do_kernel_range_flush(void *info) 521 { 522 struct flush_tlb_info *f = info; 523 unsigned long addr; 524 525 /* flush range by one by one 'invlpg' */ 526 for (addr = f->start; addr < f->end; addr += PAGE_SIZE) 527 __flush_tlb_single(addr); 528 } 529 530 void flush_tlb_kernel_range(unsigned long start, unsigned long end) 531 { 532 533 /* Balance as user space task's flush, a bit conservative */ 534 if (end == TLB_FLUSH_ALL || 535 (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { 536 on_each_cpu(do_flush_tlb_all, NULL, 1); 537 } else { 538 struct flush_tlb_info info; 539 info.start = start; 540 info.end = end; 541 on_each_cpu(do_kernel_range_flush, &info, 1); 542 } 543 } 544 545 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) 546 { 547 struct flush_tlb_info info = { 548 .mm = NULL, 549 .start = 0UL, 550 .end = TLB_FLUSH_ALL, 551 }; 552 553 int cpu = get_cpu(); 554 555 if (cpumask_test_cpu(cpu, &batch->cpumask)) { 556 VM_WARN_ON(irqs_disabled()); 557 local_irq_disable(); 558 flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); 559 local_irq_enable(); 560 } 561 562 if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) 563 flush_tlb_others(&batch->cpumask, &info); 564 565 cpumask_clear(&batch->cpumask); 566 567 put_cpu(); 568 } 569 570 static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, 571 size_t count, loff_t *ppos) 572 { 573 char buf[32]; 574 unsigned int len; 575 576 len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); 577 return simple_read_from_buffer(user_buf, count, ppos, buf, len); 578 } 579 580 static ssize_t tlbflush_write_file(struct file *file, 581 const char __user *user_buf, size_t count, loff_t *ppos) 582 { 583 char buf[32]; 584 ssize_t len; 585 int ceiling; 586 587 len = min(count, sizeof(buf) - 1); 588 if (copy_from_user(buf, user_buf, len)) 589 return -EFAULT; 590 591 buf[len] = '\0'; 592 if (kstrtoint(buf, 0, &ceiling)) 593 return -EINVAL; 594 595 if (ceiling < 0) 596 return -EINVAL; 597 598 tlb_single_page_flush_ceiling = ceiling; 599 return count; 600 } 601 602 static const struct file_operations fops_tlbflush = { 603 .read = tlbflush_read_file, 604 .write = tlbflush_write_file, 605 .llseek = default_llseek, 606 }; 607 608 static int __init create_tlb_single_page_flush_ceiling(void) 609 { 610 debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, 611 arch_debugfs_dir, NULL, &fops_tlbflush); 612 return 0; 613 } 614 late_initcall(create_tlb_single_page_flush_ceiling); 615