1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/init.h> 3 4 #include <linux/mm.h> 5 #include <linux/spinlock.h> 6 #include <linux/smp.h> 7 #include <linux/interrupt.h> 8 #include <linux/export.h> 9 #include <linux/cpu.h> 10 #include <linux/debugfs.h> 11 12 #include <asm/tlbflush.h> 13 #include <asm/mmu_context.h> 14 #include <asm/nospec-branch.h> 15 #include <asm/cache.h> 16 #include <asm/apic.h> 17 #include <asm/uv/uv.h> 18 19 #include "mm_internal.h" 20 21 /* 22 * TLB flushing, formerly SMP-only 23 * c/o Linus Torvalds. 24 * 25 * These mean you can really definitely utterly forget about 26 * writing to user space from interrupts. (Its not allowed anyway). 27 * 28 * Optimizations Manfred Spraul <manfred@colorfullife.com> 29 * 30 * More scalable flush, from Andi Kleen 31 * 32 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi 33 */ 34 35 /* 36 * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is 37 * stored in cpu_tlb_state.last_user_mm_ibpb. 38 */ 39 #define LAST_USER_MM_IBPB 0x1UL 40 41 /* 42 * We get here when we do something requiring a TLB invalidation 43 * but could not go invalidate all of the contexts. We do the 44 * necessary invalidation by clearing out the 'ctx_id' which 45 * forces a TLB flush when the context is loaded. 46 */ 47 static void clear_asid_other(void) 48 { 49 u16 asid; 50 51 /* 52 * This is only expected to be set if we have disabled 53 * kernel _PAGE_GLOBAL pages. 54 */ 55 if (!static_cpu_has(X86_FEATURE_PTI)) { 56 WARN_ON_ONCE(1); 57 return; 58 } 59 60 for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 61 /* Do not need to flush the current asid */ 62 if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) 63 continue; 64 /* 65 * Make sure the next time we go to switch to 66 * this asid, we do a flush: 67 */ 68 this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); 69 } 70 this_cpu_write(cpu_tlbstate.invalidate_other, false); 71 } 72 73 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); 74 75 76 static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, 77 u16 *new_asid, bool *need_flush) 78 { 79 u16 asid; 80 81 if (!static_cpu_has(X86_FEATURE_PCID)) { 82 *new_asid = 0; 83 *need_flush = true; 84 return; 85 } 86 87 if (this_cpu_read(cpu_tlbstate.invalidate_other)) 88 clear_asid_other(); 89 90 for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 91 if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != 92 next->context.ctx_id) 93 continue; 94 95 *new_asid = asid; 96 *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < 97 next_tlb_gen); 98 return; 99 } 100 101 /* 102 * We don't currently own an ASID slot on this CPU. 103 * Allocate a slot. 104 */ 105 *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; 106 if (*new_asid >= TLB_NR_DYN_ASIDS) { 107 *new_asid = 0; 108 this_cpu_write(cpu_tlbstate.next_asid, 1); 109 } 110 *need_flush = true; 111 } 112 113 static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) 114 { 115 unsigned long new_mm_cr3; 116 117 if (need_flush) { 118 invalidate_user_asid(new_asid); 119 new_mm_cr3 = build_cr3(pgdir, new_asid); 120 } else { 121 new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); 122 } 123 124 /* 125 * Caution: many callers of this function expect 126 * that load_cr3() is serializing and orders TLB 127 * fills with respect to the mm_cpumask writes. 128 */ 129 write_cr3(new_mm_cr3); 130 } 131 132 void leave_mm(int cpu) 133 { 134 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 135 136 /* 137 * It's plausible that we're in lazy TLB mode while our mm is init_mm. 138 * If so, our callers still expect us to flush the TLB, but there 139 * aren't any user TLB entries in init_mm to worry about. 140 * 141 * This needs to happen before any other sanity checks due to 142 * intel_idle's shenanigans. 143 */ 144 if (loaded_mm == &init_mm) 145 return; 146 147 /* Warn if we're not lazy. */ 148 WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); 149 150 switch_mm(NULL, &init_mm, NULL); 151 } 152 EXPORT_SYMBOL_GPL(leave_mm); 153 154 void switch_mm(struct mm_struct *prev, struct mm_struct *next, 155 struct task_struct *tsk) 156 { 157 unsigned long flags; 158 159 local_irq_save(flags); 160 switch_mm_irqs_off(prev, next, tsk); 161 local_irq_restore(flags); 162 } 163 164 static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) 165 { 166 unsigned long next_tif = task_thread_info(next)->flags; 167 unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; 168 169 return (unsigned long)next->mm | ibpb; 170 } 171 172 static void cond_ibpb(struct task_struct *next) 173 { 174 if (!next || !next->mm) 175 return; 176 177 /* 178 * Both, the conditional and the always IBPB mode use the mm 179 * pointer to avoid the IBPB when switching between tasks of the 180 * same process. Using the mm pointer instead of mm->context.ctx_id 181 * opens a hypothetical hole vs. mm_struct reuse, which is more or 182 * less impossible to control by an attacker. Aside of that it 183 * would only affect the first schedule so the theoretically 184 * exposed data is not really interesting. 185 */ 186 if (static_branch_likely(&switch_mm_cond_ibpb)) { 187 unsigned long prev_mm, next_mm; 188 189 /* 190 * This is a bit more complex than the always mode because 191 * it has to handle two cases: 192 * 193 * 1) Switch from a user space task (potential attacker) 194 * which has TIF_SPEC_IB set to a user space task 195 * (potential victim) which has TIF_SPEC_IB not set. 196 * 197 * 2) Switch from a user space task (potential attacker) 198 * which has TIF_SPEC_IB not set to a user space task 199 * (potential victim) which has TIF_SPEC_IB set. 200 * 201 * This could be done by unconditionally issuing IBPB when 202 * a task which has TIF_SPEC_IB set is either scheduled in 203 * or out. Though that results in two flushes when: 204 * 205 * - the same user space task is scheduled out and later 206 * scheduled in again and only a kernel thread ran in 207 * between. 208 * 209 * - a user space task belonging to the same process is 210 * scheduled in after a kernel thread ran in between 211 * 212 * - a user space task belonging to the same process is 213 * scheduled in immediately. 214 * 215 * Optimize this with reasonably small overhead for the 216 * above cases. Mangle the TIF_SPEC_IB bit into the mm 217 * pointer of the incoming task which is stored in 218 * cpu_tlbstate.last_user_mm_ibpb for comparison. 219 */ 220 next_mm = mm_mangle_tif_spec_ib(next); 221 prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); 222 223 /* 224 * Issue IBPB only if the mm's are different and one or 225 * both have the IBPB bit set. 226 */ 227 if (next_mm != prev_mm && 228 (next_mm | prev_mm) & LAST_USER_MM_IBPB) 229 indirect_branch_prediction_barrier(); 230 231 this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); 232 } 233 234 if (static_branch_unlikely(&switch_mm_always_ibpb)) { 235 /* 236 * Only flush when switching to a user space task with a 237 * different context than the user space task which ran 238 * last on this CPU. 239 */ 240 if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { 241 indirect_branch_prediction_barrier(); 242 this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); 243 } 244 } 245 } 246 247 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, 248 struct task_struct *tsk) 249 { 250 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); 251 u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 252 bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); 253 unsigned cpu = smp_processor_id(); 254 u64 next_tlb_gen; 255 bool need_flush; 256 u16 new_asid; 257 258 /* 259 * NB: The scheduler will call us with prev == next when switching 260 * from lazy TLB mode to normal mode if active_mm isn't changing. 261 * When this happens, we don't assume that CR3 (and hence 262 * cpu_tlbstate.loaded_mm) matches next. 263 * 264 * NB: leave_mm() calls us with prev == NULL and tsk == NULL. 265 */ 266 267 /* We don't want flush_tlb_func_* to run concurrently with us. */ 268 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) 269 WARN_ON_ONCE(!irqs_disabled()); 270 271 /* 272 * Verify that CR3 is what we think it is. This will catch 273 * hypothetical buggy code that directly switches to swapper_pg_dir 274 * without going through leave_mm() / switch_mm_irqs_off() or that 275 * does something like write_cr3(read_cr3_pa()). 276 * 277 * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() 278 * isn't free. 279 */ 280 #ifdef CONFIG_DEBUG_VM 281 if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { 282 /* 283 * If we were to BUG here, we'd be very likely to kill 284 * the system so hard that we don't see the call trace. 285 * Try to recover instead by ignoring the error and doing 286 * a global flush to minimize the chance of corruption. 287 * 288 * (This is far from being a fully correct recovery. 289 * Architecturally, the CPU could prefetch something 290 * back into an incorrect ASID slot and leave it there 291 * to cause trouble down the road. It's better than 292 * nothing, though.) 293 */ 294 __flush_tlb_all(); 295 } 296 #endif 297 this_cpu_write(cpu_tlbstate.is_lazy, false); 298 299 /* 300 * The membarrier system call requires a full memory barrier and 301 * core serialization before returning to user-space, after 302 * storing to rq->curr. Writing to CR3 provides that full 303 * memory barrier and core serializing instruction. 304 */ 305 if (real_prev == next) { 306 VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != 307 next->context.ctx_id); 308 309 /* 310 * Even in lazy TLB mode, the CPU should stay set in the 311 * mm_cpumask. The TLB shootdown code can figure out from 312 * from cpu_tlbstate.is_lazy whether or not to send an IPI. 313 */ 314 if (WARN_ON_ONCE(real_prev != &init_mm && 315 !cpumask_test_cpu(cpu, mm_cpumask(next)))) 316 cpumask_set_cpu(cpu, mm_cpumask(next)); 317 318 /* 319 * If the CPU is not in lazy TLB mode, we are just switching 320 * from one thread in a process to another thread in the same 321 * process. No TLB flush required. 322 */ 323 if (!was_lazy) 324 return; 325 326 /* 327 * Read the tlb_gen to check whether a flush is needed. 328 * If the TLB is up to date, just use it. 329 * The barrier synchronizes with the tlb_gen increment in 330 * the TLB shootdown code. 331 */ 332 smp_mb(); 333 next_tlb_gen = atomic64_read(&next->context.tlb_gen); 334 if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == 335 next_tlb_gen) 336 return; 337 338 /* 339 * TLB contents went out of date while we were in lazy 340 * mode. Fall through to the TLB switching code below. 341 */ 342 new_asid = prev_asid; 343 need_flush = true; 344 } else { 345 /* 346 * Avoid user/user BTB poisoning by flushing the branch 347 * predictor when switching between processes. This stops 348 * one process from doing Spectre-v2 attacks on another. 349 */ 350 cond_ibpb(tsk); 351 352 /* 353 * Stop remote flushes for the previous mm. 354 * Skip kernel threads; we never send init_mm TLB flushing IPIs, 355 * but the bitmap manipulation can cause cache line contention. 356 */ 357 if (real_prev != &init_mm) { 358 VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, 359 mm_cpumask(real_prev))); 360 cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 361 } 362 363 /* 364 * Start remote flushes and then read tlb_gen. 365 */ 366 if (next != &init_mm) 367 cpumask_set_cpu(cpu, mm_cpumask(next)); 368 next_tlb_gen = atomic64_read(&next->context.tlb_gen); 369 370 choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); 371 372 /* Let nmi_uaccess_okay() know that we're changing CR3. */ 373 this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); 374 barrier(); 375 } 376 377 if (need_flush) { 378 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 379 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 380 load_new_mm_cr3(next->pgd, new_asid, true); 381 382 /* 383 * NB: This gets called via leave_mm() in the idle path 384 * where RCU functions differently. Tracing normally 385 * uses RCU, so we need to use the _rcuidle variant. 386 * 387 * (There is no good reason for this. The idle code should 388 * be rearranged to call this before rcu_idle_enter().) 389 */ 390 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 391 } else { 392 /* The new ASID is already up to date. */ 393 load_new_mm_cr3(next->pgd, new_asid, false); 394 395 /* See above wrt _rcuidle. */ 396 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); 397 } 398 399 /* Make sure we write CR3 before loaded_mm. */ 400 barrier(); 401 402 this_cpu_write(cpu_tlbstate.loaded_mm, next); 403 this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); 404 405 if (next != real_prev) { 406 load_mm_cr4_irqsoff(next); 407 switch_ldt(real_prev, next); 408 } 409 } 410 411 /* 412 * Please ignore the name of this function. It should be called 413 * switch_to_kernel_thread(). 414 * 415 * enter_lazy_tlb() is a hint from the scheduler that we are entering a 416 * kernel thread or other context without an mm. Acceptable implementations 417 * include doing nothing whatsoever, switching to init_mm, or various clever 418 * lazy tricks to try to minimize TLB flushes. 419 * 420 * The scheduler reserves the right to call enter_lazy_tlb() several times 421 * in a row. It will notify us that we're going back to a real mm by 422 * calling switch_mm_irqs_off(). 423 */ 424 void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 425 { 426 if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) 427 return; 428 429 this_cpu_write(cpu_tlbstate.is_lazy, true); 430 } 431 432 /* 433 * Call this when reinitializing a CPU. It fixes the following potential 434 * problems: 435 * 436 * - The ASID changed from what cpu_tlbstate thinks it is (most likely 437 * because the CPU was taken down and came back up with CR3's PCID 438 * bits clear. CPU hotplug can do this. 439 * 440 * - The TLB contains junk in slots corresponding to inactive ASIDs. 441 * 442 * - The CPU went so far out to lunch that it may have missed a TLB 443 * flush. 444 */ 445 void initialize_tlbstate_and_flush(void) 446 { 447 int i; 448 struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); 449 u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); 450 unsigned long cr3 = __read_cr3(); 451 452 /* Assert that CR3 already references the right mm. */ 453 WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); 454 455 /* 456 * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization 457 * doesn't work like other CR4 bits because it can only be set from 458 * long mode.) 459 */ 460 WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && 461 !(cr4_read_shadow() & X86_CR4_PCIDE)); 462 463 /* Force ASID 0 and force a TLB flush. */ 464 write_cr3(build_cr3(mm->pgd, 0)); 465 466 /* Reinitialize tlbstate. */ 467 this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB); 468 this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); 469 this_cpu_write(cpu_tlbstate.next_asid, 1); 470 this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); 471 this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); 472 473 for (i = 1; i < TLB_NR_DYN_ASIDS; i++) 474 this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); 475 } 476 477 /* 478 * flush_tlb_func_common()'s memory ordering requirement is that any 479 * TLB fills that happen after we flush the TLB are ordered after we 480 * read active_mm's tlb_gen. We don't need any explicit barriers 481 * because all x86 flush operations are serializing and the 482 * atomic64_read operation won't be reordered by the compiler. 483 */ 484 static void flush_tlb_func_common(const struct flush_tlb_info *f, 485 bool local, enum tlb_flush_reason reason) 486 { 487 /* 488 * We have three different tlb_gen values in here. They are: 489 * 490 * - mm_tlb_gen: the latest generation. 491 * - local_tlb_gen: the generation that this CPU has already caught 492 * up to. 493 * - f->new_tlb_gen: the generation that the requester of the flush 494 * wants us to catch up to. 495 */ 496 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 497 u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 498 u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); 499 u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); 500 501 /* This code cannot presently handle being reentered. */ 502 VM_WARN_ON(!irqs_disabled()); 503 504 if (unlikely(loaded_mm == &init_mm)) 505 return; 506 507 VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != 508 loaded_mm->context.ctx_id); 509 510 if (this_cpu_read(cpu_tlbstate.is_lazy)) { 511 /* 512 * We're in lazy mode. We need to at least flush our 513 * paging-structure cache to avoid speculatively reading 514 * garbage into our TLB. Since switching to init_mm is barely 515 * slower than a minimal flush, just switch to init_mm. 516 * 517 * This should be rare, with native_flush_tlb_others skipping 518 * IPIs to lazy TLB mode CPUs. 519 */ 520 switch_mm_irqs_off(NULL, &init_mm, NULL); 521 return; 522 } 523 524 if (unlikely(local_tlb_gen == mm_tlb_gen)) { 525 /* 526 * There's nothing to do: we're already up to date. This can 527 * happen if two concurrent flushes happen -- the first flush to 528 * be handled can catch us all the way up, leaving no work for 529 * the second flush. 530 */ 531 trace_tlb_flush(reason, 0); 532 return; 533 } 534 535 WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); 536 WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); 537 538 /* 539 * If we get to this point, we know that our TLB is out of date. 540 * This does not strictly imply that we need to flush (it's 541 * possible that f->new_tlb_gen <= local_tlb_gen), but we're 542 * going to need to flush in the very near future, so we might 543 * as well get it over with. 544 * 545 * The only question is whether to do a full or partial flush. 546 * 547 * We do a partial flush if requested and two extra conditions 548 * are met: 549 * 550 * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that 551 * we've always done all needed flushes to catch up to 552 * local_tlb_gen. If, for example, local_tlb_gen == 2 and 553 * f->new_tlb_gen == 3, then we know that the flush needed to bring 554 * us up to date for tlb_gen 3 is the partial flush we're 555 * processing. 556 * 557 * As an example of why this check is needed, suppose that there 558 * are two concurrent flushes. The first is a full flush that 559 * changes context.tlb_gen from 1 to 2. The second is a partial 560 * flush that changes context.tlb_gen from 2 to 3. If they get 561 * processed on this CPU in reverse order, we'll see 562 * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. 563 * If we were to use __flush_tlb_one_user() and set local_tlb_gen to 564 * 3, we'd be break the invariant: we'd update local_tlb_gen above 565 * 1 without the full flush that's needed for tlb_gen 2. 566 * 567 * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. 568 * Partial TLB flushes are not all that much cheaper than full TLB 569 * flushes, so it seems unlikely that it would be a performance win 570 * to do a partial flush if that won't bring our TLB fully up to 571 * date. By doing a full flush instead, we can increase 572 * local_tlb_gen all the way to mm_tlb_gen and we can probably 573 * avoid another flush in the very near future. 574 */ 575 if (f->end != TLB_FLUSH_ALL && 576 f->new_tlb_gen == local_tlb_gen + 1 && 577 f->new_tlb_gen == mm_tlb_gen) { 578 /* Partial flush */ 579 unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift; 580 unsigned long addr = f->start; 581 582 while (addr < f->end) { 583 __flush_tlb_one_user(addr); 584 addr += 1UL << f->stride_shift; 585 } 586 if (local) 587 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate); 588 trace_tlb_flush(reason, nr_invalidate); 589 } else { 590 /* Full flush. */ 591 local_flush_tlb(); 592 if (local) 593 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 594 trace_tlb_flush(reason, TLB_FLUSH_ALL); 595 } 596 597 /* Both paths above update our state to mm_tlb_gen. */ 598 this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); 599 } 600 601 static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason) 602 { 603 const struct flush_tlb_info *f = info; 604 605 flush_tlb_func_common(f, true, reason); 606 } 607 608 static void flush_tlb_func_remote(void *info) 609 { 610 const struct flush_tlb_info *f = info; 611 612 inc_irq_stat(irq_tlb_count); 613 614 if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm)) 615 return; 616 617 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 618 flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); 619 } 620 621 static bool tlb_is_not_lazy(int cpu, void *data) 622 { 623 return !per_cpu(cpu_tlbstate.is_lazy, cpu); 624 } 625 626 void native_flush_tlb_others(const struct cpumask *cpumask, 627 const struct flush_tlb_info *info) 628 { 629 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 630 if (info->end == TLB_FLUSH_ALL) 631 trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); 632 else 633 trace_tlb_flush(TLB_REMOTE_SEND_IPI, 634 (info->end - info->start) >> PAGE_SHIFT); 635 636 if (is_uv_system()) { 637 /* 638 * This whole special case is confused. UV has a "Broadcast 639 * Assist Unit", which seems to be a fancy way to send IPIs. 640 * Back when x86 used an explicit TLB flush IPI, UV was 641 * optimized to use its own mechanism. These days, x86 uses 642 * smp_call_function_many(), but UV still uses a manual IPI, 643 * and that IPI's action is out of date -- it does a manual 644 * flush instead of calling flush_tlb_func_remote(). This 645 * means that the percpu tlb_gen variables won't be updated 646 * and we'll do pointless flushes on future context switches. 647 * 648 * Rather than hooking native_flush_tlb_others() here, I think 649 * that UV should be updated so that smp_call_function_many(), 650 * etc, are optimal on UV. 651 */ 652 cpumask = uv_flush_tlb_others(cpumask, info); 653 if (cpumask) 654 smp_call_function_many(cpumask, flush_tlb_func_remote, 655 (void *)info, 1); 656 return; 657 } 658 659 /* 660 * If no page tables were freed, we can skip sending IPIs to 661 * CPUs in lazy TLB mode. They will flush the CPU themselves 662 * at the next context switch. 663 * 664 * However, if page tables are getting freed, we need to send the 665 * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping 666 * up on the new contents of what used to be page tables, while 667 * doing a speculative memory access. 668 */ 669 if (info->freed_tables) 670 smp_call_function_many(cpumask, flush_tlb_func_remote, 671 (void *)info, 1); 672 else 673 on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote, 674 (void *)info, 1, cpumask); 675 } 676 677 /* 678 * See Documentation/x86/tlb.rst for details. We choose 33 679 * because it is large enough to cover the vast majority (at 680 * least 95%) of allocations, and is small enough that we are 681 * confident it will not cause too much overhead. Each single 682 * flush is about 100 ns, so this caps the maximum overhead at 683 * _about_ 3,000 ns. 684 * 685 * This is in units of pages. 686 */ 687 unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; 688 689 static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info); 690 691 #ifdef CONFIG_DEBUG_VM 692 static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx); 693 #endif 694 695 static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, 696 unsigned long start, unsigned long end, 697 unsigned int stride_shift, bool freed_tables, 698 u64 new_tlb_gen) 699 { 700 struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info); 701 702 #ifdef CONFIG_DEBUG_VM 703 /* 704 * Ensure that the following code is non-reentrant and flush_tlb_info 705 * is not overwritten. This means no TLB flushing is initiated by 706 * interrupt handlers and machine-check exception handlers. 707 */ 708 BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); 709 #endif 710 711 info->start = start; 712 info->end = end; 713 info->mm = mm; 714 info->stride_shift = stride_shift; 715 info->freed_tables = freed_tables; 716 info->new_tlb_gen = new_tlb_gen; 717 718 return info; 719 } 720 721 static inline void put_flush_tlb_info(void) 722 { 723 #ifdef CONFIG_DEBUG_VM 724 /* Complete reentrency prevention checks */ 725 barrier(); 726 this_cpu_dec(flush_tlb_info_idx); 727 #endif 728 } 729 730 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 731 unsigned long end, unsigned int stride_shift, 732 bool freed_tables) 733 { 734 struct flush_tlb_info *info; 735 u64 new_tlb_gen; 736 int cpu; 737 738 cpu = get_cpu(); 739 740 /* Should we flush just the requested range? */ 741 if ((end == TLB_FLUSH_ALL) || 742 ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { 743 start = 0; 744 end = TLB_FLUSH_ALL; 745 } 746 747 /* This is also a barrier that synchronizes with switch_mm(). */ 748 new_tlb_gen = inc_mm_tlb_gen(mm); 749 750 info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables, 751 new_tlb_gen); 752 753 if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { 754 lockdep_assert_irqs_enabled(); 755 local_irq_disable(); 756 flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN); 757 local_irq_enable(); 758 } 759 760 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) 761 flush_tlb_others(mm_cpumask(mm), info); 762 763 put_flush_tlb_info(); 764 put_cpu(); 765 } 766 767 768 static void do_flush_tlb_all(void *info) 769 { 770 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 771 __flush_tlb_all(); 772 } 773 774 void flush_tlb_all(void) 775 { 776 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 777 on_each_cpu(do_flush_tlb_all, NULL, 1); 778 } 779 780 static void do_kernel_range_flush(void *info) 781 { 782 struct flush_tlb_info *f = info; 783 unsigned long addr; 784 785 /* flush range by one by one 'invlpg' */ 786 for (addr = f->start; addr < f->end; addr += PAGE_SIZE) 787 __flush_tlb_one_kernel(addr); 788 } 789 790 void flush_tlb_kernel_range(unsigned long start, unsigned long end) 791 { 792 /* Balance as user space task's flush, a bit conservative */ 793 if (end == TLB_FLUSH_ALL || 794 (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { 795 on_each_cpu(do_flush_tlb_all, NULL, 1); 796 } else { 797 struct flush_tlb_info *info; 798 799 preempt_disable(); 800 info = get_flush_tlb_info(NULL, start, end, 0, false, 0); 801 802 on_each_cpu(do_kernel_range_flush, info, 1); 803 804 put_flush_tlb_info(); 805 preempt_enable(); 806 } 807 } 808 809 /* 810 * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm. 811 * This means that the 'struct flush_tlb_info' that describes which mappings to 812 * flush is actually fixed. We therefore set a single fixed struct and use it in 813 * arch_tlbbatch_flush(). 814 */ 815 static const struct flush_tlb_info full_flush_tlb_info = { 816 .mm = NULL, 817 .start = 0, 818 .end = TLB_FLUSH_ALL, 819 }; 820 821 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) 822 { 823 int cpu = get_cpu(); 824 825 if (cpumask_test_cpu(cpu, &batch->cpumask)) { 826 lockdep_assert_irqs_enabled(); 827 local_irq_disable(); 828 flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN); 829 local_irq_enable(); 830 } 831 832 if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) 833 flush_tlb_others(&batch->cpumask, &full_flush_tlb_info); 834 835 cpumask_clear(&batch->cpumask); 836 837 put_cpu(); 838 } 839 840 static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, 841 size_t count, loff_t *ppos) 842 { 843 char buf[32]; 844 unsigned int len; 845 846 len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); 847 return simple_read_from_buffer(user_buf, count, ppos, buf, len); 848 } 849 850 static ssize_t tlbflush_write_file(struct file *file, 851 const char __user *user_buf, size_t count, loff_t *ppos) 852 { 853 char buf[32]; 854 ssize_t len; 855 int ceiling; 856 857 len = min(count, sizeof(buf) - 1); 858 if (copy_from_user(buf, user_buf, len)) 859 return -EFAULT; 860 861 buf[len] = '\0'; 862 if (kstrtoint(buf, 0, &ceiling)) 863 return -EINVAL; 864 865 if (ceiling < 0) 866 return -EINVAL; 867 868 tlb_single_page_flush_ceiling = ceiling; 869 return count; 870 } 871 872 static const struct file_operations fops_tlbflush = { 873 .read = tlbflush_read_file, 874 .write = tlbflush_write_file, 875 .llseek = default_llseek, 876 }; 877 878 static int __init create_tlb_single_page_flush_ceiling(void) 879 { 880 debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, 881 arch_debugfs_dir, NULL, &fops_tlbflush); 882 return 0; 883 } 884 late_initcall(create_tlb_single_page_flush_ceiling); 885