155f4949fSIngo Molnar #include <linux/init.h> 255f4949fSIngo Molnar 355f4949fSIngo Molnar #include <linux/mm.h> 455f4949fSIngo Molnar #include <linux/spinlock.h> 555f4949fSIngo Molnar #include <linux/smp.h> 655f4949fSIngo Molnar #include <linux/interrupt.h> 74b599fedSPaul Gortmaker #include <linux/export.h> 893296720SShaohua Li #include <linux/cpu.h> 918bf3c3eSTim Chen #include <linux/debugfs.h> 10*dbfe2953SJiri Kosina #include <linux/ptrace.h> 1155f4949fSIngo Molnar 1255f4949fSIngo Molnar #include <asm/tlbflush.h> 1355f4949fSIngo Molnar #include <asm/mmu_context.h> 1418bf3c3eSTim Chen #include <asm/nospec-branch.h> 15350f8f56SJan Beulich #include <asm/cache.h> 1655f4949fSIngo Molnar #include <asm/apic.h> 1755f4949fSIngo Molnar #include <asm/uv/uv.h> 1855f4949fSIngo Molnar 1955f4949fSIngo Molnar /* 20ce4a4e56SAndy Lutomirski * TLB flushing, formerly SMP-only 2155f4949fSIngo Molnar * c/o Linus Torvalds. 2255f4949fSIngo Molnar * 2355f4949fSIngo Molnar * These mean you can really definitely utterly forget about 2455f4949fSIngo Molnar * writing to user space from interrupts. (Its not allowed anyway). 2555f4949fSIngo Molnar * 2655f4949fSIngo Molnar * Optimizations Manfred Spraul <manfred@colorfullife.com> 2755f4949fSIngo Molnar * 2855f4949fSIngo Molnar * More scalable flush, from Andi Kleen 2955f4949fSIngo Molnar * 3052aec330SAlex Shi * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi 3155f4949fSIngo Molnar */ 3255f4949fSIngo Molnar 332ea907c4SDave Hansen /* 342ea907c4SDave Hansen * We get here when we do something requiring a TLB invalidation 352ea907c4SDave Hansen * but could not go invalidate all of the contexts. We do the 362ea907c4SDave Hansen * necessary invalidation by clearing out the 'ctx_id' which 372ea907c4SDave Hansen * forces a TLB flush when the context is loaded. 382ea907c4SDave Hansen */ 39387048f5Szhong jiang static void clear_asid_other(void) 402ea907c4SDave Hansen { 412ea907c4SDave Hansen u16 asid; 422ea907c4SDave Hansen 432ea907c4SDave Hansen /* 442ea907c4SDave Hansen * This is only expected to be set if we have disabled 452ea907c4SDave Hansen * kernel _PAGE_GLOBAL pages. 462ea907c4SDave Hansen */ 472ea907c4SDave Hansen if (!static_cpu_has(X86_FEATURE_PTI)) { 482ea907c4SDave Hansen WARN_ON_ONCE(1); 492ea907c4SDave Hansen return; 502ea907c4SDave Hansen } 512ea907c4SDave Hansen 522ea907c4SDave Hansen for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 532ea907c4SDave Hansen /* Do not need to flush the current asid */ 542ea907c4SDave Hansen if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) 552ea907c4SDave Hansen continue; 562ea907c4SDave Hansen /* 572ea907c4SDave Hansen * Make sure the next time we go to switch to 582ea907c4SDave Hansen * this asid, we do a flush: 592ea907c4SDave Hansen */ 602ea907c4SDave Hansen this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); 612ea907c4SDave Hansen } 622ea907c4SDave Hansen this_cpu_write(cpu_tlbstate.invalidate_other, false); 632ea907c4SDave Hansen } 642ea907c4SDave Hansen 65f39681edSAndy Lutomirski atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); 66f39681edSAndy Lutomirski 67b956575bSAndy Lutomirski 6810af6235SAndy Lutomirski static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, 6910af6235SAndy Lutomirski u16 *new_asid, bool *need_flush) 7010af6235SAndy Lutomirski { 7110af6235SAndy Lutomirski u16 asid; 7210af6235SAndy Lutomirski 7310af6235SAndy Lutomirski if (!static_cpu_has(X86_FEATURE_PCID)) { 7410af6235SAndy Lutomirski *new_asid = 0; 7510af6235SAndy Lutomirski *need_flush = true; 7610af6235SAndy Lutomirski return; 7710af6235SAndy Lutomirski } 7810af6235SAndy Lutomirski 792ea907c4SDave Hansen if (this_cpu_read(cpu_tlbstate.invalidate_other)) 802ea907c4SDave Hansen clear_asid_other(); 812ea907c4SDave Hansen 8210af6235SAndy Lutomirski for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 8310af6235SAndy Lutomirski if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != 8410af6235SAndy Lutomirski next->context.ctx_id) 8510af6235SAndy Lutomirski continue; 8610af6235SAndy Lutomirski 8710af6235SAndy Lutomirski *new_asid = asid; 8810af6235SAndy Lutomirski *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < 8910af6235SAndy Lutomirski next_tlb_gen); 9010af6235SAndy Lutomirski return; 9110af6235SAndy Lutomirski } 9210af6235SAndy Lutomirski 9310af6235SAndy Lutomirski /* 9410af6235SAndy Lutomirski * We don't currently own an ASID slot on this CPU. 9510af6235SAndy Lutomirski * Allocate a slot. 9610af6235SAndy Lutomirski */ 9710af6235SAndy Lutomirski *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; 9810af6235SAndy Lutomirski if (*new_asid >= TLB_NR_DYN_ASIDS) { 9910af6235SAndy Lutomirski *new_asid = 0; 10010af6235SAndy Lutomirski this_cpu_write(cpu_tlbstate.next_asid, 1); 10110af6235SAndy Lutomirski } 10210af6235SAndy Lutomirski *need_flush = true; 10310af6235SAndy Lutomirski } 10410af6235SAndy Lutomirski 10548e11198SDave Hansen static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) 10648e11198SDave Hansen { 10748e11198SDave Hansen unsigned long new_mm_cr3; 10848e11198SDave Hansen 10948e11198SDave Hansen if (need_flush) { 1106fd166aaSPeter Zijlstra invalidate_user_asid(new_asid); 11148e11198SDave Hansen new_mm_cr3 = build_cr3(pgdir, new_asid); 11248e11198SDave Hansen } else { 11348e11198SDave Hansen new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); 11448e11198SDave Hansen } 11548e11198SDave Hansen 11648e11198SDave Hansen /* 11748e11198SDave Hansen * Caution: many callers of this function expect 11848e11198SDave Hansen * that load_cr3() is serializing and orders TLB 11948e11198SDave Hansen * fills with respect to the mm_cpumask writes. 12048e11198SDave Hansen */ 12148e11198SDave Hansen write_cr3(new_mm_cr3); 12248e11198SDave Hansen } 12348e11198SDave Hansen 12455f4949fSIngo Molnar void leave_mm(int cpu) 12555f4949fSIngo Molnar { 1263d28ebceSAndy Lutomirski struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 1273d28ebceSAndy Lutomirski 1283d28ebceSAndy Lutomirski /* 1293d28ebceSAndy Lutomirski * It's plausible that we're in lazy TLB mode while our mm is init_mm. 1303d28ebceSAndy Lutomirski * If so, our callers still expect us to flush the TLB, but there 1313d28ebceSAndy Lutomirski * aren't any user TLB entries in init_mm to worry about. 1323d28ebceSAndy Lutomirski * 1333d28ebceSAndy Lutomirski * This needs to happen before any other sanity checks due to 1343d28ebceSAndy Lutomirski * intel_idle's shenanigans. 1353d28ebceSAndy Lutomirski */ 1363d28ebceSAndy Lutomirski if (loaded_mm == &init_mm) 1373d28ebceSAndy Lutomirski return; 1383d28ebceSAndy Lutomirski 13994b1b03bSAndy Lutomirski /* Warn if we're not lazy. */ 140b956575bSAndy Lutomirski WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); 1413d28ebceSAndy Lutomirski 1423d28ebceSAndy Lutomirski switch_mm(NULL, &init_mm, NULL); 143a6fca40fSSuresh Siddha } 14467535736SAndy Lutomirski EXPORT_SYMBOL_GPL(leave_mm); 14555f4949fSIngo Molnar 14669c0319aSAndy Lutomirski void switch_mm(struct mm_struct *prev, struct mm_struct *next, 14769c0319aSAndy Lutomirski struct task_struct *tsk) 14869c0319aSAndy Lutomirski { 149078194f8SAndy Lutomirski unsigned long flags; 150078194f8SAndy Lutomirski 151078194f8SAndy Lutomirski local_irq_save(flags); 152078194f8SAndy Lutomirski switch_mm_irqs_off(prev, next, tsk); 153078194f8SAndy Lutomirski local_irq_restore(flags); 154078194f8SAndy Lutomirski } 155078194f8SAndy Lutomirski 1565beda7d5SAndy Lutomirski static void sync_current_stack_to_mm(struct mm_struct *mm) 1575beda7d5SAndy Lutomirski { 1585beda7d5SAndy Lutomirski unsigned long sp = current_stack_pointer; 1595beda7d5SAndy Lutomirski pgd_t *pgd = pgd_offset(mm, sp); 1605beda7d5SAndy Lutomirski 161ed7588d5SKirill A. Shutemov if (pgtable_l5_enabled()) { 1625beda7d5SAndy Lutomirski if (unlikely(pgd_none(*pgd))) { 1635beda7d5SAndy Lutomirski pgd_t *pgd_ref = pgd_offset_k(sp); 1645beda7d5SAndy Lutomirski 1655beda7d5SAndy Lutomirski set_pgd(pgd, *pgd_ref); 1665beda7d5SAndy Lutomirski } 1675beda7d5SAndy Lutomirski } else { 1685beda7d5SAndy Lutomirski /* 1695beda7d5SAndy Lutomirski * "pgd" is faked. The top level entries are "p4d"s, so sync 1705beda7d5SAndy Lutomirski * the p4d. This compiles to approximately the same code as 1715beda7d5SAndy Lutomirski * the 5-level case. 1725beda7d5SAndy Lutomirski */ 1735beda7d5SAndy Lutomirski p4d_t *p4d = p4d_offset(pgd, sp); 1745beda7d5SAndy Lutomirski 1755beda7d5SAndy Lutomirski if (unlikely(p4d_none(*p4d))) { 1765beda7d5SAndy Lutomirski pgd_t *pgd_ref = pgd_offset_k(sp); 1775beda7d5SAndy Lutomirski p4d_t *p4d_ref = p4d_offset(pgd_ref, sp); 1785beda7d5SAndy Lutomirski 1795beda7d5SAndy Lutomirski set_p4d(p4d, *p4d_ref); 1805beda7d5SAndy Lutomirski } 1815beda7d5SAndy Lutomirski } 1825beda7d5SAndy Lutomirski } 1835beda7d5SAndy Lutomirski 184*dbfe2953SJiri Kosina static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id) 185*dbfe2953SJiri Kosina { 186*dbfe2953SJiri Kosina /* 187*dbfe2953SJiri Kosina * Check if the current (previous) task has access to the memory 188*dbfe2953SJiri Kosina * of the @tsk (next) task. If access is denied, make sure to 189*dbfe2953SJiri Kosina * issue a IBPB to stop user->user Spectre-v2 attacks. 190*dbfe2953SJiri Kosina * 191*dbfe2953SJiri Kosina * Note: __ptrace_may_access() returns 0 or -ERRNO. 192*dbfe2953SJiri Kosina */ 193*dbfe2953SJiri Kosina return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id && 194*dbfe2953SJiri Kosina ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB)); 195*dbfe2953SJiri Kosina } 196*dbfe2953SJiri Kosina 197078194f8SAndy Lutomirski void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, 198078194f8SAndy Lutomirski struct task_struct *tsk) 199078194f8SAndy Lutomirski { 2003d28ebceSAndy Lutomirski struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); 20110af6235SAndy Lutomirski u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 20294b1b03bSAndy Lutomirski unsigned cpu = smp_processor_id(); 20394b1b03bSAndy Lutomirski u64 next_tlb_gen; 20469c0319aSAndy Lutomirski 2053d28ebceSAndy Lutomirski /* 20694b1b03bSAndy Lutomirski * NB: The scheduler will call us with prev == next when switching 20794b1b03bSAndy Lutomirski * from lazy TLB mode to normal mode if active_mm isn't changing. 20894b1b03bSAndy Lutomirski * When this happens, we don't assume that CR3 (and hence 20994b1b03bSAndy Lutomirski * cpu_tlbstate.loaded_mm) matches next. 2103d28ebceSAndy Lutomirski * 2113d28ebceSAndy Lutomirski * NB: leave_mm() calls us with prev == NULL and tsk == NULL. 2123d28ebceSAndy Lutomirski */ 2133d28ebceSAndy Lutomirski 21494b1b03bSAndy Lutomirski /* We don't want flush_tlb_func_* to run concurrently with us. */ 21594b1b03bSAndy Lutomirski if (IS_ENABLED(CONFIG_PROVE_LOCKING)) 21694b1b03bSAndy Lutomirski WARN_ON_ONCE(!irqs_disabled()); 21794b1b03bSAndy Lutomirski 21894b1b03bSAndy Lutomirski /* 21994b1b03bSAndy Lutomirski * Verify that CR3 is what we think it is. This will catch 22094b1b03bSAndy Lutomirski * hypothetical buggy code that directly switches to swapper_pg_dir 22110af6235SAndy Lutomirski * without going through leave_mm() / switch_mm_irqs_off() or that 22210af6235SAndy Lutomirski * does something like write_cr3(read_cr3_pa()). 223a376e7f9SAndy Lutomirski * 224a376e7f9SAndy Lutomirski * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() 225a376e7f9SAndy Lutomirski * isn't free. 22694b1b03bSAndy Lutomirski */ 227a376e7f9SAndy Lutomirski #ifdef CONFIG_DEBUG_VM 22850fb83a6SDave Hansen if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { 229a376e7f9SAndy Lutomirski /* 230a376e7f9SAndy Lutomirski * If we were to BUG here, we'd be very likely to kill 231a376e7f9SAndy Lutomirski * the system so hard that we don't see the call trace. 232a376e7f9SAndy Lutomirski * Try to recover instead by ignoring the error and doing 233a376e7f9SAndy Lutomirski * a global flush to minimize the chance of corruption. 234a376e7f9SAndy Lutomirski * 235a376e7f9SAndy Lutomirski * (This is far from being a fully correct recovery. 236a376e7f9SAndy Lutomirski * Architecturally, the CPU could prefetch something 237a376e7f9SAndy Lutomirski * back into an incorrect ASID slot and leave it there 238a376e7f9SAndy Lutomirski * to cause trouble down the road. It's better than 239a376e7f9SAndy Lutomirski * nothing, though.) 240a376e7f9SAndy Lutomirski */ 241a376e7f9SAndy Lutomirski __flush_tlb_all(); 242a376e7f9SAndy Lutomirski } 243a376e7f9SAndy Lutomirski #endif 244b956575bSAndy Lutomirski this_cpu_write(cpu_tlbstate.is_lazy, false); 2453d28ebceSAndy Lutomirski 246306e0604SMathieu Desnoyers /* 24710bcc80eSMathieu Desnoyers * The membarrier system call requires a full memory barrier and 24810bcc80eSMathieu Desnoyers * core serialization before returning to user-space, after 24910bcc80eSMathieu Desnoyers * storing to rq->curr. Writing to CR3 provides that full 25010bcc80eSMathieu Desnoyers * memory barrier and core serializing instruction. 251306e0604SMathieu Desnoyers */ 2523d28ebceSAndy Lutomirski if (real_prev == next) { 253e8b9b0ccSAndy Lutomirski VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != 25494b1b03bSAndy Lutomirski next->context.ctx_id); 25594b1b03bSAndy Lutomirski 2563d28ebceSAndy Lutomirski /* 25752a288c7SPeter Zijlstra * We don't currently support having a real mm loaded without 25852a288c7SPeter Zijlstra * our cpu set in mm_cpumask(). We have all the bookkeeping 25952a288c7SPeter Zijlstra * in place to figure out whether we would need to flush 26052a288c7SPeter Zijlstra * if our cpu were cleared in mm_cpumask(), but we don't 26152a288c7SPeter Zijlstra * currently use it. 2623d28ebceSAndy Lutomirski */ 263b956575bSAndy Lutomirski if (WARN_ON_ONCE(real_prev != &init_mm && 264b956575bSAndy Lutomirski !cpumask_test_cpu(cpu, mm_cpumask(next)))) 26594b1b03bSAndy Lutomirski cpumask_set_cpu(cpu, mm_cpumask(next)); 26694b1b03bSAndy Lutomirski 267b956575bSAndy Lutomirski return; 26894b1b03bSAndy Lutomirski } else { 26952a288c7SPeter Zijlstra u16 new_asid; 27052a288c7SPeter Zijlstra bool need_flush; 27118bf3c3eSTim Chen u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); 27218bf3c3eSTim Chen 27318bf3c3eSTim Chen /* 27418bf3c3eSTim Chen * Avoid user/user BTB poisoning by flushing the branch 27518bf3c3eSTim Chen * predictor when switching between processes. This stops 27618bf3c3eSTim Chen * one process from doing Spectre-v2 attacks on another. 27718bf3c3eSTim Chen * 27818bf3c3eSTim Chen * As an optimization, flush indirect branches only when 279*dbfe2953SJiri Kosina * switching into a processes that can't be ptrace by the 280*dbfe2953SJiri Kosina * current one (as in such case, attacker has much more 281*dbfe2953SJiri Kosina * convenient way how to tamper with the next process than 282*dbfe2953SJiri Kosina * branch buffer poisoning). 28318bf3c3eSTim Chen */ 284*dbfe2953SJiri Kosina if (static_cpu_has(X86_FEATURE_USE_IBPB) && 285*dbfe2953SJiri Kosina ibpb_needed(tsk, last_ctx_id)) 28618bf3c3eSTim Chen indirect_branch_prediction_barrier(); 28794b1b03bSAndy Lutomirski 288e37e43a4SAndy Lutomirski if (IS_ENABLED(CONFIG_VMAP_STACK)) { 289e37e43a4SAndy Lutomirski /* 290e37e43a4SAndy Lutomirski * If our current stack is in vmalloc space and isn't 291e37e43a4SAndy Lutomirski * mapped in the new pgd, we'll double-fault. Forcibly 292e37e43a4SAndy Lutomirski * map it. 293e37e43a4SAndy Lutomirski */ 2945beda7d5SAndy Lutomirski sync_current_stack_to_mm(next); 295e37e43a4SAndy Lutomirski } 296e37e43a4SAndy Lutomirski 297e9d8c615SRik van Riel /* 298e9d8c615SRik van Riel * Stop remote flushes for the previous mm. 299e9d8c615SRik van Riel * Skip kernel threads; we never send init_mm TLB flushing IPIs, 300e9d8c615SRik van Riel * but the bitmap manipulation can cause cache line contention. 301e9d8c615SRik van Riel */ 302e9d8c615SRik van Riel if (real_prev != &init_mm) { 303e9d8c615SRik van Riel VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, 304e9d8c615SRik van Riel mm_cpumask(real_prev))); 30594b1b03bSAndy Lutomirski cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 306e9d8c615SRik van Riel } 307e37e43a4SAndy Lutomirski 30869c0319aSAndy Lutomirski /* 30994b1b03bSAndy Lutomirski * Start remote flushes and then read tlb_gen. 31069c0319aSAndy Lutomirski */ 311e9d8c615SRik van Riel if (next != &init_mm) 31294b1b03bSAndy Lutomirski cpumask_set_cpu(cpu, mm_cpumask(next)); 31394b1b03bSAndy Lutomirski next_tlb_gen = atomic64_read(&next->context.tlb_gen); 31494b1b03bSAndy Lutomirski 31510af6235SAndy Lutomirski choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); 31669c0319aSAndy Lutomirski 3174012e77aSAndy Lutomirski /* Let nmi_uaccess_okay() know that we're changing CR3. */ 3184012e77aSAndy Lutomirski this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); 3194012e77aSAndy Lutomirski barrier(); 3204012e77aSAndy Lutomirski 32110af6235SAndy Lutomirski if (need_flush) { 32210af6235SAndy Lutomirski this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 32310af6235SAndy Lutomirski this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 32448e11198SDave Hansen load_new_mm_cr3(next->pgd, new_asid, true); 32567535736SAndy Lutomirski 32667535736SAndy Lutomirski /* 32767535736SAndy Lutomirski * NB: This gets called via leave_mm() in the idle path 32867535736SAndy Lutomirski * where RCU functions differently. Tracing normally 32967535736SAndy Lutomirski * uses RCU, so we need to use the _rcuidle variant. 33067535736SAndy Lutomirski * 33167535736SAndy Lutomirski * (There is no good reason for this. The idle code should 33267535736SAndy Lutomirski * be rearranged to call this before rcu_idle_enter().) 33367535736SAndy Lutomirski */ 33467535736SAndy Lutomirski trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 33510af6235SAndy Lutomirski } else { 33610af6235SAndy Lutomirski /* The new ASID is already up to date. */ 33748e11198SDave Hansen load_new_mm_cr3(next->pgd, new_asid, false); 33867535736SAndy Lutomirski 33967535736SAndy Lutomirski /* See above wrt _rcuidle. */ 34067535736SAndy Lutomirski trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); 34110af6235SAndy Lutomirski } 34210af6235SAndy Lutomirski 34318bf3c3eSTim Chen /* 34418bf3c3eSTim Chen * Record last user mm's context id, so we can avoid 34518bf3c3eSTim Chen * flushing branch buffer with IBPB if we switch back 34618bf3c3eSTim Chen * to the same user. 34718bf3c3eSTim Chen */ 34818bf3c3eSTim Chen if (next != &init_mm) 34918bf3c3eSTim Chen this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); 35018bf3c3eSTim Chen 3514012e77aSAndy Lutomirski /* Make sure we write CR3 before loaded_mm. */ 3524012e77aSAndy Lutomirski barrier(); 3534012e77aSAndy Lutomirski 35410af6235SAndy Lutomirski this_cpu_write(cpu_tlbstate.loaded_mm, next); 35510af6235SAndy Lutomirski this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); 35652a288c7SPeter Zijlstra } 35769c0319aSAndy Lutomirski 35869c0319aSAndy Lutomirski load_mm_cr4(next); 35973534258SAndy Lutomirski switch_ldt(real_prev, next); 36069c0319aSAndy Lutomirski } 36169c0319aSAndy Lutomirski 362b0579adeSAndy Lutomirski /* 3634e57b946SAndy Lutomirski * Please ignore the name of this function. It should be called 3644e57b946SAndy Lutomirski * switch_to_kernel_thread(). 3654e57b946SAndy Lutomirski * 366b956575bSAndy Lutomirski * enter_lazy_tlb() is a hint from the scheduler that we are entering a 367b956575bSAndy Lutomirski * kernel thread or other context without an mm. Acceptable implementations 368b956575bSAndy Lutomirski * include doing nothing whatsoever, switching to init_mm, or various clever 369b956575bSAndy Lutomirski * lazy tricks to try to minimize TLB flushes. 370b956575bSAndy Lutomirski * 371b956575bSAndy Lutomirski * The scheduler reserves the right to call enter_lazy_tlb() several times 372b956575bSAndy Lutomirski * in a row. It will notify us that we're going back to a real mm by 373b956575bSAndy Lutomirski * calling switch_mm_irqs_off(). 374b956575bSAndy Lutomirski */ 375b956575bSAndy Lutomirski void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 376b956575bSAndy Lutomirski { 377b956575bSAndy Lutomirski if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) 378b956575bSAndy Lutomirski return; 379b956575bSAndy Lutomirski 38052a288c7SPeter Zijlstra if (tlb_defer_switch_to_init_mm()) { 38152a288c7SPeter Zijlstra /* 38252a288c7SPeter Zijlstra * There's a significant optimization that may be possible 38352a288c7SPeter Zijlstra * here. We have accurate enough TLB flush tracking that we 38452a288c7SPeter Zijlstra * don't need to maintain coherence of TLB per se when we're 38552a288c7SPeter Zijlstra * lazy. We do, however, need to maintain coherence of 38652a288c7SPeter Zijlstra * paging-structure caches. We could, in principle, leave our 38752a288c7SPeter Zijlstra * old mm loaded and only switch to init_mm when 38852a288c7SPeter Zijlstra * tlb_remove_page() happens. 38952a288c7SPeter Zijlstra */ 390b956575bSAndy Lutomirski this_cpu_write(cpu_tlbstate.is_lazy, true); 39152a288c7SPeter Zijlstra } else { 39252a288c7SPeter Zijlstra switch_mm(NULL, &init_mm, NULL); 39352a288c7SPeter Zijlstra } 394b956575bSAndy Lutomirski } 395b956575bSAndy Lutomirski 396b956575bSAndy Lutomirski /* 39772c0098dSAndy Lutomirski * Call this when reinitializing a CPU. It fixes the following potential 39872c0098dSAndy Lutomirski * problems: 39972c0098dSAndy Lutomirski * 40072c0098dSAndy Lutomirski * - The ASID changed from what cpu_tlbstate thinks it is (most likely 40172c0098dSAndy Lutomirski * because the CPU was taken down and came back up with CR3's PCID 40272c0098dSAndy Lutomirski * bits clear. CPU hotplug can do this. 40372c0098dSAndy Lutomirski * 40472c0098dSAndy Lutomirski * - The TLB contains junk in slots corresponding to inactive ASIDs. 40572c0098dSAndy Lutomirski * 40672c0098dSAndy Lutomirski * - The CPU went so far out to lunch that it may have missed a TLB 40772c0098dSAndy Lutomirski * flush. 40872c0098dSAndy Lutomirski */ 40972c0098dSAndy Lutomirski void initialize_tlbstate_and_flush(void) 41072c0098dSAndy Lutomirski { 41172c0098dSAndy Lutomirski int i; 41272c0098dSAndy Lutomirski struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); 41372c0098dSAndy Lutomirski u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); 41472c0098dSAndy Lutomirski unsigned long cr3 = __read_cr3(); 41572c0098dSAndy Lutomirski 41672c0098dSAndy Lutomirski /* Assert that CR3 already references the right mm. */ 41772c0098dSAndy Lutomirski WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); 41872c0098dSAndy Lutomirski 41972c0098dSAndy Lutomirski /* 42072c0098dSAndy Lutomirski * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization 42172c0098dSAndy Lutomirski * doesn't work like other CR4 bits because it can only be set from 42272c0098dSAndy Lutomirski * long mode.) 42372c0098dSAndy Lutomirski */ 4247898f796SAndy Lutomirski WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && 42572c0098dSAndy Lutomirski !(cr4_read_shadow() & X86_CR4_PCIDE)); 42672c0098dSAndy Lutomirski 42772c0098dSAndy Lutomirski /* Force ASID 0 and force a TLB flush. */ 42850fb83a6SDave Hansen write_cr3(build_cr3(mm->pgd, 0)); 42972c0098dSAndy Lutomirski 43072c0098dSAndy Lutomirski /* Reinitialize tlbstate. */ 43118bf3c3eSTim Chen this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id); 43272c0098dSAndy Lutomirski this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); 43372c0098dSAndy Lutomirski this_cpu_write(cpu_tlbstate.next_asid, 1); 43472c0098dSAndy Lutomirski this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); 43572c0098dSAndy Lutomirski this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); 43672c0098dSAndy Lutomirski 43772c0098dSAndy Lutomirski for (i = 1; i < TLB_NR_DYN_ASIDS; i++) 43872c0098dSAndy Lutomirski this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); 43972c0098dSAndy Lutomirski } 44072c0098dSAndy Lutomirski 44172c0098dSAndy Lutomirski /* 442b0579adeSAndy Lutomirski * flush_tlb_func_common()'s memory ordering requirement is that any 443b0579adeSAndy Lutomirski * TLB fills that happen after we flush the TLB are ordered after we 444b0579adeSAndy Lutomirski * read active_mm's tlb_gen. We don't need any explicit barriers 445b0579adeSAndy Lutomirski * because all x86 flush operations are serializing and the 446b0579adeSAndy Lutomirski * atomic64_read operation won't be reordered by the compiler. 447b0579adeSAndy Lutomirski */ 448454bbad9SAndy Lutomirski static void flush_tlb_func_common(const struct flush_tlb_info *f, 449454bbad9SAndy Lutomirski bool local, enum tlb_flush_reason reason) 45055f4949fSIngo Molnar { 451b0579adeSAndy Lutomirski /* 452b0579adeSAndy Lutomirski * We have three different tlb_gen values in here. They are: 453b0579adeSAndy Lutomirski * 454b0579adeSAndy Lutomirski * - mm_tlb_gen: the latest generation. 455b0579adeSAndy Lutomirski * - local_tlb_gen: the generation that this CPU has already caught 456b0579adeSAndy Lutomirski * up to. 457b0579adeSAndy Lutomirski * - f->new_tlb_gen: the generation that the requester of the flush 458b0579adeSAndy Lutomirski * wants us to catch up to. 459b0579adeSAndy Lutomirski */ 460b0579adeSAndy Lutomirski struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 46110af6235SAndy Lutomirski u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 462b0579adeSAndy Lutomirski u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); 46310af6235SAndy Lutomirski u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); 464b0579adeSAndy Lutomirski 465bc0d5a89SAndy Lutomirski /* This code cannot presently handle being reentered. */ 466bc0d5a89SAndy Lutomirski VM_WARN_ON(!irqs_disabled()); 467bc0d5a89SAndy Lutomirski 468b956575bSAndy Lutomirski if (unlikely(loaded_mm == &init_mm)) 469b956575bSAndy Lutomirski return; 470b956575bSAndy Lutomirski 47110af6235SAndy Lutomirski VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != 472b0579adeSAndy Lutomirski loaded_mm->context.ctx_id); 473b0579adeSAndy Lutomirski 474b956575bSAndy Lutomirski if (this_cpu_read(cpu_tlbstate.is_lazy)) { 475b0579adeSAndy Lutomirski /* 476b956575bSAndy Lutomirski * We're in lazy mode. We need to at least flush our 477b956575bSAndy Lutomirski * paging-structure cache to avoid speculatively reading 478b956575bSAndy Lutomirski * garbage into our TLB. Since switching to init_mm is barely 479b956575bSAndy Lutomirski * slower than a minimal flush, just switch to init_mm. 480b0579adeSAndy Lutomirski */ 481b956575bSAndy Lutomirski switch_mm_irqs_off(NULL, &init_mm, NULL); 482b3b90e5aSAndy Lutomirski return; 483b3b90e5aSAndy Lutomirski } 484b3b90e5aSAndy Lutomirski 485b0579adeSAndy Lutomirski if (unlikely(local_tlb_gen == mm_tlb_gen)) { 486b0579adeSAndy Lutomirski /* 487b0579adeSAndy Lutomirski * There's nothing to do: we're already up to date. This can 488b0579adeSAndy Lutomirski * happen if two concurrent flushes happen -- the first flush to 489b0579adeSAndy Lutomirski * be handled can catch us all the way up, leaving no work for 490b0579adeSAndy Lutomirski * the second flush. 491b0579adeSAndy Lutomirski */ 49294b1b03bSAndy Lutomirski trace_tlb_flush(reason, 0); 493b0579adeSAndy Lutomirski return; 494b0579adeSAndy Lutomirski } 495b0579adeSAndy Lutomirski 496b0579adeSAndy Lutomirski WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); 497b0579adeSAndy Lutomirski WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); 498b0579adeSAndy Lutomirski 499b0579adeSAndy Lutomirski /* 500b0579adeSAndy Lutomirski * If we get to this point, we know that our TLB is out of date. 501b0579adeSAndy Lutomirski * This does not strictly imply that we need to flush (it's 502b0579adeSAndy Lutomirski * possible that f->new_tlb_gen <= local_tlb_gen), but we're 503b0579adeSAndy Lutomirski * going to need to flush in the very near future, so we might 504b0579adeSAndy Lutomirski * as well get it over with. 505b0579adeSAndy Lutomirski * 506b0579adeSAndy Lutomirski * The only question is whether to do a full or partial flush. 507b0579adeSAndy Lutomirski * 508b0579adeSAndy Lutomirski * We do a partial flush if requested and two extra conditions 509b0579adeSAndy Lutomirski * are met: 510b0579adeSAndy Lutomirski * 511b0579adeSAndy Lutomirski * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that 512b0579adeSAndy Lutomirski * we've always done all needed flushes to catch up to 513b0579adeSAndy Lutomirski * local_tlb_gen. If, for example, local_tlb_gen == 2 and 514b0579adeSAndy Lutomirski * f->new_tlb_gen == 3, then we know that the flush needed to bring 515b0579adeSAndy Lutomirski * us up to date for tlb_gen 3 is the partial flush we're 516b0579adeSAndy Lutomirski * processing. 517b0579adeSAndy Lutomirski * 518b0579adeSAndy Lutomirski * As an example of why this check is needed, suppose that there 519b0579adeSAndy Lutomirski * are two concurrent flushes. The first is a full flush that 520b0579adeSAndy Lutomirski * changes context.tlb_gen from 1 to 2. The second is a partial 521b0579adeSAndy Lutomirski * flush that changes context.tlb_gen from 2 to 3. If they get 522b0579adeSAndy Lutomirski * processed on this CPU in reverse order, we'll see 523b0579adeSAndy Lutomirski * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. 5241299ef1dSAndy Lutomirski * If we were to use __flush_tlb_one_user() and set local_tlb_gen to 525b0579adeSAndy Lutomirski * 3, we'd be break the invariant: we'd update local_tlb_gen above 526b0579adeSAndy Lutomirski * 1 without the full flush that's needed for tlb_gen 2. 527b0579adeSAndy Lutomirski * 528b0579adeSAndy Lutomirski * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. 529b0579adeSAndy Lutomirski * Partial TLB flushes are not all that much cheaper than full TLB 530b0579adeSAndy Lutomirski * flushes, so it seems unlikely that it would be a performance win 531b0579adeSAndy Lutomirski * to do a partial flush if that won't bring our TLB fully up to 532b0579adeSAndy Lutomirski * date. By doing a full flush instead, we can increase 533b0579adeSAndy Lutomirski * local_tlb_gen all the way to mm_tlb_gen and we can probably 534b0579adeSAndy Lutomirski * avoid another flush in the very near future. 535b0579adeSAndy Lutomirski */ 536b0579adeSAndy Lutomirski if (f->end != TLB_FLUSH_ALL && 537b0579adeSAndy Lutomirski f->new_tlb_gen == local_tlb_gen + 1 && 538b0579adeSAndy Lutomirski f->new_tlb_gen == mm_tlb_gen) { 539b0579adeSAndy Lutomirski /* Partial flush */ 540e7b52ffdSAlex Shi unsigned long addr; 541be4ffc0dSAndy Lutomirski unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; 542b0579adeSAndy Lutomirski 543a2055abeSAndy Lutomirski addr = f->start; 544a2055abeSAndy Lutomirski while (addr < f->end) { 5451299ef1dSAndy Lutomirski __flush_tlb_one_user(addr); 546e7b52ffdSAlex Shi addr += PAGE_SIZE; 547e7b52ffdSAlex Shi } 548454bbad9SAndy Lutomirski if (local) 549454bbad9SAndy Lutomirski count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); 550454bbad9SAndy Lutomirski trace_tlb_flush(reason, nr_pages); 551b0579adeSAndy Lutomirski } else { 552b0579adeSAndy Lutomirski /* Full flush. */ 553b0579adeSAndy Lutomirski local_flush_tlb(); 554b0579adeSAndy Lutomirski if (local) 555b0579adeSAndy Lutomirski count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 556b0579adeSAndy Lutomirski trace_tlb_flush(reason, TLB_FLUSH_ALL); 557e7b52ffdSAlex Shi } 558b0579adeSAndy Lutomirski 559b0579adeSAndy Lutomirski /* Both paths above update our state to mm_tlb_gen. */ 56010af6235SAndy Lutomirski this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); 56155f4949fSIngo Molnar } 56255f4949fSIngo Molnar 563454bbad9SAndy Lutomirski static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) 564454bbad9SAndy Lutomirski { 565454bbad9SAndy Lutomirski const struct flush_tlb_info *f = info; 566454bbad9SAndy Lutomirski 567454bbad9SAndy Lutomirski flush_tlb_func_common(f, true, reason); 568454bbad9SAndy Lutomirski } 569454bbad9SAndy Lutomirski 570454bbad9SAndy Lutomirski static void flush_tlb_func_remote(void *info) 571454bbad9SAndy Lutomirski { 572454bbad9SAndy Lutomirski const struct flush_tlb_info *f = info; 573454bbad9SAndy Lutomirski 574454bbad9SAndy Lutomirski inc_irq_stat(irq_tlb_count); 575454bbad9SAndy Lutomirski 5763d28ebceSAndy Lutomirski if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm)) 577454bbad9SAndy Lutomirski return; 578454bbad9SAndy Lutomirski 579454bbad9SAndy Lutomirski count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 580454bbad9SAndy Lutomirski flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); 581454bbad9SAndy Lutomirski } 582454bbad9SAndy Lutomirski 58355f4949fSIngo Molnar void native_flush_tlb_others(const struct cpumask *cpumask, 584a2055abeSAndy Lutomirski const struct flush_tlb_info *info) 58555f4949fSIngo Molnar { 586ec659934SMel Gorman count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 587a2055abeSAndy Lutomirski if (info->end == TLB_FLUSH_ALL) 58818c98243SNadav Amit trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); 58918c98243SNadav Amit else 59018c98243SNadav Amit trace_tlb_flush(TLB_REMOTE_SEND_IPI, 591a2055abeSAndy Lutomirski (info->end - info->start) >> PAGE_SHIFT); 59218c98243SNadav Amit 59355f4949fSIngo Molnar if (is_uv_system()) { 59494b1b03bSAndy Lutomirski /* 59594b1b03bSAndy Lutomirski * This whole special case is confused. UV has a "Broadcast 59694b1b03bSAndy Lutomirski * Assist Unit", which seems to be a fancy way to send IPIs. 59794b1b03bSAndy Lutomirski * Back when x86 used an explicit TLB flush IPI, UV was 59894b1b03bSAndy Lutomirski * optimized to use its own mechanism. These days, x86 uses 59994b1b03bSAndy Lutomirski * smp_call_function_many(), but UV still uses a manual IPI, 60094b1b03bSAndy Lutomirski * and that IPI's action is out of date -- it does a manual 60194b1b03bSAndy Lutomirski * flush instead of calling flush_tlb_func_remote(). This 60294b1b03bSAndy Lutomirski * means that the percpu tlb_gen variables won't be updated 60394b1b03bSAndy Lutomirski * and we'll do pointless flushes on future context switches. 60494b1b03bSAndy Lutomirski * 60594b1b03bSAndy Lutomirski * Rather than hooking native_flush_tlb_others() here, I think 60694b1b03bSAndy Lutomirski * that UV should be updated so that smp_call_function_many(), 60794b1b03bSAndy Lutomirski * etc, are optimal on UV. 60894b1b03bSAndy Lutomirski */ 60952a288c7SPeter Zijlstra unsigned int cpu; 61052a288c7SPeter Zijlstra 61125542c64SXiao Guangrong cpu = smp_processor_id(); 612a2055abeSAndy Lutomirski cpumask = uv_flush_tlb_others(cpumask, info); 61355f4949fSIngo Molnar if (cpumask) 614454bbad9SAndy Lutomirski smp_call_function_many(cpumask, flush_tlb_func_remote, 615a2055abeSAndy Lutomirski (void *)info, 1); 61655f4949fSIngo Molnar return; 61755f4949fSIngo Molnar } 618454bbad9SAndy Lutomirski smp_call_function_many(cpumask, flush_tlb_func_remote, 619a2055abeSAndy Lutomirski (void *)info, 1); 62055f4949fSIngo Molnar } 62155f4949fSIngo Molnar 622a5102476SDave Hansen /* 623a5102476SDave Hansen * See Documentation/x86/tlb.txt for details. We choose 33 624a5102476SDave Hansen * because it is large enough to cover the vast majority (at 625a5102476SDave Hansen * least 95%) of allocations, and is small enough that we are 626a5102476SDave Hansen * confident it will not cause too much overhead. Each single 627a5102476SDave Hansen * flush is about 100 ns, so this caps the maximum overhead at 628a5102476SDave Hansen * _about_ 3,000 ns. 629a5102476SDave Hansen * 630a5102476SDave Hansen * This is in units of pages. 631a5102476SDave Hansen */ 63286426851SJeremiah Mahler static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; 633e9f4e0a9SDave Hansen 634611ae8e3SAlex Shi void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 635611ae8e3SAlex Shi unsigned long end, unsigned long vmflag) 636611ae8e3SAlex Shi { 637454bbad9SAndy Lutomirski int cpu; 638e7b52ffdSAlex Shi 639515ab7c4SNadav Amit struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { 640454bbad9SAndy Lutomirski .mm = mm, 641454bbad9SAndy Lutomirski }; 642ce27374fSAndy Lutomirski 643454bbad9SAndy Lutomirski cpu = get_cpu(); 64471b3c126SAndy Lutomirski 645f39681edSAndy Lutomirski /* This is also a barrier that synchronizes with switch_mm(). */ 646b0579adeSAndy Lutomirski info.new_tlb_gen = inc_mm_tlb_gen(mm); 64771b3c126SAndy Lutomirski 648454bbad9SAndy Lutomirski /* Should we flush just the requested range? */ 649454bbad9SAndy Lutomirski if ((end != TLB_FLUSH_ALL) && 650454bbad9SAndy Lutomirski !(vmflag & VM_HUGETLB) && 651454bbad9SAndy Lutomirski ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { 652a2055abeSAndy Lutomirski info.start = start; 653a2055abeSAndy Lutomirski info.end = end; 654454bbad9SAndy Lutomirski } else { 655454bbad9SAndy Lutomirski info.start = 0UL; 656454bbad9SAndy Lutomirski info.end = TLB_FLUSH_ALL; 6574995ab9cSDave Hansen } 658454bbad9SAndy Lutomirski 659bc0d5a89SAndy Lutomirski if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { 660bc0d5a89SAndy Lutomirski VM_WARN_ON(irqs_disabled()); 661bc0d5a89SAndy Lutomirski local_irq_disable(); 662454bbad9SAndy Lutomirski flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); 663bc0d5a89SAndy Lutomirski local_irq_enable(); 664bc0d5a89SAndy Lutomirski } 665bc0d5a89SAndy Lutomirski 666454bbad9SAndy Lutomirski if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) 667a2055abeSAndy Lutomirski flush_tlb_others(mm_cpumask(mm), &info); 66894b1b03bSAndy Lutomirski 669454bbad9SAndy Lutomirski put_cpu(); 670e7b52ffdSAlex Shi } 671e7b52ffdSAlex Shi 672a2055abeSAndy Lutomirski 67355f4949fSIngo Molnar static void do_flush_tlb_all(void *info) 67455f4949fSIngo Molnar { 675ec659934SMel Gorman count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 67655f4949fSIngo Molnar __flush_tlb_all(); 67755f4949fSIngo Molnar } 67855f4949fSIngo Molnar 67955f4949fSIngo Molnar void flush_tlb_all(void) 68055f4949fSIngo Molnar { 681ec659934SMel Gorman count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 68255f4949fSIngo Molnar on_each_cpu(do_flush_tlb_all, NULL, 1); 68355f4949fSIngo Molnar } 6843df3212fSAlex Shi 685effee4b9SAlex Shi static void do_kernel_range_flush(void *info) 686effee4b9SAlex Shi { 687effee4b9SAlex Shi struct flush_tlb_info *f = info; 688effee4b9SAlex Shi unsigned long addr; 689effee4b9SAlex Shi 690effee4b9SAlex Shi /* flush range by one by one 'invlpg' */ 691a2055abeSAndy Lutomirski for (addr = f->start; addr < f->end; addr += PAGE_SIZE) 6921299ef1dSAndy Lutomirski __flush_tlb_one_kernel(addr); 693effee4b9SAlex Shi } 694effee4b9SAlex Shi 695effee4b9SAlex Shi void flush_tlb_kernel_range(unsigned long start, unsigned long end) 696effee4b9SAlex Shi { 697effee4b9SAlex Shi 698effee4b9SAlex Shi /* Balance as user space task's flush, a bit conservative */ 699e9f4e0a9SDave Hansen if (end == TLB_FLUSH_ALL || 700be4ffc0dSAndy Lutomirski (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { 701effee4b9SAlex Shi on_each_cpu(do_flush_tlb_all, NULL, 1); 702e9f4e0a9SDave Hansen } else { 703e9f4e0a9SDave Hansen struct flush_tlb_info info; 704a2055abeSAndy Lutomirski info.start = start; 705a2055abeSAndy Lutomirski info.end = end; 706effee4b9SAlex Shi on_each_cpu(do_kernel_range_flush, &info, 1); 707effee4b9SAlex Shi } 708effee4b9SAlex Shi } 7092d040a1cSDave Hansen 710e73ad5ffSAndy Lutomirski void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) 711e73ad5ffSAndy Lutomirski { 712a2055abeSAndy Lutomirski struct flush_tlb_info info = { 713a2055abeSAndy Lutomirski .mm = NULL, 714a2055abeSAndy Lutomirski .start = 0UL, 715a2055abeSAndy Lutomirski .end = TLB_FLUSH_ALL, 716a2055abeSAndy Lutomirski }; 717a2055abeSAndy Lutomirski 718e73ad5ffSAndy Lutomirski int cpu = get_cpu(); 719e73ad5ffSAndy Lutomirski 720bc0d5a89SAndy Lutomirski if (cpumask_test_cpu(cpu, &batch->cpumask)) { 721bc0d5a89SAndy Lutomirski VM_WARN_ON(irqs_disabled()); 722bc0d5a89SAndy Lutomirski local_irq_disable(); 7233f79e4c7SAndy Lutomirski flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); 724bc0d5a89SAndy Lutomirski local_irq_enable(); 725bc0d5a89SAndy Lutomirski } 726bc0d5a89SAndy Lutomirski 727e73ad5ffSAndy Lutomirski if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) 728a2055abeSAndy Lutomirski flush_tlb_others(&batch->cpumask, &info); 72994b1b03bSAndy Lutomirski 730e73ad5ffSAndy Lutomirski cpumask_clear(&batch->cpumask); 731e73ad5ffSAndy Lutomirski 732e73ad5ffSAndy Lutomirski put_cpu(); 733e73ad5ffSAndy Lutomirski } 734e73ad5ffSAndy Lutomirski 7352d040a1cSDave Hansen static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, 7362d040a1cSDave Hansen size_t count, loff_t *ppos) 7372d040a1cSDave Hansen { 7382d040a1cSDave Hansen char buf[32]; 7392d040a1cSDave Hansen unsigned int len; 7402d040a1cSDave Hansen 7412d040a1cSDave Hansen len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); 7422d040a1cSDave Hansen return simple_read_from_buffer(user_buf, count, ppos, buf, len); 7432d040a1cSDave Hansen } 7442d040a1cSDave Hansen 7452d040a1cSDave Hansen static ssize_t tlbflush_write_file(struct file *file, 7462d040a1cSDave Hansen const char __user *user_buf, size_t count, loff_t *ppos) 7472d040a1cSDave Hansen { 7482d040a1cSDave Hansen char buf[32]; 7492d040a1cSDave Hansen ssize_t len; 7502d040a1cSDave Hansen int ceiling; 7512d040a1cSDave Hansen 7522d040a1cSDave Hansen len = min(count, sizeof(buf) - 1); 7532d040a1cSDave Hansen if (copy_from_user(buf, user_buf, len)) 7542d040a1cSDave Hansen return -EFAULT; 7552d040a1cSDave Hansen 7562d040a1cSDave Hansen buf[len] = '\0'; 7572d040a1cSDave Hansen if (kstrtoint(buf, 0, &ceiling)) 7582d040a1cSDave Hansen return -EINVAL; 7592d040a1cSDave Hansen 7602d040a1cSDave Hansen if (ceiling < 0) 7612d040a1cSDave Hansen return -EINVAL; 7622d040a1cSDave Hansen 7632d040a1cSDave Hansen tlb_single_page_flush_ceiling = ceiling; 7642d040a1cSDave Hansen return count; 7652d040a1cSDave Hansen } 7662d040a1cSDave Hansen 7672d040a1cSDave Hansen static const struct file_operations fops_tlbflush = { 7682d040a1cSDave Hansen .read = tlbflush_read_file, 7692d040a1cSDave Hansen .write = tlbflush_write_file, 7702d040a1cSDave Hansen .llseek = default_llseek, 7712d040a1cSDave Hansen }; 7722d040a1cSDave Hansen 7732d040a1cSDave Hansen static int __init create_tlb_single_page_flush_ceiling(void) 7742d040a1cSDave Hansen { 7752d040a1cSDave Hansen debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, 7762d040a1cSDave Hansen arch_debugfs_dir, NULL, &fops_tlbflush); 7772d040a1cSDave Hansen return 0; 7782d040a1cSDave Hansen } 7792d040a1cSDave Hansen late_initcall(create_tlb_single_page_flush_ceiling); 780