1457c8996SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only 255f4949fSIngo Molnar #include <linux/init.h> 355f4949fSIngo Molnar 455f4949fSIngo Molnar #include <linux/mm.h> 555f4949fSIngo Molnar #include <linux/spinlock.h> 655f4949fSIngo Molnar #include <linux/smp.h> 755f4949fSIngo Molnar #include <linux/interrupt.h> 84b599fedSPaul Gortmaker #include <linux/export.h> 993296720SShaohua Li #include <linux/cpu.h> 1018bf3c3eSTim Chen #include <linux/debugfs.h> 11b5f06f64SBalbir Singh #include <linux/sched/smt.h> 128ca07e17SEric W. Biederman #include <linux/task_work.h> 136bbd42e2SAlistair Popple #include <linux/mmu_notifier.h> 1455f4949fSIngo Molnar 1555f4949fSIngo Molnar #include <asm/tlbflush.h> 1655f4949fSIngo Molnar #include <asm/mmu_context.h> 1718bf3c3eSTim Chen #include <asm/nospec-branch.h> 18350f8f56SJan Beulich #include <asm/cache.h> 19b5f06f64SBalbir Singh #include <asm/cacheflush.h> 2055f4949fSIngo Molnar #include <asm/apic.h> 215471eea5SKan Liang #include <asm/perf_event.h> 22*8322a66fSBorislav Petkov (AMD) #include <asm/tlb.h> 2355f4949fSIngo Molnar 24935f5839SPeter Zijlstra #include "mm_internal.h" 25935f5839SPeter Zijlstra 262faf153bSThomas Gleixner #ifdef CONFIG_PARAVIRT 272faf153bSThomas Gleixner # define STATIC_NOPV 282faf153bSThomas Gleixner #else 292faf153bSThomas Gleixner # define STATIC_NOPV static 302faf153bSThomas Gleixner # define __flush_tlb_local native_flush_tlb_local 31cd30d26cSThomas Gleixner # define __flush_tlb_global native_flush_tlb_global 32127ac915SThomas Gleixner # define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr) 334ce94eabSNadav Amit # define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info) 342faf153bSThomas Gleixner #endif 352faf153bSThomas Gleixner 3655f4949fSIngo Molnar /* 37ce4a4e56SAndy Lutomirski * TLB flushing, formerly SMP-only 3855f4949fSIngo Molnar * c/o Linus Torvalds. 3955f4949fSIngo Molnar * 4055f4949fSIngo Molnar * These mean you can really definitely utterly forget about 4155f4949fSIngo Molnar * writing to user space from interrupts. (Its not allowed anyway). 4255f4949fSIngo Molnar * 4355f4949fSIngo Molnar * Optimizations Manfred Spraul <manfred@colorfullife.com> 4455f4949fSIngo Molnar * 4555f4949fSIngo Molnar * More scalable flush, from Andi Kleen 4655f4949fSIngo Molnar * 4752aec330SAlex Shi * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi 4855f4949fSIngo Molnar */ 4955f4949fSIngo Molnar 502ea907c4SDave Hansen /* 51b5f06f64SBalbir Singh * Bits to mangle the TIF_SPEC_* state into the mm pointer which is 52371b09c6SBalbir Singh * stored in cpu_tlb_state.last_user_mm_spec. 534c71a2b6SThomas Gleixner */ 544c71a2b6SThomas Gleixner #define LAST_USER_MM_IBPB 0x1UL 55b5f06f64SBalbir Singh #define LAST_USER_MM_L1D_FLUSH 0x2UL 56b5f06f64SBalbir Singh #define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_L1D_FLUSH) 57371b09c6SBalbir Singh 58371b09c6SBalbir Singh /* Bits to set when tlbstate and flush is (re)initialized */ 59371b09c6SBalbir Singh #define LAST_USER_MM_INIT LAST_USER_MM_IBPB 604c71a2b6SThomas Gleixner 614c71a2b6SThomas Gleixner /* 626c9b7d79SThomas Gleixner * The x86 feature is called PCID (Process Context IDentifier). It is similar 636c9b7d79SThomas Gleixner * to what is traditionally called ASID on the RISC processors. 646c9b7d79SThomas Gleixner * 656c9b7d79SThomas Gleixner * We don't use the traditional ASID implementation, where each process/mm gets 666c9b7d79SThomas Gleixner * its own ASID and flush/restart when we run out of ASID space. 676c9b7d79SThomas Gleixner * 686c9b7d79SThomas Gleixner * Instead we have a small per-cpu array of ASIDs and cache the last few mm's 696c9b7d79SThomas Gleixner * that came by on this CPU, allowing cheaper switch_mm between processes on 706c9b7d79SThomas Gleixner * this CPU. 716c9b7d79SThomas Gleixner * 726c9b7d79SThomas Gleixner * We end up with different spaces for different things. To avoid confusion we 736c9b7d79SThomas Gleixner * use different names for each of them: 746c9b7d79SThomas Gleixner * 756c9b7d79SThomas Gleixner * ASID - [0, TLB_NR_DYN_ASIDS-1] 766c9b7d79SThomas Gleixner * the canonical identifier for an mm 776c9b7d79SThomas Gleixner * 786c9b7d79SThomas Gleixner * kPCID - [1, TLB_NR_DYN_ASIDS] 796c9b7d79SThomas Gleixner * the value we write into the PCID part of CR3; corresponds to the 806c9b7d79SThomas Gleixner * ASID+1, because PCID 0 is special. 816c9b7d79SThomas Gleixner * 826c9b7d79SThomas Gleixner * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] 836c9b7d79SThomas Gleixner * for KPTI each mm has two address spaces and thus needs two 846c9b7d79SThomas Gleixner * PCID values, but we can still do with a single ASID denomination 856c9b7d79SThomas Gleixner * for each mm. Corresponds to kPCID + 2048. 866c9b7d79SThomas Gleixner * 876c9b7d79SThomas Gleixner */ 886c9b7d79SThomas Gleixner 896c9b7d79SThomas Gleixner /* There are 12 bits of space for ASIDS in CR3 */ 906c9b7d79SThomas Gleixner #define CR3_HW_ASID_BITS 12 916c9b7d79SThomas Gleixner 926c9b7d79SThomas Gleixner /* 936c9b7d79SThomas Gleixner * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for 946c9b7d79SThomas Gleixner * user/kernel switches 956c9b7d79SThomas Gleixner */ 966c9b7d79SThomas Gleixner #ifdef CONFIG_PAGE_TABLE_ISOLATION 976c9b7d79SThomas Gleixner # define PTI_CONSUMED_PCID_BITS 1 986c9b7d79SThomas Gleixner #else 996c9b7d79SThomas Gleixner # define PTI_CONSUMED_PCID_BITS 0 1006c9b7d79SThomas Gleixner #endif 1016c9b7d79SThomas Gleixner 1026c9b7d79SThomas Gleixner #define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) 1036c9b7d79SThomas Gleixner 1046c9b7d79SThomas Gleixner /* 1056c9b7d79SThomas Gleixner * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account 1066c9b7d79SThomas Gleixner * for them being zero-based. Another -1 is because PCID 0 is reserved for 1076c9b7d79SThomas Gleixner * use by non-PCID-aware users. 1086c9b7d79SThomas Gleixner */ 1096c9b7d79SThomas Gleixner #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) 1106c9b7d79SThomas Gleixner 1116c9b7d79SThomas Gleixner /* 1126c9b7d79SThomas Gleixner * Given @asid, compute kPCID 1136c9b7d79SThomas Gleixner */ 1146c9b7d79SThomas Gleixner static inline u16 kern_pcid(u16 asid) 1156c9b7d79SThomas Gleixner { 1166c9b7d79SThomas Gleixner VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); 1176c9b7d79SThomas Gleixner 1186c9b7d79SThomas Gleixner #ifdef CONFIG_PAGE_TABLE_ISOLATION 1196c9b7d79SThomas Gleixner /* 120d9f6e12fSIngo Molnar * Make sure that the dynamic ASID space does not conflict with the 1216c9b7d79SThomas Gleixner * bit we are using to switch between user and kernel ASIDs. 1226c9b7d79SThomas Gleixner */ 1236c9b7d79SThomas Gleixner BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); 1246c9b7d79SThomas Gleixner 1256c9b7d79SThomas Gleixner /* 1266c9b7d79SThomas Gleixner * The ASID being passed in here should have respected the 1276c9b7d79SThomas Gleixner * MAX_ASID_AVAILABLE and thus never have the switch bit set. 1286c9b7d79SThomas Gleixner */ 1296c9b7d79SThomas Gleixner VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); 1306c9b7d79SThomas Gleixner #endif 1316c9b7d79SThomas Gleixner /* 1326c9b7d79SThomas Gleixner * The dynamically-assigned ASIDs that get passed in are small 1336c9b7d79SThomas Gleixner * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set, 1346c9b7d79SThomas Gleixner * so do not bother to clear it. 1356c9b7d79SThomas Gleixner * 1366c9b7d79SThomas Gleixner * If PCID is on, ASID-aware code paths put the ASID+1 into the 1376c9b7d79SThomas Gleixner * PCID bits. This serves two purposes. It prevents a nasty 1386c9b7d79SThomas Gleixner * situation in which PCID-unaware code saves CR3, loads some other 1396c9b7d79SThomas Gleixner * value (with PCID == 0), and then restores CR3, thus corrupting 1406c9b7d79SThomas Gleixner * the TLB for ASID 0 if the saved ASID was nonzero. It also means 1416c9b7d79SThomas Gleixner * that any bugs involving loading a PCID-enabled CR3 with 1426c9b7d79SThomas Gleixner * CR4.PCIDE off will trigger deterministically. 1436c9b7d79SThomas Gleixner */ 1446c9b7d79SThomas Gleixner return asid + 1; 1456c9b7d79SThomas Gleixner } 1466c9b7d79SThomas Gleixner 1476c9b7d79SThomas Gleixner /* 1486c9b7d79SThomas Gleixner * Given @asid, compute uPCID 1496c9b7d79SThomas Gleixner */ 1506c9b7d79SThomas Gleixner static inline u16 user_pcid(u16 asid) 1516c9b7d79SThomas Gleixner { 1526c9b7d79SThomas Gleixner u16 ret = kern_pcid(asid); 1536c9b7d79SThomas Gleixner #ifdef CONFIG_PAGE_TABLE_ISOLATION 1546c9b7d79SThomas Gleixner ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; 1556c9b7d79SThomas Gleixner #endif 1566c9b7d79SThomas Gleixner return ret; 1576c9b7d79SThomas Gleixner } 1586c9b7d79SThomas Gleixner 15982721d8bSKirill A. Shutemov static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam) 1606c9b7d79SThomas Gleixner { 16182721d8bSKirill A. Shutemov unsigned long cr3 = __sme_pa(pgd) | lam; 16282721d8bSKirill A. Shutemov 1636c9b7d79SThomas Gleixner if (static_cpu_has(X86_FEATURE_PCID)) { 16482721d8bSKirill A. Shutemov VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); 16582721d8bSKirill A. Shutemov cr3 |= kern_pcid(asid); 1666c9b7d79SThomas Gleixner } else { 1676c9b7d79SThomas Gleixner VM_WARN_ON_ONCE(asid != 0); 1686c9b7d79SThomas Gleixner } 1696c9b7d79SThomas Gleixner 17082721d8bSKirill A. Shutemov return cr3; 17182721d8bSKirill A. Shutemov } 17282721d8bSKirill A. Shutemov 17382721d8bSKirill A. Shutemov static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid, 17482721d8bSKirill A. Shutemov unsigned long lam) 1756c9b7d79SThomas Gleixner { 1766c9b7d79SThomas Gleixner /* 1776c9b7d79SThomas Gleixner * Use boot_cpu_has() instead of this_cpu_has() as this function 1786c9b7d79SThomas Gleixner * might be called during early boot. This should work even after 1796c9b7d79SThomas Gleixner * boot because all CPU's the have same capabilities: 1806c9b7d79SThomas Gleixner */ 1816c9b7d79SThomas Gleixner VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); 18282721d8bSKirill A. Shutemov return build_cr3(pgd, asid, lam) | CR3_NOFLUSH; 1836c9b7d79SThomas Gleixner } 1846c9b7d79SThomas Gleixner 1856c9b7d79SThomas Gleixner /* 1862ea907c4SDave Hansen * We get here when we do something requiring a TLB invalidation 1872ea907c4SDave Hansen * but could not go invalidate all of the contexts. We do the 1882ea907c4SDave Hansen * necessary invalidation by clearing out the 'ctx_id' which 1892ea907c4SDave Hansen * forces a TLB flush when the context is loaded. 1902ea907c4SDave Hansen */ 191387048f5Szhong jiang static void clear_asid_other(void) 1922ea907c4SDave Hansen { 1932ea907c4SDave Hansen u16 asid; 1942ea907c4SDave Hansen 1952ea907c4SDave Hansen /* 1962ea907c4SDave Hansen * This is only expected to be set if we have disabled 1972ea907c4SDave Hansen * kernel _PAGE_GLOBAL pages. 1982ea907c4SDave Hansen */ 1992ea907c4SDave Hansen if (!static_cpu_has(X86_FEATURE_PTI)) { 2002ea907c4SDave Hansen WARN_ON_ONCE(1); 2012ea907c4SDave Hansen return; 2022ea907c4SDave Hansen } 2032ea907c4SDave Hansen 2042ea907c4SDave Hansen for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 2052ea907c4SDave Hansen /* Do not need to flush the current asid */ 2062ea907c4SDave Hansen if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) 2072ea907c4SDave Hansen continue; 2082ea907c4SDave Hansen /* 2092ea907c4SDave Hansen * Make sure the next time we go to switch to 2102ea907c4SDave Hansen * this asid, we do a flush: 2112ea907c4SDave Hansen */ 2122ea907c4SDave Hansen this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); 2132ea907c4SDave Hansen } 2142ea907c4SDave Hansen this_cpu_write(cpu_tlbstate.invalidate_other, false); 2152ea907c4SDave Hansen } 2162ea907c4SDave Hansen 217f39681edSAndy Lutomirski atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); 218f39681edSAndy Lutomirski 219b956575bSAndy Lutomirski 22010af6235SAndy Lutomirski static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, 22110af6235SAndy Lutomirski u16 *new_asid, bool *need_flush) 22210af6235SAndy Lutomirski { 22310af6235SAndy Lutomirski u16 asid; 22410af6235SAndy Lutomirski 22510af6235SAndy Lutomirski if (!static_cpu_has(X86_FEATURE_PCID)) { 22610af6235SAndy Lutomirski *new_asid = 0; 22710af6235SAndy Lutomirski *need_flush = true; 22810af6235SAndy Lutomirski return; 22910af6235SAndy Lutomirski } 23010af6235SAndy Lutomirski 2312ea907c4SDave Hansen if (this_cpu_read(cpu_tlbstate.invalidate_other)) 2322ea907c4SDave Hansen clear_asid_other(); 2332ea907c4SDave Hansen 23410af6235SAndy Lutomirski for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 23510af6235SAndy Lutomirski if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != 23610af6235SAndy Lutomirski next->context.ctx_id) 23710af6235SAndy Lutomirski continue; 23810af6235SAndy Lutomirski 23910af6235SAndy Lutomirski *new_asid = asid; 24010af6235SAndy Lutomirski *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < 24110af6235SAndy Lutomirski next_tlb_gen); 24210af6235SAndy Lutomirski return; 24310af6235SAndy Lutomirski } 24410af6235SAndy Lutomirski 24510af6235SAndy Lutomirski /* 24610af6235SAndy Lutomirski * We don't currently own an ASID slot on this CPU. 24710af6235SAndy Lutomirski * Allocate a slot. 24810af6235SAndy Lutomirski */ 24910af6235SAndy Lutomirski *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; 25010af6235SAndy Lutomirski if (*new_asid >= TLB_NR_DYN_ASIDS) { 25110af6235SAndy Lutomirski *new_asid = 0; 25210af6235SAndy Lutomirski this_cpu_write(cpu_tlbstate.next_asid, 1); 25310af6235SAndy Lutomirski } 25410af6235SAndy Lutomirski *need_flush = true; 25510af6235SAndy Lutomirski } 25610af6235SAndy Lutomirski 257127ac915SThomas Gleixner /* 258127ac915SThomas Gleixner * Given an ASID, flush the corresponding user ASID. We can delay this 259127ac915SThomas Gleixner * until the next time we switch to it. 260127ac915SThomas Gleixner * 261127ac915SThomas Gleixner * See SWITCH_TO_USER_CR3. 262127ac915SThomas Gleixner */ 263127ac915SThomas Gleixner static inline void invalidate_user_asid(u16 asid) 264127ac915SThomas Gleixner { 265127ac915SThomas Gleixner /* There is no user ASID if address space separation is off */ 266127ac915SThomas Gleixner if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) 267127ac915SThomas Gleixner return; 268127ac915SThomas Gleixner 269127ac915SThomas Gleixner /* 270127ac915SThomas Gleixner * We only have a single ASID if PCID is off and the CR3 271127ac915SThomas Gleixner * write will have flushed it. 272127ac915SThomas Gleixner */ 273127ac915SThomas Gleixner if (!cpu_feature_enabled(X86_FEATURE_PCID)) 274127ac915SThomas Gleixner return; 275127ac915SThomas Gleixner 276127ac915SThomas Gleixner if (!static_cpu_has(X86_FEATURE_PTI)) 277127ac915SThomas Gleixner return; 278127ac915SThomas Gleixner 279127ac915SThomas Gleixner __set_bit(kern_pcid(asid), 280127ac915SThomas Gleixner (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); 281127ac915SThomas Gleixner } 282127ac915SThomas Gleixner 28382721d8bSKirill A. Shutemov static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam, 28482721d8bSKirill A. Shutemov bool need_flush) 28548e11198SDave Hansen { 28648e11198SDave Hansen unsigned long new_mm_cr3; 28748e11198SDave Hansen 28848e11198SDave Hansen if (need_flush) { 2896fd166aaSPeter Zijlstra invalidate_user_asid(new_asid); 29082721d8bSKirill A. Shutemov new_mm_cr3 = build_cr3(pgdir, new_asid, lam); 29148e11198SDave Hansen } else { 29282721d8bSKirill A. Shutemov new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam); 29348e11198SDave Hansen } 29448e11198SDave Hansen 29548e11198SDave Hansen /* 29648e11198SDave Hansen * Caution: many callers of this function expect 29748e11198SDave Hansen * that load_cr3() is serializing and orders TLB 29848e11198SDave Hansen * fills with respect to the mm_cpumask writes. 29948e11198SDave Hansen */ 30048e11198SDave Hansen write_cr3(new_mm_cr3); 30148e11198SDave Hansen } 30248e11198SDave Hansen 30355f4949fSIngo Molnar void leave_mm(int cpu) 30455f4949fSIngo Molnar { 3053d28ebceSAndy Lutomirski struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 3063d28ebceSAndy Lutomirski 3073d28ebceSAndy Lutomirski /* 3083d28ebceSAndy Lutomirski * It's plausible that we're in lazy TLB mode while our mm is init_mm. 3093d28ebceSAndy Lutomirski * If so, our callers still expect us to flush the TLB, but there 3103d28ebceSAndy Lutomirski * aren't any user TLB entries in init_mm to worry about. 3113d28ebceSAndy Lutomirski * 3123d28ebceSAndy Lutomirski * This needs to happen before any other sanity checks due to 3133d28ebceSAndy Lutomirski * intel_idle's shenanigans. 3143d28ebceSAndy Lutomirski */ 3153d28ebceSAndy Lutomirski if (loaded_mm == &init_mm) 3163d28ebceSAndy Lutomirski return; 3173d28ebceSAndy Lutomirski 31894b1b03bSAndy Lutomirski /* Warn if we're not lazy. */ 3192f4305b1SNadav Amit WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy)); 3203d28ebceSAndy Lutomirski 3213d28ebceSAndy Lutomirski switch_mm(NULL, &init_mm, NULL); 322a6fca40fSSuresh Siddha } 32367535736SAndy Lutomirski EXPORT_SYMBOL_GPL(leave_mm); 32455f4949fSIngo Molnar 32569c0319aSAndy Lutomirski void switch_mm(struct mm_struct *prev, struct mm_struct *next, 32669c0319aSAndy Lutomirski struct task_struct *tsk) 32769c0319aSAndy Lutomirski { 328078194f8SAndy Lutomirski unsigned long flags; 329078194f8SAndy Lutomirski 330078194f8SAndy Lutomirski local_irq_save(flags); 331078194f8SAndy Lutomirski switch_mm_irqs_off(prev, next, tsk); 332078194f8SAndy Lutomirski local_irq_restore(flags); 333078194f8SAndy Lutomirski } 334078194f8SAndy Lutomirski 335b5f06f64SBalbir Singh /* 336b5f06f64SBalbir Singh * Invoked from return to user/guest by a task that opted-in to L1D 337b5f06f64SBalbir Singh * flushing but ended up running on an SMT enabled core due to wrong 338b5f06f64SBalbir Singh * affinity settings or CPU hotplug. This is part of the paranoid L1D flush 339b5f06f64SBalbir Singh * contract which this task requested. 340b5f06f64SBalbir Singh */ 341b5f06f64SBalbir Singh static void l1d_flush_force_sigbus(struct callback_head *ch) 342b5f06f64SBalbir Singh { 343b5f06f64SBalbir Singh force_sig(SIGBUS); 344b5f06f64SBalbir Singh } 345b5f06f64SBalbir Singh 346b5f06f64SBalbir Singh static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm, 347b5f06f64SBalbir Singh struct task_struct *next) 348b5f06f64SBalbir Singh { 349b5f06f64SBalbir Singh /* Flush L1D if the outgoing task requests it */ 350b5f06f64SBalbir Singh if (prev_mm & LAST_USER_MM_L1D_FLUSH) 351b5f06f64SBalbir Singh wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 352b5f06f64SBalbir Singh 353b5f06f64SBalbir Singh /* Check whether the incoming task opted in for L1D flush */ 354b5f06f64SBalbir Singh if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH))) 355b5f06f64SBalbir Singh return; 356b5f06f64SBalbir Singh 357b5f06f64SBalbir Singh /* 358b5f06f64SBalbir Singh * Validate that it is not running on an SMT sibling as this would 359b5f06f64SBalbir Singh * make the excercise pointless because the siblings share L1D. If 360b5f06f64SBalbir Singh * it runs on a SMT sibling, notify it with SIGBUS on return to 361b5f06f64SBalbir Singh * user/guest 362b5f06f64SBalbir Singh */ 363b5f06f64SBalbir Singh if (this_cpu_read(cpu_info.smt_active)) { 364b5f06f64SBalbir Singh clear_ti_thread_flag(&next->thread_info, TIF_SPEC_L1D_FLUSH); 365b5f06f64SBalbir Singh next->l1d_flush_kill.func = l1d_flush_force_sigbus; 366b5f06f64SBalbir Singh task_work_add(next, &next->l1d_flush_kill, TWA_RESUME); 367b5f06f64SBalbir Singh } 368b5f06f64SBalbir Singh } 369b5f06f64SBalbir Singh 370371b09c6SBalbir Singh static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next) 371dbfe2953SJiri Kosina { 372dca99fb6SMark Rutland unsigned long next_tif = read_task_thread_flags(next); 373371b09c6SBalbir Singh unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK; 3744c71a2b6SThomas Gleixner 375b5f06f64SBalbir Singh /* 376b5f06f64SBalbir Singh * Ensure that the bit shift above works as expected and the two flags 377b5f06f64SBalbir Singh * end up in bit 0 and 1. 378b5f06f64SBalbir Singh */ 379b5f06f64SBalbir Singh BUILD_BUG_ON(TIF_SPEC_L1D_FLUSH != TIF_SPEC_IB + 1); 380b5f06f64SBalbir Singh 381371b09c6SBalbir Singh return (unsigned long)next->mm | spec_bits; 3824c71a2b6SThomas Gleixner } 3834c71a2b6SThomas Gleixner 384371b09c6SBalbir Singh static void cond_mitigation(struct task_struct *next) 3854c71a2b6SThomas Gleixner { 386371b09c6SBalbir Singh unsigned long prev_mm, next_mm; 387371b09c6SBalbir Singh 3884c71a2b6SThomas Gleixner if (!next || !next->mm) 3894c71a2b6SThomas Gleixner return; 3904c71a2b6SThomas Gleixner 391371b09c6SBalbir Singh next_mm = mm_mangle_tif_spec_bits(next); 392371b09c6SBalbir Singh prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); 393371b09c6SBalbir Singh 394dbfe2953SJiri Kosina /* 395371b09c6SBalbir Singh * Avoid user/user BTB poisoning by flushing the branch predictor 396371b09c6SBalbir Singh * when switching between processes. This stops one process from 397371b09c6SBalbir Singh * doing Spectre-v2 attacks on another. 398371b09c6SBalbir Singh * 3994c71a2b6SThomas Gleixner * Both, the conditional and the always IBPB mode use the mm 4004c71a2b6SThomas Gleixner * pointer to avoid the IBPB when switching between tasks of the 4014c71a2b6SThomas Gleixner * same process. Using the mm pointer instead of mm->context.ctx_id 4024c71a2b6SThomas Gleixner * opens a hypothetical hole vs. mm_struct reuse, which is more or 4034c71a2b6SThomas Gleixner * less impossible to control by an attacker. Aside of that it 4044c71a2b6SThomas Gleixner * would only affect the first schedule so the theoretically 4054c71a2b6SThomas Gleixner * exposed data is not really interesting. 406dbfe2953SJiri Kosina */ 4074c71a2b6SThomas Gleixner if (static_branch_likely(&switch_mm_cond_ibpb)) { 4084c71a2b6SThomas Gleixner /* 4094c71a2b6SThomas Gleixner * This is a bit more complex than the always mode because 4104c71a2b6SThomas Gleixner * it has to handle two cases: 4114c71a2b6SThomas Gleixner * 4124c71a2b6SThomas Gleixner * 1) Switch from a user space task (potential attacker) 4134c71a2b6SThomas Gleixner * which has TIF_SPEC_IB set to a user space task 4144c71a2b6SThomas Gleixner * (potential victim) which has TIF_SPEC_IB not set. 4154c71a2b6SThomas Gleixner * 4164c71a2b6SThomas Gleixner * 2) Switch from a user space task (potential attacker) 4174c71a2b6SThomas Gleixner * which has TIF_SPEC_IB not set to a user space task 4184c71a2b6SThomas Gleixner * (potential victim) which has TIF_SPEC_IB set. 4194c71a2b6SThomas Gleixner * 4204c71a2b6SThomas Gleixner * This could be done by unconditionally issuing IBPB when 4214c71a2b6SThomas Gleixner * a task which has TIF_SPEC_IB set is either scheduled in 4224c71a2b6SThomas Gleixner * or out. Though that results in two flushes when: 4234c71a2b6SThomas Gleixner * 4244c71a2b6SThomas Gleixner * - the same user space task is scheduled out and later 4254c71a2b6SThomas Gleixner * scheduled in again and only a kernel thread ran in 4264c71a2b6SThomas Gleixner * between. 4274c71a2b6SThomas Gleixner * 4284c71a2b6SThomas Gleixner * - a user space task belonging to the same process is 4294c71a2b6SThomas Gleixner * scheduled in after a kernel thread ran in between 4304c71a2b6SThomas Gleixner * 4314c71a2b6SThomas Gleixner * - a user space task belonging to the same process is 4324c71a2b6SThomas Gleixner * scheduled in immediately. 4334c71a2b6SThomas Gleixner * 4344c71a2b6SThomas Gleixner * Optimize this with reasonably small overhead for the 4354c71a2b6SThomas Gleixner * above cases. Mangle the TIF_SPEC_IB bit into the mm 4364c71a2b6SThomas Gleixner * pointer of the incoming task which is stored in 437371b09c6SBalbir Singh * cpu_tlbstate.last_user_mm_spec for comparison. 438371b09c6SBalbir Singh * 4394c71a2b6SThomas Gleixner * Issue IBPB only if the mm's are different and one or 4404c71a2b6SThomas Gleixner * both have the IBPB bit set. 4414c71a2b6SThomas Gleixner */ 4424c71a2b6SThomas Gleixner if (next_mm != prev_mm && 4434c71a2b6SThomas Gleixner (next_mm | prev_mm) & LAST_USER_MM_IBPB) 4444c71a2b6SThomas Gleixner indirect_branch_prediction_barrier(); 4454c71a2b6SThomas Gleixner } 4464c71a2b6SThomas Gleixner 4474c71a2b6SThomas Gleixner if (static_branch_unlikely(&switch_mm_always_ibpb)) { 4484c71a2b6SThomas Gleixner /* 4494c71a2b6SThomas Gleixner * Only flush when switching to a user space task with a 4504c71a2b6SThomas Gleixner * different context than the user space task which ran 4514c71a2b6SThomas Gleixner * last on this CPU. 4524c71a2b6SThomas Gleixner */ 453371b09c6SBalbir Singh if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != 454371b09c6SBalbir Singh (unsigned long)next->mm) 4554c71a2b6SThomas Gleixner indirect_branch_prediction_barrier(); 4564c71a2b6SThomas Gleixner } 457371b09c6SBalbir Singh 458b5f06f64SBalbir Singh if (static_branch_unlikely(&switch_mm_cond_l1d_flush)) { 459b5f06f64SBalbir Singh /* 460b5f06f64SBalbir Singh * Flush L1D when the outgoing task requested it and/or 461b5f06f64SBalbir Singh * check whether the incoming task requested L1D flushing 462b5f06f64SBalbir Singh * and ended up on an SMT sibling. 463b5f06f64SBalbir Singh */ 464b5f06f64SBalbir Singh if (unlikely((prev_mm | next_mm) & LAST_USER_MM_L1D_FLUSH)) 465b5f06f64SBalbir Singh l1d_flush_evaluate(prev_mm, next_mm, next); 466b5f06f64SBalbir Singh } 467b5f06f64SBalbir Singh 468371b09c6SBalbir Singh this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm); 469dbfe2953SJiri Kosina } 470dbfe2953SJiri Kosina 471cb2a0235SThomas Gleixner #ifdef CONFIG_PERF_EVENTS 472cb2a0235SThomas Gleixner static inline void cr4_update_pce_mm(struct mm_struct *mm) 473cb2a0235SThomas Gleixner { 474cb2a0235SThomas Gleixner if (static_branch_unlikely(&rdpmc_always_available_key) || 475cb2a0235SThomas Gleixner (!static_branch_unlikely(&rdpmc_never_available_key) && 4765471eea5SKan Liang atomic_read(&mm->context.perf_rdpmc_allowed))) { 4775471eea5SKan Liang /* 4785471eea5SKan Liang * Clear the existing dirty counters to 4795471eea5SKan Liang * prevent the leak for an RDPMC task. 4805471eea5SKan Liang */ 4815471eea5SKan Liang perf_clear_dirty_counters(); 482cb2a0235SThomas Gleixner cr4_set_bits_irqsoff(X86_CR4_PCE); 4835471eea5SKan Liang } else 484cb2a0235SThomas Gleixner cr4_clear_bits_irqsoff(X86_CR4_PCE); 485cb2a0235SThomas Gleixner } 486cb2a0235SThomas Gleixner 487cb2a0235SThomas Gleixner void cr4_update_pce(void *ignored) 488cb2a0235SThomas Gleixner { 489cb2a0235SThomas Gleixner cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm)); 490cb2a0235SThomas Gleixner } 491cb2a0235SThomas Gleixner 492cb2a0235SThomas Gleixner #else 493cb2a0235SThomas Gleixner static inline void cr4_update_pce_mm(struct mm_struct *mm) { } 494cb2a0235SThomas Gleixner #endif 495cb2a0235SThomas Gleixner 496078194f8SAndy Lutomirski void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, 497078194f8SAndy Lutomirski struct task_struct *tsk) 498078194f8SAndy Lutomirski { 4993d28ebceSAndy Lutomirski struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); 50010af6235SAndy Lutomirski u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 5012f4305b1SNadav Amit bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); 50294b1b03bSAndy Lutomirski unsigned cpu = smp_processor_id(); 50317c72808SYosry Ahmed unsigned long new_lam; 50494b1b03bSAndy Lutomirski u64 next_tlb_gen; 50512c4d978SRik van Riel bool need_flush; 50612c4d978SRik van Riel u16 new_asid; 50769c0319aSAndy Lutomirski 5083d28ebceSAndy Lutomirski /* 50994b1b03bSAndy Lutomirski * NB: The scheduler will call us with prev == next when switching 51094b1b03bSAndy Lutomirski * from lazy TLB mode to normal mode if active_mm isn't changing. 51194b1b03bSAndy Lutomirski * When this happens, we don't assume that CR3 (and hence 51294b1b03bSAndy Lutomirski * cpu_tlbstate.loaded_mm) matches next. 5133d28ebceSAndy Lutomirski * 5143d28ebceSAndy Lutomirski * NB: leave_mm() calls us with prev == NULL and tsk == NULL. 5153d28ebceSAndy Lutomirski */ 5163d28ebceSAndy Lutomirski 5174c1ba392SNadav Amit /* We don't want flush_tlb_func() to run concurrently with us. */ 51894b1b03bSAndy Lutomirski if (IS_ENABLED(CONFIG_PROVE_LOCKING)) 51994b1b03bSAndy Lutomirski WARN_ON_ONCE(!irqs_disabled()); 52094b1b03bSAndy Lutomirski 52194b1b03bSAndy Lutomirski /* 52294b1b03bSAndy Lutomirski * Verify that CR3 is what we think it is. This will catch 52394b1b03bSAndy Lutomirski * hypothetical buggy code that directly switches to swapper_pg_dir 52410af6235SAndy Lutomirski * without going through leave_mm() / switch_mm_irqs_off() or that 52510af6235SAndy Lutomirski * does something like write_cr3(read_cr3_pa()). 526a376e7f9SAndy Lutomirski * 527a376e7f9SAndy Lutomirski * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() 528a376e7f9SAndy Lutomirski * isn't free. 52994b1b03bSAndy Lutomirski */ 530a376e7f9SAndy Lutomirski #ifdef CONFIG_DEBUG_VM 53182721d8bSKirill A. Shutemov if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, 53282721d8bSKirill A. Shutemov tlbstate_lam_cr3_mask()))) { 533a376e7f9SAndy Lutomirski /* 534a376e7f9SAndy Lutomirski * If we were to BUG here, we'd be very likely to kill 535a376e7f9SAndy Lutomirski * the system so hard that we don't see the call trace. 536a376e7f9SAndy Lutomirski * Try to recover instead by ignoring the error and doing 537a376e7f9SAndy Lutomirski * a global flush to minimize the chance of corruption. 538a376e7f9SAndy Lutomirski * 539a376e7f9SAndy Lutomirski * (This is far from being a fully correct recovery. 540a376e7f9SAndy Lutomirski * Architecturally, the CPU could prefetch something 541a376e7f9SAndy Lutomirski * back into an incorrect ASID slot and leave it there 542a376e7f9SAndy Lutomirski * to cause trouble down the road. It's better than 543a376e7f9SAndy Lutomirski * nothing, though.) 544a376e7f9SAndy Lutomirski */ 545a376e7f9SAndy Lutomirski __flush_tlb_all(); 546a376e7f9SAndy Lutomirski } 547a376e7f9SAndy Lutomirski #endif 54809c5272eSNadav Amit if (was_lazy) 5492f4305b1SNadav Amit this_cpu_write(cpu_tlbstate_shared.is_lazy, false); 5503d28ebceSAndy Lutomirski 551306e0604SMathieu Desnoyers /* 55210bcc80eSMathieu Desnoyers * The membarrier system call requires a full memory barrier and 55310bcc80eSMathieu Desnoyers * core serialization before returning to user-space, after 554a493d1caSAndy Lutomirski * storing to rq->curr, when changing mm. This is because 555a493d1caSAndy Lutomirski * membarrier() sends IPIs to all CPUs that are in the target mm 556a493d1caSAndy Lutomirski * to make them issue memory barriers. However, if another CPU 557a493d1caSAndy Lutomirski * switches to/from the target mm concurrently with 558a493d1caSAndy Lutomirski * membarrier(), it can cause that CPU not to receive an IPI 559a493d1caSAndy Lutomirski * when it really should issue a memory barrier. Writing to CR3 560a493d1caSAndy Lutomirski * provides that full memory barrier and core serializing 561a493d1caSAndy Lutomirski * instruction. 562306e0604SMathieu Desnoyers */ 5633d28ebceSAndy Lutomirski if (real_prev == next) { 56482721d8bSKirill A. Shutemov /* Not actually switching mm's */ 565e8b9b0ccSAndy Lutomirski VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != 56694b1b03bSAndy Lutomirski next->context.ctx_id); 56794b1b03bSAndy Lutomirski 5683d28ebceSAndy Lutomirski /* 56982721d8bSKirill A. Shutemov * If this races with another thread that enables lam, 'new_lam' 57082721d8bSKirill A. Shutemov * might not match tlbstate_lam_cr3_mask(). 57182721d8bSKirill A. Shutemov */ 57282721d8bSKirill A. Shutemov 57382721d8bSKirill A. Shutemov /* 574145f573bSRik van Riel * Even in lazy TLB mode, the CPU should stay set in the 575145f573bSRik van Riel * mm_cpumask. The TLB shootdown code can figure out from 5762f4305b1SNadav Amit * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. 5773d28ebceSAndy Lutomirski */ 578b956575bSAndy Lutomirski if (WARN_ON_ONCE(real_prev != &init_mm && 579b956575bSAndy Lutomirski !cpumask_test_cpu(cpu, mm_cpumask(next)))) 58094b1b03bSAndy Lutomirski cpumask_set_cpu(cpu, mm_cpumask(next)); 58194b1b03bSAndy Lutomirski 582145f573bSRik van Riel /* 583145f573bSRik van Riel * If the CPU is not in lazy TLB mode, we are just switching 584145f573bSRik van Riel * from one thread in a process to another thread in the same 585145f573bSRik van Riel * process. No TLB flush required. 586145f573bSRik van Riel */ 587145f573bSRik van Riel if (!was_lazy) 588b956575bSAndy Lutomirski return; 589145f573bSRik van Riel 590145f573bSRik van Riel /* 591145f573bSRik van Riel * Read the tlb_gen to check whether a flush is needed. 592145f573bSRik van Riel * If the TLB is up to date, just use it. 593145f573bSRik van Riel * The barrier synchronizes with the tlb_gen increment in 594145f573bSRik van Riel * the TLB shootdown code. 595145f573bSRik van Riel */ 596145f573bSRik van Riel smp_mb(); 597145f573bSRik van Riel next_tlb_gen = atomic64_read(&next->context.tlb_gen); 598145f573bSRik van Riel if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == 599145f573bSRik van Riel next_tlb_gen) 600145f573bSRik van Riel return; 601145f573bSRik van Riel 602145f573bSRik van Riel /* 603145f573bSRik van Riel * TLB contents went out of date while we were in lazy 604145f573bSRik van Riel * mode. Fall through to the TLB switching code below. 605145f573bSRik van Riel */ 606145f573bSRik van Riel new_asid = prev_asid; 607145f573bSRik van Riel need_flush = true; 60894b1b03bSAndy Lutomirski } else { 60918bf3c3eSTim Chen /* 610371b09c6SBalbir Singh * Apply process to process speculation vulnerability 611371b09c6SBalbir Singh * mitigations if applicable. 61218bf3c3eSTim Chen */ 613371b09c6SBalbir Singh cond_mitigation(tsk); 61494b1b03bSAndy Lutomirski 615e9d8c615SRik van Riel /* 616e9d8c615SRik van Riel * Stop remote flushes for the previous mm. 617e9d8c615SRik van Riel * Skip kernel threads; we never send init_mm TLB flushing IPIs, 618e9d8c615SRik van Riel * but the bitmap manipulation can cause cache line contention. 619e9d8c615SRik van Riel */ 620e9d8c615SRik van Riel if (real_prev != &init_mm) { 621e9d8c615SRik van Riel VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, 622e9d8c615SRik van Riel mm_cpumask(real_prev))); 62394b1b03bSAndy Lutomirski cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 624e9d8c615SRik van Riel } 625e37e43a4SAndy Lutomirski 62617c72808SYosry Ahmed /* Start receiving IPIs and then read tlb_gen (and LAM below) */ 627e9d8c615SRik van Riel if (next != &init_mm) 62894b1b03bSAndy Lutomirski cpumask_set_cpu(cpu, mm_cpumask(next)); 62994b1b03bSAndy Lutomirski next_tlb_gen = atomic64_read(&next->context.tlb_gen); 63094b1b03bSAndy Lutomirski 63110af6235SAndy Lutomirski choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); 63269c0319aSAndy Lutomirski 6334012e77aSAndy Lutomirski /* Let nmi_uaccess_okay() know that we're changing CR3. */ 6344012e77aSAndy Lutomirski this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); 6354012e77aSAndy Lutomirski barrier(); 63612c4d978SRik van Riel } 6374012e77aSAndy Lutomirski 63817c72808SYosry Ahmed new_lam = mm_lam_cr3_mask(next); 63982721d8bSKirill A. Shutemov set_tlbstate_lam_mode(next); 64010af6235SAndy Lutomirski if (need_flush) { 64110af6235SAndy Lutomirski this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 64210af6235SAndy Lutomirski this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 64382721d8bSKirill A. Shutemov load_new_mm_cr3(next->pgd, new_asid, new_lam, true); 64467535736SAndy Lutomirski 645bf9282dcSPeter Zijlstra trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 64610af6235SAndy Lutomirski } else { 64710af6235SAndy Lutomirski /* The new ASID is already up to date. */ 64882721d8bSKirill A. Shutemov load_new_mm_cr3(next->pgd, new_asid, new_lam, false); 64967535736SAndy Lutomirski 650bf9282dcSPeter Zijlstra trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); 65110af6235SAndy Lutomirski } 65210af6235SAndy Lutomirski 6534012e77aSAndy Lutomirski /* Make sure we write CR3 before loaded_mm. */ 6544012e77aSAndy Lutomirski barrier(); 6554012e77aSAndy Lutomirski 65610af6235SAndy Lutomirski this_cpu_write(cpu_tlbstate.loaded_mm, next); 65710af6235SAndy Lutomirski this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); 65869c0319aSAndy Lutomirski 659145f573bSRik van Riel if (next != real_prev) { 660cb2a0235SThomas Gleixner cr4_update_pce_mm(next); 66173534258SAndy Lutomirski switch_ldt(real_prev, next); 66269c0319aSAndy Lutomirski } 663145f573bSRik van Riel } 66469c0319aSAndy Lutomirski 665b0579adeSAndy Lutomirski /* 6664e57b946SAndy Lutomirski * Please ignore the name of this function. It should be called 6674e57b946SAndy Lutomirski * switch_to_kernel_thread(). 6684e57b946SAndy Lutomirski * 669b956575bSAndy Lutomirski * enter_lazy_tlb() is a hint from the scheduler that we are entering a 670b956575bSAndy Lutomirski * kernel thread or other context without an mm. Acceptable implementations 671b956575bSAndy Lutomirski * include doing nothing whatsoever, switching to init_mm, or various clever 672b956575bSAndy Lutomirski * lazy tricks to try to minimize TLB flushes. 673b956575bSAndy Lutomirski * 674b956575bSAndy Lutomirski * The scheduler reserves the right to call enter_lazy_tlb() several times 675b956575bSAndy Lutomirski * in a row. It will notify us that we're going back to a real mm by 676b956575bSAndy Lutomirski * calling switch_mm_irqs_off(). 677b956575bSAndy Lutomirski */ 678b956575bSAndy Lutomirski void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 679b956575bSAndy Lutomirski { 680b956575bSAndy Lutomirski if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) 681b956575bSAndy Lutomirski return; 682b956575bSAndy Lutomirski 6832f4305b1SNadav Amit this_cpu_write(cpu_tlbstate_shared.is_lazy, true); 684b956575bSAndy Lutomirski } 685b956575bSAndy Lutomirski 686b956575bSAndy Lutomirski /* 68772c0098dSAndy Lutomirski * Call this when reinitializing a CPU. It fixes the following potential 68872c0098dSAndy Lutomirski * problems: 68972c0098dSAndy Lutomirski * 69072c0098dSAndy Lutomirski * - The ASID changed from what cpu_tlbstate thinks it is (most likely 69172c0098dSAndy Lutomirski * because the CPU was taken down and came back up with CR3's PCID 69272c0098dSAndy Lutomirski * bits clear. CPU hotplug can do this. 69372c0098dSAndy Lutomirski * 69472c0098dSAndy Lutomirski * - The TLB contains junk in slots corresponding to inactive ASIDs. 69572c0098dSAndy Lutomirski * 69672c0098dSAndy Lutomirski * - The CPU went so far out to lunch that it may have missed a TLB 69772c0098dSAndy Lutomirski * flush. 69872c0098dSAndy Lutomirski */ 69972c0098dSAndy Lutomirski void initialize_tlbstate_and_flush(void) 70072c0098dSAndy Lutomirski { 70172c0098dSAndy Lutomirski int i; 70272c0098dSAndy Lutomirski struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); 70372c0098dSAndy Lutomirski u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); 70472c0098dSAndy Lutomirski unsigned long cr3 = __read_cr3(); 70572c0098dSAndy Lutomirski 70672c0098dSAndy Lutomirski /* Assert that CR3 already references the right mm. */ 70772c0098dSAndy Lutomirski WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); 70872c0098dSAndy Lutomirski 70982721d8bSKirill A. Shutemov /* LAM expected to be disabled */ 71082721d8bSKirill A. Shutemov WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)); 71182721d8bSKirill A. Shutemov WARN_ON(mm_lam_cr3_mask(mm)); 71282721d8bSKirill A. Shutemov 71372c0098dSAndy Lutomirski /* 71472c0098dSAndy Lutomirski * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization 71572c0098dSAndy Lutomirski * doesn't work like other CR4 bits because it can only be set from 71672c0098dSAndy Lutomirski * long mode.) 71772c0098dSAndy Lutomirski */ 7187898f796SAndy Lutomirski WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && 71972c0098dSAndy Lutomirski !(cr4_read_shadow() & X86_CR4_PCIDE)); 72072c0098dSAndy Lutomirski 72182721d8bSKirill A. Shutemov /* Disable LAM, force ASID 0 and force a TLB flush. */ 72282721d8bSKirill A. Shutemov write_cr3(build_cr3(mm->pgd, 0, 0)); 72372c0098dSAndy Lutomirski 72472c0098dSAndy Lutomirski /* Reinitialize tlbstate. */ 725371b09c6SBalbir Singh this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT); 72672c0098dSAndy Lutomirski this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); 72772c0098dSAndy Lutomirski this_cpu_write(cpu_tlbstate.next_asid, 1); 72872c0098dSAndy Lutomirski this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); 72972c0098dSAndy Lutomirski this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); 73082721d8bSKirill A. Shutemov set_tlbstate_lam_mode(mm); 73172c0098dSAndy Lutomirski 73272c0098dSAndy Lutomirski for (i = 1; i < TLB_NR_DYN_ASIDS; i++) 73372c0098dSAndy Lutomirski this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); 73472c0098dSAndy Lutomirski } 73572c0098dSAndy Lutomirski 73672c0098dSAndy Lutomirski /* 7374c1ba392SNadav Amit * flush_tlb_func()'s memory ordering requirement is that any 738b0579adeSAndy Lutomirski * TLB fills that happen after we flush the TLB are ordered after we 739b0579adeSAndy Lutomirski * read active_mm's tlb_gen. We don't need any explicit barriers 740b0579adeSAndy Lutomirski * because all x86 flush operations are serializing and the 741b0579adeSAndy Lutomirski * atomic64_read operation won't be reordered by the compiler. 742b0579adeSAndy Lutomirski */ 7434c1ba392SNadav Amit static void flush_tlb_func(void *info) 74455f4949fSIngo Molnar { 745b0579adeSAndy Lutomirski /* 746b0579adeSAndy Lutomirski * We have three different tlb_gen values in here. They are: 747b0579adeSAndy Lutomirski * 748b0579adeSAndy Lutomirski * - mm_tlb_gen: the latest generation. 749b0579adeSAndy Lutomirski * - local_tlb_gen: the generation that this CPU has already caught 750b0579adeSAndy Lutomirski * up to. 751b0579adeSAndy Lutomirski * - f->new_tlb_gen: the generation that the requester of the flush 752b0579adeSAndy Lutomirski * wants us to catch up to. 753b0579adeSAndy Lutomirski */ 7544c1ba392SNadav Amit const struct flush_tlb_info *f = info; 755b0579adeSAndy Lutomirski struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 75610af6235SAndy Lutomirski u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 75710af6235SAndy Lutomirski u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); 7584c1ba392SNadav Amit bool local = smp_processor_id() == f->initiating_cpu; 7594c1ba392SNadav Amit unsigned long nr_invalidate = 0; 760aa442849SNadav Amit u64 mm_tlb_gen; 761b0579adeSAndy Lutomirski 762bc0d5a89SAndy Lutomirski /* This code cannot presently handle being reentered. */ 763bc0d5a89SAndy Lutomirski VM_WARN_ON(!irqs_disabled()); 764bc0d5a89SAndy Lutomirski 7654c1ba392SNadav Amit if (!local) { 7664c1ba392SNadav Amit inc_irq_stat(irq_tlb_count); 7674c1ba392SNadav Amit count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 7684c1ba392SNadav Amit 7694c1ba392SNadav Amit /* Can only happen on remote CPUs */ 7704c1ba392SNadav Amit if (f->mm && f->mm != loaded_mm) 7714c1ba392SNadav Amit return; 7724c1ba392SNadav Amit } 7734c1ba392SNadav Amit 774b956575bSAndy Lutomirski if (unlikely(loaded_mm == &init_mm)) 775b956575bSAndy Lutomirski return; 776b956575bSAndy Lutomirski 77710af6235SAndy Lutomirski VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != 778b0579adeSAndy Lutomirski loaded_mm->context.ctx_id); 779b0579adeSAndy Lutomirski 7802f4305b1SNadav Amit if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) { 781b0579adeSAndy Lutomirski /* 782b956575bSAndy Lutomirski * We're in lazy mode. We need to at least flush our 783b956575bSAndy Lutomirski * paging-structure cache to avoid speculatively reading 784b956575bSAndy Lutomirski * garbage into our TLB. Since switching to init_mm is barely 785b956575bSAndy Lutomirski * slower than a minimal flush, just switch to init_mm. 786145f573bSRik van Riel * 7874ce94eabSNadav Amit * This should be rare, with native_flush_tlb_multi() skipping 788145f573bSRik van Riel * IPIs to lazy TLB mode CPUs. 789b0579adeSAndy Lutomirski */ 790b956575bSAndy Lutomirski switch_mm_irqs_off(NULL, &init_mm, NULL); 791b3b90e5aSAndy Lutomirski return; 792b3b90e5aSAndy Lutomirski } 793b3b90e5aSAndy Lutomirski 7948f1d56f6SNadav Amit if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID && 7958f1d56f6SNadav Amit f->new_tlb_gen <= local_tlb_gen)) { 796aa442849SNadav Amit /* 797aa442849SNadav Amit * The TLB is already up to date in respect to f->new_tlb_gen. 798aa442849SNadav Amit * While the core might be still behind mm_tlb_gen, checking 799aa442849SNadav Amit * mm_tlb_gen unnecessarily would have negative caching effects 800aa442849SNadav Amit * so avoid it. 801aa442849SNadav Amit */ 802aa442849SNadav Amit return; 803aa442849SNadav Amit } 804aa442849SNadav Amit 805aa442849SNadav Amit /* 806aa442849SNadav Amit * Defer mm_tlb_gen reading as long as possible to avoid cache 807aa442849SNadav Amit * contention. 808aa442849SNadav Amit */ 809aa442849SNadav Amit mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); 810aa442849SNadav Amit 811b0579adeSAndy Lutomirski if (unlikely(local_tlb_gen == mm_tlb_gen)) { 812b0579adeSAndy Lutomirski /* 813b0579adeSAndy Lutomirski * There's nothing to do: we're already up to date. This can 814b0579adeSAndy Lutomirski * happen if two concurrent flushes happen -- the first flush to 815b0579adeSAndy Lutomirski * be handled can catch us all the way up, leaving no work for 816b0579adeSAndy Lutomirski * the second flush. 817b0579adeSAndy Lutomirski */ 8184c1ba392SNadav Amit goto done; 819b0579adeSAndy Lutomirski } 820b0579adeSAndy Lutomirski 821b0579adeSAndy Lutomirski WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); 822b0579adeSAndy Lutomirski WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); 823b0579adeSAndy Lutomirski 824b0579adeSAndy Lutomirski /* 825b0579adeSAndy Lutomirski * If we get to this point, we know that our TLB is out of date. 826b0579adeSAndy Lutomirski * This does not strictly imply that we need to flush (it's 827b0579adeSAndy Lutomirski * possible that f->new_tlb_gen <= local_tlb_gen), but we're 828b0579adeSAndy Lutomirski * going to need to flush in the very near future, so we might 829b0579adeSAndy Lutomirski * as well get it over with. 830b0579adeSAndy Lutomirski * 831b0579adeSAndy Lutomirski * The only question is whether to do a full or partial flush. 832b0579adeSAndy Lutomirski * 833b0579adeSAndy Lutomirski * We do a partial flush if requested and two extra conditions 834b0579adeSAndy Lutomirski * are met: 835b0579adeSAndy Lutomirski * 836b0579adeSAndy Lutomirski * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that 837b0579adeSAndy Lutomirski * we've always done all needed flushes to catch up to 838b0579adeSAndy Lutomirski * local_tlb_gen. If, for example, local_tlb_gen == 2 and 839b0579adeSAndy Lutomirski * f->new_tlb_gen == 3, then we know that the flush needed to bring 840b0579adeSAndy Lutomirski * us up to date for tlb_gen 3 is the partial flush we're 841b0579adeSAndy Lutomirski * processing. 842b0579adeSAndy Lutomirski * 843b0579adeSAndy Lutomirski * As an example of why this check is needed, suppose that there 844b0579adeSAndy Lutomirski * are two concurrent flushes. The first is a full flush that 845b0579adeSAndy Lutomirski * changes context.tlb_gen from 1 to 2. The second is a partial 846b0579adeSAndy Lutomirski * flush that changes context.tlb_gen from 2 to 3. If they get 847b0579adeSAndy Lutomirski * processed on this CPU in reverse order, we'll see 848b0579adeSAndy Lutomirski * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. 8491299ef1dSAndy Lutomirski * If we were to use __flush_tlb_one_user() and set local_tlb_gen to 850b0579adeSAndy Lutomirski * 3, we'd be break the invariant: we'd update local_tlb_gen above 851b0579adeSAndy Lutomirski * 1 without the full flush that's needed for tlb_gen 2. 852b0579adeSAndy Lutomirski * 853d9f6e12fSIngo Molnar * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization. 854b0579adeSAndy Lutomirski * Partial TLB flushes are not all that much cheaper than full TLB 855b0579adeSAndy Lutomirski * flushes, so it seems unlikely that it would be a performance win 856b0579adeSAndy Lutomirski * to do a partial flush if that won't bring our TLB fully up to 857b0579adeSAndy Lutomirski * date. By doing a full flush instead, we can increase 858b0579adeSAndy Lutomirski * local_tlb_gen all the way to mm_tlb_gen and we can probably 859b0579adeSAndy Lutomirski * avoid another flush in the very near future. 860b0579adeSAndy Lutomirski */ 861b0579adeSAndy Lutomirski if (f->end != TLB_FLUSH_ALL && 862b0579adeSAndy Lutomirski f->new_tlb_gen == local_tlb_gen + 1 && 863b0579adeSAndy Lutomirski f->new_tlb_gen == mm_tlb_gen) { 864b0579adeSAndy Lutomirski /* Partial flush */ 865a31acd3eSPeter Zijlstra unsigned long addr = f->start; 866b0579adeSAndy Lutomirski 8678f1d56f6SNadav Amit /* Partial flush cannot have invalid generations */ 8688f1d56f6SNadav Amit VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID); 8698f1d56f6SNadav Amit 8708f1d56f6SNadav Amit /* Partial flush must have valid mm */ 8718f1d56f6SNadav Amit VM_WARN_ON(f->mm == NULL); 8728f1d56f6SNadav Amit 8734c1ba392SNadav Amit nr_invalidate = (f->end - f->start) >> f->stride_shift; 8744c1ba392SNadav Amit 875a2055abeSAndy Lutomirski while (addr < f->end) { 876127ac915SThomas Gleixner flush_tlb_one_user(addr); 877a31acd3eSPeter Zijlstra addr += 1UL << f->stride_shift; 878e7b52ffdSAlex Shi } 879454bbad9SAndy Lutomirski if (local) 880a31acd3eSPeter Zijlstra count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate); 881b0579adeSAndy Lutomirski } else { 882b0579adeSAndy Lutomirski /* Full flush. */ 8834c1ba392SNadav Amit nr_invalidate = TLB_FLUSH_ALL; 8844c1ba392SNadav Amit 8852faf153bSThomas Gleixner flush_tlb_local(); 886b0579adeSAndy Lutomirski if (local) 887b0579adeSAndy Lutomirski count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 888e7b52ffdSAlex Shi } 889b0579adeSAndy Lutomirski 890b0579adeSAndy Lutomirski /* Both paths above update our state to mm_tlb_gen. */ 89110af6235SAndy Lutomirski this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); 89255f4949fSIngo Molnar 8934c1ba392SNadav Amit /* Tracing is done in a unified manner to reduce the code size */ 8944c1ba392SNadav Amit done: 8954c1ba392SNadav Amit trace_tlb_flush(!local ? TLB_REMOTE_SHOOTDOWN : 8964c1ba392SNadav Amit (f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN : 8974c1ba392SNadav Amit TLB_LOCAL_MM_SHOOTDOWN, 8984c1ba392SNadav Amit nr_invalidate); 89955f4949fSIngo Molnar } 90055f4949fSIngo Molnar 901d39268adSDave Hansen static bool tlb_is_not_lazy(int cpu, void *data) 902454bbad9SAndy Lutomirski { 9032f4305b1SNadav Amit return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu); 904454bbad9SAndy Lutomirski } 905454bbad9SAndy Lutomirski 9062f4305b1SNadav Amit DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); 9072f4305b1SNadav Amit EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared); 908454bbad9SAndy Lutomirski 9094ce94eabSNadav Amit STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask, 910a2055abeSAndy Lutomirski const struct flush_tlb_info *info) 91155f4949fSIngo Molnar { 9124ce94eabSNadav Amit /* 9134ce94eabSNadav Amit * Do accounting and tracing. Note that there are (and have always been) 9144ce94eabSNadav Amit * cases in which a remote TLB flush will be traced, but eventually 9154ce94eabSNadav Amit * would not happen. 9164ce94eabSNadav Amit */ 917ec659934SMel Gorman count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 918a2055abeSAndy Lutomirski if (info->end == TLB_FLUSH_ALL) 91918c98243SNadav Amit trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); 92018c98243SNadav Amit else 92118c98243SNadav Amit trace_tlb_flush(TLB_REMOTE_SEND_IPI, 922a2055abeSAndy Lutomirski (info->end - info->start) >> PAGE_SHIFT); 92318c98243SNadav Amit 924145f573bSRik van Riel /* 925145f573bSRik van Riel * If no page tables were freed, we can skip sending IPIs to 926145f573bSRik van Riel * CPUs in lazy TLB mode. They will flush the CPU themselves 927145f573bSRik van Riel * at the next context switch. 928145f573bSRik van Riel * 929145f573bSRik van Riel * However, if page tables are getting freed, we need to send the 930145f573bSRik van Riel * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping 931145f573bSRik van Riel * up on the new contents of what used to be page tables, while 932145f573bSRik van Riel * doing a speculative memory access. 933145f573bSRik van Riel */ 934d39268adSDave Hansen if (info->freed_tables) 9354ce94eabSNadav Amit on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); 936d39268adSDave Hansen else 937d39268adSDave Hansen on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func, 938d39268adSDave Hansen (void *)info, 1, cpumask); 93955f4949fSIngo Molnar } 94055f4949fSIngo Molnar 9414ce94eabSNadav Amit void flush_tlb_multi(const struct cpumask *cpumask, 94229def599SThomas Gleixner const struct flush_tlb_info *info) 94329def599SThomas Gleixner { 9444ce94eabSNadav Amit __flush_tlb_multi(cpumask, info); 94529def599SThomas Gleixner } 94629def599SThomas Gleixner 947a5102476SDave Hansen /* 948ff61f079SJonathan Corbet * See Documentation/arch/x86/tlb.rst for details. We choose 33 949a5102476SDave Hansen * because it is large enough to cover the vast majority (at 950a5102476SDave Hansen * least 95%) of allocations, and is small enough that we are 951a5102476SDave Hansen * confident it will not cause too much overhead. Each single 952a5102476SDave Hansen * flush is about 100 ns, so this caps the maximum overhead at 953a5102476SDave Hansen * _about_ 3,000 ns. 954a5102476SDave Hansen * 955a5102476SDave Hansen * This is in units of pages. 956a5102476SDave Hansen */ 957935f5839SPeter Zijlstra unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; 958e9f4e0a9SDave Hansen 9593db6d5a5SNadav Amit static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info); 9603db6d5a5SNadav Amit 9613db6d5a5SNadav Amit #ifdef CONFIG_DEBUG_VM 9623db6d5a5SNadav Amit static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx); 9633db6d5a5SNadav Amit #endif 9643db6d5a5SNadav Amit 9651608e4cfSNadav Amit static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, 9663db6d5a5SNadav Amit unsigned long start, unsigned long end, 9673db6d5a5SNadav Amit unsigned int stride_shift, bool freed_tables, 9683db6d5a5SNadav Amit u64 new_tlb_gen) 9693db6d5a5SNadav Amit { 9703db6d5a5SNadav Amit struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info); 9713db6d5a5SNadav Amit 9723db6d5a5SNadav Amit #ifdef CONFIG_DEBUG_VM 9733db6d5a5SNadav Amit /* 9743db6d5a5SNadav Amit * Ensure that the following code is non-reentrant and flush_tlb_info 9753db6d5a5SNadav Amit * is not overwritten. This means no TLB flushing is initiated by 9763db6d5a5SNadav Amit * interrupt handlers and machine-check exception handlers. 9773db6d5a5SNadav Amit */ 9783db6d5a5SNadav Amit BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); 9793db6d5a5SNadav Amit #endif 9803db6d5a5SNadav Amit 9813db6d5a5SNadav Amit info->start = start; 9823db6d5a5SNadav Amit info->end = end; 9833db6d5a5SNadav Amit info->mm = mm; 9843db6d5a5SNadav Amit info->stride_shift = stride_shift; 9853db6d5a5SNadav Amit info->freed_tables = freed_tables; 9863db6d5a5SNadav Amit info->new_tlb_gen = new_tlb_gen; 9874c1ba392SNadav Amit info->initiating_cpu = smp_processor_id(); 9883db6d5a5SNadav Amit 9893db6d5a5SNadav Amit return info; 9903db6d5a5SNadav Amit } 9913db6d5a5SNadav Amit 9921608e4cfSNadav Amit static void put_flush_tlb_info(void) 9933db6d5a5SNadav Amit { 9943db6d5a5SNadav Amit #ifdef CONFIG_DEBUG_VM 995d9f6e12fSIngo Molnar /* Complete reentrancy prevention checks */ 9963db6d5a5SNadav Amit barrier(); 9973db6d5a5SNadav Amit this_cpu_dec(flush_tlb_info_idx); 9983db6d5a5SNadav Amit #endif 9993db6d5a5SNadav Amit } 10003db6d5a5SNadav Amit 1001611ae8e3SAlex Shi void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 1002016c4d92SRik van Riel unsigned long end, unsigned int stride_shift, 1003016c4d92SRik van Riel bool freed_tables) 1004611ae8e3SAlex Shi { 10053db6d5a5SNadav Amit struct flush_tlb_info *info; 10063db6d5a5SNadav Amit u64 new_tlb_gen; 1007454bbad9SAndy Lutomirski int cpu; 1008e7b52ffdSAlex Shi 1009454bbad9SAndy Lutomirski cpu = get_cpu(); 101071b3c126SAndy Lutomirski 1011454bbad9SAndy Lutomirski /* Should we flush just the requested range? */ 10123db6d5a5SNadav Amit if ((end == TLB_FLUSH_ALL) || 10133db6d5a5SNadav Amit ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { 10143db6d5a5SNadav Amit start = 0; 10153db6d5a5SNadav Amit end = TLB_FLUSH_ALL; 10164995ab9cSDave Hansen } 1017454bbad9SAndy Lutomirski 10183db6d5a5SNadav Amit /* This is also a barrier that synchronizes with switch_mm(). */ 10193db6d5a5SNadav Amit new_tlb_gen = inc_mm_tlb_gen(mm); 10203db6d5a5SNadav Amit 10213db6d5a5SNadav Amit info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables, 10223db6d5a5SNadav Amit new_tlb_gen); 10233db6d5a5SNadav Amit 10244ce94eabSNadav Amit /* 10254ce94eabSNadav Amit * flush_tlb_multi() is not optimized for the common case in which only 10264ce94eabSNadav Amit * a local TLB flush is needed. Optimize this use-case by calling 10274ce94eabSNadav Amit * flush_tlb_func_local() directly in this case. 10284ce94eabSNadav Amit */ 10294ce94eabSNadav Amit if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { 10304ce94eabSNadav Amit flush_tlb_multi(mm_cpumask(mm), info); 10314ce94eabSNadav Amit } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { 10323db6d5a5SNadav Amit lockdep_assert_irqs_enabled(); 1033bc0d5a89SAndy Lutomirski local_irq_disable(); 10344c1ba392SNadav Amit flush_tlb_func(info); 1035bc0d5a89SAndy Lutomirski local_irq_enable(); 1036bc0d5a89SAndy Lutomirski } 1037bc0d5a89SAndy Lutomirski 10383db6d5a5SNadav Amit put_flush_tlb_info(); 1039454bbad9SAndy Lutomirski put_cpu(); 10401af5a810SAlistair Popple mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); 1041e7b52ffdSAlex Shi } 1042e7b52ffdSAlex Shi 1043a2055abeSAndy Lutomirski 104455f4949fSIngo Molnar static void do_flush_tlb_all(void *info) 104555f4949fSIngo Molnar { 1046ec659934SMel Gorman count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 104755f4949fSIngo Molnar __flush_tlb_all(); 104855f4949fSIngo Molnar } 104955f4949fSIngo Molnar 105055f4949fSIngo Molnar void flush_tlb_all(void) 105155f4949fSIngo Molnar { 1052ec659934SMel Gorman count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 105355f4949fSIngo Molnar on_each_cpu(do_flush_tlb_all, NULL, 1); 105455f4949fSIngo Molnar } 10553df3212fSAlex Shi 1056effee4b9SAlex Shi static void do_kernel_range_flush(void *info) 1057effee4b9SAlex Shi { 1058effee4b9SAlex Shi struct flush_tlb_info *f = info; 1059effee4b9SAlex Shi unsigned long addr; 1060effee4b9SAlex Shi 1061effee4b9SAlex Shi /* flush range by one by one 'invlpg' */ 1062a2055abeSAndy Lutomirski for (addr = f->start; addr < f->end; addr += PAGE_SIZE) 106358430c5dSThomas Gleixner flush_tlb_one_kernel(addr); 1064effee4b9SAlex Shi } 1065effee4b9SAlex Shi 1066effee4b9SAlex Shi void flush_tlb_kernel_range(unsigned long start, unsigned long end) 1067effee4b9SAlex Shi { 1068effee4b9SAlex Shi /* Balance as user space task's flush, a bit conservative */ 1069e9f4e0a9SDave Hansen if (end == TLB_FLUSH_ALL || 1070be4ffc0dSAndy Lutomirski (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { 1071effee4b9SAlex Shi on_each_cpu(do_flush_tlb_all, NULL, 1); 1072e9f4e0a9SDave Hansen } else { 10733db6d5a5SNadav Amit struct flush_tlb_info *info; 10743db6d5a5SNadav Amit 10753db6d5a5SNadav Amit preempt_disable(); 10768f1d56f6SNadav Amit info = get_flush_tlb_info(NULL, start, end, 0, false, 10778f1d56f6SNadav Amit TLB_GENERATION_INVALID); 10783db6d5a5SNadav Amit 10793db6d5a5SNadav Amit on_each_cpu(do_kernel_range_flush, info, 1); 10803db6d5a5SNadav Amit 10813db6d5a5SNadav Amit put_flush_tlb_info(); 10823db6d5a5SNadav Amit preempt_enable(); 1083effee4b9SAlex Shi } 1084effee4b9SAlex Shi } 10852d040a1cSDave Hansen 10863db6d5a5SNadav Amit /* 10878c5cc19eSThomas Gleixner * This can be used from process context to figure out what the value of 10888c5cc19eSThomas Gleixner * CR3 is without needing to do a (slow) __read_cr3(). 10898c5cc19eSThomas Gleixner * 10908c5cc19eSThomas Gleixner * It's intended to be used for code like KVM that sneakily changes CR3 10918c5cc19eSThomas Gleixner * and needs to restore it. It needs to be used very carefully. 10928c5cc19eSThomas Gleixner */ 10938c5cc19eSThomas Gleixner unsigned long __get_current_cr3_fast(void) 10948c5cc19eSThomas Gleixner { 109582721d8bSKirill A. Shutemov unsigned long cr3 = 109682721d8bSKirill A. Shutemov build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, 109782721d8bSKirill A. Shutemov this_cpu_read(cpu_tlbstate.loaded_mm_asid), 109882721d8bSKirill A. Shutemov tlbstate_lam_cr3_mask()); 10998c5cc19eSThomas Gleixner 11008c5cc19eSThomas Gleixner /* For now, be very restrictive about when this can be called. */ 11018c5cc19eSThomas Gleixner VM_WARN_ON(in_nmi() || preemptible()); 11028c5cc19eSThomas Gleixner 11038c5cc19eSThomas Gleixner VM_BUG_ON(cr3 != __read_cr3()); 11048c5cc19eSThomas Gleixner return cr3; 11058c5cc19eSThomas Gleixner } 11068c5cc19eSThomas Gleixner EXPORT_SYMBOL_GPL(__get_current_cr3_fast); 11078c5cc19eSThomas Gleixner 11088c5cc19eSThomas Gleixner /* 110958430c5dSThomas Gleixner * Flush one page in the kernel mapping 111058430c5dSThomas Gleixner */ 111158430c5dSThomas Gleixner void flush_tlb_one_kernel(unsigned long addr) 111258430c5dSThomas Gleixner { 111358430c5dSThomas Gleixner count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); 111458430c5dSThomas Gleixner 111558430c5dSThomas Gleixner /* 111658430c5dSThomas Gleixner * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its 111758430c5dSThomas Gleixner * paravirt equivalent. Even with PCID, this is sufficient: we only 111858430c5dSThomas Gleixner * use PCID if we also use global PTEs for the kernel mapping, and 111958430c5dSThomas Gleixner * INVLPG flushes global translations across all address spaces. 112058430c5dSThomas Gleixner * 112158430c5dSThomas Gleixner * If PTI is on, then the kernel is mapped with non-global PTEs, and 112258430c5dSThomas Gleixner * __flush_tlb_one_user() will flush the given address for the current 112358430c5dSThomas Gleixner * kernel address space and for its usermode counterpart, but it does 112458430c5dSThomas Gleixner * not flush it for other address spaces. 112558430c5dSThomas Gleixner */ 112658430c5dSThomas Gleixner flush_tlb_one_user(addr); 112758430c5dSThomas Gleixner 112858430c5dSThomas Gleixner if (!static_cpu_has(X86_FEATURE_PTI)) 112958430c5dSThomas Gleixner return; 113058430c5dSThomas Gleixner 113158430c5dSThomas Gleixner /* 113258430c5dSThomas Gleixner * See above. We need to propagate the flush to all other address 113358430c5dSThomas Gleixner * spaces. In principle, we only need to propagate it to kernelmode 113458430c5dSThomas Gleixner * address spaces, but the extra bookkeeping we would need is not 113558430c5dSThomas Gleixner * worth it. 113658430c5dSThomas Gleixner */ 113758430c5dSThomas Gleixner this_cpu_write(cpu_tlbstate.invalidate_other, true); 113858430c5dSThomas Gleixner } 113958430c5dSThomas Gleixner 114058430c5dSThomas Gleixner /* 1141127ac915SThomas Gleixner * Flush one page in the user mapping 1142127ac915SThomas Gleixner */ 1143127ac915SThomas Gleixner STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr) 1144127ac915SThomas Gleixner { 114554e3d943SDave Hansen u32 loaded_mm_asid; 114654e3d943SDave Hansen bool cpu_pcide; 1147127ac915SThomas Gleixner 114854e3d943SDave Hansen /* Flush 'addr' from the kernel PCID: */ 1149*8322a66fSBorislav Petkov (AMD) invlpg(addr); 1150127ac915SThomas Gleixner 115154e3d943SDave Hansen /* If PTI is off there is no user PCID and nothing to flush. */ 1152127ac915SThomas Gleixner if (!static_cpu_has(X86_FEATURE_PTI)) 1153127ac915SThomas Gleixner return; 1154127ac915SThomas Gleixner 115554e3d943SDave Hansen loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 115654e3d943SDave Hansen cpu_pcide = this_cpu_read(cpu_tlbstate.cr4) & X86_CR4_PCIDE; 115754e3d943SDave Hansen 1158127ac915SThomas Gleixner /* 115954e3d943SDave Hansen * invpcid_flush_one(pcid>0) will #GP if CR4.PCIDE==0. Check 116054e3d943SDave Hansen * 'cpu_pcide' to ensure that *this* CPU will not trigger those 116154e3d943SDave Hansen * #GP's even if called before CR4.PCIDE has been initialized. 1162127ac915SThomas Gleixner */ 116354e3d943SDave Hansen if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_pcide) 1164127ac915SThomas Gleixner invpcid_flush_one(user_pcid(loaded_mm_asid), addr); 116554e3d943SDave Hansen else 116654e3d943SDave Hansen invalidate_user_asid(loaded_mm_asid); 1167127ac915SThomas Gleixner } 1168127ac915SThomas Gleixner 1169127ac915SThomas Gleixner void flush_tlb_one_user(unsigned long addr) 1170127ac915SThomas Gleixner { 1171127ac915SThomas Gleixner __flush_tlb_one_user(addr); 1172127ac915SThomas Gleixner } 1173127ac915SThomas Gleixner 1174127ac915SThomas Gleixner /* 1175cd30d26cSThomas Gleixner * Flush everything 1176cd30d26cSThomas Gleixner */ 1177cd30d26cSThomas Gleixner STATIC_NOPV void native_flush_tlb_global(void) 1178cd30d26cSThomas Gleixner { 1179f154f290SJoerg Roedel unsigned long flags; 1180cd30d26cSThomas Gleixner 1181cd30d26cSThomas Gleixner if (static_cpu_has(X86_FEATURE_INVPCID)) { 1182cd30d26cSThomas Gleixner /* 1183cd30d26cSThomas Gleixner * Using INVPCID is considerably faster than a pair of writes 1184cd30d26cSThomas Gleixner * to CR4 sandwiched inside an IRQ flag save/restore. 1185cd30d26cSThomas Gleixner * 1186cd30d26cSThomas Gleixner * Note, this works with CR4.PCIDE=0 or 1. 1187cd30d26cSThomas Gleixner */ 1188cd30d26cSThomas Gleixner invpcid_flush_all(); 1189cd30d26cSThomas Gleixner return; 1190cd30d26cSThomas Gleixner } 1191cd30d26cSThomas Gleixner 1192cd30d26cSThomas Gleixner /* 1193cd30d26cSThomas Gleixner * Read-modify-write to CR4 - protect it from preemption and 1194cd30d26cSThomas Gleixner * from interrupts. (Use the raw variant because this code can 1195cd30d26cSThomas Gleixner * be called from deep inside debugging code.) 1196cd30d26cSThomas Gleixner */ 1197cd30d26cSThomas Gleixner raw_local_irq_save(flags); 1198cd30d26cSThomas Gleixner 1199f154f290SJoerg Roedel __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); 1200cd30d26cSThomas Gleixner 1201cd30d26cSThomas Gleixner raw_local_irq_restore(flags); 1202cd30d26cSThomas Gleixner } 1203cd30d26cSThomas Gleixner 1204cd30d26cSThomas Gleixner /* 12052faf153bSThomas Gleixner * Flush the entire current user mapping 12062faf153bSThomas Gleixner */ 12072faf153bSThomas Gleixner STATIC_NOPV void native_flush_tlb_local(void) 12082faf153bSThomas Gleixner { 12092faf153bSThomas Gleixner /* 12102faf153bSThomas Gleixner * Preemption or interrupts must be disabled to protect the access 12112faf153bSThomas Gleixner * to the per CPU variable and to prevent being preempted between 12122faf153bSThomas Gleixner * read_cr3() and write_cr3(). 12132faf153bSThomas Gleixner */ 12142faf153bSThomas Gleixner WARN_ON_ONCE(preemptible()); 12152faf153bSThomas Gleixner 12162faf153bSThomas Gleixner invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); 12172faf153bSThomas Gleixner 12182faf153bSThomas Gleixner /* If current->mm == NULL then the read_cr3() "borrows" an mm */ 12192faf153bSThomas Gleixner native_write_cr3(__native_read_cr3()); 12202faf153bSThomas Gleixner } 12212faf153bSThomas Gleixner 12222faf153bSThomas Gleixner void flush_tlb_local(void) 12232faf153bSThomas Gleixner { 12242faf153bSThomas Gleixner __flush_tlb_local(); 12252faf153bSThomas Gleixner } 12264b04e6c2SThomas Gleixner 12274b04e6c2SThomas Gleixner /* 12284b04e6c2SThomas Gleixner * Flush everything 12294b04e6c2SThomas Gleixner */ 12304b04e6c2SThomas Gleixner void __flush_tlb_all(void) 12314b04e6c2SThomas Gleixner { 12324b04e6c2SThomas Gleixner /* 12334b04e6c2SThomas Gleixner * This is to catch users with enabled preemption and the PGE feature 12344b04e6c2SThomas Gleixner * and don't trigger the warning in __native_flush_tlb(). 12354b04e6c2SThomas Gleixner */ 12364b04e6c2SThomas Gleixner VM_WARN_ON_ONCE(preemptible()); 12374b04e6c2SThomas Gleixner 1238ebd3ad60SBorislav Petkov (AMD) if (cpu_feature_enabled(X86_FEATURE_PGE)) { 12394b04e6c2SThomas Gleixner __flush_tlb_global(); 12404b04e6c2SThomas Gleixner } else { 12414b04e6c2SThomas Gleixner /* 12424b04e6c2SThomas Gleixner * !PGE -> !PCID (setup_pcid()), thus every flush is total. 12434b04e6c2SThomas Gleixner */ 12444b04e6c2SThomas Gleixner flush_tlb_local(); 12454b04e6c2SThomas Gleixner } 12464b04e6c2SThomas Gleixner } 12474b04e6c2SThomas Gleixner EXPORT_SYMBOL_GPL(__flush_tlb_all); 12482faf153bSThomas Gleixner 12493db6d5a5SNadav Amit void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) 12503db6d5a5SNadav Amit { 12514c1ba392SNadav Amit struct flush_tlb_info *info; 12524c1ba392SNadav Amit 1253e73ad5ffSAndy Lutomirski int cpu = get_cpu(); 1254e73ad5ffSAndy Lutomirski 12558f1d56f6SNadav Amit info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 12568f1d56f6SNadav Amit TLB_GENERATION_INVALID); 12574ce94eabSNadav Amit /* 12584ce94eabSNadav Amit * flush_tlb_multi() is not optimized for the common case in which only 12594ce94eabSNadav Amit * a local TLB flush is needed. Optimize this use-case by calling 12604ce94eabSNadav Amit * flush_tlb_func_local() directly in this case. 12614ce94eabSNadav Amit */ 12624ce94eabSNadav Amit if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { 12634ce94eabSNadav Amit flush_tlb_multi(&batch->cpumask, info); 12644ce94eabSNadav Amit } else if (cpumask_test_cpu(cpu, &batch->cpumask)) { 12653db6d5a5SNadav Amit lockdep_assert_irqs_enabled(); 1266bc0d5a89SAndy Lutomirski local_irq_disable(); 12674c1ba392SNadav Amit flush_tlb_func(info); 1268bc0d5a89SAndy Lutomirski local_irq_enable(); 1269bc0d5a89SAndy Lutomirski } 1270bc0d5a89SAndy Lutomirski 1271e73ad5ffSAndy Lutomirski cpumask_clear(&batch->cpumask); 1272e73ad5ffSAndy Lutomirski 12734c1ba392SNadav Amit put_flush_tlb_info(); 1274e73ad5ffSAndy Lutomirski put_cpu(); 1275e73ad5ffSAndy Lutomirski } 1276e73ad5ffSAndy Lutomirski 1277af5c40c6SThomas Gleixner /* 1278af5c40c6SThomas Gleixner * Blindly accessing user memory from NMI context can be dangerous 1279af5c40c6SThomas Gleixner * if we're in the middle of switching the current user task or 1280af5c40c6SThomas Gleixner * switching the loaded mm. It can also be dangerous if we 1281af5c40c6SThomas Gleixner * interrupted some kernel code that was temporarily using a 1282af5c40c6SThomas Gleixner * different mm. 1283af5c40c6SThomas Gleixner */ 1284af5c40c6SThomas Gleixner bool nmi_uaccess_okay(void) 1285af5c40c6SThomas Gleixner { 1286af5c40c6SThomas Gleixner struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 1287af5c40c6SThomas Gleixner struct mm_struct *current_mm = current->mm; 1288af5c40c6SThomas Gleixner 1289af5c40c6SThomas Gleixner VM_WARN_ON_ONCE(!loaded_mm); 1290af5c40c6SThomas Gleixner 1291af5c40c6SThomas Gleixner /* 1292af5c40c6SThomas Gleixner * The condition we want to check is 1293af5c40c6SThomas Gleixner * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, 1294af5c40c6SThomas Gleixner * if we're running in a VM with shadow paging, and nmi_uaccess_okay() 1295af5c40c6SThomas Gleixner * is supposed to be reasonably fast. 1296af5c40c6SThomas Gleixner * 1297af5c40c6SThomas Gleixner * Instead, we check the almost equivalent but somewhat conservative 1298af5c40c6SThomas Gleixner * condition below, and we rely on the fact that switch_mm_irqs_off() 1299af5c40c6SThomas Gleixner * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. 1300af5c40c6SThomas Gleixner */ 1301af5c40c6SThomas Gleixner if (loaded_mm != current_mm) 1302af5c40c6SThomas Gleixner return false; 1303af5c40c6SThomas Gleixner 1304af5c40c6SThomas Gleixner VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); 1305af5c40c6SThomas Gleixner 1306af5c40c6SThomas Gleixner return true; 1307af5c40c6SThomas Gleixner } 1308af5c40c6SThomas Gleixner 13092d040a1cSDave Hansen static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, 13102d040a1cSDave Hansen size_t count, loff_t *ppos) 13112d040a1cSDave Hansen { 13122d040a1cSDave Hansen char buf[32]; 13132d040a1cSDave Hansen unsigned int len; 13142d040a1cSDave Hansen 13152d040a1cSDave Hansen len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); 13162d040a1cSDave Hansen return simple_read_from_buffer(user_buf, count, ppos, buf, len); 13172d040a1cSDave Hansen } 13182d040a1cSDave Hansen 13192d040a1cSDave Hansen static ssize_t tlbflush_write_file(struct file *file, 13202d040a1cSDave Hansen const char __user *user_buf, size_t count, loff_t *ppos) 13212d040a1cSDave Hansen { 13222d040a1cSDave Hansen char buf[32]; 13232d040a1cSDave Hansen ssize_t len; 13242d040a1cSDave Hansen int ceiling; 13252d040a1cSDave Hansen 13262d040a1cSDave Hansen len = min(count, sizeof(buf) - 1); 13272d040a1cSDave Hansen if (copy_from_user(buf, user_buf, len)) 13282d040a1cSDave Hansen return -EFAULT; 13292d040a1cSDave Hansen 13302d040a1cSDave Hansen buf[len] = '\0'; 13312d040a1cSDave Hansen if (kstrtoint(buf, 0, &ceiling)) 13322d040a1cSDave Hansen return -EINVAL; 13332d040a1cSDave Hansen 13342d040a1cSDave Hansen if (ceiling < 0) 13352d040a1cSDave Hansen return -EINVAL; 13362d040a1cSDave Hansen 13372d040a1cSDave Hansen tlb_single_page_flush_ceiling = ceiling; 13382d040a1cSDave Hansen return count; 13392d040a1cSDave Hansen } 13402d040a1cSDave Hansen 13412d040a1cSDave Hansen static const struct file_operations fops_tlbflush = { 13422d040a1cSDave Hansen .read = tlbflush_read_file, 13432d040a1cSDave Hansen .write = tlbflush_write_file, 13442d040a1cSDave Hansen .llseek = default_llseek, 13452d040a1cSDave Hansen }; 13462d040a1cSDave Hansen 13472d040a1cSDave Hansen static int __init create_tlb_single_page_flush_ceiling(void) 13482d040a1cSDave Hansen { 13492d040a1cSDave Hansen debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, 13502d040a1cSDave Hansen arch_debugfs_dir, NULL, &fops_tlbflush); 13512d040a1cSDave Hansen return 0; 13522d040a1cSDave Hansen } 13532d040a1cSDave Hansen late_initcall(create_tlb_single_page_flush_ceiling); 1354