11965aae3SH. Peter Anvin #ifndef _ASM_X86_MMU_CONTEXT_H 21965aae3SH. Peter Anvin #define _ASM_X86_MMU_CONTEXT_H 3bb898558SAl Viro 4bb898558SAl Viro #include <asm/desc.h> 560063497SArun Sharma #include <linux/atomic.h> 6d17d8f9dSDave Hansen #include <linux/mm_types.h> 7d17d8f9dSDave Hansen 8d17d8f9dSDave Hansen #include <trace/events/tlb.h> 9d17d8f9dSDave Hansen 10bb898558SAl Viro #include <asm/pgalloc.h> 11bb898558SAl Viro #include <asm/tlbflush.h> 12bb898558SAl Viro #include <asm/paravirt.h> 13fe3d197fSDave Hansen #include <asm/mpx.h> 14bb898558SAl Viro #ifndef CONFIG_PARAVIRT 15bb898558SAl Viro static inline void paravirt_activate_mm(struct mm_struct *prev, 16bb898558SAl Viro struct mm_struct *next) 17bb898558SAl Viro { 18bb898558SAl Viro } 19bb898558SAl Viro #endif /* !CONFIG_PARAVIRT */ 20bb898558SAl Viro 217911d3f7SAndy Lutomirski #ifdef CONFIG_PERF_EVENTS 22a6673429SAndy Lutomirski extern struct static_key rdpmc_always_available; 23a6673429SAndy Lutomirski 247911d3f7SAndy Lutomirski static inline void load_mm_cr4(struct mm_struct *mm) 257911d3f7SAndy Lutomirski { 26a833581eSPeter Zijlstra if (static_key_false(&rdpmc_always_available) || 27a6673429SAndy Lutomirski atomic_read(&mm->context.perf_rdpmc_allowed)) 287911d3f7SAndy Lutomirski cr4_set_bits(X86_CR4_PCE); 297911d3f7SAndy Lutomirski else 307911d3f7SAndy Lutomirski cr4_clear_bits(X86_CR4_PCE); 317911d3f7SAndy Lutomirski } 327911d3f7SAndy Lutomirski #else 337911d3f7SAndy Lutomirski static inline void load_mm_cr4(struct mm_struct *mm) {} 347911d3f7SAndy Lutomirski #endif 357911d3f7SAndy Lutomirski 36a5b9e5a2SAndy Lutomirski #ifdef CONFIG_MODIFY_LDT_SYSCALL 37bb898558SAl Viro /* 3837868fe1SAndy Lutomirski * ldt_structs can be allocated, used, and freed, but they are never 3937868fe1SAndy Lutomirski * modified while live. 4037868fe1SAndy Lutomirski */ 4137868fe1SAndy Lutomirski struct ldt_struct { 4237868fe1SAndy Lutomirski /* 4337868fe1SAndy Lutomirski * Xen requires page-aligned LDTs with special permissions. This is 4437868fe1SAndy Lutomirski * needed to prevent us from installing evil descriptors such as 4537868fe1SAndy Lutomirski * call gates. On native, we could merge the ldt_struct and LDT 4637868fe1SAndy Lutomirski * allocations, but it's not worth trying to optimize. 4737868fe1SAndy Lutomirski */ 4837868fe1SAndy Lutomirski struct desc_struct *entries; 4937868fe1SAndy Lutomirski int size; 5037868fe1SAndy Lutomirski }; 5137868fe1SAndy Lutomirski 52a5b9e5a2SAndy Lutomirski /* 53a5b9e5a2SAndy Lutomirski * Used for LDT copy/destruction. 54a5b9e5a2SAndy Lutomirski */ 55a5b9e5a2SAndy Lutomirski int init_new_context(struct task_struct *tsk, struct mm_struct *mm); 56a5b9e5a2SAndy Lutomirski void destroy_context(struct mm_struct *mm); 57a5b9e5a2SAndy Lutomirski #else /* CONFIG_MODIFY_LDT_SYSCALL */ 58a5b9e5a2SAndy Lutomirski static inline int init_new_context(struct task_struct *tsk, 59a5b9e5a2SAndy Lutomirski struct mm_struct *mm) 60a5b9e5a2SAndy Lutomirski { 61a5b9e5a2SAndy Lutomirski return 0; 62a5b9e5a2SAndy Lutomirski } 63a5b9e5a2SAndy Lutomirski static inline void destroy_context(struct mm_struct *mm) {} 64a5b9e5a2SAndy Lutomirski #endif 65a5b9e5a2SAndy Lutomirski 6637868fe1SAndy Lutomirski static inline void load_mm_ldt(struct mm_struct *mm) 6737868fe1SAndy Lutomirski { 68a5b9e5a2SAndy Lutomirski #ifdef CONFIG_MODIFY_LDT_SYSCALL 6937868fe1SAndy Lutomirski struct ldt_struct *ldt; 7037868fe1SAndy Lutomirski 7137868fe1SAndy Lutomirski /* lockless_dereference synchronizes with smp_store_release */ 7237868fe1SAndy Lutomirski ldt = lockless_dereference(mm->context.ldt); 7337868fe1SAndy Lutomirski 7437868fe1SAndy Lutomirski /* 7537868fe1SAndy Lutomirski * Any change to mm->context.ldt is followed by an IPI to all 7637868fe1SAndy Lutomirski * CPUs with the mm active. The LDT will not be freed until 7737868fe1SAndy Lutomirski * after the IPI is handled by all such CPUs. This means that, 7837868fe1SAndy Lutomirski * if the ldt_struct changes before we return, the values we see 7937868fe1SAndy Lutomirski * will be safe, and the new values will be loaded before we run 8037868fe1SAndy Lutomirski * any user code. 8137868fe1SAndy Lutomirski * 8237868fe1SAndy Lutomirski * NB: don't try to convert this to use RCU without extreme care. 8337868fe1SAndy Lutomirski * We would still need IRQs off, because we don't want to change 8437868fe1SAndy Lutomirski * the local LDT after an IPI loaded a newer value than the one 8537868fe1SAndy Lutomirski * that we can see. 8637868fe1SAndy Lutomirski */ 8737868fe1SAndy Lutomirski 8837868fe1SAndy Lutomirski if (unlikely(ldt)) 8937868fe1SAndy Lutomirski set_ldt(ldt->entries, ldt->size); 9037868fe1SAndy Lutomirski else 9137868fe1SAndy Lutomirski clear_LDT(); 92a5b9e5a2SAndy Lutomirski #else 93a5b9e5a2SAndy Lutomirski clear_LDT(); 94a5b9e5a2SAndy Lutomirski #endif 9537868fe1SAndy Lutomirski 9637868fe1SAndy Lutomirski DEBUG_LOCKS_WARN_ON(preemptible()); 9737868fe1SAndy Lutomirski } 9837868fe1SAndy Lutomirski 996826c8ffSBrian Gerst static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 1006826c8ffSBrian Gerst { 1016826c8ffSBrian Gerst #ifdef CONFIG_SMP 102c6ae41e7SAlex Shi if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 103c6ae41e7SAlex Shi this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); 104bb898558SAl Viro #endif 1056826c8ffSBrian Gerst } 1066826c8ffSBrian Gerst 1076826c8ffSBrian Gerst static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 1086826c8ffSBrian Gerst struct task_struct *tsk) 1096826c8ffSBrian Gerst { 1106826c8ffSBrian Gerst unsigned cpu = smp_processor_id(); 1116826c8ffSBrian Gerst 1126826c8ffSBrian Gerst if (likely(prev != next)) { 1136826c8ffSBrian Gerst #ifdef CONFIG_SMP 114c6ae41e7SAlex Shi this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); 115c6ae41e7SAlex Shi this_cpu_write(cpu_tlbstate.active_mm, next); 1166826c8ffSBrian Gerst #endif 11778f1c4d6SRusty Russell cpumask_set_cpu(cpu, mm_cpumask(next)); 1186826c8ffSBrian Gerst 11971b3c126SAndy Lutomirski /* 12071b3c126SAndy Lutomirski * Re-load page tables. 12171b3c126SAndy Lutomirski * 12271b3c126SAndy Lutomirski * This logic has an ordering constraint: 12371b3c126SAndy Lutomirski * 12471b3c126SAndy Lutomirski * CPU 0: Write to a PTE for 'next' 12571b3c126SAndy Lutomirski * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. 12671b3c126SAndy Lutomirski * CPU 1: set bit 1 in next's mm_cpumask 12771b3c126SAndy Lutomirski * CPU 1: load from the PTE that CPU 0 writes (implicit) 12871b3c126SAndy Lutomirski * 12971b3c126SAndy Lutomirski * We need to prevent an outcome in which CPU 1 observes 13071b3c126SAndy Lutomirski * the new PTE value and CPU 0 observes bit 1 clear in 13171b3c126SAndy Lutomirski * mm_cpumask. (If that occurs, then the IPI will never 13271b3c126SAndy Lutomirski * be sent, and CPU 0's TLB will contain a stale entry.) 13371b3c126SAndy Lutomirski * 13471b3c126SAndy Lutomirski * The bad outcome can occur if either CPU's load is 13571b3c126SAndy Lutomirski * reordered before that CPU's store, so both CPUs much 13671b3c126SAndy Lutomirski * execute full barriers to prevent this from happening. 13771b3c126SAndy Lutomirski * 13871b3c126SAndy Lutomirski * Thus, switch_mm needs a full barrier between the 13971b3c126SAndy Lutomirski * store to mm_cpumask and any operation that could load 14071b3c126SAndy Lutomirski * from next->pgd. This barrier synchronizes with 14171b3c126SAndy Lutomirski * remote TLB flushers. Fortunately, load_cr3 is 14271b3c126SAndy Lutomirski * serializing and thus acts as a full barrier. 14371b3c126SAndy Lutomirski * 14471b3c126SAndy Lutomirski */ 1456826c8ffSBrian Gerst load_cr3(next->pgd); 14671b3c126SAndy Lutomirski 147d17d8f9dSDave Hansen trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 1486826c8ffSBrian Gerst 1498f898fbbSRik van Riel /* Stop flush ipis for the previous mm */ 150831d52bcSSuresh Siddha cpumask_clear_cpu(cpu, mm_cpumask(prev)); 151831d52bcSSuresh Siddha 1527911d3f7SAndy Lutomirski /* Load per-mm CR4 state */ 1537911d3f7SAndy Lutomirski load_mm_cr4(next); 1547911d3f7SAndy Lutomirski 155a5b9e5a2SAndy Lutomirski #ifdef CONFIG_MODIFY_LDT_SYSCALL 156c4a7bba2SAndy Lutomirski /* 157c4a7bba2SAndy Lutomirski * Load the LDT, if the LDT is different. 158c4a7bba2SAndy Lutomirski * 15922c4bd9fSAndy Lutomirski * It's possible that prev->context.ldt doesn't match 16022c4bd9fSAndy Lutomirski * the LDT register. This can happen if leave_mm(prev) 16122c4bd9fSAndy Lutomirski * was called and then modify_ldt changed 16222c4bd9fSAndy Lutomirski * prev->context.ldt but suppressed an IPI to this CPU. 16322c4bd9fSAndy Lutomirski * In this case, prev->context.ldt != NULL, because we 16437868fe1SAndy Lutomirski * never set context.ldt to NULL while the mm still 16537868fe1SAndy Lutomirski * exists. That means that next->context.ldt != 16637868fe1SAndy Lutomirski * prev->context.ldt, because mms never share an LDT. 167c4a7bba2SAndy Lutomirski */ 1686826c8ffSBrian Gerst if (unlikely(prev->context.ldt != next->context.ldt)) 16937868fe1SAndy Lutomirski load_mm_ldt(next); 170a5b9e5a2SAndy Lutomirski #endif 1716826c8ffSBrian Gerst } 1726826c8ffSBrian Gerst #ifdef CONFIG_SMP 1736826c8ffSBrian Gerst else { 174c6ae41e7SAlex Shi this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); 175c6ae41e7SAlex Shi BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); 1766826c8ffSBrian Gerst 1778f898fbbSRik van Riel if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { 1788f898fbbSRik van Riel /* 1798f898fbbSRik van Riel * On established mms, the mm_cpumask is only changed 1808f898fbbSRik van Riel * from irq context, from ptep_clear_flush() while in 1818f898fbbSRik van Riel * lazy tlb mode, and here. Irqs are blocked during 1828f898fbbSRik van Riel * schedule, protecting us from simultaneous changes. 1838f898fbbSRik van Riel */ 1848f898fbbSRik van Riel cpumask_set_cpu(cpu, mm_cpumask(next)); 18571b3c126SAndy Lutomirski 1868f898fbbSRik van Riel /* 1878f898fbbSRik van Riel * We were in lazy tlb mode and leave_mm disabled 1886826c8ffSBrian Gerst * tlb flush IPI delivery. We must reload CR3 1896826c8ffSBrian Gerst * to make sure to use no freed page tables. 19071b3c126SAndy Lutomirski * 19171b3c126SAndy Lutomirski * As above, this is a barrier that forces 19271b3c126SAndy Lutomirski * TLB repopulation to be ordered after the 19371b3c126SAndy Lutomirski * store to mm_cpumask. 1946826c8ffSBrian Gerst */ 1956826c8ffSBrian Gerst load_cr3(next->pgd); 196d17d8f9dSDave Hansen trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 1977911d3f7SAndy Lutomirski load_mm_cr4(next); 19837868fe1SAndy Lutomirski load_mm_ldt(next); 1996826c8ffSBrian Gerst } 2006826c8ffSBrian Gerst } 2016826c8ffSBrian Gerst #endif 2026826c8ffSBrian Gerst } 203bb898558SAl Viro 204bb898558SAl Viro #define activate_mm(prev, next) \ 205bb898558SAl Viro do { \ 206bb898558SAl Viro paravirt_activate_mm((prev), (next)); \ 207bb898558SAl Viro switch_mm((prev), (next), NULL); \ 208bb898558SAl Viro } while (0); 209bb898558SAl Viro 2106826c8ffSBrian Gerst #ifdef CONFIG_X86_32 2116826c8ffSBrian Gerst #define deactivate_mm(tsk, mm) \ 2126826c8ffSBrian Gerst do { \ 213ccbeed3aSTejun Heo lazy_load_gs(0); \ 2146826c8ffSBrian Gerst } while (0) 2156826c8ffSBrian Gerst #else 2166826c8ffSBrian Gerst #define deactivate_mm(tsk, mm) \ 2176826c8ffSBrian Gerst do { \ 2186826c8ffSBrian Gerst load_gs_index(0); \ 2196826c8ffSBrian Gerst loadsegment(fs, 0); \ 2206826c8ffSBrian Gerst } while (0) 2216826c8ffSBrian Gerst #endif 222bb898558SAl Viro 223a1ea1c03SDave Hansen static inline void arch_dup_mmap(struct mm_struct *oldmm, 224a1ea1c03SDave Hansen struct mm_struct *mm) 225a1ea1c03SDave Hansen { 226a1ea1c03SDave Hansen paravirt_arch_dup_mmap(oldmm, mm); 227a1ea1c03SDave Hansen } 228a1ea1c03SDave Hansen 229a1ea1c03SDave Hansen static inline void arch_exit_mmap(struct mm_struct *mm) 230a1ea1c03SDave Hansen { 231a1ea1c03SDave Hansen paravirt_arch_exit_mmap(mm); 232a1ea1c03SDave Hansen } 233a1ea1c03SDave Hansen 234b0e9b09bSDave Hansen #ifdef CONFIG_X86_64 235b0e9b09bSDave Hansen static inline bool is_64bit_mm(struct mm_struct *mm) 236b0e9b09bSDave Hansen { 237b0e9b09bSDave Hansen return !config_enabled(CONFIG_IA32_EMULATION) || 238b0e9b09bSDave Hansen !(mm->context.ia32_compat == TIF_IA32); 239b0e9b09bSDave Hansen } 240b0e9b09bSDave Hansen #else 241b0e9b09bSDave Hansen static inline bool is_64bit_mm(struct mm_struct *mm) 242b0e9b09bSDave Hansen { 243b0e9b09bSDave Hansen return false; 244b0e9b09bSDave Hansen } 245b0e9b09bSDave Hansen #endif 246b0e9b09bSDave Hansen 247fe3d197fSDave Hansen static inline void arch_bprm_mm_init(struct mm_struct *mm, 248fe3d197fSDave Hansen struct vm_area_struct *vma) 249fe3d197fSDave Hansen { 250fe3d197fSDave Hansen mpx_mm_init(mm); 251fe3d197fSDave Hansen } 252fe3d197fSDave Hansen 2531de4fa14SDave Hansen static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, 2541de4fa14SDave Hansen unsigned long start, unsigned long end) 2551de4fa14SDave Hansen { 256c922228eSDave Hansen /* 257c922228eSDave Hansen * mpx_notify_unmap() goes and reads a rarely-hot 258c922228eSDave Hansen * cacheline in the mm_struct. That can be expensive 259c922228eSDave Hansen * enough to be seen in profiles. 260c922228eSDave Hansen * 261c922228eSDave Hansen * The mpx_notify_unmap() call and its contents have been 262c922228eSDave Hansen * observed to affect munmap() performance on hardware 263c922228eSDave Hansen * where MPX is not present. 264c922228eSDave Hansen * 265c922228eSDave Hansen * The unlikely() optimizes for the fast case: no MPX 266c922228eSDave Hansen * in the CPU, or no MPX use in the process. Even if 267c922228eSDave Hansen * we get this wrong (in the unlikely event that MPX 268c922228eSDave Hansen * is widely enabled on some system) the overhead of 269c922228eSDave Hansen * MPX itself (reading bounds tables) is expected to 270c922228eSDave Hansen * overwhelm the overhead of getting this unlikely() 271c922228eSDave Hansen * consistently wrong. 272c922228eSDave Hansen */ 273c922228eSDave Hansen if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX))) 2741de4fa14SDave Hansen mpx_notify_unmap(mm, vma, start, end); 2751de4fa14SDave Hansen } 2761de4fa14SDave Hansen 2771965aae3SH. Peter Anvin #endif /* _ASM_X86_MMU_CONTEXT_H */ 278