11965aae3SH. Peter Anvin #ifndef _ASM_X86_MMU_CONTEXT_H 21965aae3SH. Peter Anvin #define _ASM_X86_MMU_CONTEXT_H 3bb898558SAl Viro 4bb898558SAl Viro #include <asm/desc.h> 560063497SArun Sharma #include <linux/atomic.h> 6d17d8f9dSDave Hansen #include <linux/mm_types.h> 7d17d8f9dSDave Hansen 8d17d8f9dSDave Hansen #include <trace/events/tlb.h> 9d17d8f9dSDave Hansen 10bb898558SAl Viro #include <asm/pgalloc.h> 11bb898558SAl Viro #include <asm/tlbflush.h> 12bb898558SAl Viro #include <asm/paravirt.h> 13fe3d197fSDave Hansen #include <asm/mpx.h> 14bb898558SAl Viro #ifndef CONFIG_PARAVIRT 15bb898558SAl Viro static inline void paravirt_activate_mm(struct mm_struct *prev, 16bb898558SAl Viro struct mm_struct *next) 17bb898558SAl Viro { 18bb898558SAl Viro } 19bb898558SAl Viro #endif /* !CONFIG_PARAVIRT */ 20bb898558SAl Viro 217911d3f7SAndy Lutomirski #ifdef CONFIG_PERF_EVENTS 22a6673429SAndy Lutomirski extern struct static_key rdpmc_always_available; 23a6673429SAndy Lutomirski 247911d3f7SAndy Lutomirski static inline void load_mm_cr4(struct mm_struct *mm) 257911d3f7SAndy Lutomirski { 26a833581eSPeter Zijlstra if (static_key_false(&rdpmc_always_available) || 27a6673429SAndy Lutomirski atomic_read(&mm->context.perf_rdpmc_allowed)) 287911d3f7SAndy Lutomirski cr4_set_bits(X86_CR4_PCE); 297911d3f7SAndy Lutomirski else 307911d3f7SAndy Lutomirski cr4_clear_bits(X86_CR4_PCE); 317911d3f7SAndy Lutomirski } 327911d3f7SAndy Lutomirski #else 337911d3f7SAndy Lutomirski static inline void load_mm_cr4(struct mm_struct *mm) {} 347911d3f7SAndy Lutomirski #endif 357911d3f7SAndy Lutomirski 36a5b9e5a2SAndy Lutomirski #ifdef CONFIG_MODIFY_LDT_SYSCALL 37bb898558SAl Viro /* 3837868fe1SAndy Lutomirski * ldt_structs can be allocated, used, and freed, but they are never 3937868fe1SAndy Lutomirski * modified while live. 4037868fe1SAndy Lutomirski */ 4137868fe1SAndy Lutomirski struct ldt_struct { 4237868fe1SAndy Lutomirski /* 4337868fe1SAndy Lutomirski * Xen requires page-aligned LDTs with special permissions. This is 4437868fe1SAndy Lutomirski * needed to prevent us from installing evil descriptors such as 4537868fe1SAndy Lutomirski * call gates. On native, we could merge the ldt_struct and LDT 4637868fe1SAndy Lutomirski * allocations, but it's not worth trying to optimize. 4737868fe1SAndy Lutomirski */ 4837868fe1SAndy Lutomirski struct desc_struct *entries; 4937868fe1SAndy Lutomirski int size; 5037868fe1SAndy Lutomirski }; 5137868fe1SAndy Lutomirski 52a5b9e5a2SAndy Lutomirski /* 53a5b9e5a2SAndy Lutomirski * Used for LDT copy/destruction. 54a5b9e5a2SAndy Lutomirski */ 55a5b9e5a2SAndy Lutomirski int init_new_context(struct task_struct *tsk, struct mm_struct *mm); 56a5b9e5a2SAndy Lutomirski void destroy_context(struct mm_struct *mm); 57a5b9e5a2SAndy Lutomirski #else /* CONFIG_MODIFY_LDT_SYSCALL */ 58a5b9e5a2SAndy Lutomirski static inline int init_new_context(struct task_struct *tsk, 59a5b9e5a2SAndy Lutomirski struct mm_struct *mm) 60a5b9e5a2SAndy Lutomirski { 61a5b9e5a2SAndy Lutomirski return 0; 62a5b9e5a2SAndy Lutomirski } 63a5b9e5a2SAndy Lutomirski static inline void destroy_context(struct mm_struct *mm) {} 64a5b9e5a2SAndy Lutomirski #endif 65a5b9e5a2SAndy Lutomirski 6637868fe1SAndy Lutomirski static inline void load_mm_ldt(struct mm_struct *mm) 6737868fe1SAndy Lutomirski { 68a5b9e5a2SAndy Lutomirski #ifdef CONFIG_MODIFY_LDT_SYSCALL 6937868fe1SAndy Lutomirski struct ldt_struct *ldt; 7037868fe1SAndy Lutomirski 7137868fe1SAndy Lutomirski /* lockless_dereference synchronizes with smp_store_release */ 7237868fe1SAndy Lutomirski ldt = lockless_dereference(mm->context.ldt); 7337868fe1SAndy Lutomirski 7437868fe1SAndy Lutomirski /* 7537868fe1SAndy Lutomirski * Any change to mm->context.ldt is followed by an IPI to all 7637868fe1SAndy Lutomirski * CPUs with the mm active. The LDT will not be freed until 7737868fe1SAndy Lutomirski * after the IPI is handled by all such CPUs. This means that, 7837868fe1SAndy Lutomirski * if the ldt_struct changes before we return, the values we see 7937868fe1SAndy Lutomirski * will be safe, and the new values will be loaded before we run 8037868fe1SAndy Lutomirski * any user code. 8137868fe1SAndy Lutomirski * 8237868fe1SAndy Lutomirski * NB: don't try to convert this to use RCU without extreme care. 8337868fe1SAndy Lutomirski * We would still need IRQs off, because we don't want to change 8437868fe1SAndy Lutomirski * the local LDT after an IPI loaded a newer value than the one 8537868fe1SAndy Lutomirski * that we can see. 8637868fe1SAndy Lutomirski */ 8737868fe1SAndy Lutomirski 8837868fe1SAndy Lutomirski if (unlikely(ldt)) 8937868fe1SAndy Lutomirski set_ldt(ldt->entries, ldt->size); 9037868fe1SAndy Lutomirski else 9137868fe1SAndy Lutomirski clear_LDT(); 92a5b9e5a2SAndy Lutomirski #else 93a5b9e5a2SAndy Lutomirski clear_LDT(); 94a5b9e5a2SAndy Lutomirski #endif 9537868fe1SAndy Lutomirski 9637868fe1SAndy Lutomirski DEBUG_LOCKS_WARN_ON(preemptible()); 9737868fe1SAndy Lutomirski } 9837868fe1SAndy Lutomirski 996826c8ffSBrian Gerst static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 1006826c8ffSBrian Gerst { 1016826c8ffSBrian Gerst #ifdef CONFIG_SMP 102c6ae41e7SAlex Shi if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 103c6ae41e7SAlex Shi this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); 104bb898558SAl Viro #endif 1056826c8ffSBrian Gerst } 1066826c8ffSBrian Gerst 1076826c8ffSBrian Gerst static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 1086826c8ffSBrian Gerst struct task_struct *tsk) 1096826c8ffSBrian Gerst { 1106826c8ffSBrian Gerst unsigned cpu = smp_processor_id(); 1116826c8ffSBrian Gerst 1126826c8ffSBrian Gerst if (likely(prev != next)) { 1136826c8ffSBrian Gerst #ifdef CONFIG_SMP 114c6ae41e7SAlex Shi this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); 115c6ae41e7SAlex Shi this_cpu_write(cpu_tlbstate.active_mm, next); 1166826c8ffSBrian Gerst #endif 11778f1c4d6SRusty Russell cpumask_set_cpu(cpu, mm_cpumask(next)); 1186826c8ffSBrian Gerst 11971b3c126SAndy Lutomirski /* 12071b3c126SAndy Lutomirski * Re-load page tables. 12171b3c126SAndy Lutomirski * 12271b3c126SAndy Lutomirski * This logic has an ordering constraint: 12371b3c126SAndy Lutomirski * 12471b3c126SAndy Lutomirski * CPU 0: Write to a PTE for 'next' 12571b3c126SAndy Lutomirski * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. 12671b3c126SAndy Lutomirski * CPU 1: set bit 1 in next's mm_cpumask 12771b3c126SAndy Lutomirski * CPU 1: load from the PTE that CPU 0 writes (implicit) 12871b3c126SAndy Lutomirski * 12971b3c126SAndy Lutomirski * We need to prevent an outcome in which CPU 1 observes 13071b3c126SAndy Lutomirski * the new PTE value and CPU 0 observes bit 1 clear in 13171b3c126SAndy Lutomirski * mm_cpumask. (If that occurs, then the IPI will never 13271b3c126SAndy Lutomirski * be sent, and CPU 0's TLB will contain a stale entry.) 13371b3c126SAndy Lutomirski * 13471b3c126SAndy Lutomirski * The bad outcome can occur if either CPU's load is 1354eaffdd5SAndy Lutomirski * reordered before that CPU's store, so both CPUs must 13671b3c126SAndy Lutomirski * execute full barriers to prevent this from happening. 13771b3c126SAndy Lutomirski * 13871b3c126SAndy Lutomirski * Thus, switch_mm needs a full barrier between the 13971b3c126SAndy Lutomirski * store to mm_cpumask and any operation that could load 1404eaffdd5SAndy Lutomirski * from next->pgd. TLB fills are special and can happen 1414eaffdd5SAndy Lutomirski * due to instruction fetches or for no reason at all, 1424eaffdd5SAndy Lutomirski * and neither LOCK nor MFENCE orders them. 1434eaffdd5SAndy Lutomirski * Fortunately, load_cr3() is serializing and gives the 1444eaffdd5SAndy Lutomirski * ordering guarantee we need. 14571b3c126SAndy Lutomirski * 14671b3c126SAndy Lutomirski */ 1476826c8ffSBrian Gerst load_cr3(next->pgd); 14871b3c126SAndy Lutomirski 149d17d8f9dSDave Hansen trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 1506826c8ffSBrian Gerst 1518f898fbbSRik van Riel /* Stop flush ipis for the previous mm */ 152831d52bcSSuresh Siddha cpumask_clear_cpu(cpu, mm_cpumask(prev)); 153831d52bcSSuresh Siddha 1547911d3f7SAndy Lutomirski /* Load per-mm CR4 state */ 1557911d3f7SAndy Lutomirski load_mm_cr4(next); 1567911d3f7SAndy Lutomirski 157a5b9e5a2SAndy Lutomirski #ifdef CONFIG_MODIFY_LDT_SYSCALL 158c4a7bba2SAndy Lutomirski /* 159c4a7bba2SAndy Lutomirski * Load the LDT, if the LDT is different. 160c4a7bba2SAndy Lutomirski * 16122c4bd9fSAndy Lutomirski * It's possible that prev->context.ldt doesn't match 16222c4bd9fSAndy Lutomirski * the LDT register. This can happen if leave_mm(prev) 16322c4bd9fSAndy Lutomirski * was called and then modify_ldt changed 16422c4bd9fSAndy Lutomirski * prev->context.ldt but suppressed an IPI to this CPU. 16522c4bd9fSAndy Lutomirski * In this case, prev->context.ldt != NULL, because we 16637868fe1SAndy Lutomirski * never set context.ldt to NULL while the mm still 16737868fe1SAndy Lutomirski * exists. That means that next->context.ldt != 16837868fe1SAndy Lutomirski * prev->context.ldt, because mms never share an LDT. 169c4a7bba2SAndy Lutomirski */ 1706826c8ffSBrian Gerst if (unlikely(prev->context.ldt != next->context.ldt)) 17137868fe1SAndy Lutomirski load_mm_ldt(next); 172a5b9e5a2SAndy Lutomirski #endif 1736826c8ffSBrian Gerst } 1746826c8ffSBrian Gerst #ifdef CONFIG_SMP 1756826c8ffSBrian Gerst else { 176c6ae41e7SAlex Shi this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); 177c6ae41e7SAlex Shi BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); 1786826c8ffSBrian Gerst 1798f898fbbSRik van Riel if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { 1808f898fbbSRik van Riel /* 1818f898fbbSRik van Riel * On established mms, the mm_cpumask is only changed 1828f898fbbSRik van Riel * from irq context, from ptep_clear_flush() while in 1838f898fbbSRik van Riel * lazy tlb mode, and here. Irqs are blocked during 1848f898fbbSRik van Riel * schedule, protecting us from simultaneous changes. 1858f898fbbSRik van Riel */ 1868f898fbbSRik van Riel cpumask_set_cpu(cpu, mm_cpumask(next)); 18771b3c126SAndy Lutomirski 1888f898fbbSRik van Riel /* 1898f898fbbSRik van Riel * We were in lazy tlb mode and leave_mm disabled 1906826c8ffSBrian Gerst * tlb flush IPI delivery. We must reload CR3 1916826c8ffSBrian Gerst * to make sure to use no freed page tables. 19271b3c126SAndy Lutomirski * 1934eaffdd5SAndy Lutomirski * As above, load_cr3() is serializing and orders TLB 1944eaffdd5SAndy Lutomirski * fills with respect to the mm_cpumask write. 1956826c8ffSBrian Gerst */ 1966826c8ffSBrian Gerst load_cr3(next->pgd); 197d17d8f9dSDave Hansen trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 1987911d3f7SAndy Lutomirski load_mm_cr4(next); 19937868fe1SAndy Lutomirski load_mm_ldt(next); 2006826c8ffSBrian Gerst } 2016826c8ffSBrian Gerst } 2026826c8ffSBrian Gerst #endif 2036826c8ffSBrian Gerst } 204bb898558SAl Viro 205bb898558SAl Viro #define activate_mm(prev, next) \ 206bb898558SAl Viro do { \ 207bb898558SAl Viro paravirt_activate_mm((prev), (next)); \ 208bb898558SAl Viro switch_mm((prev), (next), NULL); \ 209bb898558SAl Viro } while (0); 210bb898558SAl Viro 2116826c8ffSBrian Gerst #ifdef CONFIG_X86_32 2126826c8ffSBrian Gerst #define deactivate_mm(tsk, mm) \ 2136826c8ffSBrian Gerst do { \ 214ccbeed3aSTejun Heo lazy_load_gs(0); \ 2156826c8ffSBrian Gerst } while (0) 2166826c8ffSBrian Gerst #else 2176826c8ffSBrian Gerst #define deactivate_mm(tsk, mm) \ 2186826c8ffSBrian Gerst do { \ 2196826c8ffSBrian Gerst load_gs_index(0); \ 2206826c8ffSBrian Gerst loadsegment(fs, 0); \ 2216826c8ffSBrian Gerst } while (0) 2226826c8ffSBrian Gerst #endif 223bb898558SAl Viro 224a1ea1c03SDave Hansen static inline void arch_dup_mmap(struct mm_struct *oldmm, 225a1ea1c03SDave Hansen struct mm_struct *mm) 226a1ea1c03SDave Hansen { 227a1ea1c03SDave Hansen paravirt_arch_dup_mmap(oldmm, mm); 228a1ea1c03SDave Hansen } 229a1ea1c03SDave Hansen 230a1ea1c03SDave Hansen static inline void arch_exit_mmap(struct mm_struct *mm) 231a1ea1c03SDave Hansen { 232a1ea1c03SDave Hansen paravirt_arch_exit_mmap(mm); 233a1ea1c03SDave Hansen } 234a1ea1c03SDave Hansen 235b0e9b09bSDave Hansen #ifdef CONFIG_X86_64 236b0e9b09bSDave Hansen static inline bool is_64bit_mm(struct mm_struct *mm) 237b0e9b09bSDave Hansen { 238b0e9b09bSDave Hansen return !config_enabled(CONFIG_IA32_EMULATION) || 239b0e9b09bSDave Hansen !(mm->context.ia32_compat == TIF_IA32); 240b0e9b09bSDave Hansen } 241b0e9b09bSDave Hansen #else 242b0e9b09bSDave Hansen static inline bool is_64bit_mm(struct mm_struct *mm) 243b0e9b09bSDave Hansen { 244b0e9b09bSDave Hansen return false; 245b0e9b09bSDave Hansen } 246b0e9b09bSDave Hansen #endif 247b0e9b09bSDave Hansen 248fe3d197fSDave Hansen static inline void arch_bprm_mm_init(struct mm_struct *mm, 249fe3d197fSDave Hansen struct vm_area_struct *vma) 250fe3d197fSDave Hansen { 251fe3d197fSDave Hansen mpx_mm_init(mm); 252fe3d197fSDave Hansen } 253fe3d197fSDave Hansen 2541de4fa14SDave Hansen static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, 2551de4fa14SDave Hansen unsigned long start, unsigned long end) 2561de4fa14SDave Hansen { 257c922228eSDave Hansen /* 258c922228eSDave Hansen * mpx_notify_unmap() goes and reads a rarely-hot 259c922228eSDave Hansen * cacheline in the mm_struct. That can be expensive 260c922228eSDave Hansen * enough to be seen in profiles. 261c922228eSDave Hansen * 262c922228eSDave Hansen * The mpx_notify_unmap() call and its contents have been 263c922228eSDave Hansen * observed to affect munmap() performance on hardware 264c922228eSDave Hansen * where MPX is not present. 265c922228eSDave Hansen * 266c922228eSDave Hansen * The unlikely() optimizes for the fast case: no MPX 267c922228eSDave Hansen * in the CPU, or no MPX use in the process. Even if 268c922228eSDave Hansen * we get this wrong (in the unlikely event that MPX 269c922228eSDave Hansen * is widely enabled on some system) the overhead of 270c922228eSDave Hansen * MPX itself (reading bounds tables) is expected to 271c922228eSDave Hansen * overwhelm the overhead of getting this unlikely() 272c922228eSDave Hansen * consistently wrong. 273c922228eSDave Hansen */ 274c922228eSDave Hansen if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX))) 2751de4fa14SDave Hansen mpx_notify_unmap(mm, vma, start, end); 2761de4fa14SDave Hansen } 2771de4fa14SDave Hansen 2788f62c883SDave Hansen static inline int vma_pkey(struct vm_area_struct *vma) 2798f62c883SDave Hansen { 2808f62c883SDave Hansen u16 pkey = 0; 2818f62c883SDave Hansen #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 2828f62c883SDave Hansen unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | 2838f62c883SDave Hansen VM_PKEY_BIT2 | VM_PKEY_BIT3; 2848f62c883SDave Hansen pkey = (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT; 2858f62c883SDave Hansen #endif 2868f62c883SDave Hansen return pkey; 2878f62c883SDave Hansen } 2888f62c883SDave Hansen 2891965aae3SH. Peter Anvin #endif /* _ASM_X86_MMU_CONTEXT_H */ 290