xref: /openbmc/linux/arch/x86/mm/tlb.c (revision dbfe2953f63c640463c630746cd5d9de8b2f63ae)
155f4949fSIngo Molnar #include <linux/init.h>
255f4949fSIngo Molnar 
355f4949fSIngo Molnar #include <linux/mm.h>
455f4949fSIngo Molnar #include <linux/spinlock.h>
555f4949fSIngo Molnar #include <linux/smp.h>
655f4949fSIngo Molnar #include <linux/interrupt.h>
74b599fedSPaul Gortmaker #include <linux/export.h>
893296720SShaohua Li #include <linux/cpu.h>
918bf3c3eSTim Chen #include <linux/debugfs.h>
10*dbfe2953SJiri Kosina #include <linux/ptrace.h>
1155f4949fSIngo Molnar 
1255f4949fSIngo Molnar #include <asm/tlbflush.h>
1355f4949fSIngo Molnar #include <asm/mmu_context.h>
1418bf3c3eSTim Chen #include <asm/nospec-branch.h>
15350f8f56SJan Beulich #include <asm/cache.h>
1655f4949fSIngo Molnar #include <asm/apic.h>
1755f4949fSIngo Molnar #include <asm/uv/uv.h>
1855f4949fSIngo Molnar 
1955f4949fSIngo Molnar /*
20ce4a4e56SAndy Lutomirski  *	TLB flushing, formerly SMP-only
2155f4949fSIngo Molnar  *		c/o Linus Torvalds.
2255f4949fSIngo Molnar  *
2355f4949fSIngo Molnar  *	These mean you can really definitely utterly forget about
2455f4949fSIngo Molnar  *	writing to user space from interrupts. (Its not allowed anyway).
2555f4949fSIngo Molnar  *
2655f4949fSIngo Molnar  *	Optimizations Manfred Spraul <manfred@colorfullife.com>
2755f4949fSIngo Molnar  *
2855f4949fSIngo Molnar  *	More scalable flush, from Andi Kleen
2955f4949fSIngo Molnar  *
3052aec330SAlex Shi  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
3155f4949fSIngo Molnar  */
3255f4949fSIngo Molnar 
332ea907c4SDave Hansen /*
342ea907c4SDave Hansen  * We get here when we do something requiring a TLB invalidation
352ea907c4SDave Hansen  * but could not go invalidate all of the contexts.  We do the
362ea907c4SDave Hansen  * necessary invalidation by clearing out the 'ctx_id' which
372ea907c4SDave Hansen  * forces a TLB flush when the context is loaded.
382ea907c4SDave Hansen  */
39387048f5Szhong jiang static void clear_asid_other(void)
402ea907c4SDave Hansen {
412ea907c4SDave Hansen 	u16 asid;
422ea907c4SDave Hansen 
432ea907c4SDave Hansen 	/*
442ea907c4SDave Hansen 	 * This is only expected to be set if we have disabled
452ea907c4SDave Hansen 	 * kernel _PAGE_GLOBAL pages.
462ea907c4SDave Hansen 	 */
472ea907c4SDave Hansen 	if (!static_cpu_has(X86_FEATURE_PTI)) {
482ea907c4SDave Hansen 		WARN_ON_ONCE(1);
492ea907c4SDave Hansen 		return;
502ea907c4SDave Hansen 	}
512ea907c4SDave Hansen 
522ea907c4SDave Hansen 	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
532ea907c4SDave Hansen 		/* Do not need to flush the current asid */
542ea907c4SDave Hansen 		if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
552ea907c4SDave Hansen 			continue;
562ea907c4SDave Hansen 		/*
572ea907c4SDave Hansen 		 * Make sure the next time we go to switch to
582ea907c4SDave Hansen 		 * this asid, we do a flush:
592ea907c4SDave Hansen 		 */
602ea907c4SDave Hansen 		this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
612ea907c4SDave Hansen 	}
622ea907c4SDave Hansen 	this_cpu_write(cpu_tlbstate.invalidate_other, false);
632ea907c4SDave Hansen }
642ea907c4SDave Hansen 
65f39681edSAndy Lutomirski atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
66f39681edSAndy Lutomirski 
67b956575bSAndy Lutomirski 
6810af6235SAndy Lutomirski static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
6910af6235SAndy Lutomirski 			    u16 *new_asid, bool *need_flush)
7010af6235SAndy Lutomirski {
7110af6235SAndy Lutomirski 	u16 asid;
7210af6235SAndy Lutomirski 
7310af6235SAndy Lutomirski 	if (!static_cpu_has(X86_FEATURE_PCID)) {
7410af6235SAndy Lutomirski 		*new_asid = 0;
7510af6235SAndy Lutomirski 		*need_flush = true;
7610af6235SAndy Lutomirski 		return;
7710af6235SAndy Lutomirski 	}
7810af6235SAndy Lutomirski 
792ea907c4SDave Hansen 	if (this_cpu_read(cpu_tlbstate.invalidate_other))
802ea907c4SDave Hansen 		clear_asid_other();
812ea907c4SDave Hansen 
8210af6235SAndy Lutomirski 	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
8310af6235SAndy Lutomirski 		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
8410af6235SAndy Lutomirski 		    next->context.ctx_id)
8510af6235SAndy Lutomirski 			continue;
8610af6235SAndy Lutomirski 
8710af6235SAndy Lutomirski 		*new_asid = asid;
8810af6235SAndy Lutomirski 		*need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
8910af6235SAndy Lutomirski 			       next_tlb_gen);
9010af6235SAndy Lutomirski 		return;
9110af6235SAndy Lutomirski 	}
9210af6235SAndy Lutomirski 
9310af6235SAndy Lutomirski 	/*
9410af6235SAndy Lutomirski 	 * We don't currently own an ASID slot on this CPU.
9510af6235SAndy Lutomirski 	 * Allocate a slot.
9610af6235SAndy Lutomirski 	 */
9710af6235SAndy Lutomirski 	*new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
9810af6235SAndy Lutomirski 	if (*new_asid >= TLB_NR_DYN_ASIDS) {
9910af6235SAndy Lutomirski 		*new_asid = 0;
10010af6235SAndy Lutomirski 		this_cpu_write(cpu_tlbstate.next_asid, 1);
10110af6235SAndy Lutomirski 	}
10210af6235SAndy Lutomirski 	*need_flush = true;
10310af6235SAndy Lutomirski }
10410af6235SAndy Lutomirski 
10548e11198SDave Hansen static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
10648e11198SDave Hansen {
10748e11198SDave Hansen 	unsigned long new_mm_cr3;
10848e11198SDave Hansen 
10948e11198SDave Hansen 	if (need_flush) {
1106fd166aaSPeter Zijlstra 		invalidate_user_asid(new_asid);
11148e11198SDave Hansen 		new_mm_cr3 = build_cr3(pgdir, new_asid);
11248e11198SDave Hansen 	} else {
11348e11198SDave Hansen 		new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
11448e11198SDave Hansen 	}
11548e11198SDave Hansen 
11648e11198SDave Hansen 	/*
11748e11198SDave Hansen 	 * Caution: many callers of this function expect
11848e11198SDave Hansen 	 * that load_cr3() is serializing and orders TLB
11948e11198SDave Hansen 	 * fills with respect to the mm_cpumask writes.
12048e11198SDave Hansen 	 */
12148e11198SDave Hansen 	write_cr3(new_mm_cr3);
12248e11198SDave Hansen }
12348e11198SDave Hansen 
12455f4949fSIngo Molnar void leave_mm(int cpu)
12555f4949fSIngo Molnar {
1263d28ebceSAndy Lutomirski 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1273d28ebceSAndy Lutomirski 
1283d28ebceSAndy Lutomirski 	/*
1293d28ebceSAndy Lutomirski 	 * It's plausible that we're in lazy TLB mode while our mm is init_mm.
1303d28ebceSAndy Lutomirski 	 * If so, our callers still expect us to flush the TLB, but there
1313d28ebceSAndy Lutomirski 	 * aren't any user TLB entries in init_mm to worry about.
1323d28ebceSAndy Lutomirski 	 *
1333d28ebceSAndy Lutomirski 	 * This needs to happen before any other sanity checks due to
1343d28ebceSAndy Lutomirski 	 * intel_idle's shenanigans.
1353d28ebceSAndy Lutomirski 	 */
1363d28ebceSAndy Lutomirski 	if (loaded_mm == &init_mm)
1373d28ebceSAndy Lutomirski 		return;
1383d28ebceSAndy Lutomirski 
13994b1b03bSAndy Lutomirski 	/* Warn if we're not lazy. */
140b956575bSAndy Lutomirski 	WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
1413d28ebceSAndy Lutomirski 
1423d28ebceSAndy Lutomirski 	switch_mm(NULL, &init_mm, NULL);
143a6fca40fSSuresh Siddha }
14467535736SAndy Lutomirski EXPORT_SYMBOL_GPL(leave_mm);
14555f4949fSIngo Molnar 
14669c0319aSAndy Lutomirski void switch_mm(struct mm_struct *prev, struct mm_struct *next,
14769c0319aSAndy Lutomirski 	       struct task_struct *tsk)
14869c0319aSAndy Lutomirski {
149078194f8SAndy Lutomirski 	unsigned long flags;
150078194f8SAndy Lutomirski 
151078194f8SAndy Lutomirski 	local_irq_save(flags);
152078194f8SAndy Lutomirski 	switch_mm_irqs_off(prev, next, tsk);
153078194f8SAndy Lutomirski 	local_irq_restore(flags);
154078194f8SAndy Lutomirski }
155078194f8SAndy Lutomirski 
1565beda7d5SAndy Lutomirski static void sync_current_stack_to_mm(struct mm_struct *mm)
1575beda7d5SAndy Lutomirski {
1585beda7d5SAndy Lutomirski 	unsigned long sp = current_stack_pointer;
1595beda7d5SAndy Lutomirski 	pgd_t *pgd = pgd_offset(mm, sp);
1605beda7d5SAndy Lutomirski 
161ed7588d5SKirill A. Shutemov 	if (pgtable_l5_enabled()) {
1625beda7d5SAndy Lutomirski 		if (unlikely(pgd_none(*pgd))) {
1635beda7d5SAndy Lutomirski 			pgd_t *pgd_ref = pgd_offset_k(sp);
1645beda7d5SAndy Lutomirski 
1655beda7d5SAndy Lutomirski 			set_pgd(pgd, *pgd_ref);
1665beda7d5SAndy Lutomirski 		}
1675beda7d5SAndy Lutomirski 	} else {
1685beda7d5SAndy Lutomirski 		/*
1695beda7d5SAndy Lutomirski 		 * "pgd" is faked.  The top level entries are "p4d"s, so sync
1705beda7d5SAndy Lutomirski 		 * the p4d.  This compiles to approximately the same code as
1715beda7d5SAndy Lutomirski 		 * the 5-level case.
1725beda7d5SAndy Lutomirski 		 */
1735beda7d5SAndy Lutomirski 		p4d_t *p4d = p4d_offset(pgd, sp);
1745beda7d5SAndy Lutomirski 
1755beda7d5SAndy Lutomirski 		if (unlikely(p4d_none(*p4d))) {
1765beda7d5SAndy Lutomirski 			pgd_t *pgd_ref = pgd_offset_k(sp);
1775beda7d5SAndy Lutomirski 			p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
1785beda7d5SAndy Lutomirski 
1795beda7d5SAndy Lutomirski 			set_p4d(p4d, *p4d_ref);
1805beda7d5SAndy Lutomirski 		}
1815beda7d5SAndy Lutomirski 	}
1825beda7d5SAndy Lutomirski }
1835beda7d5SAndy Lutomirski 
184*dbfe2953SJiri Kosina static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id)
185*dbfe2953SJiri Kosina {
186*dbfe2953SJiri Kosina 	/*
187*dbfe2953SJiri Kosina 	 * Check if the current (previous) task has access to the memory
188*dbfe2953SJiri Kosina 	 * of the @tsk (next) task. If access is denied, make sure to
189*dbfe2953SJiri Kosina 	 * issue a IBPB to stop user->user Spectre-v2 attacks.
190*dbfe2953SJiri Kosina 	 *
191*dbfe2953SJiri Kosina 	 * Note: __ptrace_may_access() returns 0 or -ERRNO.
192*dbfe2953SJiri Kosina 	 */
193*dbfe2953SJiri Kosina 	return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id &&
194*dbfe2953SJiri Kosina 		ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB));
195*dbfe2953SJiri Kosina }
196*dbfe2953SJiri Kosina 
197078194f8SAndy Lutomirski void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
198078194f8SAndy Lutomirski 			struct task_struct *tsk)
199078194f8SAndy Lutomirski {
2003d28ebceSAndy Lutomirski 	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
20110af6235SAndy Lutomirski 	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
20294b1b03bSAndy Lutomirski 	unsigned cpu = smp_processor_id();
20394b1b03bSAndy Lutomirski 	u64 next_tlb_gen;
20469c0319aSAndy Lutomirski 
2053d28ebceSAndy Lutomirski 	/*
20694b1b03bSAndy Lutomirski 	 * NB: The scheduler will call us with prev == next when switching
20794b1b03bSAndy Lutomirski 	 * from lazy TLB mode to normal mode if active_mm isn't changing.
20894b1b03bSAndy Lutomirski 	 * When this happens, we don't assume that CR3 (and hence
20994b1b03bSAndy Lutomirski 	 * cpu_tlbstate.loaded_mm) matches next.
2103d28ebceSAndy Lutomirski 	 *
2113d28ebceSAndy Lutomirski 	 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
2123d28ebceSAndy Lutomirski 	 */
2133d28ebceSAndy Lutomirski 
21494b1b03bSAndy Lutomirski 	/* We don't want flush_tlb_func_* to run concurrently with us. */
21594b1b03bSAndy Lutomirski 	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
21694b1b03bSAndy Lutomirski 		WARN_ON_ONCE(!irqs_disabled());
21794b1b03bSAndy Lutomirski 
21894b1b03bSAndy Lutomirski 	/*
21994b1b03bSAndy Lutomirski 	 * Verify that CR3 is what we think it is.  This will catch
22094b1b03bSAndy Lutomirski 	 * hypothetical buggy code that directly switches to swapper_pg_dir
22110af6235SAndy Lutomirski 	 * without going through leave_mm() / switch_mm_irqs_off() or that
22210af6235SAndy Lutomirski 	 * does something like write_cr3(read_cr3_pa()).
223a376e7f9SAndy Lutomirski 	 *
224a376e7f9SAndy Lutomirski 	 * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
225a376e7f9SAndy Lutomirski 	 * isn't free.
22694b1b03bSAndy Lutomirski 	 */
227a376e7f9SAndy Lutomirski #ifdef CONFIG_DEBUG_VM
22850fb83a6SDave Hansen 	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
229a376e7f9SAndy Lutomirski 		/*
230a376e7f9SAndy Lutomirski 		 * If we were to BUG here, we'd be very likely to kill
231a376e7f9SAndy Lutomirski 		 * the system so hard that we don't see the call trace.
232a376e7f9SAndy Lutomirski 		 * Try to recover instead by ignoring the error and doing
233a376e7f9SAndy Lutomirski 		 * a global flush to minimize the chance of corruption.
234a376e7f9SAndy Lutomirski 		 *
235a376e7f9SAndy Lutomirski 		 * (This is far from being a fully correct recovery.
236a376e7f9SAndy Lutomirski 		 *  Architecturally, the CPU could prefetch something
237a376e7f9SAndy Lutomirski 		 *  back into an incorrect ASID slot and leave it there
238a376e7f9SAndy Lutomirski 		 *  to cause trouble down the road.  It's better than
239a376e7f9SAndy Lutomirski 		 *  nothing, though.)
240a376e7f9SAndy Lutomirski 		 */
241a376e7f9SAndy Lutomirski 		__flush_tlb_all();
242a376e7f9SAndy Lutomirski 	}
243a376e7f9SAndy Lutomirski #endif
244b956575bSAndy Lutomirski 	this_cpu_write(cpu_tlbstate.is_lazy, false);
2453d28ebceSAndy Lutomirski 
246306e0604SMathieu Desnoyers 	/*
24710bcc80eSMathieu Desnoyers 	 * The membarrier system call requires a full memory barrier and
24810bcc80eSMathieu Desnoyers 	 * core serialization before returning to user-space, after
24910bcc80eSMathieu Desnoyers 	 * storing to rq->curr. Writing to CR3 provides that full
25010bcc80eSMathieu Desnoyers 	 * memory barrier and core serializing instruction.
251306e0604SMathieu Desnoyers 	 */
2523d28ebceSAndy Lutomirski 	if (real_prev == next) {
253e8b9b0ccSAndy Lutomirski 		VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
25494b1b03bSAndy Lutomirski 			   next->context.ctx_id);
25594b1b03bSAndy Lutomirski 
2563d28ebceSAndy Lutomirski 		/*
25752a288c7SPeter Zijlstra 		 * We don't currently support having a real mm loaded without
25852a288c7SPeter Zijlstra 		 * our cpu set in mm_cpumask().  We have all the bookkeeping
25952a288c7SPeter Zijlstra 		 * in place to figure out whether we would need to flush
26052a288c7SPeter Zijlstra 		 * if our cpu were cleared in mm_cpumask(), but we don't
26152a288c7SPeter Zijlstra 		 * currently use it.
2623d28ebceSAndy Lutomirski 		 */
263b956575bSAndy Lutomirski 		if (WARN_ON_ONCE(real_prev != &init_mm &&
264b956575bSAndy Lutomirski 				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
26594b1b03bSAndy Lutomirski 			cpumask_set_cpu(cpu, mm_cpumask(next));
26694b1b03bSAndy Lutomirski 
267b956575bSAndy Lutomirski 		return;
26894b1b03bSAndy Lutomirski 	} else {
26952a288c7SPeter Zijlstra 		u16 new_asid;
27052a288c7SPeter Zijlstra 		bool need_flush;
27118bf3c3eSTim Chen 		u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
27218bf3c3eSTim Chen 
27318bf3c3eSTim Chen 		/*
27418bf3c3eSTim Chen 		 * Avoid user/user BTB poisoning by flushing the branch
27518bf3c3eSTim Chen 		 * predictor when switching between processes. This stops
27618bf3c3eSTim Chen 		 * one process from doing Spectre-v2 attacks on another.
27718bf3c3eSTim Chen 		 *
27818bf3c3eSTim Chen 		 * As an optimization, flush indirect branches only when
279*dbfe2953SJiri Kosina 		 * switching into a processes that can't be ptrace by the
280*dbfe2953SJiri Kosina 		 * current one (as in such case, attacker has much more
281*dbfe2953SJiri Kosina 		 * convenient way how to tamper with the next process than
282*dbfe2953SJiri Kosina 		 * branch buffer poisoning).
28318bf3c3eSTim Chen 		 */
284*dbfe2953SJiri Kosina 		if (static_cpu_has(X86_FEATURE_USE_IBPB) &&
285*dbfe2953SJiri Kosina 				ibpb_needed(tsk, last_ctx_id))
28618bf3c3eSTim Chen 			indirect_branch_prediction_barrier();
28794b1b03bSAndy Lutomirski 
288e37e43a4SAndy Lutomirski 		if (IS_ENABLED(CONFIG_VMAP_STACK)) {
289e37e43a4SAndy Lutomirski 			/*
290e37e43a4SAndy Lutomirski 			 * If our current stack is in vmalloc space and isn't
291e37e43a4SAndy Lutomirski 			 * mapped in the new pgd, we'll double-fault.  Forcibly
292e37e43a4SAndy Lutomirski 			 * map it.
293e37e43a4SAndy Lutomirski 			 */
2945beda7d5SAndy Lutomirski 			sync_current_stack_to_mm(next);
295e37e43a4SAndy Lutomirski 		}
296e37e43a4SAndy Lutomirski 
297e9d8c615SRik van Riel 		/*
298e9d8c615SRik van Riel 		 * Stop remote flushes for the previous mm.
299e9d8c615SRik van Riel 		 * Skip kernel threads; we never send init_mm TLB flushing IPIs,
300e9d8c615SRik van Riel 		 * but the bitmap manipulation can cause cache line contention.
301e9d8c615SRik van Riel 		 */
302e9d8c615SRik van Riel 		if (real_prev != &init_mm) {
303e9d8c615SRik van Riel 			VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
304e9d8c615SRik van Riel 						mm_cpumask(real_prev)));
30594b1b03bSAndy Lutomirski 			cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
306e9d8c615SRik van Riel 		}
307e37e43a4SAndy Lutomirski 
30869c0319aSAndy Lutomirski 		/*
30994b1b03bSAndy Lutomirski 		 * Start remote flushes and then read tlb_gen.
31069c0319aSAndy Lutomirski 		 */
311e9d8c615SRik van Riel 		if (next != &init_mm)
31294b1b03bSAndy Lutomirski 			cpumask_set_cpu(cpu, mm_cpumask(next));
31394b1b03bSAndy Lutomirski 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
31494b1b03bSAndy Lutomirski 
31510af6235SAndy Lutomirski 		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
31669c0319aSAndy Lutomirski 
3174012e77aSAndy Lutomirski 		/* Let nmi_uaccess_okay() know that we're changing CR3. */
3184012e77aSAndy Lutomirski 		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
3194012e77aSAndy Lutomirski 		barrier();
3204012e77aSAndy Lutomirski 
32110af6235SAndy Lutomirski 		if (need_flush) {
32210af6235SAndy Lutomirski 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
32310af6235SAndy Lutomirski 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
32448e11198SDave Hansen 			load_new_mm_cr3(next->pgd, new_asid, true);
32567535736SAndy Lutomirski 
32667535736SAndy Lutomirski 			/*
32767535736SAndy Lutomirski 			 * NB: This gets called via leave_mm() in the idle path
32867535736SAndy Lutomirski 			 * where RCU functions differently.  Tracing normally
32967535736SAndy Lutomirski 			 * uses RCU, so we need to use the _rcuidle variant.
33067535736SAndy Lutomirski 			 *
33167535736SAndy Lutomirski 			 * (There is no good reason for this.  The idle code should
33267535736SAndy Lutomirski 			 *  be rearranged to call this before rcu_idle_enter().)
33367535736SAndy Lutomirski 			 */
33467535736SAndy Lutomirski 			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
33510af6235SAndy Lutomirski 		} else {
33610af6235SAndy Lutomirski 			/* The new ASID is already up to date. */
33748e11198SDave Hansen 			load_new_mm_cr3(next->pgd, new_asid, false);
33867535736SAndy Lutomirski 
33967535736SAndy Lutomirski 			/* See above wrt _rcuidle. */
34067535736SAndy Lutomirski 			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
34110af6235SAndy Lutomirski 		}
34210af6235SAndy Lutomirski 
34318bf3c3eSTim Chen 		/*
34418bf3c3eSTim Chen 		 * Record last user mm's context id, so we can avoid
34518bf3c3eSTim Chen 		 * flushing branch buffer with IBPB if we switch back
34618bf3c3eSTim Chen 		 * to the same user.
34718bf3c3eSTim Chen 		 */
34818bf3c3eSTim Chen 		if (next != &init_mm)
34918bf3c3eSTim Chen 			this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
35018bf3c3eSTim Chen 
3514012e77aSAndy Lutomirski 		/* Make sure we write CR3 before loaded_mm. */
3524012e77aSAndy Lutomirski 		barrier();
3534012e77aSAndy Lutomirski 
35410af6235SAndy Lutomirski 		this_cpu_write(cpu_tlbstate.loaded_mm, next);
35510af6235SAndy Lutomirski 		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
35652a288c7SPeter Zijlstra 	}
35769c0319aSAndy Lutomirski 
35869c0319aSAndy Lutomirski 	load_mm_cr4(next);
35973534258SAndy Lutomirski 	switch_ldt(real_prev, next);
36069c0319aSAndy Lutomirski }
36169c0319aSAndy Lutomirski 
362b0579adeSAndy Lutomirski /*
3634e57b946SAndy Lutomirski  * Please ignore the name of this function.  It should be called
3644e57b946SAndy Lutomirski  * switch_to_kernel_thread().
3654e57b946SAndy Lutomirski  *
366b956575bSAndy Lutomirski  * enter_lazy_tlb() is a hint from the scheduler that we are entering a
367b956575bSAndy Lutomirski  * kernel thread or other context without an mm.  Acceptable implementations
368b956575bSAndy Lutomirski  * include doing nothing whatsoever, switching to init_mm, or various clever
369b956575bSAndy Lutomirski  * lazy tricks to try to minimize TLB flushes.
370b956575bSAndy Lutomirski  *
371b956575bSAndy Lutomirski  * The scheduler reserves the right to call enter_lazy_tlb() several times
372b956575bSAndy Lutomirski  * in a row.  It will notify us that we're going back to a real mm by
373b956575bSAndy Lutomirski  * calling switch_mm_irqs_off().
374b956575bSAndy Lutomirski  */
375b956575bSAndy Lutomirski void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
376b956575bSAndy Lutomirski {
377b956575bSAndy Lutomirski 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
378b956575bSAndy Lutomirski 		return;
379b956575bSAndy Lutomirski 
38052a288c7SPeter Zijlstra 	if (tlb_defer_switch_to_init_mm()) {
38152a288c7SPeter Zijlstra 		/*
38252a288c7SPeter Zijlstra 		 * There's a significant optimization that may be possible
38352a288c7SPeter Zijlstra 		 * here.  We have accurate enough TLB flush tracking that we
38452a288c7SPeter Zijlstra 		 * don't need to maintain coherence of TLB per se when we're
38552a288c7SPeter Zijlstra 		 * lazy.  We do, however, need to maintain coherence of
38652a288c7SPeter Zijlstra 		 * paging-structure caches.  We could, in principle, leave our
38752a288c7SPeter Zijlstra 		 * old mm loaded and only switch to init_mm when
38852a288c7SPeter Zijlstra 		 * tlb_remove_page() happens.
38952a288c7SPeter Zijlstra 		 */
390b956575bSAndy Lutomirski 		this_cpu_write(cpu_tlbstate.is_lazy, true);
39152a288c7SPeter Zijlstra 	} else {
39252a288c7SPeter Zijlstra 		switch_mm(NULL, &init_mm, NULL);
39352a288c7SPeter Zijlstra 	}
394b956575bSAndy Lutomirski }
395b956575bSAndy Lutomirski 
396b956575bSAndy Lutomirski /*
39772c0098dSAndy Lutomirski  * Call this when reinitializing a CPU.  It fixes the following potential
39872c0098dSAndy Lutomirski  * problems:
39972c0098dSAndy Lutomirski  *
40072c0098dSAndy Lutomirski  * - The ASID changed from what cpu_tlbstate thinks it is (most likely
40172c0098dSAndy Lutomirski  *   because the CPU was taken down and came back up with CR3's PCID
40272c0098dSAndy Lutomirski  *   bits clear.  CPU hotplug can do this.
40372c0098dSAndy Lutomirski  *
40472c0098dSAndy Lutomirski  * - The TLB contains junk in slots corresponding to inactive ASIDs.
40572c0098dSAndy Lutomirski  *
40672c0098dSAndy Lutomirski  * - The CPU went so far out to lunch that it may have missed a TLB
40772c0098dSAndy Lutomirski  *   flush.
40872c0098dSAndy Lutomirski  */
40972c0098dSAndy Lutomirski void initialize_tlbstate_and_flush(void)
41072c0098dSAndy Lutomirski {
41172c0098dSAndy Lutomirski 	int i;
41272c0098dSAndy Lutomirski 	struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
41372c0098dSAndy Lutomirski 	u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
41472c0098dSAndy Lutomirski 	unsigned long cr3 = __read_cr3();
41572c0098dSAndy Lutomirski 
41672c0098dSAndy Lutomirski 	/* Assert that CR3 already references the right mm. */
41772c0098dSAndy Lutomirski 	WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
41872c0098dSAndy Lutomirski 
41972c0098dSAndy Lutomirski 	/*
42072c0098dSAndy Lutomirski 	 * Assert that CR4.PCIDE is set if needed.  (CR4.PCIDE initialization
42172c0098dSAndy Lutomirski 	 * doesn't work like other CR4 bits because it can only be set from
42272c0098dSAndy Lutomirski 	 * long mode.)
42372c0098dSAndy Lutomirski 	 */
4247898f796SAndy Lutomirski 	WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
42572c0098dSAndy Lutomirski 		!(cr4_read_shadow() & X86_CR4_PCIDE));
42672c0098dSAndy Lutomirski 
42772c0098dSAndy Lutomirski 	/* Force ASID 0 and force a TLB flush. */
42850fb83a6SDave Hansen 	write_cr3(build_cr3(mm->pgd, 0));
42972c0098dSAndy Lutomirski 
43072c0098dSAndy Lutomirski 	/* Reinitialize tlbstate. */
43118bf3c3eSTim Chen 	this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id);
43272c0098dSAndy Lutomirski 	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
43372c0098dSAndy Lutomirski 	this_cpu_write(cpu_tlbstate.next_asid, 1);
43472c0098dSAndy Lutomirski 	this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
43572c0098dSAndy Lutomirski 	this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
43672c0098dSAndy Lutomirski 
43772c0098dSAndy Lutomirski 	for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
43872c0098dSAndy Lutomirski 		this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
43972c0098dSAndy Lutomirski }
44072c0098dSAndy Lutomirski 
44172c0098dSAndy Lutomirski /*
442b0579adeSAndy Lutomirski  * flush_tlb_func_common()'s memory ordering requirement is that any
443b0579adeSAndy Lutomirski  * TLB fills that happen after we flush the TLB are ordered after we
444b0579adeSAndy Lutomirski  * read active_mm's tlb_gen.  We don't need any explicit barriers
445b0579adeSAndy Lutomirski  * because all x86 flush operations are serializing and the
446b0579adeSAndy Lutomirski  * atomic64_read operation won't be reordered by the compiler.
447b0579adeSAndy Lutomirski  */
448454bbad9SAndy Lutomirski static void flush_tlb_func_common(const struct flush_tlb_info *f,
449454bbad9SAndy Lutomirski 				  bool local, enum tlb_flush_reason reason)
45055f4949fSIngo Molnar {
451b0579adeSAndy Lutomirski 	/*
452b0579adeSAndy Lutomirski 	 * We have three different tlb_gen values in here.  They are:
453b0579adeSAndy Lutomirski 	 *
454b0579adeSAndy Lutomirski 	 * - mm_tlb_gen:     the latest generation.
455b0579adeSAndy Lutomirski 	 * - local_tlb_gen:  the generation that this CPU has already caught
456b0579adeSAndy Lutomirski 	 *                   up to.
457b0579adeSAndy Lutomirski 	 * - f->new_tlb_gen: the generation that the requester of the flush
458b0579adeSAndy Lutomirski 	 *                   wants us to catch up to.
459b0579adeSAndy Lutomirski 	 */
460b0579adeSAndy Lutomirski 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
46110af6235SAndy Lutomirski 	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
462b0579adeSAndy Lutomirski 	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
46310af6235SAndy Lutomirski 	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
464b0579adeSAndy Lutomirski 
465bc0d5a89SAndy Lutomirski 	/* This code cannot presently handle being reentered. */
466bc0d5a89SAndy Lutomirski 	VM_WARN_ON(!irqs_disabled());
467bc0d5a89SAndy Lutomirski 
468b956575bSAndy Lutomirski 	if (unlikely(loaded_mm == &init_mm))
469b956575bSAndy Lutomirski 		return;
470b956575bSAndy Lutomirski 
47110af6235SAndy Lutomirski 	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
472b0579adeSAndy Lutomirski 		   loaded_mm->context.ctx_id);
473b0579adeSAndy Lutomirski 
474b956575bSAndy Lutomirski 	if (this_cpu_read(cpu_tlbstate.is_lazy)) {
475b0579adeSAndy Lutomirski 		/*
476b956575bSAndy Lutomirski 		 * We're in lazy mode.  We need to at least flush our
477b956575bSAndy Lutomirski 		 * paging-structure cache to avoid speculatively reading
478b956575bSAndy Lutomirski 		 * garbage into our TLB.  Since switching to init_mm is barely
479b956575bSAndy Lutomirski 		 * slower than a minimal flush, just switch to init_mm.
480b0579adeSAndy Lutomirski 		 */
481b956575bSAndy Lutomirski 		switch_mm_irqs_off(NULL, &init_mm, NULL);
482b3b90e5aSAndy Lutomirski 		return;
483b3b90e5aSAndy Lutomirski 	}
484b3b90e5aSAndy Lutomirski 
485b0579adeSAndy Lutomirski 	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
486b0579adeSAndy Lutomirski 		/*
487b0579adeSAndy Lutomirski 		 * There's nothing to do: we're already up to date.  This can
488b0579adeSAndy Lutomirski 		 * happen if two concurrent flushes happen -- the first flush to
489b0579adeSAndy Lutomirski 		 * be handled can catch us all the way up, leaving no work for
490b0579adeSAndy Lutomirski 		 * the second flush.
491b0579adeSAndy Lutomirski 		 */
49294b1b03bSAndy Lutomirski 		trace_tlb_flush(reason, 0);
493b0579adeSAndy Lutomirski 		return;
494b0579adeSAndy Lutomirski 	}
495b0579adeSAndy Lutomirski 
496b0579adeSAndy Lutomirski 	WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
497b0579adeSAndy Lutomirski 	WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
498b0579adeSAndy Lutomirski 
499b0579adeSAndy Lutomirski 	/*
500b0579adeSAndy Lutomirski 	 * If we get to this point, we know that our TLB is out of date.
501b0579adeSAndy Lutomirski 	 * This does not strictly imply that we need to flush (it's
502b0579adeSAndy Lutomirski 	 * possible that f->new_tlb_gen <= local_tlb_gen), but we're
503b0579adeSAndy Lutomirski 	 * going to need to flush in the very near future, so we might
504b0579adeSAndy Lutomirski 	 * as well get it over with.
505b0579adeSAndy Lutomirski 	 *
506b0579adeSAndy Lutomirski 	 * The only question is whether to do a full or partial flush.
507b0579adeSAndy Lutomirski 	 *
508b0579adeSAndy Lutomirski 	 * We do a partial flush if requested and two extra conditions
509b0579adeSAndy Lutomirski 	 * are met:
510b0579adeSAndy Lutomirski 	 *
511b0579adeSAndy Lutomirski 	 * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that
512b0579adeSAndy Lutomirski 	 *    we've always done all needed flushes to catch up to
513b0579adeSAndy Lutomirski 	 *    local_tlb_gen.  If, for example, local_tlb_gen == 2 and
514b0579adeSAndy Lutomirski 	 *    f->new_tlb_gen == 3, then we know that the flush needed to bring
515b0579adeSAndy Lutomirski 	 *    us up to date for tlb_gen 3 is the partial flush we're
516b0579adeSAndy Lutomirski 	 *    processing.
517b0579adeSAndy Lutomirski 	 *
518b0579adeSAndy Lutomirski 	 *    As an example of why this check is needed, suppose that there
519b0579adeSAndy Lutomirski 	 *    are two concurrent flushes.  The first is a full flush that
520b0579adeSAndy Lutomirski 	 *    changes context.tlb_gen from 1 to 2.  The second is a partial
521b0579adeSAndy Lutomirski 	 *    flush that changes context.tlb_gen from 2 to 3.  If they get
522b0579adeSAndy Lutomirski 	 *    processed on this CPU in reverse order, we'll see
523b0579adeSAndy Lutomirski 	 *     local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
5241299ef1dSAndy Lutomirski 	 *    If we were to use __flush_tlb_one_user() and set local_tlb_gen to
525b0579adeSAndy Lutomirski 	 *    3, we'd be break the invariant: we'd update local_tlb_gen above
526b0579adeSAndy Lutomirski 	 *    1 without the full flush that's needed for tlb_gen 2.
527b0579adeSAndy Lutomirski 	 *
528b0579adeSAndy Lutomirski 	 * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimiation.
529b0579adeSAndy Lutomirski 	 *    Partial TLB flushes are not all that much cheaper than full TLB
530b0579adeSAndy Lutomirski 	 *    flushes, so it seems unlikely that it would be a performance win
531b0579adeSAndy Lutomirski 	 *    to do a partial flush if that won't bring our TLB fully up to
532b0579adeSAndy Lutomirski 	 *    date.  By doing a full flush instead, we can increase
533b0579adeSAndy Lutomirski 	 *    local_tlb_gen all the way to mm_tlb_gen and we can probably
534b0579adeSAndy Lutomirski 	 *    avoid another flush in the very near future.
535b0579adeSAndy Lutomirski 	 */
536b0579adeSAndy Lutomirski 	if (f->end != TLB_FLUSH_ALL &&
537b0579adeSAndy Lutomirski 	    f->new_tlb_gen == local_tlb_gen + 1 &&
538b0579adeSAndy Lutomirski 	    f->new_tlb_gen == mm_tlb_gen) {
539b0579adeSAndy Lutomirski 		/* Partial flush */
540e7b52ffdSAlex Shi 		unsigned long addr;
541be4ffc0dSAndy Lutomirski 		unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
542b0579adeSAndy Lutomirski 
543a2055abeSAndy Lutomirski 		addr = f->start;
544a2055abeSAndy Lutomirski 		while (addr < f->end) {
5451299ef1dSAndy Lutomirski 			__flush_tlb_one_user(addr);
546e7b52ffdSAlex Shi 			addr += PAGE_SIZE;
547e7b52ffdSAlex Shi 		}
548454bbad9SAndy Lutomirski 		if (local)
549454bbad9SAndy Lutomirski 			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
550454bbad9SAndy Lutomirski 		trace_tlb_flush(reason, nr_pages);
551b0579adeSAndy Lutomirski 	} else {
552b0579adeSAndy Lutomirski 		/* Full flush. */
553b0579adeSAndy Lutomirski 		local_flush_tlb();
554b0579adeSAndy Lutomirski 		if (local)
555b0579adeSAndy Lutomirski 			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
556b0579adeSAndy Lutomirski 		trace_tlb_flush(reason, TLB_FLUSH_ALL);
557e7b52ffdSAlex Shi 	}
558b0579adeSAndy Lutomirski 
559b0579adeSAndy Lutomirski 	/* Both paths above update our state to mm_tlb_gen. */
56010af6235SAndy Lutomirski 	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
56155f4949fSIngo Molnar }
56255f4949fSIngo Molnar 
563454bbad9SAndy Lutomirski static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
564454bbad9SAndy Lutomirski {
565454bbad9SAndy Lutomirski 	const struct flush_tlb_info *f = info;
566454bbad9SAndy Lutomirski 
567454bbad9SAndy Lutomirski 	flush_tlb_func_common(f, true, reason);
568454bbad9SAndy Lutomirski }
569454bbad9SAndy Lutomirski 
570454bbad9SAndy Lutomirski static void flush_tlb_func_remote(void *info)
571454bbad9SAndy Lutomirski {
572454bbad9SAndy Lutomirski 	const struct flush_tlb_info *f = info;
573454bbad9SAndy Lutomirski 
574454bbad9SAndy Lutomirski 	inc_irq_stat(irq_tlb_count);
575454bbad9SAndy Lutomirski 
5763d28ebceSAndy Lutomirski 	if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
577454bbad9SAndy Lutomirski 		return;
578454bbad9SAndy Lutomirski 
579454bbad9SAndy Lutomirski 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
580454bbad9SAndy Lutomirski 	flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
581454bbad9SAndy Lutomirski }
582454bbad9SAndy Lutomirski 
58355f4949fSIngo Molnar void native_flush_tlb_others(const struct cpumask *cpumask,
584a2055abeSAndy Lutomirski 			     const struct flush_tlb_info *info)
58555f4949fSIngo Molnar {
586ec659934SMel Gorman 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
587a2055abeSAndy Lutomirski 	if (info->end == TLB_FLUSH_ALL)
58818c98243SNadav Amit 		trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
58918c98243SNadav Amit 	else
59018c98243SNadav Amit 		trace_tlb_flush(TLB_REMOTE_SEND_IPI,
591a2055abeSAndy Lutomirski 				(info->end - info->start) >> PAGE_SHIFT);
59218c98243SNadav Amit 
59355f4949fSIngo Molnar 	if (is_uv_system()) {
59494b1b03bSAndy Lutomirski 		/*
59594b1b03bSAndy Lutomirski 		 * This whole special case is confused.  UV has a "Broadcast
59694b1b03bSAndy Lutomirski 		 * Assist Unit", which seems to be a fancy way to send IPIs.
59794b1b03bSAndy Lutomirski 		 * Back when x86 used an explicit TLB flush IPI, UV was
59894b1b03bSAndy Lutomirski 		 * optimized to use its own mechanism.  These days, x86 uses
59994b1b03bSAndy Lutomirski 		 * smp_call_function_many(), but UV still uses a manual IPI,
60094b1b03bSAndy Lutomirski 		 * and that IPI's action is out of date -- it does a manual
60194b1b03bSAndy Lutomirski 		 * flush instead of calling flush_tlb_func_remote().  This
60294b1b03bSAndy Lutomirski 		 * means that the percpu tlb_gen variables won't be updated
60394b1b03bSAndy Lutomirski 		 * and we'll do pointless flushes on future context switches.
60494b1b03bSAndy Lutomirski 		 *
60594b1b03bSAndy Lutomirski 		 * Rather than hooking native_flush_tlb_others() here, I think
60694b1b03bSAndy Lutomirski 		 * that UV should be updated so that smp_call_function_many(),
60794b1b03bSAndy Lutomirski 		 * etc, are optimal on UV.
60894b1b03bSAndy Lutomirski 		 */
60952a288c7SPeter Zijlstra 		unsigned int cpu;
61052a288c7SPeter Zijlstra 
61125542c64SXiao Guangrong 		cpu = smp_processor_id();
612a2055abeSAndy Lutomirski 		cpumask = uv_flush_tlb_others(cpumask, info);
61355f4949fSIngo Molnar 		if (cpumask)
614454bbad9SAndy Lutomirski 			smp_call_function_many(cpumask, flush_tlb_func_remote,
615a2055abeSAndy Lutomirski 					       (void *)info, 1);
61655f4949fSIngo Molnar 		return;
61755f4949fSIngo Molnar 	}
618454bbad9SAndy Lutomirski 	smp_call_function_many(cpumask, flush_tlb_func_remote,
619a2055abeSAndy Lutomirski 			       (void *)info, 1);
62055f4949fSIngo Molnar }
62155f4949fSIngo Molnar 
622a5102476SDave Hansen /*
623a5102476SDave Hansen  * See Documentation/x86/tlb.txt for details.  We choose 33
624a5102476SDave Hansen  * because it is large enough to cover the vast majority (at
625a5102476SDave Hansen  * least 95%) of allocations, and is small enough that we are
626a5102476SDave Hansen  * confident it will not cause too much overhead.  Each single
627a5102476SDave Hansen  * flush is about 100 ns, so this caps the maximum overhead at
628a5102476SDave Hansen  * _about_ 3,000 ns.
629a5102476SDave Hansen  *
630a5102476SDave Hansen  * This is in units of pages.
631a5102476SDave Hansen  */
63286426851SJeremiah Mahler static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
633e9f4e0a9SDave Hansen 
634611ae8e3SAlex Shi void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
635611ae8e3SAlex Shi 				unsigned long end, unsigned long vmflag)
636611ae8e3SAlex Shi {
637454bbad9SAndy Lutomirski 	int cpu;
638e7b52ffdSAlex Shi 
639515ab7c4SNadav Amit 	struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
640454bbad9SAndy Lutomirski 		.mm = mm,
641454bbad9SAndy Lutomirski 	};
642ce27374fSAndy Lutomirski 
643454bbad9SAndy Lutomirski 	cpu = get_cpu();
64471b3c126SAndy Lutomirski 
645f39681edSAndy Lutomirski 	/* This is also a barrier that synchronizes with switch_mm(). */
646b0579adeSAndy Lutomirski 	info.new_tlb_gen = inc_mm_tlb_gen(mm);
64771b3c126SAndy Lutomirski 
648454bbad9SAndy Lutomirski 	/* Should we flush just the requested range? */
649454bbad9SAndy Lutomirski 	if ((end != TLB_FLUSH_ALL) &&
650454bbad9SAndy Lutomirski 	    !(vmflag & VM_HUGETLB) &&
651454bbad9SAndy Lutomirski 	    ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
652a2055abeSAndy Lutomirski 		info.start = start;
653a2055abeSAndy Lutomirski 		info.end = end;
654454bbad9SAndy Lutomirski 	} else {
655454bbad9SAndy Lutomirski 		info.start = 0UL;
656454bbad9SAndy Lutomirski 		info.end = TLB_FLUSH_ALL;
6574995ab9cSDave Hansen 	}
658454bbad9SAndy Lutomirski 
659bc0d5a89SAndy Lutomirski 	if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
660bc0d5a89SAndy Lutomirski 		VM_WARN_ON(irqs_disabled());
661bc0d5a89SAndy Lutomirski 		local_irq_disable();
662454bbad9SAndy Lutomirski 		flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
663bc0d5a89SAndy Lutomirski 		local_irq_enable();
664bc0d5a89SAndy Lutomirski 	}
665bc0d5a89SAndy Lutomirski 
666454bbad9SAndy Lutomirski 	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
667a2055abeSAndy Lutomirski 		flush_tlb_others(mm_cpumask(mm), &info);
66894b1b03bSAndy Lutomirski 
669454bbad9SAndy Lutomirski 	put_cpu();
670e7b52ffdSAlex Shi }
671e7b52ffdSAlex Shi 
672a2055abeSAndy Lutomirski 
67355f4949fSIngo Molnar static void do_flush_tlb_all(void *info)
67455f4949fSIngo Molnar {
675ec659934SMel Gorman 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
67655f4949fSIngo Molnar 	__flush_tlb_all();
67755f4949fSIngo Molnar }
67855f4949fSIngo Molnar 
67955f4949fSIngo Molnar void flush_tlb_all(void)
68055f4949fSIngo Molnar {
681ec659934SMel Gorman 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
68255f4949fSIngo Molnar 	on_each_cpu(do_flush_tlb_all, NULL, 1);
68355f4949fSIngo Molnar }
6843df3212fSAlex Shi 
685effee4b9SAlex Shi static void do_kernel_range_flush(void *info)
686effee4b9SAlex Shi {
687effee4b9SAlex Shi 	struct flush_tlb_info *f = info;
688effee4b9SAlex Shi 	unsigned long addr;
689effee4b9SAlex Shi 
690effee4b9SAlex Shi 	/* flush range by one by one 'invlpg' */
691a2055abeSAndy Lutomirski 	for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
6921299ef1dSAndy Lutomirski 		__flush_tlb_one_kernel(addr);
693effee4b9SAlex Shi }
694effee4b9SAlex Shi 
695effee4b9SAlex Shi void flush_tlb_kernel_range(unsigned long start, unsigned long end)
696effee4b9SAlex Shi {
697effee4b9SAlex Shi 
698effee4b9SAlex Shi 	/* Balance as user space task's flush, a bit conservative */
699e9f4e0a9SDave Hansen 	if (end == TLB_FLUSH_ALL ||
700be4ffc0dSAndy Lutomirski 	    (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
701effee4b9SAlex Shi 		on_each_cpu(do_flush_tlb_all, NULL, 1);
702e9f4e0a9SDave Hansen 	} else {
703e9f4e0a9SDave Hansen 		struct flush_tlb_info info;
704a2055abeSAndy Lutomirski 		info.start = start;
705a2055abeSAndy Lutomirski 		info.end = end;
706effee4b9SAlex Shi 		on_each_cpu(do_kernel_range_flush, &info, 1);
707effee4b9SAlex Shi 	}
708effee4b9SAlex Shi }
7092d040a1cSDave Hansen 
710e73ad5ffSAndy Lutomirski void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
711e73ad5ffSAndy Lutomirski {
712a2055abeSAndy Lutomirski 	struct flush_tlb_info info = {
713a2055abeSAndy Lutomirski 		.mm = NULL,
714a2055abeSAndy Lutomirski 		.start = 0UL,
715a2055abeSAndy Lutomirski 		.end = TLB_FLUSH_ALL,
716a2055abeSAndy Lutomirski 	};
717a2055abeSAndy Lutomirski 
718e73ad5ffSAndy Lutomirski 	int cpu = get_cpu();
719e73ad5ffSAndy Lutomirski 
720bc0d5a89SAndy Lutomirski 	if (cpumask_test_cpu(cpu, &batch->cpumask)) {
721bc0d5a89SAndy Lutomirski 		VM_WARN_ON(irqs_disabled());
722bc0d5a89SAndy Lutomirski 		local_irq_disable();
7233f79e4c7SAndy Lutomirski 		flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
724bc0d5a89SAndy Lutomirski 		local_irq_enable();
725bc0d5a89SAndy Lutomirski 	}
726bc0d5a89SAndy Lutomirski 
727e73ad5ffSAndy Lutomirski 	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
728a2055abeSAndy Lutomirski 		flush_tlb_others(&batch->cpumask, &info);
72994b1b03bSAndy Lutomirski 
730e73ad5ffSAndy Lutomirski 	cpumask_clear(&batch->cpumask);
731e73ad5ffSAndy Lutomirski 
732e73ad5ffSAndy Lutomirski 	put_cpu();
733e73ad5ffSAndy Lutomirski }
734e73ad5ffSAndy Lutomirski 
7352d040a1cSDave Hansen static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
7362d040a1cSDave Hansen 			     size_t count, loff_t *ppos)
7372d040a1cSDave Hansen {
7382d040a1cSDave Hansen 	char buf[32];
7392d040a1cSDave Hansen 	unsigned int len;
7402d040a1cSDave Hansen 
7412d040a1cSDave Hansen 	len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
7422d040a1cSDave Hansen 	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
7432d040a1cSDave Hansen }
7442d040a1cSDave Hansen 
7452d040a1cSDave Hansen static ssize_t tlbflush_write_file(struct file *file,
7462d040a1cSDave Hansen 		 const char __user *user_buf, size_t count, loff_t *ppos)
7472d040a1cSDave Hansen {
7482d040a1cSDave Hansen 	char buf[32];
7492d040a1cSDave Hansen 	ssize_t len;
7502d040a1cSDave Hansen 	int ceiling;
7512d040a1cSDave Hansen 
7522d040a1cSDave Hansen 	len = min(count, sizeof(buf) - 1);
7532d040a1cSDave Hansen 	if (copy_from_user(buf, user_buf, len))
7542d040a1cSDave Hansen 		return -EFAULT;
7552d040a1cSDave Hansen 
7562d040a1cSDave Hansen 	buf[len] = '\0';
7572d040a1cSDave Hansen 	if (kstrtoint(buf, 0, &ceiling))
7582d040a1cSDave Hansen 		return -EINVAL;
7592d040a1cSDave Hansen 
7602d040a1cSDave Hansen 	if (ceiling < 0)
7612d040a1cSDave Hansen 		return -EINVAL;
7622d040a1cSDave Hansen 
7632d040a1cSDave Hansen 	tlb_single_page_flush_ceiling = ceiling;
7642d040a1cSDave Hansen 	return count;
7652d040a1cSDave Hansen }
7662d040a1cSDave Hansen 
7672d040a1cSDave Hansen static const struct file_operations fops_tlbflush = {
7682d040a1cSDave Hansen 	.read = tlbflush_read_file,
7692d040a1cSDave Hansen 	.write = tlbflush_write_file,
7702d040a1cSDave Hansen 	.llseek = default_llseek,
7712d040a1cSDave Hansen };
7722d040a1cSDave Hansen 
7732d040a1cSDave Hansen static int __init create_tlb_single_page_flush_ceiling(void)
7742d040a1cSDave Hansen {
7752d040a1cSDave Hansen 	debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
7762d040a1cSDave Hansen 			    arch_debugfs_dir, NULL, &fops_tlbflush);
7772d040a1cSDave Hansen 	return 0;
7782d040a1cSDave Hansen }
7792d040a1cSDave Hansen late_initcall(create_tlb_single_page_flush_ceiling);
780