xref: /openbmc/linux/kernel/entry/common.c (revision 355f841a)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/context_tracking.h>
4 #include <linux/entry-common.h>
5 #include <linux/resume_user_mode.h>
6 #include <linux/highmem.h>
7 #include <linux/livepatch.h>
8 #include <linux/audit.h>
9 #include <linux/tick.h>
10 
11 #include "common.h"
12 
13 #define CREATE_TRACE_POINTS
14 #include <trace/events/syscalls.h>
15 
16 /* See comment for enter_from_user_mode() in entry-common.h */
17 static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
18 {
19 	arch_check_user_regs(regs);
20 	lockdep_hardirqs_off(CALLER_ADDR0);
21 
22 	CT_WARN_ON(ct_state() != CONTEXT_USER);
23 	user_exit_irqoff();
24 
25 	instrumentation_begin();
26 	trace_hardirqs_off_finish();
27 	instrumentation_end();
28 }
29 
30 void noinstr enter_from_user_mode(struct pt_regs *regs)
31 {
32 	__enter_from_user_mode(regs);
33 }
34 
35 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
36 {
37 	if (unlikely(audit_context())) {
38 		unsigned long args[6];
39 
40 		syscall_get_arguments(current, regs, args);
41 		audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
42 	}
43 }
44 
45 static long syscall_trace_enter(struct pt_regs *regs, long syscall,
46 				unsigned long work)
47 {
48 	long ret = 0;
49 
50 	/*
51 	 * Handle Syscall User Dispatch.  This must comes first, since
52 	 * the ABI here can be something that doesn't make sense for
53 	 * other syscall_work features.
54 	 */
55 	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
56 		if (syscall_user_dispatch(regs))
57 			return -1L;
58 	}
59 
60 	/* Handle ptrace */
61 	if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
62 		ret = ptrace_report_syscall_entry(regs);
63 		if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
64 			return -1L;
65 	}
66 
67 	/* Do seccomp after ptrace, to catch any tracer changes. */
68 	if (work & SYSCALL_WORK_SECCOMP) {
69 		ret = __secure_computing(NULL);
70 		if (ret == -1L)
71 			return ret;
72 	}
73 
74 	/* Either of the above might have changed the syscall number */
75 	syscall = syscall_get_nr(current, regs);
76 
77 	if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
78 		trace_sys_enter(regs, syscall);
79 
80 	syscall_enter_audit(regs, syscall);
81 
82 	return ret ? : syscall;
83 }
84 
85 static __always_inline long
86 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
87 {
88 	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
89 
90 	if (work & SYSCALL_WORK_ENTER)
91 		syscall = syscall_trace_enter(regs, syscall, work);
92 
93 	return syscall;
94 }
95 
96 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
97 {
98 	return __syscall_enter_from_user_work(regs, syscall);
99 }
100 
101 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
102 {
103 	long ret;
104 
105 	__enter_from_user_mode(regs);
106 
107 	instrumentation_begin();
108 	local_irq_enable();
109 	ret = __syscall_enter_from_user_work(regs, syscall);
110 	instrumentation_end();
111 
112 	return ret;
113 }
114 
115 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
116 {
117 	__enter_from_user_mode(regs);
118 	instrumentation_begin();
119 	local_irq_enable();
120 	instrumentation_end();
121 }
122 
123 /* See comment for exit_to_user_mode() in entry-common.h */
124 static __always_inline void __exit_to_user_mode(void)
125 {
126 	instrumentation_begin();
127 	trace_hardirqs_on_prepare();
128 	lockdep_hardirqs_on_prepare(CALLER_ADDR0);
129 	instrumentation_end();
130 
131 	user_enter_irqoff();
132 	arch_exit_to_user_mode();
133 	lockdep_hardirqs_on(CALLER_ADDR0);
134 }
135 
136 void noinstr exit_to_user_mode(void)
137 {
138 	__exit_to_user_mode();
139 }
140 
141 /* Workaround to allow gradual conversion of architecture code */
142 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
143 
144 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
145 					    unsigned long ti_work)
146 {
147 	/*
148 	 * Before returning to user space ensure that all pending work
149 	 * items have been completed.
150 	 */
151 	while (ti_work & EXIT_TO_USER_MODE_WORK) {
152 
153 		local_irq_enable_exit_to_user(ti_work);
154 
155 		if (ti_work & _TIF_NEED_RESCHED)
156 			schedule();
157 
158 		if (ti_work & _TIF_UPROBE)
159 			uprobe_notify_resume(regs);
160 
161 		if (ti_work & _TIF_PATCH_PENDING)
162 			klp_update_patch_state(current);
163 
164 		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
165 			arch_do_signal_or_restart(regs);
166 
167 		if (ti_work & _TIF_NOTIFY_RESUME)
168 			resume_user_mode_work(regs);
169 
170 		/* Architecture specific TIF work */
171 		arch_exit_to_user_mode_work(regs, ti_work);
172 
173 		/*
174 		 * Disable interrupts and reevaluate the work flags as they
175 		 * might have changed while interrupts and preemption was
176 		 * enabled above.
177 		 */
178 		local_irq_disable_exit_to_user();
179 
180 		/* Check if any of the above work has queued a deferred wakeup */
181 		tick_nohz_user_enter_prepare();
182 
183 		ti_work = read_thread_flags();
184 	}
185 
186 	/* Return the latest work state for arch_exit_to_user_mode() */
187 	return ti_work;
188 }
189 
190 static void exit_to_user_mode_prepare(struct pt_regs *regs)
191 {
192 	unsigned long ti_work = read_thread_flags();
193 
194 	lockdep_assert_irqs_disabled();
195 
196 	/* Flush pending rcuog wakeup before the last need_resched() check */
197 	tick_nohz_user_enter_prepare();
198 
199 	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
200 		ti_work = exit_to_user_mode_loop(regs, ti_work);
201 
202 	arch_exit_to_user_mode_prepare(regs, ti_work);
203 
204 	/* Ensure that the address limit is intact and no locks are held */
205 	addr_limit_user_check();
206 	kmap_assert_nomap();
207 	lockdep_assert_irqs_disabled();
208 	lockdep_sys_exit();
209 }
210 
211 /*
212  * If SYSCALL_EMU is set, then the only reason to report is when
213  * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
214  * instruction has been already reported in syscall_enter_from_user_mode().
215  */
216 static inline bool report_single_step(unsigned long work)
217 {
218 	if (work & SYSCALL_WORK_SYSCALL_EMU)
219 		return false;
220 
221 	return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
222 }
223 
224 static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
225 {
226 	bool step;
227 
228 	/*
229 	 * If the syscall was rolled back due to syscall user dispatching,
230 	 * then the tracers below are not invoked for the same reason as
231 	 * the entry side was not invoked in syscall_trace_enter(): The ABI
232 	 * of these syscalls is unknown.
233 	 */
234 	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
235 		if (unlikely(current->syscall_dispatch.on_dispatch)) {
236 			current->syscall_dispatch.on_dispatch = false;
237 			return;
238 		}
239 	}
240 
241 	audit_syscall_exit(regs);
242 
243 	if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
244 		trace_sys_exit(regs, syscall_get_return_value(current, regs));
245 
246 	step = report_single_step(work);
247 	if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
248 		ptrace_report_syscall_exit(regs, step);
249 }
250 
251 /*
252  * Syscall specific exit to user mode preparation. Runs with interrupts
253  * enabled.
254  */
255 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
256 {
257 	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
258 	unsigned long nr = syscall_get_nr(current, regs);
259 
260 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
261 
262 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
263 		if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
264 			local_irq_enable();
265 	}
266 
267 	rseq_syscall(regs);
268 
269 	/*
270 	 * Do one-time syscall specific work. If these work items are
271 	 * enabled, we want to run them exactly once per syscall exit with
272 	 * interrupts enabled.
273 	 */
274 	if (unlikely(work & SYSCALL_WORK_EXIT))
275 		syscall_exit_work(regs, work);
276 }
277 
278 static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
279 {
280 	syscall_exit_to_user_mode_prepare(regs);
281 	local_irq_disable_exit_to_user();
282 	exit_to_user_mode_prepare(regs);
283 }
284 
285 void syscall_exit_to_user_mode_work(struct pt_regs *regs)
286 {
287 	__syscall_exit_to_user_mode_work(regs);
288 }
289 
290 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
291 {
292 	instrumentation_begin();
293 	__syscall_exit_to_user_mode_work(regs);
294 	instrumentation_end();
295 	__exit_to_user_mode();
296 }
297 
298 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
299 {
300 	__enter_from_user_mode(regs);
301 }
302 
303 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
304 {
305 	instrumentation_begin();
306 	exit_to_user_mode_prepare(regs);
307 	instrumentation_end();
308 	__exit_to_user_mode();
309 }
310 
311 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
312 {
313 	irqentry_state_t ret = {
314 		.exit_rcu = false,
315 	};
316 
317 	if (user_mode(regs)) {
318 		irqentry_enter_from_user_mode(regs);
319 		return ret;
320 	}
321 
322 	/*
323 	 * If this entry hit the idle task invoke rcu_irq_enter() whether
324 	 * RCU is watching or not.
325 	 *
326 	 * Interrupts can nest when the first interrupt invokes softirq
327 	 * processing on return which enables interrupts.
328 	 *
329 	 * Scheduler ticks in the idle task can mark quiescent state and
330 	 * terminate a grace period, if and only if the timer interrupt is
331 	 * not nested into another interrupt.
332 	 *
333 	 * Checking for rcu_is_watching() here would prevent the nesting
334 	 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
335 	 * the tick then rcu_flavor_sched_clock_irq() would wrongfully
336 	 * assume that it is the first interrupt and eventually claim
337 	 * quiescent state and end grace periods prematurely.
338 	 *
339 	 * Unconditionally invoke rcu_irq_enter() so RCU state stays
340 	 * consistent.
341 	 *
342 	 * TINY_RCU does not support EQS, so let the compiler eliminate
343 	 * this part when enabled.
344 	 */
345 	if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
346 		/*
347 		 * If RCU is not watching then the same careful
348 		 * sequence vs. lockdep and tracing is required
349 		 * as in irqentry_enter_from_user_mode().
350 		 */
351 		lockdep_hardirqs_off(CALLER_ADDR0);
352 		rcu_irq_enter();
353 		instrumentation_begin();
354 		trace_hardirqs_off_finish();
355 		instrumentation_end();
356 
357 		ret.exit_rcu = true;
358 		return ret;
359 	}
360 
361 	/*
362 	 * If RCU is watching then RCU only wants to check whether it needs
363 	 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
364 	 * already contains a warning when RCU is not watching, so no point
365 	 * in having another one here.
366 	 */
367 	lockdep_hardirqs_off(CALLER_ADDR0);
368 	instrumentation_begin();
369 	rcu_irq_enter_check_tick();
370 	trace_hardirqs_off_finish();
371 	instrumentation_end();
372 
373 	return ret;
374 }
375 
376 void irqentry_exit_cond_resched(void)
377 {
378 	if (!preempt_count()) {
379 		/* Sanity check RCU and thread stack */
380 		rcu_irq_exit_check_preempt();
381 		if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
382 			WARN_ON_ONCE(!on_thread_stack());
383 		if (need_resched())
384 			preempt_schedule_irq();
385 	}
386 }
387 #ifdef CONFIG_PREEMPT_DYNAMIC
388 DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
389 #endif
390 
391 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
392 {
393 	lockdep_assert_irqs_disabled();
394 
395 	/* Check whether this returns to user mode */
396 	if (user_mode(regs)) {
397 		irqentry_exit_to_user_mode(regs);
398 	} else if (!regs_irqs_disabled(regs)) {
399 		/*
400 		 * If RCU was not watching on entry this needs to be done
401 		 * carefully and needs the same ordering of lockdep/tracing
402 		 * and RCU as the return to user mode path.
403 		 */
404 		if (state.exit_rcu) {
405 			instrumentation_begin();
406 			/* Tell the tracer that IRET will enable interrupts */
407 			trace_hardirqs_on_prepare();
408 			lockdep_hardirqs_on_prepare(CALLER_ADDR0);
409 			instrumentation_end();
410 			rcu_irq_exit();
411 			lockdep_hardirqs_on(CALLER_ADDR0);
412 			return;
413 		}
414 
415 		instrumentation_begin();
416 		if (IS_ENABLED(CONFIG_PREEMPTION)) {
417 #ifdef CONFIG_PREEMPT_DYNAMIC
418 			static_call(irqentry_exit_cond_resched)();
419 #else
420 			irqentry_exit_cond_resched();
421 #endif
422 		}
423 		/* Covers both tracing and lockdep */
424 		trace_hardirqs_on();
425 		instrumentation_end();
426 	} else {
427 		/*
428 		 * IRQ flags state is correct already. Just tell RCU if it
429 		 * was not watching on entry.
430 		 */
431 		if (state.exit_rcu)
432 			rcu_irq_exit();
433 	}
434 }
435 
436 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
437 {
438 	irqentry_state_t irq_state;
439 
440 	irq_state.lockdep = lockdep_hardirqs_enabled();
441 
442 	__nmi_enter();
443 	lockdep_hardirqs_off(CALLER_ADDR0);
444 	lockdep_hardirq_enter();
445 	rcu_nmi_enter();
446 
447 	instrumentation_begin();
448 	trace_hardirqs_off_finish();
449 	ftrace_nmi_enter();
450 	instrumentation_end();
451 
452 	return irq_state;
453 }
454 
455 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
456 {
457 	instrumentation_begin();
458 	ftrace_nmi_exit();
459 	if (irq_state.lockdep) {
460 		trace_hardirqs_on_prepare();
461 		lockdep_hardirqs_on_prepare(CALLER_ADDR0);
462 	}
463 	instrumentation_end();
464 
465 	rcu_nmi_exit();
466 	lockdep_hardirq_exit();
467 	if (irq_state.lockdep)
468 		lockdep_hardirqs_on(CALLER_ADDR0);
469 	__nmi_exit();
470 }
471