xref: /openbmc/linux/arch/x86/entry/common.c (revision a2cce7a9)
1 /*
2  * common.c - C code for kernel entry and exit
3  * Copyright (c) 2015 Andrew Lutomirski
4  * GPL v2
5  *
6  * Based on asm and ptrace code by many authors.  The code here originated
7  * in ptrace.c and signal.c.
8  */
9 
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/mm.h>
13 #include <linux/smp.h>
14 #include <linux/errno.h>
15 #include <linux/ptrace.h>
16 #include <linux/tracehook.h>
17 #include <linux/audit.h>
18 #include <linux/seccomp.h>
19 #include <linux/signal.h>
20 #include <linux/export.h>
21 #include <linux/context_tracking.h>
22 #include <linux/user-return-notifier.h>
23 #include <linux/uprobes.h>
24 
25 #include <asm/desc.h>
26 #include <asm/traps.h>
27 
28 #define CREATE_TRACE_POINTS
29 #include <trace/events/syscalls.h>
30 
31 #ifdef CONFIG_CONTEXT_TRACKING
32 /* Called on entry from user mode with IRQs off. */
33 __visible void enter_from_user_mode(void)
34 {
35 	CT_WARN_ON(ct_state() != CONTEXT_USER);
36 	user_exit();
37 }
38 #endif
39 
40 static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
41 {
42 #ifdef CONFIG_X86_64
43 	if (arch == AUDIT_ARCH_X86_64) {
44 		audit_syscall_entry(regs->orig_ax, regs->di,
45 				    regs->si, regs->dx, regs->r10);
46 	} else
47 #endif
48 	{
49 		audit_syscall_entry(regs->orig_ax, regs->bx,
50 				    regs->cx, regs->dx, regs->si);
51 	}
52 }
53 
54 /*
55  * We can return 0 to resume the syscall or anything else to go to phase
56  * 2.  If we resume the syscall, we need to put something appropriate in
57  * regs->orig_ax.
58  *
59  * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
60  * are fully functional.
61  *
62  * For phase 2's benefit, our return value is:
63  * 0:			resume the syscall
64  * 1:			go to phase 2; no seccomp phase 2 needed
65  * anything else:	go to phase 2; pass return value to seccomp
66  */
67 unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
68 {
69 	unsigned long ret = 0;
70 	u32 work;
71 
72 	BUG_ON(regs != task_pt_regs(current));
73 
74 	work = ACCESS_ONCE(current_thread_info()->flags) &
75 		_TIF_WORK_SYSCALL_ENTRY;
76 
77 #ifdef CONFIG_CONTEXT_TRACKING
78 	/*
79 	 * If TIF_NOHZ is set, we are required to call user_exit() before
80 	 * doing anything that could touch RCU.
81 	 */
82 	if (work & _TIF_NOHZ) {
83 		enter_from_user_mode();
84 		work &= ~_TIF_NOHZ;
85 	}
86 #endif
87 
88 #ifdef CONFIG_SECCOMP
89 	/*
90 	 * Do seccomp first -- it should minimize exposure of other
91 	 * code, and keeping seccomp fast is probably more valuable
92 	 * than the rest of this.
93 	 */
94 	if (work & _TIF_SECCOMP) {
95 		struct seccomp_data sd;
96 
97 		sd.arch = arch;
98 		sd.nr = regs->orig_ax;
99 		sd.instruction_pointer = regs->ip;
100 #ifdef CONFIG_X86_64
101 		if (arch == AUDIT_ARCH_X86_64) {
102 			sd.args[0] = regs->di;
103 			sd.args[1] = regs->si;
104 			sd.args[2] = regs->dx;
105 			sd.args[3] = regs->r10;
106 			sd.args[4] = regs->r8;
107 			sd.args[5] = regs->r9;
108 		} else
109 #endif
110 		{
111 			sd.args[0] = regs->bx;
112 			sd.args[1] = regs->cx;
113 			sd.args[2] = regs->dx;
114 			sd.args[3] = regs->si;
115 			sd.args[4] = regs->di;
116 			sd.args[5] = regs->bp;
117 		}
118 
119 		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
120 		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
121 
122 		ret = seccomp_phase1(&sd);
123 		if (ret == SECCOMP_PHASE1_SKIP) {
124 			regs->orig_ax = -1;
125 			ret = 0;
126 		} else if (ret != SECCOMP_PHASE1_OK) {
127 			return ret;  /* Go directly to phase 2 */
128 		}
129 
130 		work &= ~_TIF_SECCOMP;
131 	}
132 #endif
133 
134 	/* Do our best to finish without phase 2. */
135 	if (work == 0)
136 		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
137 
138 #ifdef CONFIG_AUDITSYSCALL
139 	if (work == _TIF_SYSCALL_AUDIT) {
140 		/*
141 		 * If there is no more work to be done except auditing,
142 		 * then audit in phase 1.  Phase 2 always audits, so, if
143 		 * we audit here, then we can't go on to phase 2.
144 		 */
145 		do_audit_syscall_entry(regs, arch);
146 		return 0;
147 	}
148 #endif
149 
150 	return 1;  /* Something is enabled that we can't handle in phase 1 */
151 }
152 
153 /* Returns the syscall nr to run (which should match regs->orig_ax). */
154 long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
155 				unsigned long phase1_result)
156 {
157 	long ret = 0;
158 	u32 work = ACCESS_ONCE(current_thread_info()->flags) &
159 		_TIF_WORK_SYSCALL_ENTRY;
160 
161 	BUG_ON(regs != task_pt_regs(current));
162 
163 	/*
164 	 * If we stepped into a sysenter/syscall insn, it trapped in
165 	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
166 	 * If user-mode had set TF itself, then it's still clear from
167 	 * do_debug() and we need to set it again to restore the user
168 	 * state.  If we entered on the slow path, TF was already set.
169 	 */
170 	if (work & _TIF_SINGLESTEP)
171 		regs->flags |= X86_EFLAGS_TF;
172 
173 #ifdef CONFIG_SECCOMP
174 	/*
175 	 * Call seccomp_phase2 before running the other hooks so that
176 	 * they can see any changes made by a seccomp tracer.
177 	 */
178 	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
179 		/* seccomp failures shouldn't expose any additional code. */
180 		return -1;
181 	}
182 #endif
183 
184 	if (unlikely(work & _TIF_SYSCALL_EMU))
185 		ret = -1L;
186 
187 	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
188 	    tracehook_report_syscall_entry(regs))
189 		ret = -1L;
190 
191 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
192 		trace_sys_enter(regs, regs->orig_ax);
193 
194 	do_audit_syscall_entry(regs, arch);
195 
196 	return ret ?: regs->orig_ax;
197 }
198 
199 long syscall_trace_enter(struct pt_regs *regs)
200 {
201 	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
202 	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
203 
204 	if (phase1_result == 0)
205 		return regs->orig_ax;
206 	else
207 		return syscall_trace_enter_phase2(regs, arch, phase1_result);
208 }
209 
210 static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
211 {
212 	unsigned long top_of_stack =
213 		(unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
214 	return (struct thread_info *)(top_of_stack - THREAD_SIZE);
215 }
216 
217 /* Called with IRQs disabled. */
218 __visible void prepare_exit_to_usermode(struct pt_regs *regs)
219 {
220 	if (WARN_ON(!irqs_disabled()))
221 		local_irq_disable();
222 
223 	/*
224 	 * In order to return to user mode, we need to have IRQs off with
225 	 * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY,
226 	 * _TIF_UPROBE, or _TIF_NEED_RESCHED set.  Several of these flags
227 	 * can be set at any time on preemptable kernels if we have IRQs on,
228 	 * so we need to loop.  Disabling preemption wouldn't help: doing the
229 	 * work to clear some of the flags can sleep.
230 	 */
231 	while (true) {
232 		u32 cached_flags =
233 			READ_ONCE(pt_regs_to_thread_info(regs)->flags);
234 
235 		if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |
236 				      _TIF_UPROBE | _TIF_NEED_RESCHED |
237 				      _TIF_USER_RETURN_NOTIFY)))
238 			break;
239 
240 		/* We have work to do. */
241 		local_irq_enable();
242 
243 		if (cached_flags & _TIF_NEED_RESCHED)
244 			schedule();
245 
246 		if (cached_flags & _TIF_UPROBE)
247 			uprobe_notify_resume(regs);
248 
249 		/* deal with pending signal delivery */
250 		if (cached_flags & _TIF_SIGPENDING)
251 			do_signal(regs);
252 
253 		if (cached_flags & _TIF_NOTIFY_RESUME) {
254 			clear_thread_flag(TIF_NOTIFY_RESUME);
255 			tracehook_notify_resume(regs);
256 		}
257 
258 		if (cached_flags & _TIF_USER_RETURN_NOTIFY)
259 			fire_user_return_notifiers();
260 
261 		/* Disable IRQs and retry */
262 		local_irq_disable();
263 	}
264 
265 	user_enter();
266 }
267 
268 /*
269  * Called with IRQs on and fully valid regs.  Returns with IRQs off in a
270  * state such that we can immediately switch to user mode.
271  */
272 __visible void syscall_return_slowpath(struct pt_regs *regs)
273 {
274 	struct thread_info *ti = pt_regs_to_thread_info(regs);
275 	u32 cached_flags = READ_ONCE(ti->flags);
276 	bool step;
277 
278 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
279 
280 	if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
281 		 regs->orig_ax))
282 		local_irq_enable();
283 
284 	/*
285 	 * First do one-time work.  If these work items are enabled, we
286 	 * want to run them exactly once per syscall exit with IRQs on.
287 	 */
288 	if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |
289 			    _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) {
290 		audit_syscall_exit(regs);
291 
292 		if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
293 			trace_sys_exit(regs, regs->ax);
294 
295 		/*
296 		 * If TIF_SYSCALL_EMU is set, we only get here because of
297 		 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
298 		 * We already reported this syscall instruction in
299 		 * syscall_trace_enter().
300 		 */
301 		step = unlikely(
302 			(cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
303 			== _TIF_SINGLESTEP);
304 		if (step || cached_flags & _TIF_SYSCALL_TRACE)
305 			tracehook_report_syscall_exit(regs, step);
306 	}
307 
308 #ifdef CONFIG_COMPAT
309 	/*
310 	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
311 	 * returning to user mode.
312 	 */
313 	ti->status &= ~TS_COMPAT;
314 #endif
315 
316 	local_irq_disable();
317 	prepare_exit_to_usermode(regs);
318 }
319