1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/context_tracking.h> 4 #include <linux/entry-common.h> 5 #include <linux/livepatch.h> 6 #include <linux/audit.h> 7 8 #define CREATE_TRACE_POINTS 9 #include <trace/events/syscalls.h> 10 11 /** 12 * enter_from_user_mode - Establish state when coming from user mode 13 * 14 * Syscall/interrupt entry disables interrupts, but user mode is traced as 15 * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. 16 * 17 * 1) Tell lockdep that interrupts are disabled 18 * 2) Invoke context tracking if enabled to reactivate RCU 19 * 3) Trace interrupts off state 20 */ 21 static __always_inline void enter_from_user_mode(struct pt_regs *regs) 22 { 23 arch_check_user_regs(regs); 24 lockdep_hardirqs_off(CALLER_ADDR0); 25 26 CT_WARN_ON(ct_state() != CONTEXT_USER); 27 user_exit_irqoff(); 28 29 instrumentation_begin(); 30 trace_hardirqs_off_finish(); 31 instrumentation_end(); 32 } 33 34 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 35 { 36 if (unlikely(audit_context())) { 37 unsigned long args[6]; 38 39 syscall_get_arguments(current, regs, args); 40 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 41 } 42 } 43 44 static long syscall_trace_enter(struct pt_regs *regs, long syscall, 45 unsigned long ti_work) 46 { 47 long ret = 0; 48 49 /* Handle ptrace */ 50 if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { 51 ret = arch_syscall_enter_tracehook(regs); 52 if (ret || (ti_work & _TIF_SYSCALL_EMU)) 53 return -1L; 54 } 55 56 /* Do seccomp after ptrace, to catch any tracer changes. */ 57 if (ti_work & _TIF_SECCOMP) { 58 ret = __secure_computing(NULL); 59 if (ret == -1L) 60 return ret; 61 } 62 63 if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT)) 64 trace_sys_enter(regs, syscall); 65 66 syscall_enter_audit(regs, syscall); 67 68 /* The above might have changed the syscall number */ 69 return ret ? : syscall_get_nr(current, regs); 70 } 71 72 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 73 { 74 unsigned long ti_work; 75 76 enter_from_user_mode(regs); 77 instrumentation_begin(); 78 79 local_irq_enable(); 80 ti_work = READ_ONCE(current_thread_info()->flags); 81 if (ti_work & SYSCALL_ENTER_WORK) 82 syscall = syscall_trace_enter(regs, syscall, ti_work); 83 instrumentation_end(); 84 85 return syscall; 86 } 87 88 /** 89 * exit_to_user_mode - Fixup state when exiting to user mode 90 * 91 * Syscall/interupt exit enables interrupts, but the kernel state is 92 * interrupts disabled when this is invoked. Also tell RCU about it. 93 * 94 * 1) Trace interrupts on state 95 * 2) Invoke context tracking if enabled to adjust RCU state 96 * 3) Invoke architecture specific last minute exit code, e.g. speculation 97 * mitigations, etc. 98 * 4) Tell lockdep that interrupts are enabled 99 */ 100 static __always_inline void exit_to_user_mode(void) 101 { 102 instrumentation_begin(); 103 trace_hardirqs_on_prepare(); 104 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 105 instrumentation_end(); 106 107 user_enter_irqoff(); 108 arch_exit_to_user_mode(); 109 lockdep_hardirqs_on(CALLER_ADDR0); 110 } 111 112 /* Workaround to allow gradual conversion of architecture code */ 113 void __weak arch_do_signal(struct pt_regs *regs) { } 114 115 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 116 unsigned long ti_work) 117 { 118 /* 119 * Before returning to user space ensure that all pending work 120 * items have been completed. 121 */ 122 while (ti_work & EXIT_TO_USER_MODE_WORK) { 123 124 local_irq_enable_exit_to_user(ti_work); 125 126 if (ti_work & _TIF_NEED_RESCHED) 127 schedule(); 128 129 if (ti_work & _TIF_UPROBE) 130 uprobe_notify_resume(regs); 131 132 if (ti_work & _TIF_PATCH_PENDING) 133 klp_update_patch_state(current); 134 135 if (ti_work & _TIF_SIGPENDING) 136 arch_do_signal(regs); 137 138 if (ti_work & _TIF_NOTIFY_RESUME) { 139 clear_thread_flag(TIF_NOTIFY_RESUME); 140 tracehook_notify_resume(regs); 141 rseq_handle_notify_resume(NULL, regs); 142 } 143 144 /* Architecture specific TIF work */ 145 arch_exit_to_user_mode_work(regs, ti_work); 146 147 /* 148 * Disable interrupts and reevaluate the work flags as they 149 * might have changed while interrupts and preemption was 150 * enabled above. 151 */ 152 local_irq_disable_exit_to_user(); 153 ti_work = READ_ONCE(current_thread_info()->flags); 154 } 155 156 /* Return the latest work state for arch_exit_to_user_mode() */ 157 return ti_work; 158 } 159 160 static void exit_to_user_mode_prepare(struct pt_regs *regs) 161 { 162 unsigned long ti_work = READ_ONCE(current_thread_info()->flags); 163 164 lockdep_assert_irqs_disabled(); 165 166 if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 167 ti_work = exit_to_user_mode_loop(regs, ti_work); 168 169 arch_exit_to_user_mode_prepare(regs, ti_work); 170 171 /* Ensure that the address limit is intact and no locks are held */ 172 addr_limit_user_check(); 173 lockdep_assert_irqs_disabled(); 174 lockdep_sys_exit(); 175 } 176 177 #ifndef _TIF_SINGLESTEP 178 static inline bool report_single_step(unsigned long ti_work) 179 { 180 return false; 181 } 182 #else 183 /* 184 * If TIF_SYSCALL_EMU is set, then the only reason to report is when 185 * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 186 * instruction has been already reported in syscall_enter_from_usermode(). 187 */ 188 #define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) 189 190 static inline bool report_single_step(unsigned long ti_work) 191 { 192 return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP; 193 } 194 #endif 195 196 static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work) 197 { 198 bool step; 199 200 audit_syscall_exit(regs); 201 202 if (ti_work & _TIF_SYSCALL_TRACEPOINT) 203 trace_sys_exit(regs, syscall_get_return_value(current, regs)); 204 205 step = report_single_step(ti_work); 206 if (step || ti_work & _TIF_SYSCALL_TRACE) 207 arch_syscall_exit_tracehook(regs, step); 208 } 209 210 /* 211 * Syscall specific exit to user mode preparation. Runs with interrupts 212 * enabled. 213 */ 214 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) 215 { 216 u32 cached_flags = READ_ONCE(current_thread_info()->flags); 217 unsigned long nr = syscall_get_nr(current, regs); 218 219 CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 220 221 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 222 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) 223 local_irq_enable(); 224 } 225 226 rseq_syscall(regs); 227 228 /* 229 * Do one-time syscall specific work. If these work items are 230 * enabled, we want to run them exactly once per syscall exit with 231 * interrupts enabled. 232 */ 233 if (unlikely(cached_flags & SYSCALL_EXIT_WORK)) 234 syscall_exit_work(regs, cached_flags); 235 } 236 237 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) 238 { 239 instrumentation_begin(); 240 syscall_exit_to_user_mode_prepare(regs); 241 local_irq_disable_exit_to_user(); 242 exit_to_user_mode_prepare(regs); 243 instrumentation_end(); 244 exit_to_user_mode(); 245 } 246 247 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) 248 { 249 enter_from_user_mode(regs); 250 } 251 252 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) 253 { 254 instrumentation_begin(); 255 exit_to_user_mode_prepare(regs); 256 instrumentation_end(); 257 exit_to_user_mode(); 258 } 259 260 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) 261 { 262 irqentry_state_t ret = { 263 .exit_rcu = false, 264 }; 265 266 if (user_mode(regs)) { 267 irqentry_enter_from_user_mode(regs); 268 return ret; 269 } 270 271 /* 272 * If this entry hit the idle task invoke rcu_irq_enter() whether 273 * RCU is watching or not. 274 * 275 * Interupts can nest when the first interrupt invokes softirq 276 * processing on return which enables interrupts. 277 * 278 * Scheduler ticks in the idle task can mark quiescent state and 279 * terminate a grace period, if and only if the timer interrupt is 280 * not nested into another interrupt. 281 * 282 * Checking for __rcu_is_watching() here would prevent the nesting 283 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is 284 * the tick then rcu_flavor_sched_clock_irq() would wrongfully 285 * assume that it is the first interupt and eventually claim 286 * quiescient state and end grace periods prematurely. 287 * 288 * Unconditionally invoke rcu_irq_enter() so RCU state stays 289 * consistent. 290 * 291 * TINY_RCU does not support EQS, so let the compiler eliminate 292 * this part when enabled. 293 */ 294 if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { 295 /* 296 * If RCU is not watching then the same careful 297 * sequence vs. lockdep and tracing is required 298 * as in irq_enter_from_user_mode(). 299 */ 300 lockdep_hardirqs_off(CALLER_ADDR0); 301 rcu_irq_enter(); 302 instrumentation_begin(); 303 trace_hardirqs_off_finish(); 304 instrumentation_end(); 305 306 ret.exit_rcu = true; 307 return ret; 308 } 309 310 /* 311 * If RCU is watching then RCU only wants to check whether it needs 312 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() 313 * already contains a warning when RCU is not watching, so no point 314 * in having another one here. 315 */ 316 instrumentation_begin(); 317 rcu_irq_enter_check_tick(); 318 /* Use the combo lockdep/tracing function */ 319 trace_hardirqs_off(); 320 instrumentation_end(); 321 322 return ret; 323 } 324 325 void irqentry_exit_cond_resched(void) 326 { 327 if (!preempt_count()) { 328 /* Sanity check RCU and thread stack */ 329 rcu_irq_exit_check_preempt(); 330 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 331 WARN_ON_ONCE(!on_thread_stack()); 332 if (need_resched()) 333 preempt_schedule_irq(); 334 } 335 } 336 337 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) 338 { 339 lockdep_assert_irqs_disabled(); 340 341 /* Check whether this returns to user mode */ 342 if (user_mode(regs)) { 343 irqentry_exit_to_user_mode(regs); 344 } else if (!regs_irqs_disabled(regs)) { 345 /* 346 * If RCU was not watching on entry this needs to be done 347 * carefully and needs the same ordering of lockdep/tracing 348 * and RCU as the return to user mode path. 349 */ 350 if (state.exit_rcu) { 351 instrumentation_begin(); 352 /* Tell the tracer that IRET will enable interrupts */ 353 trace_hardirqs_on_prepare(); 354 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 355 instrumentation_end(); 356 rcu_irq_exit(); 357 lockdep_hardirqs_on(CALLER_ADDR0); 358 return; 359 } 360 361 instrumentation_begin(); 362 if (IS_ENABLED(CONFIG_PREEMPTION)) 363 irqentry_exit_cond_resched(); 364 /* Covers both tracing and lockdep */ 365 trace_hardirqs_on(); 366 instrumentation_end(); 367 } else { 368 /* 369 * IRQ flags state is correct already. Just tell RCU if it 370 * was not watching on entry. 371 */ 372 if (state.exit_rcu) 373 rcu_irq_exit(); 374 } 375 } 376