1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/context_tracking.h> 4 #include <linux/entry-common.h> 5 #include <linux/livepatch.h> 6 #include <linux/audit.h> 7 8 #define CREATE_TRACE_POINTS 9 #include <trace/events/syscalls.h> 10 11 /** 12 * enter_from_user_mode - Establish state when coming from user mode 13 * 14 * Syscall/interrupt entry disables interrupts, but user mode is traced as 15 * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. 16 * 17 * 1) Tell lockdep that interrupts are disabled 18 * 2) Invoke context tracking if enabled to reactivate RCU 19 * 3) Trace interrupts off state 20 */ 21 static __always_inline void enter_from_user_mode(struct pt_regs *regs) 22 { 23 arch_check_user_regs(regs); 24 lockdep_hardirqs_off(CALLER_ADDR0); 25 26 CT_WARN_ON(ct_state() != CONTEXT_USER); 27 user_exit_irqoff(); 28 29 instrumentation_begin(); 30 trace_hardirqs_off_finish(); 31 instrumentation_end(); 32 } 33 34 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 35 { 36 if (unlikely(audit_context())) { 37 unsigned long args[6]; 38 39 syscall_get_arguments(current, regs, args); 40 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 41 } 42 } 43 44 static long syscall_trace_enter(struct pt_regs *regs, long syscall, 45 unsigned long ti_work) 46 { 47 long ret = 0; 48 49 /* Handle ptrace */ 50 if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { 51 ret = arch_syscall_enter_tracehook(regs); 52 if (ret || (ti_work & _TIF_SYSCALL_EMU)) 53 return -1L; 54 } 55 56 /* Do seccomp after ptrace, to catch any tracer changes. */ 57 if (ti_work & _TIF_SECCOMP) { 58 ret = __secure_computing(NULL); 59 if (ret == -1L) 60 return ret; 61 } 62 63 /* Either of the above might have changed the syscall number */ 64 syscall = syscall_get_nr(current, regs); 65 66 if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT)) 67 trace_sys_enter(regs, syscall); 68 69 syscall_enter_audit(regs, syscall); 70 71 return ret ? : syscall; 72 } 73 74 static __always_inline long 75 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) 76 { 77 unsigned long ti_work; 78 79 ti_work = READ_ONCE(current_thread_info()->flags); 80 if (ti_work & SYSCALL_ENTER_WORK) 81 syscall = syscall_trace_enter(regs, syscall, ti_work); 82 83 return syscall; 84 } 85 86 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) 87 { 88 return __syscall_enter_from_user_work(regs, syscall); 89 } 90 91 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 92 { 93 long ret; 94 95 enter_from_user_mode(regs); 96 97 instrumentation_begin(); 98 local_irq_enable(); 99 ret = __syscall_enter_from_user_work(regs, syscall); 100 instrumentation_end(); 101 102 return ret; 103 } 104 105 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) 106 { 107 enter_from_user_mode(regs); 108 instrumentation_begin(); 109 local_irq_enable(); 110 instrumentation_end(); 111 } 112 113 /** 114 * exit_to_user_mode - Fixup state when exiting to user mode 115 * 116 * Syscall/interupt exit enables interrupts, but the kernel state is 117 * interrupts disabled when this is invoked. Also tell RCU about it. 118 * 119 * 1) Trace interrupts on state 120 * 2) Invoke context tracking if enabled to adjust RCU state 121 * 3) Invoke architecture specific last minute exit code, e.g. speculation 122 * mitigations, etc. 123 * 4) Tell lockdep that interrupts are enabled 124 */ 125 static __always_inline void exit_to_user_mode(void) 126 { 127 instrumentation_begin(); 128 trace_hardirqs_on_prepare(); 129 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 130 instrumentation_end(); 131 132 user_enter_irqoff(); 133 arch_exit_to_user_mode(); 134 lockdep_hardirqs_on(CALLER_ADDR0); 135 } 136 137 /* Workaround to allow gradual conversion of architecture code */ 138 void __weak arch_do_signal(struct pt_regs *regs) { } 139 140 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 141 unsigned long ti_work) 142 { 143 /* 144 * Before returning to user space ensure that all pending work 145 * items have been completed. 146 */ 147 while (ti_work & EXIT_TO_USER_MODE_WORK) { 148 149 local_irq_enable_exit_to_user(ti_work); 150 151 if (ti_work & _TIF_NEED_RESCHED) 152 schedule(); 153 154 if (ti_work & _TIF_UPROBE) 155 uprobe_notify_resume(regs); 156 157 if (ti_work & _TIF_PATCH_PENDING) 158 klp_update_patch_state(current); 159 160 if (ti_work & _TIF_SIGPENDING) 161 arch_do_signal(regs); 162 163 if (ti_work & _TIF_NOTIFY_RESUME) { 164 tracehook_notify_resume(regs); 165 rseq_handle_notify_resume(NULL, regs); 166 } 167 168 /* Architecture specific TIF work */ 169 arch_exit_to_user_mode_work(regs, ti_work); 170 171 /* 172 * Disable interrupts and reevaluate the work flags as they 173 * might have changed while interrupts and preemption was 174 * enabled above. 175 */ 176 local_irq_disable_exit_to_user(); 177 ti_work = READ_ONCE(current_thread_info()->flags); 178 } 179 180 /* Return the latest work state for arch_exit_to_user_mode() */ 181 return ti_work; 182 } 183 184 static void exit_to_user_mode_prepare(struct pt_regs *regs) 185 { 186 unsigned long ti_work = READ_ONCE(current_thread_info()->flags); 187 188 lockdep_assert_irqs_disabled(); 189 190 if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 191 ti_work = exit_to_user_mode_loop(regs, ti_work); 192 193 arch_exit_to_user_mode_prepare(regs, ti_work); 194 195 /* Ensure that the address limit is intact and no locks are held */ 196 addr_limit_user_check(); 197 lockdep_assert_irqs_disabled(); 198 lockdep_sys_exit(); 199 } 200 201 #ifndef _TIF_SINGLESTEP 202 static inline bool report_single_step(unsigned long ti_work) 203 { 204 return false; 205 } 206 #else 207 /* 208 * If TIF_SYSCALL_EMU is set, then the only reason to report is when 209 * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 210 * instruction has been already reported in syscall_enter_from_user_mode(). 211 */ 212 #define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) 213 214 static inline bool report_single_step(unsigned long ti_work) 215 { 216 return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP; 217 } 218 #endif 219 220 static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work) 221 { 222 bool step; 223 224 audit_syscall_exit(regs); 225 226 if (ti_work & _TIF_SYSCALL_TRACEPOINT) 227 trace_sys_exit(regs, syscall_get_return_value(current, regs)); 228 229 step = report_single_step(ti_work); 230 if (step || ti_work & _TIF_SYSCALL_TRACE) 231 arch_syscall_exit_tracehook(regs, step); 232 } 233 234 /* 235 * Syscall specific exit to user mode preparation. Runs with interrupts 236 * enabled. 237 */ 238 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) 239 { 240 u32 cached_flags = READ_ONCE(current_thread_info()->flags); 241 unsigned long nr = syscall_get_nr(current, regs); 242 243 CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 244 245 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 246 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) 247 local_irq_enable(); 248 } 249 250 rseq_syscall(regs); 251 252 /* 253 * Do one-time syscall specific work. If these work items are 254 * enabled, we want to run them exactly once per syscall exit with 255 * interrupts enabled. 256 */ 257 if (unlikely(cached_flags & SYSCALL_EXIT_WORK)) 258 syscall_exit_work(regs, cached_flags); 259 } 260 261 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) 262 { 263 instrumentation_begin(); 264 syscall_exit_to_user_mode_prepare(regs); 265 local_irq_disable_exit_to_user(); 266 exit_to_user_mode_prepare(regs); 267 instrumentation_end(); 268 exit_to_user_mode(); 269 } 270 271 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) 272 { 273 enter_from_user_mode(regs); 274 } 275 276 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) 277 { 278 instrumentation_begin(); 279 exit_to_user_mode_prepare(regs); 280 instrumentation_end(); 281 exit_to_user_mode(); 282 } 283 284 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) 285 { 286 irqentry_state_t ret = { 287 .exit_rcu = false, 288 }; 289 290 if (user_mode(regs)) { 291 irqentry_enter_from_user_mode(regs); 292 return ret; 293 } 294 295 /* 296 * If this entry hit the idle task invoke rcu_irq_enter() whether 297 * RCU is watching or not. 298 * 299 * Interupts can nest when the first interrupt invokes softirq 300 * processing on return which enables interrupts. 301 * 302 * Scheduler ticks in the idle task can mark quiescent state and 303 * terminate a grace period, if and only if the timer interrupt is 304 * not nested into another interrupt. 305 * 306 * Checking for rcu_is_watching() here would prevent the nesting 307 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is 308 * the tick then rcu_flavor_sched_clock_irq() would wrongfully 309 * assume that it is the first interupt and eventually claim 310 * quiescient state and end grace periods prematurely. 311 * 312 * Unconditionally invoke rcu_irq_enter() so RCU state stays 313 * consistent. 314 * 315 * TINY_RCU does not support EQS, so let the compiler eliminate 316 * this part when enabled. 317 */ 318 if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { 319 /* 320 * If RCU is not watching then the same careful 321 * sequence vs. lockdep and tracing is required 322 * as in irq_enter_from_user_mode(). 323 */ 324 lockdep_hardirqs_off(CALLER_ADDR0); 325 rcu_irq_enter(); 326 instrumentation_begin(); 327 trace_hardirqs_off_finish(); 328 instrumentation_end(); 329 330 ret.exit_rcu = true; 331 return ret; 332 } 333 334 /* 335 * If RCU is watching then RCU only wants to check whether it needs 336 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() 337 * already contains a warning when RCU is not watching, so no point 338 * in having another one here. 339 */ 340 instrumentation_begin(); 341 rcu_irq_enter_check_tick(); 342 /* Use the combo lockdep/tracing function */ 343 trace_hardirqs_off(); 344 instrumentation_end(); 345 346 return ret; 347 } 348 349 void irqentry_exit_cond_resched(void) 350 { 351 if (!preempt_count()) { 352 /* Sanity check RCU and thread stack */ 353 rcu_irq_exit_check_preempt(); 354 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 355 WARN_ON_ONCE(!on_thread_stack()); 356 if (need_resched()) 357 preempt_schedule_irq(); 358 } 359 } 360 361 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) 362 { 363 lockdep_assert_irqs_disabled(); 364 365 /* Check whether this returns to user mode */ 366 if (user_mode(regs)) { 367 irqentry_exit_to_user_mode(regs); 368 } else if (!regs_irqs_disabled(regs)) { 369 /* 370 * If RCU was not watching on entry this needs to be done 371 * carefully and needs the same ordering of lockdep/tracing 372 * and RCU as the return to user mode path. 373 */ 374 if (state.exit_rcu) { 375 instrumentation_begin(); 376 /* Tell the tracer that IRET will enable interrupts */ 377 trace_hardirqs_on_prepare(); 378 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 379 instrumentation_end(); 380 rcu_irq_exit(); 381 lockdep_hardirqs_on(CALLER_ADDR0); 382 return; 383 } 384 385 instrumentation_begin(); 386 if (IS_ENABLED(CONFIG_PREEMPTION)) 387 irqentry_exit_cond_resched(); 388 /* Covers both tracing and lockdep */ 389 trace_hardirqs_on(); 390 instrumentation_end(); 391 } else { 392 /* 393 * IRQ flags state is correct already. Just tell RCU if it 394 * was not watching on entry. 395 */ 396 if (state.exit_rcu) 397 rcu_irq_exit(); 398 } 399 } 400