1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/context_tracking.h> 4 #include <linux/entry-common.h> 5 #include <linux/highmem.h> 6 #include <linux/livepatch.h> 7 #include <linux/audit.h> 8 9 #include "common.h" 10 11 #define CREATE_TRACE_POINTS 12 #include <trace/events/syscalls.h> 13 14 /* See comment for enter_from_user_mode() in entry-common.h */ 15 static __always_inline void __enter_from_user_mode(struct pt_regs *regs) 16 { 17 arch_check_user_regs(regs); 18 lockdep_hardirqs_off(CALLER_ADDR0); 19 20 CT_WARN_ON(ct_state() != CONTEXT_USER); 21 user_exit_irqoff(); 22 23 instrumentation_begin(); 24 trace_hardirqs_off_finish(); 25 instrumentation_end(); 26 } 27 28 void noinstr enter_from_user_mode(struct pt_regs *regs) 29 { 30 __enter_from_user_mode(regs); 31 } 32 33 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 34 { 35 if (unlikely(audit_context())) { 36 unsigned long args[6]; 37 38 syscall_get_arguments(current, regs, args); 39 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 40 } 41 } 42 43 static long syscall_trace_enter(struct pt_regs *regs, long syscall, 44 unsigned long work) 45 { 46 long ret = 0; 47 48 /* 49 * Handle Syscall User Dispatch. This must comes first, since 50 * the ABI here can be something that doesn't make sense for 51 * other syscall_work features. 52 */ 53 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 54 if (syscall_user_dispatch(regs)) 55 return -1L; 56 } 57 58 /* Handle ptrace */ 59 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { 60 ret = arch_syscall_enter_tracehook(regs); 61 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) 62 return -1L; 63 } 64 65 /* Do seccomp after ptrace, to catch any tracer changes. */ 66 if (work & SYSCALL_WORK_SECCOMP) { 67 ret = __secure_computing(NULL); 68 if (ret == -1L) 69 return ret; 70 } 71 72 /* Either of the above might have changed the syscall number */ 73 syscall = syscall_get_nr(current, regs); 74 75 if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) 76 trace_sys_enter(regs, syscall); 77 78 syscall_enter_audit(regs, syscall); 79 80 return ret ? : syscall; 81 } 82 83 static __always_inline long 84 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) 85 { 86 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 87 88 if (work & SYSCALL_WORK_ENTER) 89 syscall = syscall_trace_enter(regs, syscall, work); 90 91 return syscall; 92 } 93 94 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) 95 { 96 return __syscall_enter_from_user_work(regs, syscall); 97 } 98 99 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 100 { 101 long ret; 102 103 __enter_from_user_mode(regs); 104 105 instrumentation_begin(); 106 local_irq_enable(); 107 ret = __syscall_enter_from_user_work(regs, syscall); 108 instrumentation_end(); 109 110 return ret; 111 } 112 113 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) 114 { 115 __enter_from_user_mode(regs); 116 instrumentation_begin(); 117 local_irq_enable(); 118 instrumentation_end(); 119 } 120 121 /* See comment for exit_to_user_mode() in entry-common.h */ 122 static __always_inline void __exit_to_user_mode(void) 123 { 124 instrumentation_begin(); 125 trace_hardirqs_on_prepare(); 126 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 127 instrumentation_end(); 128 129 user_enter_irqoff(); 130 arch_exit_to_user_mode(); 131 lockdep_hardirqs_on(CALLER_ADDR0); 132 } 133 134 void noinstr exit_to_user_mode(void) 135 { 136 __exit_to_user_mode(); 137 } 138 139 /* Workaround to allow gradual conversion of architecture code */ 140 void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { } 141 142 static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work) 143 { 144 if (ti_work & _TIF_NOTIFY_SIGNAL) 145 tracehook_notify_signal(); 146 147 arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING); 148 } 149 150 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 151 unsigned long ti_work) 152 { 153 /* 154 * Before returning to user space ensure that all pending work 155 * items have been completed. 156 */ 157 while (ti_work & EXIT_TO_USER_MODE_WORK) { 158 159 local_irq_enable_exit_to_user(ti_work); 160 161 if (ti_work & _TIF_NEED_RESCHED) 162 schedule(); 163 164 if (ti_work & _TIF_UPROBE) 165 uprobe_notify_resume(regs); 166 167 if (ti_work & _TIF_PATCH_PENDING) 168 klp_update_patch_state(current); 169 170 if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) 171 handle_signal_work(regs, ti_work); 172 173 if (ti_work & _TIF_NOTIFY_RESUME) { 174 tracehook_notify_resume(regs); 175 rseq_handle_notify_resume(NULL, regs); 176 } 177 178 /* Architecture specific TIF work */ 179 arch_exit_to_user_mode_work(regs, ti_work); 180 181 /* 182 * Disable interrupts and reevaluate the work flags as they 183 * might have changed while interrupts and preemption was 184 * enabled above. 185 */ 186 local_irq_disable_exit_to_user(); 187 188 /* Check if any of the above work has queued a deferred wakeup */ 189 rcu_nocb_flush_deferred_wakeup(); 190 191 ti_work = READ_ONCE(current_thread_info()->flags); 192 } 193 194 /* Return the latest work state for arch_exit_to_user_mode() */ 195 return ti_work; 196 } 197 198 static void exit_to_user_mode_prepare(struct pt_regs *regs) 199 { 200 unsigned long ti_work = READ_ONCE(current_thread_info()->flags); 201 202 lockdep_assert_irqs_disabled(); 203 204 /* Flush pending rcuog wakeup before the last need_resched() check */ 205 rcu_nocb_flush_deferred_wakeup(); 206 207 if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 208 ti_work = exit_to_user_mode_loop(regs, ti_work); 209 210 arch_exit_to_user_mode_prepare(regs, ti_work); 211 212 /* Ensure that the address limit is intact and no locks are held */ 213 addr_limit_user_check(); 214 kmap_assert_nomap(); 215 lockdep_assert_irqs_disabled(); 216 lockdep_sys_exit(); 217 } 218 219 /* 220 * If SYSCALL_EMU is set, then the only reason to report is when 221 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 222 * instruction has been already reported in syscall_enter_from_user_mode(). 223 */ 224 static inline bool report_single_step(unsigned long work) 225 { 226 if (work & SYSCALL_WORK_SYSCALL_EMU) 227 return false; 228 229 return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; 230 } 231 232 static void syscall_exit_work(struct pt_regs *regs, unsigned long work) 233 { 234 bool step; 235 236 /* 237 * If the syscall was rolled back due to syscall user dispatching, 238 * then the tracers below are not invoked for the same reason as 239 * the entry side was not invoked in syscall_trace_enter(): The ABI 240 * of these syscalls is unknown. 241 */ 242 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 243 if (unlikely(current->syscall_dispatch.on_dispatch)) { 244 current->syscall_dispatch.on_dispatch = false; 245 return; 246 } 247 } 248 249 audit_syscall_exit(regs); 250 251 if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) 252 trace_sys_exit(regs, syscall_get_return_value(current, regs)); 253 254 step = report_single_step(work); 255 if (step || work & SYSCALL_WORK_SYSCALL_TRACE) 256 arch_syscall_exit_tracehook(regs, step); 257 } 258 259 /* 260 * Syscall specific exit to user mode preparation. Runs with interrupts 261 * enabled. 262 */ 263 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) 264 { 265 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 266 unsigned long nr = syscall_get_nr(current, regs); 267 268 CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 269 270 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 271 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) 272 local_irq_enable(); 273 } 274 275 rseq_syscall(regs); 276 277 /* 278 * Do one-time syscall specific work. If these work items are 279 * enabled, we want to run them exactly once per syscall exit with 280 * interrupts enabled. 281 */ 282 if (unlikely(work & SYSCALL_WORK_EXIT)) 283 syscall_exit_work(regs, work); 284 } 285 286 static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) 287 { 288 syscall_exit_to_user_mode_prepare(regs); 289 local_irq_disable_exit_to_user(); 290 exit_to_user_mode_prepare(regs); 291 } 292 293 void syscall_exit_to_user_mode_work(struct pt_regs *regs) 294 { 295 __syscall_exit_to_user_mode_work(regs); 296 } 297 298 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) 299 { 300 instrumentation_begin(); 301 __syscall_exit_to_user_mode_work(regs); 302 instrumentation_end(); 303 __exit_to_user_mode(); 304 } 305 306 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) 307 { 308 __enter_from_user_mode(regs); 309 } 310 311 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) 312 { 313 instrumentation_begin(); 314 exit_to_user_mode_prepare(regs); 315 instrumentation_end(); 316 __exit_to_user_mode(); 317 } 318 319 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) 320 { 321 irqentry_state_t ret = { 322 .exit_rcu = false, 323 }; 324 325 if (user_mode(regs)) { 326 irqentry_enter_from_user_mode(regs); 327 return ret; 328 } 329 330 /* 331 * If this entry hit the idle task invoke rcu_irq_enter() whether 332 * RCU is watching or not. 333 * 334 * Interrupts can nest when the first interrupt invokes softirq 335 * processing on return which enables interrupts. 336 * 337 * Scheduler ticks in the idle task can mark quiescent state and 338 * terminate a grace period, if and only if the timer interrupt is 339 * not nested into another interrupt. 340 * 341 * Checking for rcu_is_watching() here would prevent the nesting 342 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is 343 * the tick then rcu_flavor_sched_clock_irq() would wrongfully 344 * assume that it is the first interupt and eventually claim 345 * quiescent state and end grace periods prematurely. 346 * 347 * Unconditionally invoke rcu_irq_enter() so RCU state stays 348 * consistent. 349 * 350 * TINY_RCU does not support EQS, so let the compiler eliminate 351 * this part when enabled. 352 */ 353 if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { 354 /* 355 * If RCU is not watching then the same careful 356 * sequence vs. lockdep and tracing is required 357 * as in irqentry_enter_from_user_mode(). 358 */ 359 lockdep_hardirqs_off(CALLER_ADDR0); 360 rcu_irq_enter(); 361 instrumentation_begin(); 362 trace_hardirqs_off_finish(); 363 instrumentation_end(); 364 365 ret.exit_rcu = true; 366 return ret; 367 } 368 369 /* 370 * If RCU is watching then RCU only wants to check whether it needs 371 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() 372 * already contains a warning when RCU is not watching, so no point 373 * in having another one here. 374 */ 375 lockdep_hardirqs_off(CALLER_ADDR0); 376 instrumentation_begin(); 377 rcu_irq_enter_check_tick(); 378 trace_hardirqs_off_finish(); 379 instrumentation_end(); 380 381 return ret; 382 } 383 384 void irqentry_exit_cond_resched(void) 385 { 386 if (!preempt_count()) { 387 /* Sanity check RCU and thread stack */ 388 rcu_irq_exit_check_preempt(); 389 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 390 WARN_ON_ONCE(!on_thread_stack()); 391 if (need_resched()) 392 preempt_schedule_irq(); 393 } 394 } 395 #ifdef CONFIG_PREEMPT_DYNAMIC 396 DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched); 397 #endif 398 399 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) 400 { 401 lockdep_assert_irqs_disabled(); 402 403 /* Check whether this returns to user mode */ 404 if (user_mode(regs)) { 405 irqentry_exit_to_user_mode(regs); 406 } else if (!regs_irqs_disabled(regs)) { 407 /* 408 * If RCU was not watching on entry this needs to be done 409 * carefully and needs the same ordering of lockdep/tracing 410 * and RCU as the return to user mode path. 411 */ 412 if (state.exit_rcu) { 413 instrumentation_begin(); 414 /* Tell the tracer that IRET will enable interrupts */ 415 trace_hardirqs_on_prepare(); 416 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 417 instrumentation_end(); 418 rcu_irq_exit(); 419 lockdep_hardirqs_on(CALLER_ADDR0); 420 return; 421 } 422 423 instrumentation_begin(); 424 if (IS_ENABLED(CONFIG_PREEMPTION)) { 425 #ifdef CONFIG_PREEMT_DYNAMIC 426 static_call(irqentry_exit_cond_resched)(); 427 #else 428 irqentry_exit_cond_resched(); 429 #endif 430 } 431 /* Covers both tracing and lockdep */ 432 trace_hardirqs_on(); 433 instrumentation_end(); 434 } else { 435 /* 436 * IRQ flags state is correct already. Just tell RCU if it 437 * was not watching on entry. 438 */ 439 if (state.exit_rcu) 440 rcu_irq_exit(); 441 } 442 } 443 444 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) 445 { 446 irqentry_state_t irq_state; 447 448 irq_state.lockdep = lockdep_hardirqs_enabled(); 449 450 __nmi_enter(); 451 lockdep_hardirqs_off(CALLER_ADDR0); 452 lockdep_hardirq_enter(); 453 rcu_nmi_enter(); 454 455 instrumentation_begin(); 456 trace_hardirqs_off_finish(); 457 ftrace_nmi_enter(); 458 instrumentation_end(); 459 460 return irq_state; 461 } 462 463 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) 464 { 465 instrumentation_begin(); 466 ftrace_nmi_exit(); 467 if (irq_state.lockdep) { 468 trace_hardirqs_on_prepare(); 469 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 470 } 471 instrumentation_end(); 472 473 rcu_nmi_exit(); 474 lockdep_hardirq_exit(); 475 if (irq_state.lockdep) 476 lockdep_hardirqs_on(CALLER_ADDR0); 477 __nmi_exit(); 478 } 479