1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/context_tracking.h> 4 #include <linux/entry-common.h> 5 #include <linux/highmem.h> 6 #include <linux/livepatch.h> 7 #include <linux/audit.h> 8 9 #include "common.h" 10 11 #define CREATE_TRACE_POINTS 12 #include <trace/events/syscalls.h> 13 14 /* See comment for enter_from_user_mode() in entry-common.h */ 15 static __always_inline void __enter_from_user_mode(struct pt_regs *regs) 16 { 17 arch_check_user_regs(regs); 18 lockdep_hardirqs_off(CALLER_ADDR0); 19 20 CT_WARN_ON(ct_state() != CONTEXT_USER); 21 user_exit_irqoff(); 22 23 instrumentation_begin(); 24 trace_hardirqs_off_finish(); 25 instrumentation_end(); 26 } 27 28 void noinstr enter_from_user_mode(struct pt_regs *regs) 29 { 30 __enter_from_user_mode(regs); 31 } 32 33 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 34 { 35 if (unlikely(audit_context())) { 36 unsigned long args[6]; 37 38 syscall_get_arguments(current, regs, args); 39 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 40 } 41 } 42 43 static long syscall_trace_enter(struct pt_regs *regs, long syscall, 44 unsigned long work) 45 { 46 long ret = 0; 47 48 /* 49 * Handle Syscall User Dispatch. This must comes first, since 50 * the ABI here can be something that doesn't make sense for 51 * other syscall_work features. 52 */ 53 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 54 if (syscall_user_dispatch(regs)) 55 return -1L; 56 } 57 58 /* Handle ptrace */ 59 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { 60 ret = arch_syscall_enter_tracehook(regs); 61 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) 62 return -1L; 63 } 64 65 /* Do seccomp after ptrace, to catch any tracer changes. */ 66 if (work & SYSCALL_WORK_SECCOMP) { 67 ret = __secure_computing(NULL); 68 if (ret == -1L) 69 return ret; 70 } 71 72 /* Either of the above might have changed the syscall number */ 73 syscall = syscall_get_nr(current, regs); 74 75 if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) 76 trace_sys_enter(regs, syscall); 77 78 syscall_enter_audit(regs, syscall); 79 80 return ret ? : syscall; 81 } 82 83 static __always_inline long 84 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) 85 { 86 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 87 88 if (work & SYSCALL_WORK_ENTER) 89 syscall = syscall_trace_enter(regs, syscall, work); 90 91 return syscall; 92 } 93 94 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) 95 { 96 return __syscall_enter_from_user_work(regs, syscall); 97 } 98 99 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 100 { 101 long ret; 102 103 __enter_from_user_mode(regs); 104 105 instrumentation_begin(); 106 local_irq_enable(); 107 ret = __syscall_enter_from_user_work(regs, syscall); 108 instrumentation_end(); 109 110 return ret; 111 } 112 113 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) 114 { 115 __enter_from_user_mode(regs); 116 instrumentation_begin(); 117 local_irq_enable(); 118 instrumentation_end(); 119 } 120 121 /* See comment for exit_to_user_mode() in entry-common.h */ 122 static __always_inline void __exit_to_user_mode(void) 123 { 124 instrumentation_begin(); 125 trace_hardirqs_on_prepare(); 126 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 127 instrumentation_end(); 128 129 user_enter_irqoff(); 130 arch_exit_to_user_mode(); 131 lockdep_hardirqs_on(CALLER_ADDR0); 132 } 133 134 void noinstr exit_to_user_mode(void) 135 { 136 __exit_to_user_mode(); 137 } 138 139 /* Workaround to allow gradual conversion of architecture code */ 140 void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { } 141 142 static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work) 143 { 144 if (ti_work & _TIF_NOTIFY_SIGNAL) 145 tracehook_notify_signal(); 146 147 arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING); 148 } 149 150 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 151 unsigned long ti_work) 152 { 153 /* 154 * Before returning to user space ensure that all pending work 155 * items have been completed. 156 */ 157 while (ti_work & EXIT_TO_USER_MODE_WORK) { 158 159 local_irq_enable_exit_to_user(ti_work); 160 161 if (ti_work & _TIF_NEED_RESCHED) 162 schedule(); 163 164 if (ti_work & _TIF_UPROBE) 165 uprobe_notify_resume(regs); 166 167 if (ti_work & _TIF_PATCH_PENDING) 168 klp_update_patch_state(current); 169 170 if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) 171 handle_signal_work(regs, ti_work); 172 173 if (ti_work & _TIF_NOTIFY_RESUME) { 174 tracehook_notify_resume(regs); 175 rseq_handle_notify_resume(NULL, regs); 176 } 177 178 /* Architecture specific TIF work */ 179 arch_exit_to_user_mode_work(regs, ti_work); 180 181 /* 182 * Disable interrupts and reevaluate the work flags as they 183 * might have changed while interrupts and preemption was 184 * enabled above. 185 */ 186 local_irq_disable_exit_to_user(); 187 ti_work = READ_ONCE(current_thread_info()->flags); 188 } 189 190 /* Return the latest work state for arch_exit_to_user_mode() */ 191 return ti_work; 192 } 193 194 static void exit_to_user_mode_prepare(struct pt_regs *regs) 195 { 196 unsigned long ti_work = READ_ONCE(current_thread_info()->flags); 197 198 lockdep_assert_irqs_disabled(); 199 200 if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 201 ti_work = exit_to_user_mode_loop(regs, ti_work); 202 203 arch_exit_to_user_mode_prepare(regs, ti_work); 204 205 /* Ensure that the address limit is intact and no locks are held */ 206 addr_limit_user_check(); 207 kmap_assert_nomap(); 208 lockdep_assert_irqs_disabled(); 209 lockdep_sys_exit(); 210 } 211 212 /* 213 * If SYSCALL_EMU is set, then the only reason to report is when 214 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 215 * instruction has been already reported in syscall_enter_from_user_mode(). 216 */ 217 static inline bool report_single_step(unsigned long work) 218 { 219 if (work & SYSCALL_WORK_SYSCALL_EMU) 220 return false; 221 222 return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; 223 } 224 225 static void syscall_exit_work(struct pt_regs *regs, unsigned long work) 226 { 227 bool step; 228 229 /* 230 * If the syscall was rolled back due to syscall user dispatching, 231 * then the tracers below are not invoked for the same reason as 232 * the entry side was not invoked in syscall_trace_enter(): The ABI 233 * of these syscalls is unknown. 234 */ 235 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 236 if (unlikely(current->syscall_dispatch.on_dispatch)) { 237 current->syscall_dispatch.on_dispatch = false; 238 return; 239 } 240 } 241 242 audit_syscall_exit(regs); 243 244 if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) 245 trace_sys_exit(regs, syscall_get_return_value(current, regs)); 246 247 step = report_single_step(work); 248 if (step || work & SYSCALL_WORK_SYSCALL_TRACE) 249 arch_syscall_exit_tracehook(regs, step); 250 } 251 252 /* 253 * Syscall specific exit to user mode preparation. Runs with interrupts 254 * enabled. 255 */ 256 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) 257 { 258 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 259 unsigned long nr = syscall_get_nr(current, regs); 260 261 CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 262 263 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 264 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) 265 local_irq_enable(); 266 } 267 268 rseq_syscall(regs); 269 270 /* 271 * Do one-time syscall specific work. If these work items are 272 * enabled, we want to run them exactly once per syscall exit with 273 * interrupts enabled. 274 */ 275 if (unlikely(work & SYSCALL_WORK_EXIT)) 276 syscall_exit_work(regs, work); 277 } 278 279 static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) 280 { 281 syscall_exit_to_user_mode_prepare(regs); 282 local_irq_disable_exit_to_user(); 283 exit_to_user_mode_prepare(regs); 284 } 285 286 void syscall_exit_to_user_mode_work(struct pt_regs *regs) 287 { 288 __syscall_exit_to_user_mode_work(regs); 289 } 290 291 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) 292 { 293 instrumentation_begin(); 294 __syscall_exit_to_user_mode_work(regs); 295 instrumentation_end(); 296 __exit_to_user_mode(); 297 } 298 299 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) 300 { 301 __enter_from_user_mode(regs); 302 } 303 304 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) 305 { 306 instrumentation_begin(); 307 exit_to_user_mode_prepare(regs); 308 instrumentation_end(); 309 __exit_to_user_mode(); 310 } 311 312 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) 313 { 314 irqentry_state_t ret = { 315 .exit_rcu = false, 316 }; 317 318 if (user_mode(regs)) { 319 irqentry_enter_from_user_mode(regs); 320 return ret; 321 } 322 323 /* 324 * If this entry hit the idle task invoke rcu_irq_enter() whether 325 * RCU is watching or not. 326 * 327 * Interrupts can nest when the first interrupt invokes softirq 328 * processing on return which enables interrupts. 329 * 330 * Scheduler ticks in the idle task can mark quiescent state and 331 * terminate a grace period, if and only if the timer interrupt is 332 * not nested into another interrupt. 333 * 334 * Checking for rcu_is_watching() here would prevent the nesting 335 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is 336 * the tick then rcu_flavor_sched_clock_irq() would wrongfully 337 * assume that it is the first interupt and eventually claim 338 * quiescent state and end grace periods prematurely. 339 * 340 * Unconditionally invoke rcu_irq_enter() so RCU state stays 341 * consistent. 342 * 343 * TINY_RCU does not support EQS, so let the compiler eliminate 344 * this part when enabled. 345 */ 346 if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { 347 /* 348 * If RCU is not watching then the same careful 349 * sequence vs. lockdep and tracing is required 350 * as in irqentry_enter_from_user_mode(). 351 */ 352 lockdep_hardirqs_off(CALLER_ADDR0); 353 rcu_irq_enter(); 354 instrumentation_begin(); 355 trace_hardirqs_off_finish(); 356 instrumentation_end(); 357 358 ret.exit_rcu = true; 359 return ret; 360 } 361 362 /* 363 * If RCU is watching then RCU only wants to check whether it needs 364 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() 365 * already contains a warning when RCU is not watching, so no point 366 * in having another one here. 367 */ 368 lockdep_hardirqs_off(CALLER_ADDR0); 369 instrumentation_begin(); 370 rcu_irq_enter_check_tick(); 371 trace_hardirqs_off_finish(); 372 instrumentation_end(); 373 374 return ret; 375 } 376 377 void irqentry_exit_cond_resched(void) 378 { 379 if (!preempt_count()) { 380 /* Sanity check RCU and thread stack */ 381 rcu_irq_exit_check_preempt(); 382 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 383 WARN_ON_ONCE(!on_thread_stack()); 384 if (need_resched()) 385 preempt_schedule_irq(); 386 } 387 } 388 389 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) 390 { 391 lockdep_assert_irqs_disabled(); 392 393 /* Check whether this returns to user mode */ 394 if (user_mode(regs)) { 395 irqentry_exit_to_user_mode(regs); 396 } else if (!regs_irqs_disabled(regs)) { 397 /* 398 * If RCU was not watching on entry this needs to be done 399 * carefully and needs the same ordering of lockdep/tracing 400 * and RCU as the return to user mode path. 401 */ 402 if (state.exit_rcu) { 403 instrumentation_begin(); 404 /* Tell the tracer that IRET will enable interrupts */ 405 trace_hardirqs_on_prepare(); 406 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 407 instrumentation_end(); 408 rcu_irq_exit(); 409 lockdep_hardirqs_on(CALLER_ADDR0); 410 return; 411 } 412 413 instrumentation_begin(); 414 if (IS_ENABLED(CONFIG_PREEMPTION)) 415 irqentry_exit_cond_resched(); 416 /* Covers both tracing and lockdep */ 417 trace_hardirqs_on(); 418 instrumentation_end(); 419 } else { 420 /* 421 * IRQ flags state is correct already. Just tell RCU if it 422 * was not watching on entry. 423 */ 424 if (state.exit_rcu) 425 rcu_irq_exit(); 426 } 427 } 428 429 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) 430 { 431 irqentry_state_t irq_state; 432 433 irq_state.lockdep = lockdep_hardirqs_enabled(); 434 435 __nmi_enter(); 436 lockdep_hardirqs_off(CALLER_ADDR0); 437 lockdep_hardirq_enter(); 438 rcu_nmi_enter(); 439 440 instrumentation_begin(); 441 trace_hardirqs_off_finish(); 442 ftrace_nmi_enter(); 443 instrumentation_end(); 444 445 return irq_state; 446 } 447 448 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) 449 { 450 instrumentation_begin(); 451 ftrace_nmi_exit(); 452 if (irq_state.lockdep) { 453 trace_hardirqs_on_prepare(); 454 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 455 } 456 instrumentation_end(); 457 458 rcu_nmi_exit(); 459 lockdep_hardirq_exit(); 460 if (irq_state.lockdep) 461 lockdep_hardirqs_on(CALLER_ADDR0); 462 __nmi_exit(); 463 } 464