1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * common.c - C code for kernel entry and exit 4 * Copyright (c) 2015 Andrew Lutomirski 5 * 6 * Based on asm and ptrace code by many authors. The code here originated 7 * in ptrace.c and signal.c. 8 */ 9 10 #include <linux/kernel.h> 11 #include <linux/sched.h> 12 #include <linux/sched/task_stack.h> 13 #include <linux/mm.h> 14 #include <linux/smp.h> 15 #include <linux/errno.h> 16 #include <linux/ptrace.h> 17 #include <linux/tracehook.h> 18 #include <linux/audit.h> 19 #include <linux/seccomp.h> 20 #include <linux/signal.h> 21 #include <linux/export.h> 22 #include <linux/context_tracking.h> 23 #include <linux/user-return-notifier.h> 24 #include <linux/nospec.h> 25 #include <linux/uprobes.h> 26 #include <linux/livepatch.h> 27 #include <linux/syscalls.h> 28 #include <linux/uaccess.h> 29 30 #ifdef CONFIG_XEN_PV 31 #include <xen/xen-ops.h> 32 #include <xen/events.h> 33 #endif 34 35 #include <asm/desc.h> 36 #include <asm/traps.h> 37 #include <asm/vdso.h> 38 #include <asm/cpufeature.h> 39 #include <asm/fpu/api.h> 40 #include <asm/nospec-branch.h> 41 #include <asm/io_bitmap.h> 42 #include <asm/syscall.h> 43 #include <asm/irq_stack.h> 44 45 #define CREATE_TRACE_POINTS 46 #include <trace/events/syscalls.h> 47 48 /* Check that the stack and regs on entry from user mode are sane. */ 49 static void check_user_regs(struct pt_regs *regs) 50 { 51 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { 52 /* 53 * Make sure that the entry code gave us a sensible EFLAGS 54 * register. Native because we want to check the actual CPU 55 * state, not the interrupt state as imagined by Xen. 56 */ 57 unsigned long flags = native_save_fl(); 58 WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | 59 X86_EFLAGS_NT)); 60 61 /* We think we came from user mode. Make sure pt_regs agrees. */ 62 WARN_ON_ONCE(!user_mode(regs)); 63 64 /* 65 * All entries from user mode (except #DF) should be on the 66 * normal thread stack and should have user pt_regs in the 67 * correct location. 68 */ 69 WARN_ON_ONCE(!on_thread_stack()); 70 WARN_ON_ONCE(regs != task_pt_regs(current)); 71 } 72 } 73 74 #ifdef CONFIG_CONTEXT_TRACKING 75 /** 76 * enter_from_user_mode - Establish state when coming from user mode 77 * 78 * Syscall entry disables interrupts, but user mode is traced as interrupts 79 * enabled. Also with NO_HZ_FULL RCU might be idle. 80 * 81 * 1) Tell lockdep that interrupts are disabled 82 * 2) Invoke context tracking if enabled to reactivate RCU 83 * 3) Trace interrupts off state 84 */ 85 static noinstr void enter_from_user_mode(void) 86 { 87 enum ctx_state state = ct_state(); 88 89 lockdep_hardirqs_off(CALLER_ADDR0); 90 user_exit_irqoff(); 91 92 instrumentation_begin(); 93 CT_WARN_ON(state != CONTEXT_USER); 94 trace_hardirqs_off_finish(); 95 instrumentation_end(); 96 } 97 #else 98 static __always_inline void enter_from_user_mode(void) 99 { 100 lockdep_hardirqs_off(CALLER_ADDR0); 101 instrumentation_begin(); 102 trace_hardirqs_off_finish(); 103 instrumentation_end(); 104 } 105 #endif 106 107 /** 108 * exit_to_user_mode - Fixup state when exiting to user mode 109 * 110 * Syscall exit enables interrupts, but the kernel state is interrupts 111 * disabled when this is invoked. Also tell RCU about it. 112 * 113 * 1) Trace interrupts on state 114 * 2) Invoke context tracking if enabled to adjust RCU state 115 * 3) Clear CPU buffers if CPU is affected by MDS and the migitation is on. 116 * 4) Tell lockdep that interrupts are enabled 117 */ 118 static __always_inline void exit_to_user_mode(void) 119 { 120 instrumentation_begin(); 121 trace_hardirqs_on_prepare(); 122 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 123 instrumentation_end(); 124 125 user_enter_irqoff(); 126 mds_user_clear_cpu_buffers(); 127 lockdep_hardirqs_on(CALLER_ADDR0); 128 } 129 130 static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) 131 { 132 #ifdef CONFIG_X86_64 133 if (arch == AUDIT_ARCH_X86_64) { 134 audit_syscall_entry(regs->orig_ax, regs->di, 135 regs->si, regs->dx, regs->r10); 136 } else 137 #endif 138 { 139 audit_syscall_entry(regs->orig_ax, regs->bx, 140 regs->cx, regs->dx, regs->si); 141 } 142 } 143 144 /* 145 * Returns the syscall nr to run (which should match regs->orig_ax) or -1 146 * to skip the syscall. 147 */ 148 static long syscall_trace_enter(struct pt_regs *regs) 149 { 150 u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; 151 152 struct thread_info *ti = current_thread_info(); 153 unsigned long ret = 0; 154 u32 work; 155 156 work = READ_ONCE(ti->flags); 157 158 if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { 159 ret = tracehook_report_syscall_entry(regs); 160 if (ret || (work & _TIF_SYSCALL_EMU)) 161 return -1L; 162 } 163 164 #ifdef CONFIG_SECCOMP 165 /* 166 * Do seccomp after ptrace, to catch any tracer changes. 167 */ 168 if (work & _TIF_SECCOMP) { 169 struct seccomp_data sd; 170 171 sd.arch = arch; 172 sd.nr = regs->orig_ax; 173 sd.instruction_pointer = regs->ip; 174 #ifdef CONFIG_X86_64 175 if (arch == AUDIT_ARCH_X86_64) { 176 sd.args[0] = regs->di; 177 sd.args[1] = regs->si; 178 sd.args[2] = regs->dx; 179 sd.args[3] = regs->r10; 180 sd.args[4] = regs->r8; 181 sd.args[5] = regs->r9; 182 } else 183 #endif 184 { 185 sd.args[0] = regs->bx; 186 sd.args[1] = regs->cx; 187 sd.args[2] = regs->dx; 188 sd.args[3] = regs->si; 189 sd.args[4] = regs->di; 190 sd.args[5] = regs->bp; 191 } 192 193 ret = __secure_computing(&sd); 194 if (ret == -1) 195 return ret; 196 } 197 #endif 198 199 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 200 trace_sys_enter(regs, regs->orig_ax); 201 202 do_audit_syscall_entry(regs, arch); 203 204 return ret ?: regs->orig_ax; 205 } 206 207 #define EXIT_TO_USERMODE_LOOP_FLAGS \ 208 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ 209 _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING) 210 211 static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) 212 { 213 /* 214 * In order to return to user mode, we need to have IRQs off with 215 * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags 216 * can be set at any time on preemptible kernels if we have IRQs on, 217 * so we need to loop. Disabling preemption wouldn't help: doing the 218 * work to clear some of the flags can sleep. 219 */ 220 while (true) { 221 /* We have work to do. */ 222 local_irq_enable(); 223 224 if (cached_flags & _TIF_NEED_RESCHED) 225 schedule(); 226 227 if (cached_flags & _TIF_UPROBE) 228 uprobe_notify_resume(regs); 229 230 if (cached_flags & _TIF_PATCH_PENDING) 231 klp_update_patch_state(current); 232 233 /* deal with pending signal delivery */ 234 if (cached_flags & _TIF_SIGPENDING) 235 do_signal(regs); 236 237 if (cached_flags & _TIF_NOTIFY_RESUME) { 238 clear_thread_flag(TIF_NOTIFY_RESUME); 239 tracehook_notify_resume(regs); 240 rseq_handle_notify_resume(NULL, regs); 241 } 242 243 if (cached_flags & _TIF_USER_RETURN_NOTIFY) 244 fire_user_return_notifiers(); 245 246 /* Disable IRQs and retry */ 247 local_irq_disable(); 248 249 cached_flags = READ_ONCE(current_thread_info()->flags); 250 251 if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) 252 break; 253 } 254 } 255 256 static void __prepare_exit_to_usermode(struct pt_regs *regs) 257 { 258 struct thread_info *ti = current_thread_info(); 259 u32 cached_flags; 260 261 addr_limit_user_check(); 262 263 lockdep_assert_irqs_disabled(); 264 lockdep_sys_exit(); 265 266 cached_flags = READ_ONCE(ti->flags); 267 268 if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) 269 exit_to_usermode_loop(regs, cached_flags); 270 271 /* Reload ti->flags; we may have rescheduled above. */ 272 cached_flags = READ_ONCE(ti->flags); 273 274 if (unlikely(cached_flags & _TIF_IO_BITMAP)) 275 tss_update_io_bitmap(); 276 277 fpregs_assert_state_consistent(); 278 if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD)) 279 switch_fpu_return(); 280 281 #ifdef CONFIG_COMPAT 282 /* 283 * Compat syscalls set TS_COMPAT. Make sure we clear it before 284 * returning to user mode. We need to clear it *after* signal 285 * handling, because syscall restart has a fixup for compat 286 * syscalls. The fixup is exercised by the ptrace_syscall_32 287 * selftest. 288 * 289 * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer 290 * special case only applies after poking regs and before the 291 * very next return to user mode. 292 */ 293 ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); 294 #endif 295 } 296 297 __visible noinstr void prepare_exit_to_usermode(struct pt_regs *regs) 298 { 299 instrumentation_begin(); 300 __prepare_exit_to_usermode(regs); 301 instrumentation_end(); 302 exit_to_user_mode(); 303 } 304 305 #define SYSCALL_EXIT_WORK_FLAGS \ 306 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ 307 _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) 308 309 static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) 310 { 311 bool step; 312 313 audit_syscall_exit(regs); 314 315 if (cached_flags & _TIF_SYSCALL_TRACEPOINT) 316 trace_sys_exit(regs, regs->ax); 317 318 /* 319 * If TIF_SYSCALL_EMU is set, we only get here because of 320 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). 321 * We already reported this syscall instruction in 322 * syscall_trace_enter(). 323 */ 324 step = unlikely( 325 (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) 326 == _TIF_SINGLESTEP); 327 if (step || cached_flags & _TIF_SYSCALL_TRACE) 328 tracehook_report_syscall_exit(regs, step); 329 } 330 331 static void __syscall_return_slowpath(struct pt_regs *regs) 332 { 333 struct thread_info *ti = current_thread_info(); 334 u32 cached_flags = READ_ONCE(ti->flags); 335 336 CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 337 338 if (IS_ENABLED(CONFIG_PROVE_LOCKING) && 339 WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) 340 local_irq_enable(); 341 342 rseq_syscall(regs); 343 344 /* 345 * First do one-time work. If these work items are enabled, we 346 * want to run them exactly once per syscall exit with IRQs on. 347 */ 348 if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) 349 syscall_slow_exit_work(regs, cached_flags); 350 351 local_irq_disable(); 352 __prepare_exit_to_usermode(regs); 353 } 354 355 /* 356 * Called with IRQs on and fully valid regs. Returns with IRQs off in a 357 * state such that we can immediately switch to user mode. 358 */ 359 __visible noinstr void syscall_return_slowpath(struct pt_regs *regs) 360 { 361 instrumentation_begin(); 362 __syscall_return_slowpath(regs); 363 instrumentation_end(); 364 exit_to_user_mode(); 365 } 366 367 #ifdef CONFIG_X86_64 368 __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) 369 { 370 struct thread_info *ti; 371 372 check_user_regs(regs); 373 374 enter_from_user_mode(); 375 instrumentation_begin(); 376 377 local_irq_enable(); 378 ti = current_thread_info(); 379 if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) 380 nr = syscall_trace_enter(regs); 381 382 if (likely(nr < NR_syscalls)) { 383 nr = array_index_nospec(nr, NR_syscalls); 384 regs->ax = sys_call_table[nr](regs); 385 #ifdef CONFIG_X86_X32_ABI 386 } else if (likely((nr & __X32_SYSCALL_BIT) && 387 (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) { 388 nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT, 389 X32_NR_syscalls); 390 regs->ax = x32_sys_call_table[nr](regs); 391 #endif 392 } 393 __syscall_return_slowpath(regs); 394 395 instrumentation_end(); 396 exit_to_user_mode(); 397 } 398 #endif 399 400 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 401 /* 402 * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does 403 * all entry and exit work and returns with IRQs off. This function is 404 * extremely hot in workloads that use it, and it's usually called from 405 * do_fast_syscall_32, so forcibly inline it to improve performance. 406 */ 407 static void do_syscall_32_irqs_on(struct pt_regs *regs) 408 { 409 struct thread_info *ti = current_thread_info(); 410 unsigned int nr = (unsigned int)regs->orig_ax; 411 412 #ifdef CONFIG_IA32_EMULATION 413 ti->status |= TS_COMPAT; 414 #endif 415 416 if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { 417 /* 418 * Subtlety here: if ptrace pokes something larger than 419 * 2^32-1 into orig_ax, this truncates it. This may or 420 * may not be necessary, but it matches the old asm 421 * behavior. 422 */ 423 nr = syscall_trace_enter(regs); 424 } 425 426 if (likely(nr < IA32_NR_syscalls)) { 427 nr = array_index_nospec(nr, IA32_NR_syscalls); 428 regs->ax = ia32_sys_call_table[nr](regs); 429 } 430 431 __syscall_return_slowpath(regs); 432 } 433 434 /* Handles int $0x80 */ 435 __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) 436 { 437 check_user_regs(regs); 438 439 enter_from_user_mode(); 440 instrumentation_begin(); 441 442 local_irq_enable(); 443 do_syscall_32_irqs_on(regs); 444 445 instrumentation_end(); 446 exit_to_user_mode(); 447 } 448 449 static bool __do_fast_syscall_32(struct pt_regs *regs) 450 { 451 int res; 452 453 /* Fetch EBP from where the vDSO stashed it. */ 454 if (IS_ENABLED(CONFIG_X86_64)) { 455 /* 456 * Micro-optimization: the pointer we're following is 457 * explicitly 32 bits, so it can't be out of range. 458 */ 459 res = __get_user(*(u32 *)®s->bp, 460 (u32 __user __force *)(unsigned long)(u32)regs->sp); 461 } else { 462 res = get_user(*(u32 *)®s->bp, 463 (u32 __user __force *)(unsigned long)(u32)regs->sp); 464 } 465 466 if (res) { 467 /* User code screwed up. */ 468 regs->ax = -EFAULT; 469 local_irq_disable(); 470 __prepare_exit_to_usermode(regs); 471 return false; 472 } 473 474 /* Now this is just like a normal syscall. */ 475 do_syscall_32_irqs_on(regs); 476 return true; 477 } 478 479 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ 480 __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) 481 { 482 /* 483 * Called using the internal vDSO SYSENTER/SYSCALL32 calling 484 * convention. Adjust regs so it looks like we entered using int80. 485 */ 486 unsigned long landing_pad = (unsigned long)current->mm->context.vdso + 487 vdso_image_32.sym_int80_landing_pad; 488 bool success; 489 490 check_user_regs(regs); 491 492 /* 493 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward 494 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. 495 * Fix it up. 496 */ 497 regs->ip = landing_pad; 498 499 enter_from_user_mode(); 500 instrumentation_begin(); 501 502 local_irq_enable(); 503 success = __do_fast_syscall_32(regs); 504 505 instrumentation_end(); 506 exit_to_user_mode(); 507 508 /* If it failed, keep it simple: use IRET. */ 509 if (!success) 510 return 0; 511 512 #ifdef CONFIG_X86_64 513 /* 514 * Opportunistic SYSRETL: if possible, try to return using SYSRETL. 515 * SYSRETL is available on all 64-bit CPUs, so we don't need to 516 * bother with SYSEXIT. 517 * 518 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, 519 * because the ECX fixup above will ensure that this is essentially 520 * never the case. 521 */ 522 return regs->cs == __USER32_CS && regs->ss == __USER_DS && 523 regs->ip == landing_pad && 524 (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; 525 #else 526 /* 527 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. 528 * 529 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, 530 * because the ECX fixup above will ensure that this is essentially 531 * never the case. 532 * 533 * We don't allow syscalls at all from VM86 mode, but we still 534 * need to check VM, because we might be returning from sys_vm86. 535 */ 536 return static_cpu_has(X86_FEATURE_SEP) && 537 regs->cs == __USER_CS && regs->ss == __USER_DS && 538 regs->ip == landing_pad && 539 (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; 540 #endif 541 } 542 543 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ 544 __visible noinstr long do_SYSENTER_32(struct pt_regs *regs) 545 { 546 /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ 547 regs->sp = regs->bp; 548 549 /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ 550 regs->flags |= X86_EFLAGS_IF; 551 552 return do_fast_syscall_32(regs); 553 } 554 #endif 555 556 SYSCALL_DEFINE0(ni_syscall) 557 { 558 return -ENOSYS; 559 } 560 561 /** 562 * idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional 563 * RCU handling 564 * @regs: Pointer to pt_regs of interrupted context 565 * 566 * Invokes: 567 * - lockdep irqflag state tracking as low level ASM entry disabled 568 * interrupts. 569 * 570 * - Context tracking if the exception hit user mode. 571 * 572 * - The hardirq tracer to keep the state consistent as low level ASM 573 * entry disabled interrupts. 574 * 575 * For kernel mode entries RCU handling is done conditional. If RCU is 576 * watching then the only RCU requirement is to check whether the tick has 577 * to be restarted. If RCU is not watching then rcu_irq_enter() has to be 578 * invoked on entry and rcu_irq_exit() on exit. 579 * 580 * Avoiding the rcu_irq_enter/exit() calls is an optimization but also 581 * solves the problem of kernel mode pagefaults which can schedule, which 582 * is not possible after invoking rcu_irq_enter() without undoing it. 583 * 584 * For user mode entries enter_from_user_mode() must be invoked to 585 * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit 586 * would not be possible. 587 * 588 * Returns: True if RCU has been adjusted on a kernel entry 589 * False otherwise 590 * 591 * The return value must be fed into the rcu_exit argument of 592 * idtentry_exit_cond_rcu(). 593 */ 594 bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) 595 { 596 if (user_mode(regs)) { 597 check_user_regs(regs); 598 enter_from_user_mode(); 599 return false; 600 } 601 602 /* 603 * If this entry hit the idle task invoke rcu_irq_enter() whether 604 * RCU is watching or not. 605 * 606 * Interupts can nest when the first interrupt invokes softirq 607 * processing on return which enables interrupts. 608 * 609 * Scheduler ticks in the idle task can mark quiescent state and 610 * terminate a grace period, if and only if the timer interrupt is 611 * not nested into another interrupt. 612 * 613 * Checking for __rcu_is_watching() here would prevent the nesting 614 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is 615 * the tick then rcu_flavor_sched_clock_irq() would wrongfully 616 * assume that it is the first interupt and eventually claim 617 * quiescient state and end grace periods prematurely. 618 * 619 * Unconditionally invoke rcu_irq_enter() so RCU state stays 620 * consistent. 621 * 622 * TINY_RCU does not support EQS, so let the compiler eliminate 623 * this part when enabled. 624 */ 625 if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { 626 /* 627 * If RCU is not watching then the same careful 628 * sequence vs. lockdep and tracing is required 629 * as in enter_from_user_mode(). 630 */ 631 lockdep_hardirqs_off(CALLER_ADDR0); 632 rcu_irq_enter(); 633 instrumentation_begin(); 634 trace_hardirqs_off_finish(); 635 instrumentation_end(); 636 637 return true; 638 } 639 640 /* 641 * If RCU is watching then RCU only wants to check whether it needs 642 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() 643 * already contains a warning when RCU is not watching, so no point 644 * in having another one here. 645 */ 646 instrumentation_begin(); 647 rcu_irq_enter_check_tick(); 648 /* Use the combo lockdep/tracing function */ 649 trace_hardirqs_off(); 650 instrumentation_end(); 651 652 return false; 653 } 654 655 static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched) 656 { 657 if (may_sched && !preempt_count()) { 658 /* Sanity check RCU and thread stack */ 659 rcu_irq_exit_check_preempt(); 660 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 661 WARN_ON_ONCE(!on_thread_stack()); 662 if (need_resched()) 663 preempt_schedule_irq(); 664 } 665 /* Covers both tracing and lockdep */ 666 trace_hardirqs_on(); 667 } 668 669 /** 670 * idtentry_exit_cond_rcu - Handle return from exception with conditional RCU 671 * handling 672 * @regs: Pointer to pt_regs (exception entry regs) 673 * @rcu_exit: Invoke rcu_irq_exit() if true 674 * 675 * Depending on the return target (kernel/user) this runs the necessary 676 * preemption and work checks if possible and reguired and returns to 677 * the caller with interrupts disabled and no further work pending. 678 * 679 * This is the last action before returning to the low level ASM code which 680 * just needs to return to the appropriate context. 681 * 682 * Counterpart to idtentry_enter_cond_rcu(). The return value of the entry 683 * function must be fed into the @rcu_exit argument. 684 */ 685 void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) 686 { 687 lockdep_assert_irqs_disabled(); 688 689 /* Check whether this returns to user mode */ 690 if (user_mode(regs)) { 691 prepare_exit_to_usermode(regs); 692 } else if (regs->flags & X86_EFLAGS_IF) { 693 /* 694 * If RCU was not watching on entry this needs to be done 695 * carefully and needs the same ordering of lockdep/tracing 696 * and RCU as the return to user mode path. 697 */ 698 if (rcu_exit) { 699 instrumentation_begin(); 700 /* Tell the tracer that IRET will enable interrupts */ 701 trace_hardirqs_on_prepare(); 702 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 703 instrumentation_end(); 704 rcu_irq_exit(); 705 lockdep_hardirqs_on(CALLER_ADDR0); 706 return; 707 } 708 709 instrumentation_begin(); 710 idtentry_exit_cond_resched(regs, IS_ENABLED(CONFIG_PREEMPTION)); 711 instrumentation_end(); 712 } else { 713 /* 714 * IRQ flags state is correct already. Just tell RCU if it 715 * was not watching on entry. 716 */ 717 if (rcu_exit) 718 rcu_irq_exit(); 719 } 720 } 721 722 /** 723 * idtentry_enter_user - Handle state tracking on idtentry from user mode 724 * @regs: Pointer to pt_regs of interrupted context 725 * 726 * Invokes enter_from_user_mode() to establish the proper context for 727 * NOHZ_FULL. Otherwise scheduling on exit would not be possible. 728 */ 729 void noinstr idtentry_enter_user(struct pt_regs *regs) 730 { 731 check_user_regs(regs); 732 enter_from_user_mode(); 733 } 734 735 /** 736 * idtentry_exit_user - Handle return from exception to user mode 737 * @regs: Pointer to pt_regs (exception entry regs) 738 * 739 * Runs the necessary preemption and work checks and returns to the caller 740 * with interrupts disabled and no further work pending. 741 * 742 * This is the last action before returning to the low level ASM code which 743 * just needs to return to the appropriate context. 744 * 745 * Counterpart to idtentry_enter_user(). 746 */ 747 void noinstr idtentry_exit_user(struct pt_regs *regs) 748 { 749 lockdep_assert_irqs_disabled(); 750 751 prepare_exit_to_usermode(regs); 752 } 753 754 #ifdef CONFIG_XEN_PV 755 #ifndef CONFIG_PREEMPTION 756 /* 757 * Some hypercalls issued by the toolstack can take many 10s of 758 * seconds. Allow tasks running hypercalls via the privcmd driver to 759 * be voluntarily preempted even if full kernel preemption is 760 * disabled. 761 * 762 * Such preemptible hypercalls are bracketed by 763 * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() 764 * calls. 765 */ 766 DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); 767 EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); 768 769 /* 770 * In case of scheduling the flag must be cleared and restored after 771 * returning from schedule as the task might move to a different CPU. 772 */ 773 static __always_inline bool get_and_clear_inhcall(void) 774 { 775 bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); 776 777 __this_cpu_write(xen_in_preemptible_hcall, false); 778 return inhcall; 779 } 780 781 static __always_inline void restore_inhcall(bool inhcall) 782 { 783 __this_cpu_write(xen_in_preemptible_hcall, inhcall); 784 } 785 #else 786 static __always_inline bool get_and_clear_inhcall(void) { return false; } 787 static __always_inline void restore_inhcall(bool inhcall) { } 788 #endif 789 790 static void __xen_pv_evtchn_do_upcall(void) 791 { 792 irq_enter_rcu(); 793 inc_irq_stat(irq_hv_callback_count); 794 795 xen_hvm_evtchn_do_upcall(); 796 797 irq_exit_rcu(); 798 } 799 800 __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) 801 { 802 struct pt_regs *old_regs; 803 bool inhcall, rcu_exit; 804 805 rcu_exit = idtentry_enter_cond_rcu(regs); 806 old_regs = set_irq_regs(regs); 807 808 instrumentation_begin(); 809 run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, NULL, regs); 810 instrumentation_begin(); 811 812 set_irq_regs(old_regs); 813 814 inhcall = get_and_clear_inhcall(); 815 if (inhcall && !WARN_ON_ONCE(rcu_exit)) { 816 instrumentation_begin(); 817 idtentry_exit_cond_resched(regs, true); 818 instrumentation_end(); 819 restore_inhcall(inhcall); 820 } else { 821 idtentry_exit_cond_rcu(regs, rcu_exit); 822 } 823 } 824 #endif /* CONFIG_XEN_PV */ 825