1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * common.c - C code for kernel entry and exit 4 * Copyright (c) 2015 Andrew Lutomirski 5 * 6 * Based on asm and ptrace code by many authors. The code here originated 7 * in ptrace.c and signal.c. 8 */ 9 10 #include <linux/kernel.h> 11 #include <linux/sched.h> 12 #include <linux/sched/task_stack.h> 13 #include <linux/mm.h> 14 #include <linux/smp.h> 15 #include <linux/errno.h> 16 #include <linux/ptrace.h> 17 #include <linux/tracehook.h> 18 #include <linux/audit.h> 19 #include <linux/seccomp.h> 20 #include <linux/signal.h> 21 #include <linux/export.h> 22 #include <linux/context_tracking.h> 23 #include <linux/user-return-notifier.h> 24 #include <linux/nospec.h> 25 #include <linux/uprobes.h> 26 #include <linux/livepatch.h> 27 #include <linux/syscalls.h> 28 #include <linux/uaccess.h> 29 30 #ifdef CONFIG_XEN_PV 31 #include <xen/xen-ops.h> 32 #include <xen/events.h> 33 #endif 34 35 #include <asm/desc.h> 36 #include <asm/traps.h> 37 #include <asm/vdso.h> 38 #include <asm/cpufeature.h> 39 #include <asm/fpu/api.h> 40 #include <asm/nospec-branch.h> 41 #include <asm/io_bitmap.h> 42 #include <asm/syscall.h> 43 #include <asm/irq_stack.h> 44 45 #define CREATE_TRACE_POINTS 46 #include <trace/events/syscalls.h> 47 48 #ifdef CONFIG_CONTEXT_TRACKING 49 /** 50 * enter_from_user_mode - Establish state when coming from user mode 51 * 52 * Syscall entry disables interrupts, but user mode is traced as interrupts 53 * enabled. Also with NO_HZ_FULL RCU might be idle. 54 * 55 * 1) Tell lockdep that interrupts are disabled 56 * 2) Invoke context tracking if enabled to reactivate RCU 57 * 3) Trace interrupts off state 58 */ 59 __visible noinstr void enter_from_user_mode(void) 60 { 61 enum ctx_state state = ct_state(); 62 63 lockdep_hardirqs_off(CALLER_ADDR0); 64 user_exit_irqoff(); 65 66 instrumentation_begin(); 67 CT_WARN_ON(state != CONTEXT_USER); 68 trace_hardirqs_off_prepare(); 69 instrumentation_end(); 70 } 71 #else 72 static __always_inline void enter_from_user_mode(void) 73 { 74 lockdep_hardirqs_off(CALLER_ADDR0); 75 instrumentation_begin(); 76 trace_hardirqs_off_prepare(); 77 instrumentation_end(); 78 } 79 #endif 80 81 /** 82 * exit_to_user_mode - Fixup state when exiting to user mode 83 * 84 * Syscall exit enables interrupts, but the kernel state is interrupts 85 * disabled when this is invoked. Also tell RCU about it. 86 * 87 * 1) Trace interrupts on state 88 * 2) Invoke context tracking if enabled to adjust RCU state 89 * 3) Clear CPU buffers if CPU is affected by MDS and the migitation is on. 90 * 4) Tell lockdep that interrupts are enabled 91 */ 92 static __always_inline void exit_to_user_mode(void) 93 { 94 instrumentation_begin(); 95 trace_hardirqs_on_prepare(); 96 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 97 instrumentation_end(); 98 99 user_enter_irqoff(); 100 mds_user_clear_cpu_buffers(); 101 lockdep_hardirqs_on(CALLER_ADDR0); 102 } 103 104 static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) 105 { 106 #ifdef CONFIG_X86_64 107 if (arch == AUDIT_ARCH_X86_64) { 108 audit_syscall_entry(regs->orig_ax, regs->di, 109 regs->si, regs->dx, regs->r10); 110 } else 111 #endif 112 { 113 audit_syscall_entry(regs->orig_ax, regs->bx, 114 regs->cx, regs->dx, regs->si); 115 } 116 } 117 118 /* 119 * Returns the syscall nr to run (which should match regs->orig_ax) or -1 120 * to skip the syscall. 121 */ 122 static long syscall_trace_enter(struct pt_regs *regs) 123 { 124 u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; 125 126 struct thread_info *ti = current_thread_info(); 127 unsigned long ret = 0; 128 u32 work; 129 130 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 131 BUG_ON(regs != task_pt_regs(current)); 132 133 work = READ_ONCE(ti->flags); 134 135 if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { 136 ret = tracehook_report_syscall_entry(regs); 137 if (ret || (work & _TIF_SYSCALL_EMU)) 138 return -1L; 139 } 140 141 #ifdef CONFIG_SECCOMP 142 /* 143 * Do seccomp after ptrace, to catch any tracer changes. 144 */ 145 if (work & _TIF_SECCOMP) { 146 struct seccomp_data sd; 147 148 sd.arch = arch; 149 sd.nr = regs->orig_ax; 150 sd.instruction_pointer = regs->ip; 151 #ifdef CONFIG_X86_64 152 if (arch == AUDIT_ARCH_X86_64) { 153 sd.args[0] = regs->di; 154 sd.args[1] = regs->si; 155 sd.args[2] = regs->dx; 156 sd.args[3] = regs->r10; 157 sd.args[4] = regs->r8; 158 sd.args[5] = regs->r9; 159 } else 160 #endif 161 { 162 sd.args[0] = regs->bx; 163 sd.args[1] = regs->cx; 164 sd.args[2] = regs->dx; 165 sd.args[3] = regs->si; 166 sd.args[4] = regs->di; 167 sd.args[5] = regs->bp; 168 } 169 170 ret = __secure_computing(&sd); 171 if (ret == -1) 172 return ret; 173 } 174 #endif 175 176 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 177 trace_sys_enter(regs, regs->orig_ax); 178 179 do_audit_syscall_entry(regs, arch); 180 181 return ret ?: regs->orig_ax; 182 } 183 184 #define EXIT_TO_USERMODE_LOOP_FLAGS \ 185 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ 186 _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING) 187 188 static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) 189 { 190 /* 191 * In order to return to user mode, we need to have IRQs off with 192 * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags 193 * can be set at any time on preemptible kernels if we have IRQs on, 194 * so we need to loop. Disabling preemption wouldn't help: doing the 195 * work to clear some of the flags can sleep. 196 */ 197 while (true) { 198 /* We have work to do. */ 199 local_irq_enable(); 200 201 if (cached_flags & _TIF_NEED_RESCHED) 202 schedule(); 203 204 if (cached_flags & _TIF_UPROBE) 205 uprobe_notify_resume(regs); 206 207 if (cached_flags & _TIF_PATCH_PENDING) 208 klp_update_patch_state(current); 209 210 /* deal with pending signal delivery */ 211 if (cached_flags & _TIF_SIGPENDING) 212 do_signal(regs); 213 214 if (cached_flags & _TIF_NOTIFY_RESUME) { 215 clear_thread_flag(TIF_NOTIFY_RESUME); 216 tracehook_notify_resume(regs); 217 rseq_handle_notify_resume(NULL, regs); 218 } 219 220 if (cached_flags & _TIF_USER_RETURN_NOTIFY) 221 fire_user_return_notifiers(); 222 223 /* Disable IRQs and retry */ 224 local_irq_disable(); 225 226 cached_flags = READ_ONCE(current_thread_info()->flags); 227 228 if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) 229 break; 230 } 231 } 232 233 static void __prepare_exit_to_usermode(struct pt_regs *regs) 234 { 235 struct thread_info *ti = current_thread_info(); 236 u32 cached_flags; 237 238 addr_limit_user_check(); 239 240 lockdep_assert_irqs_disabled(); 241 lockdep_sys_exit(); 242 243 cached_flags = READ_ONCE(ti->flags); 244 245 if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) 246 exit_to_usermode_loop(regs, cached_flags); 247 248 /* Reload ti->flags; we may have rescheduled above. */ 249 cached_flags = READ_ONCE(ti->flags); 250 251 if (unlikely(cached_flags & _TIF_IO_BITMAP)) 252 tss_update_io_bitmap(); 253 254 fpregs_assert_state_consistent(); 255 if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD)) 256 switch_fpu_return(); 257 258 #ifdef CONFIG_COMPAT 259 /* 260 * Compat syscalls set TS_COMPAT. Make sure we clear it before 261 * returning to user mode. We need to clear it *after* signal 262 * handling, because syscall restart has a fixup for compat 263 * syscalls. The fixup is exercised by the ptrace_syscall_32 264 * selftest. 265 * 266 * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer 267 * special case only applies after poking regs and before the 268 * very next return to user mode. 269 */ 270 ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); 271 #endif 272 } 273 274 __visible noinstr void prepare_exit_to_usermode(struct pt_regs *regs) 275 { 276 instrumentation_begin(); 277 __prepare_exit_to_usermode(regs); 278 instrumentation_end(); 279 exit_to_user_mode(); 280 } 281 282 #define SYSCALL_EXIT_WORK_FLAGS \ 283 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ 284 _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) 285 286 static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) 287 { 288 bool step; 289 290 audit_syscall_exit(regs); 291 292 if (cached_flags & _TIF_SYSCALL_TRACEPOINT) 293 trace_sys_exit(regs, regs->ax); 294 295 /* 296 * If TIF_SYSCALL_EMU is set, we only get here because of 297 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). 298 * We already reported this syscall instruction in 299 * syscall_trace_enter(). 300 */ 301 step = unlikely( 302 (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) 303 == _TIF_SINGLESTEP); 304 if (step || cached_flags & _TIF_SYSCALL_TRACE) 305 tracehook_report_syscall_exit(regs, step); 306 } 307 308 static void __syscall_return_slowpath(struct pt_regs *regs) 309 { 310 struct thread_info *ti = current_thread_info(); 311 u32 cached_flags = READ_ONCE(ti->flags); 312 313 CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 314 315 if (IS_ENABLED(CONFIG_PROVE_LOCKING) && 316 WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) 317 local_irq_enable(); 318 319 rseq_syscall(regs); 320 321 /* 322 * First do one-time work. If these work items are enabled, we 323 * want to run them exactly once per syscall exit with IRQs on. 324 */ 325 if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) 326 syscall_slow_exit_work(regs, cached_flags); 327 328 local_irq_disable(); 329 __prepare_exit_to_usermode(regs); 330 } 331 332 /* 333 * Called with IRQs on and fully valid regs. Returns with IRQs off in a 334 * state such that we can immediately switch to user mode. 335 */ 336 __visible noinstr void syscall_return_slowpath(struct pt_regs *regs) 337 { 338 instrumentation_begin(); 339 __syscall_return_slowpath(regs); 340 instrumentation_end(); 341 exit_to_user_mode(); 342 } 343 344 #ifdef CONFIG_X86_64 345 __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) 346 { 347 struct thread_info *ti; 348 349 enter_from_user_mode(); 350 instrumentation_begin(); 351 352 local_irq_enable(); 353 ti = current_thread_info(); 354 if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) 355 nr = syscall_trace_enter(regs); 356 357 if (likely(nr < NR_syscalls)) { 358 nr = array_index_nospec(nr, NR_syscalls); 359 regs->ax = sys_call_table[nr](regs); 360 #ifdef CONFIG_X86_X32_ABI 361 } else if (likely((nr & __X32_SYSCALL_BIT) && 362 (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) { 363 nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT, 364 X32_NR_syscalls); 365 regs->ax = x32_sys_call_table[nr](regs); 366 #endif 367 } 368 __syscall_return_slowpath(regs); 369 370 instrumentation_end(); 371 exit_to_user_mode(); 372 } 373 #endif 374 375 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 376 /* 377 * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does 378 * all entry and exit work and returns with IRQs off. This function is 379 * extremely hot in workloads that use it, and it's usually called from 380 * do_fast_syscall_32, so forcibly inline it to improve performance. 381 */ 382 static void do_syscall_32_irqs_on(struct pt_regs *regs) 383 { 384 struct thread_info *ti = current_thread_info(); 385 unsigned int nr = (unsigned int)regs->orig_ax; 386 387 #ifdef CONFIG_IA32_EMULATION 388 ti->status |= TS_COMPAT; 389 #endif 390 391 if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { 392 /* 393 * Subtlety here: if ptrace pokes something larger than 394 * 2^32-1 into orig_ax, this truncates it. This may or 395 * may not be necessary, but it matches the old asm 396 * behavior. 397 */ 398 nr = syscall_trace_enter(regs); 399 } 400 401 if (likely(nr < IA32_NR_syscalls)) { 402 nr = array_index_nospec(nr, IA32_NR_syscalls); 403 regs->ax = ia32_sys_call_table[nr](regs); 404 } 405 406 __syscall_return_slowpath(regs); 407 } 408 409 /* Handles int $0x80 */ 410 __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) 411 { 412 enter_from_user_mode(); 413 instrumentation_begin(); 414 415 local_irq_enable(); 416 do_syscall_32_irqs_on(regs); 417 418 instrumentation_end(); 419 exit_to_user_mode(); 420 } 421 422 static bool __do_fast_syscall_32(struct pt_regs *regs) 423 { 424 int res; 425 426 /* Fetch EBP from where the vDSO stashed it. */ 427 if (IS_ENABLED(CONFIG_X86_64)) { 428 /* 429 * Micro-optimization: the pointer we're following is 430 * explicitly 32 bits, so it can't be out of range. 431 */ 432 res = __get_user(*(u32 *)®s->bp, 433 (u32 __user __force *)(unsigned long)(u32)regs->sp); 434 } else { 435 res = get_user(*(u32 *)®s->bp, 436 (u32 __user __force *)(unsigned long)(u32)regs->sp); 437 } 438 439 if (res) { 440 /* User code screwed up. */ 441 regs->ax = -EFAULT; 442 local_irq_disable(); 443 __prepare_exit_to_usermode(regs); 444 return false; 445 } 446 447 /* Now this is just like a normal syscall. */ 448 do_syscall_32_irqs_on(regs); 449 return true; 450 } 451 452 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ 453 __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) 454 { 455 /* 456 * Called using the internal vDSO SYSENTER/SYSCALL32 calling 457 * convention. Adjust regs so it looks like we entered using int80. 458 */ 459 unsigned long landing_pad = (unsigned long)current->mm->context.vdso + 460 vdso_image_32.sym_int80_landing_pad; 461 bool success; 462 463 /* 464 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward 465 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. 466 * Fix it up. 467 */ 468 regs->ip = landing_pad; 469 470 enter_from_user_mode(); 471 instrumentation_begin(); 472 473 local_irq_enable(); 474 success = __do_fast_syscall_32(regs); 475 476 instrumentation_end(); 477 exit_to_user_mode(); 478 479 /* If it failed, keep it simple: use IRET. */ 480 if (!success) 481 return 0; 482 483 #ifdef CONFIG_X86_64 484 /* 485 * Opportunistic SYSRETL: if possible, try to return using SYSRETL. 486 * SYSRETL is available on all 64-bit CPUs, so we don't need to 487 * bother with SYSEXIT. 488 * 489 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, 490 * because the ECX fixup above will ensure that this is essentially 491 * never the case. 492 */ 493 return regs->cs == __USER32_CS && regs->ss == __USER_DS && 494 regs->ip == landing_pad && 495 (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; 496 #else 497 /* 498 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. 499 * 500 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, 501 * because the ECX fixup above will ensure that this is essentially 502 * never the case. 503 * 504 * We don't allow syscalls at all from VM86 mode, but we still 505 * need to check VM, because we might be returning from sys_vm86. 506 */ 507 return static_cpu_has(X86_FEATURE_SEP) && 508 regs->cs == __USER_CS && regs->ss == __USER_DS && 509 regs->ip == landing_pad && 510 (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; 511 #endif 512 } 513 #endif 514 515 SYSCALL_DEFINE0(ni_syscall) 516 { 517 return -ENOSYS; 518 } 519 520 /** 521 * idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional 522 * RCU handling 523 * @regs: Pointer to pt_regs of interrupted context 524 * 525 * Invokes: 526 * - lockdep irqflag state tracking as low level ASM entry disabled 527 * interrupts. 528 * 529 * - Context tracking if the exception hit user mode. 530 * 531 * - The hardirq tracer to keep the state consistent as low level ASM 532 * entry disabled interrupts. 533 * 534 * For kernel mode entries RCU handling is done conditional. If RCU is 535 * watching then the only RCU requirement is to check whether the tick has 536 * to be restarted. If RCU is not watching then rcu_irq_enter() has to be 537 * invoked on entry and rcu_irq_exit() on exit. 538 * 539 * Avoiding the rcu_irq_enter/exit() calls is an optimization but also 540 * solves the problem of kernel mode pagefaults which can schedule, which 541 * is not possible after invoking rcu_irq_enter() without undoing it. 542 * 543 * For user mode entries enter_from_user_mode() must be invoked to 544 * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit 545 * would not be possible. 546 * 547 * Returns: True if RCU has been adjusted on a kernel entry 548 * False otherwise 549 * 550 * The return value must be fed into the rcu_exit argument of 551 * idtentry_exit_cond_rcu(). 552 */ 553 bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) 554 { 555 if (user_mode(regs)) { 556 enter_from_user_mode(); 557 return false; 558 } 559 560 if (!__rcu_is_watching()) { 561 /* 562 * If RCU is not watching then the same careful 563 * sequence vs. lockdep and tracing is required 564 * as in enter_from_user_mode(). 565 * 566 * This only happens for IRQs that hit the idle 567 * loop, i.e. if idle is not using MWAIT. 568 */ 569 lockdep_hardirqs_off(CALLER_ADDR0); 570 rcu_irq_enter(); 571 instrumentation_begin(); 572 trace_hardirqs_off_prepare(); 573 instrumentation_end(); 574 575 return true; 576 } 577 578 /* 579 * If RCU is watching then RCU only wants to check 580 * whether it needs to restart the tick in NOHZ 581 * mode. 582 */ 583 instrumentation_begin(); 584 rcu_irq_enter_check_tick(); 585 /* Use the combo lockdep/tracing function */ 586 trace_hardirqs_off(); 587 instrumentation_end(); 588 589 return false; 590 } 591 592 static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched) 593 { 594 if (may_sched && !preempt_count()) { 595 /* Sanity check RCU and thread stack */ 596 rcu_irq_exit_check_preempt(); 597 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 598 WARN_ON_ONCE(!on_thread_stack()); 599 if (need_resched()) 600 preempt_schedule_irq(); 601 } 602 /* Covers both tracing and lockdep */ 603 trace_hardirqs_on(); 604 } 605 606 /** 607 * idtentry_exit_cond_rcu - Handle return from exception with conditional RCU 608 * handling 609 * @regs: Pointer to pt_regs (exception entry regs) 610 * @rcu_exit: Invoke rcu_irq_exit() if true 611 * 612 * Depending on the return target (kernel/user) this runs the necessary 613 * preemption and work checks if possible and reguired and returns to 614 * the caller with interrupts disabled and no further work pending. 615 * 616 * This is the last action before returning to the low level ASM code which 617 * just needs to return to the appropriate context. 618 * 619 * Counterpart to idtentry_enter_cond_rcu(). The return value of the entry 620 * function must be fed into the @rcu_exit argument. 621 */ 622 void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) 623 { 624 lockdep_assert_irqs_disabled(); 625 626 /* Check whether this returns to user mode */ 627 if (user_mode(regs)) { 628 prepare_exit_to_usermode(regs); 629 } else if (regs->flags & X86_EFLAGS_IF) { 630 /* 631 * If RCU was not watching on entry this needs to be done 632 * carefully and needs the same ordering of lockdep/tracing 633 * and RCU as the return to user mode path. 634 */ 635 if (rcu_exit) { 636 instrumentation_begin(); 637 /* Tell the tracer that IRET will enable interrupts */ 638 trace_hardirqs_on_prepare(); 639 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 640 instrumentation_end(); 641 rcu_irq_exit(); 642 lockdep_hardirqs_on(CALLER_ADDR0); 643 return; 644 } 645 646 instrumentation_begin(); 647 idtentry_exit_cond_resched(regs, IS_ENABLED(CONFIG_PREEMPTION)); 648 instrumentation_end(); 649 } else { 650 /* 651 * IRQ flags state is correct already. Just tell RCU if it 652 * was not watching on entry. 653 */ 654 if (rcu_exit) 655 rcu_irq_exit(); 656 } 657 } 658 659 /** 660 * idtentry_enter_user - Handle state tracking on idtentry from user mode 661 * @regs: Pointer to pt_regs of interrupted context 662 * 663 * Invokes enter_from_user_mode() to establish the proper context for 664 * NOHZ_FULL. Otherwise scheduling on exit would not be possible. 665 */ 666 void noinstr idtentry_enter_user(struct pt_regs *regs) 667 { 668 enter_from_user_mode(); 669 } 670 671 /** 672 * idtentry_exit_user - Handle return from exception to user mode 673 * @regs: Pointer to pt_regs (exception entry regs) 674 * 675 * Runs the necessary preemption and work checks and returns to the caller 676 * with interrupts disabled and no further work pending. 677 * 678 * This is the last action before returning to the low level ASM code which 679 * just needs to return to the appropriate context. 680 * 681 * Counterpart to idtentry_enter_user(). 682 */ 683 void noinstr idtentry_exit_user(struct pt_regs *regs) 684 { 685 lockdep_assert_irqs_disabled(); 686 687 prepare_exit_to_usermode(regs); 688 } 689 690 #ifdef CONFIG_XEN_PV 691 #ifndef CONFIG_PREEMPTION 692 /* 693 * Some hypercalls issued by the toolstack can take many 10s of 694 * seconds. Allow tasks running hypercalls via the privcmd driver to 695 * be voluntarily preempted even if full kernel preemption is 696 * disabled. 697 * 698 * Such preemptible hypercalls are bracketed by 699 * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() 700 * calls. 701 */ 702 DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); 703 EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); 704 705 /* 706 * In case of scheduling the flag must be cleared and restored after 707 * returning from schedule as the task might move to a different CPU. 708 */ 709 static __always_inline bool get_and_clear_inhcall(void) 710 { 711 bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); 712 713 __this_cpu_write(xen_in_preemptible_hcall, false); 714 return inhcall; 715 } 716 717 static __always_inline void restore_inhcall(bool inhcall) 718 { 719 __this_cpu_write(xen_in_preemptible_hcall, inhcall); 720 } 721 #else 722 static __always_inline bool get_and_clear_inhcall(void) { return false; } 723 static __always_inline void restore_inhcall(bool inhcall) { } 724 #endif 725 726 static void __xen_pv_evtchn_do_upcall(void) 727 { 728 irq_enter_rcu(); 729 inc_irq_stat(irq_hv_callback_count); 730 731 xen_hvm_evtchn_do_upcall(); 732 733 irq_exit_rcu(); 734 } 735 736 __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) 737 { 738 struct pt_regs *old_regs; 739 bool inhcall, rcu_exit; 740 741 rcu_exit = idtentry_enter_cond_rcu(regs); 742 old_regs = set_irq_regs(regs); 743 744 instrumentation_begin(); 745 run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, NULL, regs); 746 instrumentation_begin(); 747 748 set_irq_regs(old_regs); 749 750 inhcall = get_and_clear_inhcall(); 751 if (inhcall && !WARN_ON_ONCE(rcu_exit)) { 752 instrumentation_begin(); 753 idtentry_exit_cond_resched(regs, true); 754 instrumentation_end(); 755 restore_inhcall(inhcall); 756 } else { 757 idtentry_exit_cond_rcu(regs, rcu_exit); 758 } 759 } 760 #endif /* CONFIG_XEN_PV */ 761