1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * Copyright (C) 1991,1992 Linus Torvalds 4 * 5 * entry_32.S contains the system-call and low-level fault and trap handling routines. 6 * 7 * Stack layout while running C code: 8 * ptrace needs to have all registers on the stack. 9 * If the order here is changed, it needs to be 10 * updated in fork.c:copy_process(), signal.c:do_signal(), 11 * ptrace.c and ptrace.h 12 * 13 * 0(%esp) - %ebx 14 * 4(%esp) - %ecx 15 * 8(%esp) - %edx 16 * C(%esp) - %esi 17 * 10(%esp) - %edi 18 * 14(%esp) - %ebp 19 * 18(%esp) - %eax 20 * 1C(%esp) - %ds 21 * 20(%esp) - %es 22 * 24(%esp) - %fs 23 * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS 24 * 2C(%esp) - orig_eax 25 * 30(%esp) - %eip 26 * 34(%esp) - %cs 27 * 38(%esp) - %eflags 28 * 3C(%esp) - %oldesp 29 * 40(%esp) - %oldss 30 */ 31 32#include <linux/linkage.h> 33#include <linux/err.h> 34#include <asm/thread_info.h> 35#include <asm/irqflags.h> 36#include <asm/errno.h> 37#include <asm/segment.h> 38#include <asm/smp.h> 39#include <asm/percpu.h> 40#include <asm/processor-flags.h> 41#include <asm/irq_vectors.h> 42#include <asm/cpufeatures.h> 43#include <asm/alternative-asm.h> 44#include <asm/asm.h> 45#include <asm/smap.h> 46#include <asm/frame.h> 47#include <asm/nospec-branch.h> 48 49#include "calling.h" 50 51 .section .entry.text, "ax" 52 53/* 54 * We use macros for low-level operations which need to be overridden 55 * for paravirtualization. The following will never clobber any registers: 56 * INTERRUPT_RETURN (aka. "iret") 57 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") 58 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). 59 * 60 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must 61 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). 62 * Allowing a register to be clobbered can shrink the paravirt replacement 63 * enough to patch inline, increasing performance. 64 */ 65 66#ifdef CONFIG_PREEMPTION 67# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF 68#else 69# define preempt_stop(clobbers) 70#endif 71 72.macro TRACE_IRQS_IRET 73#ifdef CONFIG_TRACE_IRQFLAGS 74 testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off? 75 jz 1f 76 TRACE_IRQS_ON 771: 78#endif 79.endm 80 81#define PTI_SWITCH_MASK (1 << PAGE_SHIFT) 82 83/* 84 * User gs save/restore 85 * 86 * %gs is used for userland TLS and kernel only uses it for stack 87 * canary which is required to be at %gs:20 by gcc. Read the comment 88 * at the top of stackprotector.h for more info. 89 * 90 * Local labels 98 and 99 are used. 91 */ 92#ifdef CONFIG_X86_32_LAZY_GS 93 94 /* unfortunately push/pop can't be no-op */ 95.macro PUSH_GS 96 pushl $0 97.endm 98.macro POP_GS pop=0 99 addl $(4 + \pop), %esp 100.endm 101.macro POP_GS_EX 102.endm 103 104 /* all the rest are no-op */ 105.macro PTGS_TO_GS 106.endm 107.macro PTGS_TO_GS_EX 108.endm 109.macro GS_TO_REG reg 110.endm 111.macro REG_TO_PTGS reg 112.endm 113.macro SET_KERNEL_GS reg 114.endm 115 116#else /* CONFIG_X86_32_LAZY_GS */ 117 118.macro PUSH_GS 119 pushl %gs 120.endm 121 122.macro POP_GS pop=0 12398: popl %gs 124 .if \pop <> 0 125 add $\pop, %esp 126 .endif 127.endm 128.macro POP_GS_EX 129.pushsection .fixup, "ax" 13099: movl $0, (%esp) 131 jmp 98b 132.popsection 133 _ASM_EXTABLE(98b, 99b) 134.endm 135 136.macro PTGS_TO_GS 13798: mov PT_GS(%esp), %gs 138.endm 139.macro PTGS_TO_GS_EX 140.pushsection .fixup, "ax" 14199: movl $0, PT_GS(%esp) 142 jmp 98b 143.popsection 144 _ASM_EXTABLE(98b, 99b) 145.endm 146 147.macro GS_TO_REG reg 148 movl %gs, \reg 149.endm 150.macro REG_TO_PTGS reg 151 movl \reg, PT_GS(%esp) 152.endm 153.macro SET_KERNEL_GS reg 154 movl $(__KERNEL_STACK_CANARY), \reg 155 movl \reg, %gs 156.endm 157 158#endif /* CONFIG_X86_32_LAZY_GS */ 159 160/* Unconditionally switch to user cr3 */ 161.macro SWITCH_TO_USER_CR3 scratch_reg:req 162 ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI 163 164 movl %cr3, \scratch_reg 165 orl $PTI_SWITCH_MASK, \scratch_reg 166 movl \scratch_reg, %cr3 167.Lend_\@: 168.endm 169 170.macro BUG_IF_WRONG_CR3 no_user_check=0 171#ifdef CONFIG_DEBUG_ENTRY 172 ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI 173 .if \no_user_check == 0 174 /* coming from usermode? */ 175 testl $SEGMENT_RPL_MASK, PT_CS(%esp) 176 jz .Lend_\@ 177 .endif 178 /* On user-cr3? */ 179 movl %cr3, %eax 180 testl $PTI_SWITCH_MASK, %eax 181 jnz .Lend_\@ 182 /* From userspace with kernel cr3 - BUG */ 183 ud2 184.Lend_\@: 185#endif 186.endm 187 188/* 189 * Switch to kernel cr3 if not already loaded and return current cr3 in 190 * \scratch_reg 191 */ 192.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req 193 ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI 194 movl %cr3, \scratch_reg 195 /* Test if we are already on kernel CR3 */ 196 testl $PTI_SWITCH_MASK, \scratch_reg 197 jz .Lend_\@ 198 andl $(~PTI_SWITCH_MASK), \scratch_reg 199 movl \scratch_reg, %cr3 200 /* Return original CR3 in \scratch_reg */ 201 orl $PTI_SWITCH_MASK, \scratch_reg 202.Lend_\@: 203.endm 204 205#define CS_FROM_ENTRY_STACK (1 << 31) 206#define CS_FROM_USER_CR3 (1 << 30) 207#define CS_FROM_KERNEL (1 << 29) 208 209.macro FIXUP_FRAME 210 /* 211 * The high bits of the CS dword (__csh) are used for CS_FROM_*. 212 * Clear them in case hardware didn't do this for us. 213 */ 214 andl $0x0000ffff, 3*4(%esp) 215 216#ifdef CONFIG_VM86 217 testl $X86_EFLAGS_VM, 4*4(%esp) 218 jnz .Lfrom_usermode_no_fixup_\@ 219#endif 220 testl $SEGMENT_RPL_MASK, 3*4(%esp) 221 jnz .Lfrom_usermode_no_fixup_\@ 222 223 orl $CS_FROM_KERNEL, 3*4(%esp) 224 225 /* 226 * When we're here from kernel mode; the (exception) stack looks like: 227 * 228 * 5*4(%esp) - <previous context> 229 * 4*4(%esp) - flags 230 * 3*4(%esp) - cs 231 * 2*4(%esp) - ip 232 * 1*4(%esp) - orig_eax 233 * 0*4(%esp) - gs / function 234 * 235 * Lets build a 5 entry IRET frame after that, such that struct pt_regs 236 * is complete and in particular regs->sp is correct. This gives us 237 * the original 5 enties as gap: 238 * 239 * 12*4(%esp) - <previous context> 240 * 11*4(%esp) - gap / flags 241 * 10*4(%esp) - gap / cs 242 * 9*4(%esp) - gap / ip 243 * 8*4(%esp) - gap / orig_eax 244 * 7*4(%esp) - gap / gs / function 245 * 6*4(%esp) - ss 246 * 5*4(%esp) - sp 247 * 4*4(%esp) - flags 248 * 3*4(%esp) - cs 249 * 2*4(%esp) - ip 250 * 1*4(%esp) - orig_eax 251 * 0*4(%esp) - gs / function 252 */ 253 254 pushl %ss # ss 255 pushl %esp # sp (points at ss) 256 addl $6*4, (%esp) # point sp back at the previous context 257 pushl 6*4(%esp) # flags 258 pushl 6*4(%esp) # cs 259 pushl 6*4(%esp) # ip 260 pushl 6*4(%esp) # orig_eax 261 pushl 6*4(%esp) # gs / function 262.Lfrom_usermode_no_fixup_\@: 263.endm 264 265.macro IRET_FRAME 266 testl $CS_FROM_KERNEL, 1*4(%esp) 267 jz .Lfinished_frame_\@ 268 269 /* 270 * Reconstruct the 3 entry IRET frame right after the (modified) 271 * regs->sp without lowering %esp in between, such that an NMI in the 272 * middle doesn't scribble our stack. 273 */ 274 pushl %eax 275 pushl %ecx 276 movl 5*4(%esp), %eax # (modified) regs->sp 277 278 movl 4*4(%esp), %ecx # flags 279 movl %ecx, -4(%eax) 280 281 movl 3*4(%esp), %ecx # cs 282 andl $0x0000ffff, %ecx 283 movl %ecx, -8(%eax) 284 285 movl 2*4(%esp), %ecx # ip 286 movl %ecx, -12(%eax) 287 288 movl 1*4(%esp), %ecx # eax 289 movl %ecx, -16(%eax) 290 291 popl %ecx 292 lea -16(%eax), %esp 293 popl %eax 294.Lfinished_frame_\@: 295.endm 296 297.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 298 cld 299.if \skip_gs == 0 300 PUSH_GS 301.endif 302 FIXUP_FRAME 303 pushl %fs 304 pushl %es 305 pushl %ds 306 pushl \pt_regs_ax 307 pushl %ebp 308 pushl %edi 309 pushl %esi 310 pushl %edx 311 pushl %ecx 312 pushl %ebx 313 movl $(__USER_DS), %edx 314 movl %edx, %ds 315 movl %edx, %es 316 movl $(__KERNEL_PERCPU), %edx 317 movl %edx, %fs 318.if \skip_gs == 0 319 SET_KERNEL_GS %edx 320.endif 321 /* Switch to kernel stack if necessary */ 322.if \switch_stacks > 0 323 SWITCH_TO_KERNEL_STACK 324.endif 325.endm 326 327.macro SAVE_ALL_NMI cr3_reg:req 328 SAVE_ALL 329 330 BUG_IF_WRONG_CR3 331 332 /* 333 * Now switch the CR3 when PTI is enabled. 334 * 335 * We can enter with either user or kernel cr3, the code will 336 * store the old cr3 in \cr3_reg and switches to the kernel cr3 337 * if necessary. 338 */ 339 SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg 340 341.Lend_\@: 342.endm 343 344.macro RESTORE_INT_REGS 345 popl %ebx 346 popl %ecx 347 popl %edx 348 popl %esi 349 popl %edi 350 popl %ebp 351 popl %eax 352.endm 353 354.macro RESTORE_REGS pop=0 355 RESTORE_INT_REGS 3561: popl %ds 3572: popl %es 3583: popl %fs 359 POP_GS \pop 360.pushsection .fixup, "ax" 3614: movl $0, (%esp) 362 jmp 1b 3635: movl $0, (%esp) 364 jmp 2b 3656: movl $0, (%esp) 366 jmp 3b 367.popsection 368 _ASM_EXTABLE(1b, 4b) 369 _ASM_EXTABLE(2b, 5b) 370 _ASM_EXTABLE(3b, 6b) 371 POP_GS_EX 372.endm 373 374.macro RESTORE_ALL_NMI cr3_reg:req pop=0 375 /* 376 * Now switch the CR3 when PTI is enabled. 377 * 378 * We enter with kernel cr3 and switch the cr3 to the value 379 * stored on \cr3_reg, which is either a user or a kernel cr3. 380 */ 381 ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI 382 383 testl $PTI_SWITCH_MASK, \cr3_reg 384 jz .Lswitched_\@ 385 386 /* User cr3 in \cr3_reg - write it to hardware cr3 */ 387 movl \cr3_reg, %cr3 388 389.Lswitched_\@: 390 391 BUG_IF_WRONG_CR3 392 393 RESTORE_REGS pop=\pop 394.endm 395 396.macro CHECK_AND_APPLY_ESPFIX 397#ifdef CONFIG_X86_ESPFIX32 398#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) 399 400 ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX 401 402 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS 403 /* 404 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we 405 * are returning to the kernel. 406 * See comments in process.c:copy_thread() for details. 407 */ 408 movb PT_OLDSS(%esp), %ah 409 movb PT_CS(%esp), %al 410 andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax 411 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax 412 jne .Lend_\@ # returning to user-space with LDT SS 413 414 /* 415 * Setup and switch to ESPFIX stack 416 * 417 * We're returning to userspace with a 16 bit stack. The CPU will not 418 * restore the high word of ESP for us on executing iret... This is an 419 * "official" bug of all the x86-compatible CPUs, which we can work 420 * around to make dosemu and wine happy. We do this by preloading the 421 * high word of ESP with the high word of the userspace ESP while 422 * compensating for the offset by changing to the ESPFIX segment with 423 * a base address that matches for the difference. 424 */ 425 mov %esp, %edx /* load kernel esp */ 426 mov PT_OLDESP(%esp), %eax /* load userspace esp */ 427 mov %dx, %ax /* eax: new kernel esp */ 428 sub %eax, %edx /* offset (low word is 0) */ 429 shr $16, %edx 430 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ 431 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ 432 pushl $__ESPFIX_SS 433 pushl %eax /* new kernel esp */ 434 /* 435 * Disable interrupts, but do not irqtrace this section: we 436 * will soon execute iret and the tracer was already set to 437 * the irqstate after the IRET: 438 */ 439 DISABLE_INTERRUPTS(CLBR_ANY) 440 lss (%esp), %esp /* switch to espfix segment */ 441.Lend_\@: 442#endif /* CONFIG_X86_ESPFIX32 */ 443.endm 444 445/* 446 * Called with pt_regs fully populated and kernel segments loaded, 447 * so we can access PER_CPU and use the integer registers. 448 * 449 * We need to be very careful here with the %esp switch, because an NMI 450 * can happen everywhere. If the NMI handler finds itself on the 451 * entry-stack, it will overwrite the task-stack and everything we 452 * copied there. So allocate the stack-frame on the task-stack and 453 * switch to it before we do any copying. 454 */ 455 456.macro SWITCH_TO_KERNEL_STACK 457 458 ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV 459 460 BUG_IF_WRONG_CR3 461 462 SWITCH_TO_KERNEL_CR3 scratch_reg=%eax 463 464 /* 465 * %eax now contains the entry cr3 and we carry it forward in 466 * that register for the time this macro runs 467 */ 468 469 /* Are we on the entry stack? Bail out if not! */ 470 movl PER_CPU_VAR(cpu_entry_area), %ecx 471 addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx 472 subl %esp, %ecx /* ecx = (end of entry_stack) - esp */ 473 cmpl $SIZEOF_entry_stack, %ecx 474 jae .Lend_\@ 475 476 /* Load stack pointer into %esi and %edi */ 477 movl %esp, %esi 478 movl %esi, %edi 479 480 /* Move %edi to the top of the entry stack */ 481 andl $(MASK_entry_stack), %edi 482 addl $(SIZEOF_entry_stack), %edi 483 484 /* Load top of task-stack into %edi */ 485 movl TSS_entry2task_stack(%edi), %edi 486 487 /* Special case - entry from kernel mode via entry stack */ 488#ifdef CONFIG_VM86 489 movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS 490 movb PT_CS(%esp), %cl 491 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx 492#else 493 movl PT_CS(%esp), %ecx 494 andl $SEGMENT_RPL_MASK, %ecx 495#endif 496 cmpl $USER_RPL, %ecx 497 jb .Lentry_from_kernel_\@ 498 499 /* Bytes to copy */ 500 movl $PTREGS_SIZE, %ecx 501 502#ifdef CONFIG_VM86 503 testl $X86_EFLAGS_VM, PT_EFLAGS(%esi) 504 jz .Lcopy_pt_regs_\@ 505 506 /* 507 * Stack-frame contains 4 additional segment registers when 508 * coming from VM86 mode 509 */ 510 addl $(4 * 4), %ecx 511 512#endif 513.Lcopy_pt_regs_\@: 514 515 /* Allocate frame on task-stack */ 516 subl %ecx, %edi 517 518 /* Switch to task-stack */ 519 movl %edi, %esp 520 521 /* 522 * We are now on the task-stack and can safely copy over the 523 * stack-frame 524 */ 525 shrl $2, %ecx 526 cld 527 rep movsl 528 529 jmp .Lend_\@ 530 531.Lentry_from_kernel_\@: 532 533 /* 534 * This handles the case when we enter the kernel from 535 * kernel-mode and %esp points to the entry-stack. When this 536 * happens we need to switch to the task-stack to run C code, 537 * but switch back to the entry-stack again when we approach 538 * iret and return to the interrupted code-path. This usually 539 * happens when we hit an exception while restoring user-space 540 * segment registers on the way back to user-space or when the 541 * sysenter handler runs with eflags.tf set. 542 * 543 * When we switch to the task-stack here, we can't trust the 544 * contents of the entry-stack anymore, as the exception handler 545 * might be scheduled out or moved to another CPU. Therefore we 546 * copy the complete entry-stack to the task-stack and set a 547 * marker in the iret-frame (bit 31 of the CS dword) to detect 548 * what we've done on the iret path. 549 * 550 * On the iret path we copy everything back and switch to the 551 * entry-stack, so that the interrupted kernel code-path 552 * continues on the same stack it was interrupted with. 553 * 554 * Be aware that an NMI can happen anytime in this code. 555 * 556 * %esi: Entry-Stack pointer (same as %esp) 557 * %edi: Top of the task stack 558 * %eax: CR3 on kernel entry 559 */ 560 561 /* Calculate number of bytes on the entry stack in %ecx */ 562 movl %esi, %ecx 563 564 /* %ecx to the top of entry-stack */ 565 andl $(MASK_entry_stack), %ecx 566 addl $(SIZEOF_entry_stack), %ecx 567 568 /* Number of bytes on the entry stack to %ecx */ 569 sub %esi, %ecx 570 571 /* Mark stackframe as coming from entry stack */ 572 orl $CS_FROM_ENTRY_STACK, PT_CS(%esp) 573 574 /* 575 * Test the cr3 used to enter the kernel and add a marker 576 * so that we can switch back to it before iret. 577 */ 578 testl $PTI_SWITCH_MASK, %eax 579 jz .Lcopy_pt_regs_\@ 580 orl $CS_FROM_USER_CR3, PT_CS(%esp) 581 582 /* 583 * %esi and %edi are unchanged, %ecx contains the number of 584 * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate 585 * the stack-frame on task-stack and copy everything over 586 */ 587 jmp .Lcopy_pt_regs_\@ 588 589.Lend_\@: 590.endm 591 592/* 593 * Switch back from the kernel stack to the entry stack. 594 * 595 * The %esp register must point to pt_regs on the task stack. It will 596 * first calculate the size of the stack-frame to copy, depending on 597 * whether we return to VM86 mode or not. With that it uses 'rep movsl' 598 * to copy the contents of the stack over to the entry stack. 599 * 600 * We must be very careful here, as we can't trust the contents of the 601 * task-stack once we switched to the entry-stack. When an NMI happens 602 * while on the entry-stack, the NMI handler will switch back to the top 603 * of the task stack, overwriting our stack-frame we are about to copy. 604 * Therefore we switch the stack only after everything is copied over. 605 */ 606.macro SWITCH_TO_ENTRY_STACK 607 608 ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV 609 610 /* Bytes to copy */ 611 movl $PTREGS_SIZE, %ecx 612 613#ifdef CONFIG_VM86 614 testl $(X86_EFLAGS_VM), PT_EFLAGS(%esp) 615 jz .Lcopy_pt_regs_\@ 616 617 /* Additional 4 registers to copy when returning to VM86 mode */ 618 addl $(4 * 4), %ecx 619 620.Lcopy_pt_regs_\@: 621#endif 622 623 /* Initialize source and destination for movsl */ 624 movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi 625 subl %ecx, %edi 626 movl %esp, %esi 627 628 /* Save future stack pointer in %ebx */ 629 movl %edi, %ebx 630 631 /* Copy over the stack-frame */ 632 shrl $2, %ecx 633 cld 634 rep movsl 635 636 /* 637 * Switch to entry-stack - needs to happen after everything is 638 * copied because the NMI handler will overwrite the task-stack 639 * when on entry-stack 640 */ 641 movl %ebx, %esp 642 643.Lend_\@: 644.endm 645 646/* 647 * This macro handles the case when we return to kernel-mode on the iret 648 * path and have to switch back to the entry stack and/or user-cr3 649 * 650 * See the comments below the .Lentry_from_kernel_\@ label in the 651 * SWITCH_TO_KERNEL_STACK macro for more details. 652 */ 653.macro PARANOID_EXIT_TO_KERNEL_MODE 654 655 /* 656 * Test if we entered the kernel with the entry-stack. Most 657 * likely we did not, because this code only runs on the 658 * return-to-kernel path. 659 */ 660 testl $CS_FROM_ENTRY_STACK, PT_CS(%esp) 661 jz .Lend_\@ 662 663 /* Unlikely slow-path */ 664 665 /* Clear marker from stack-frame */ 666 andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp) 667 668 /* Copy the remaining task-stack contents to entry-stack */ 669 movl %esp, %esi 670 movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi 671 672 /* Bytes on the task-stack to ecx */ 673 movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx 674 subl %esi, %ecx 675 676 /* Allocate stack-frame on entry-stack */ 677 subl %ecx, %edi 678 679 /* 680 * Save future stack-pointer, we must not switch until the 681 * copy is done, otherwise the NMI handler could destroy the 682 * contents of the task-stack we are about to copy. 683 */ 684 movl %edi, %ebx 685 686 /* Do the copy */ 687 shrl $2, %ecx 688 cld 689 rep movsl 690 691 /* Safe to switch to entry-stack now */ 692 movl %ebx, %esp 693 694 /* 695 * We came from entry-stack and need to check if we also need to 696 * switch back to user cr3. 697 */ 698 testl $CS_FROM_USER_CR3, PT_CS(%esp) 699 jz .Lend_\@ 700 701 /* Clear marker from stack-frame */ 702 andl $(~CS_FROM_USER_CR3), PT_CS(%esp) 703 704 SWITCH_TO_USER_CR3 scratch_reg=%eax 705 706.Lend_\@: 707.endm 708/* 709 * %eax: prev task 710 * %edx: next task 711 */ 712ENTRY(__switch_to_asm) 713 /* 714 * Save callee-saved registers 715 * This must match the order in struct inactive_task_frame 716 */ 717 pushl %ebp 718 pushl %ebx 719 pushl %edi 720 pushl %esi 721 pushfl 722 723 /* switch stack */ 724 movl %esp, TASK_threadsp(%eax) 725 movl TASK_threadsp(%edx), %esp 726 727#ifdef CONFIG_STACKPROTECTOR 728 movl TASK_stack_canary(%edx), %ebx 729 movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset 730#endif 731 732#ifdef CONFIG_RETPOLINE 733 /* 734 * When switching from a shallower to a deeper call stack 735 * the RSB may either underflow or use entries populated 736 * with userspace addresses. On CPUs where those concerns 737 * exist, overwrite the RSB with entries which capture 738 * speculative execution to prevent attack. 739 */ 740 FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW 741#endif 742 743 /* restore callee-saved registers */ 744 popfl 745 popl %esi 746 popl %edi 747 popl %ebx 748 popl %ebp 749 750 jmp __switch_to 751END(__switch_to_asm) 752 753/* 754 * The unwinder expects the last frame on the stack to always be at the same 755 * offset from the end of the page, which allows it to validate the stack. 756 * Calling schedule_tail() directly would break that convention because its an 757 * asmlinkage function so its argument has to be pushed on the stack. This 758 * wrapper creates a proper "end of stack" frame header before the call. 759 */ 760ENTRY(schedule_tail_wrapper) 761 FRAME_BEGIN 762 763 pushl %eax 764 call schedule_tail 765 popl %eax 766 767 FRAME_END 768 ret 769ENDPROC(schedule_tail_wrapper) 770/* 771 * A newly forked process directly context switches into this address. 772 * 773 * eax: prev task we switched from 774 * ebx: kernel thread func (NULL for user thread) 775 * edi: kernel thread arg 776 */ 777ENTRY(ret_from_fork) 778 call schedule_tail_wrapper 779 780 testl %ebx, %ebx 781 jnz 1f /* kernel threads are uncommon */ 782 7832: 784 /* When we fork, we trace the syscall return in the child, too. */ 785 movl %esp, %eax 786 call syscall_return_slowpath 787 STACKLEAK_ERASE 788 jmp restore_all 789 790 /* kernel thread */ 7911: movl %edi, %eax 792 CALL_NOSPEC %ebx 793 /* 794 * A kernel thread is allowed to return here after successfully 795 * calling do_execve(). Exit to userspace to complete the execve() 796 * syscall. 797 */ 798 movl $0, PT_EAX(%esp) 799 jmp 2b 800END(ret_from_fork) 801 802/* 803 * Return to user mode is not as complex as all this looks, 804 * but we want the default path for a system call return to 805 * go as quickly as possible which is why some of this is 806 * less clear than it otherwise should be. 807 */ 808 809 # userspace resumption stub bypassing syscall exit tracing 810 ALIGN 811ret_from_exception: 812 preempt_stop(CLBR_ANY) 813ret_from_intr: 814#ifdef CONFIG_VM86 815 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS 816 movb PT_CS(%esp), %al 817 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax 818#else 819 /* 820 * We can be coming here from child spawned by kernel_thread(). 821 */ 822 movl PT_CS(%esp), %eax 823 andl $SEGMENT_RPL_MASK, %eax 824#endif 825 cmpl $USER_RPL, %eax 826 jb restore_all_kernel # not returning to v8086 or userspace 827 828ENTRY(resume_userspace) 829 DISABLE_INTERRUPTS(CLBR_ANY) 830 TRACE_IRQS_OFF 831 movl %esp, %eax 832 call prepare_exit_to_usermode 833 jmp restore_all 834END(ret_from_exception) 835 836GLOBAL(__begin_SYSENTER_singlestep_region) 837/* 838 * All code from here through __end_SYSENTER_singlestep_region is subject 839 * to being single-stepped if a user program sets TF and executes SYSENTER. 840 * There is absolutely nothing that we can do to prevent this from happening 841 * (thanks Intel!). To keep our handling of this situation as simple as 842 * possible, we handle TF just like AC and NT, except that our #DB handler 843 * will ignore all of the single-step traps generated in this range. 844 */ 845 846#ifdef CONFIG_XEN_PV 847/* 848 * Xen doesn't set %esp to be precisely what the normal SYSENTER 849 * entry point expects, so fix it up before using the normal path. 850 */ 851ENTRY(xen_sysenter_target) 852 addl $5*4, %esp /* remove xen-provided frame */ 853 jmp .Lsysenter_past_esp 854#endif 855 856/* 857 * 32-bit SYSENTER entry. 858 * 859 * 32-bit system calls through the vDSO's __kernel_vsyscall enter here 860 * if X86_FEATURE_SEP is available. This is the preferred system call 861 * entry on 32-bit systems. 862 * 863 * The SYSENTER instruction, in principle, should *only* occur in the 864 * vDSO. In practice, a small number of Android devices were shipped 865 * with a copy of Bionic that inlined a SYSENTER instruction. This 866 * never happened in any of Google's Bionic versions -- it only happened 867 * in a narrow range of Intel-provided versions. 868 * 869 * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs. 870 * IF and VM in RFLAGS are cleared (IOW: interrupts are off). 871 * SYSENTER does not save anything on the stack, 872 * and does not save old EIP (!!!), ESP, or EFLAGS. 873 * 874 * To avoid losing track of EFLAGS.VM (and thus potentially corrupting 875 * user and/or vm86 state), we explicitly disable the SYSENTER 876 * instruction in vm86 mode by reprogramming the MSRs. 877 * 878 * Arguments: 879 * eax system call number 880 * ebx arg1 881 * ecx arg2 882 * edx arg3 883 * esi arg4 884 * edi arg5 885 * ebp user stack 886 * 0(%ebp) arg6 887 */ 888ENTRY(entry_SYSENTER_32) 889 /* 890 * On entry-stack with all userspace-regs live - save and 891 * restore eflags and %eax to use it as scratch-reg for the cr3 892 * switch. 893 */ 894 pushfl 895 pushl %eax 896 BUG_IF_WRONG_CR3 no_user_check=1 897 SWITCH_TO_KERNEL_CR3 scratch_reg=%eax 898 popl %eax 899 popfl 900 901 /* Stack empty again, switch to task stack */ 902 movl TSS_entry2task_stack(%esp), %esp 903 904.Lsysenter_past_esp: 905 pushl $__USER_DS /* pt_regs->ss */ 906 pushl %ebp /* pt_regs->sp (stashed in bp) */ 907 pushfl /* pt_regs->flags (except IF = 0) */ 908 orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ 909 pushl $__USER_CS /* pt_regs->cs */ 910 pushl $0 /* pt_regs->ip = 0 (placeholder) */ 911 pushl %eax /* pt_regs->orig_ax */ 912 SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, stack already switched */ 913 914 /* 915 * SYSENTER doesn't filter flags, so we need to clear NT, AC 916 * and TF ourselves. To save a few cycles, we can check whether 917 * either was set instead of doing an unconditional popfq. 918 * This needs to happen before enabling interrupts so that 919 * we don't get preempted with NT set. 920 * 921 * If TF is set, we will single-step all the way to here -- do_debug 922 * will ignore all the traps. (Yes, this is slow, but so is 923 * single-stepping in general. This allows us to avoid having 924 * a more complicated code to handle the case where a user program 925 * forces us to single-step through the SYSENTER entry code.) 926 * 927 * NB.: .Lsysenter_fix_flags is a label with the code under it moved 928 * out-of-line as an optimization: NT is unlikely to be set in the 929 * majority of the cases and instead of polluting the I$ unnecessarily, 930 * we're keeping that code behind a branch which will predict as 931 * not-taken and therefore its instructions won't be fetched. 932 */ 933 testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp) 934 jnz .Lsysenter_fix_flags 935.Lsysenter_flags_fixed: 936 937 /* 938 * User mode is traced as though IRQs are on, and SYSENTER 939 * turned them off. 940 */ 941 TRACE_IRQS_OFF 942 943 movl %esp, %eax 944 call do_fast_syscall_32 945 /* XEN PV guests always use IRET path */ 946 ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ 947 "jmp .Lsyscall_32_done", X86_FEATURE_XENPV 948 949 STACKLEAK_ERASE 950 951/* Opportunistic SYSEXIT */ 952 TRACE_IRQS_ON /* User mode traces as IRQs on. */ 953 954 /* 955 * Setup entry stack - we keep the pointer in %eax and do the 956 * switch after almost all user-state is restored. 957 */ 958 959 /* Load entry stack pointer and allocate frame for eflags/eax */ 960 movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax 961 subl $(2*4), %eax 962 963 /* Copy eflags and eax to entry stack */ 964 movl PT_EFLAGS(%esp), %edi 965 movl PT_EAX(%esp), %esi 966 movl %edi, (%eax) 967 movl %esi, 4(%eax) 968 969 /* Restore user registers and segments */ 970 movl PT_EIP(%esp), %edx /* pt_regs->ip */ 971 movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */ 9721: mov PT_FS(%esp), %fs 973 PTGS_TO_GS 974 975 popl %ebx /* pt_regs->bx */ 976 addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */ 977 popl %esi /* pt_regs->si */ 978 popl %edi /* pt_regs->di */ 979 popl %ebp /* pt_regs->bp */ 980 981 /* Switch to entry stack */ 982 movl %eax, %esp 983 984 /* Now ready to switch the cr3 */ 985 SWITCH_TO_USER_CR3 scratch_reg=%eax 986 987 /* 988 * Restore all flags except IF. (We restore IF separately because 989 * STI gives a one-instruction window in which we won't be interrupted, 990 * whereas POPF does not.) 991 */ 992 btrl $X86_EFLAGS_IF_BIT, (%esp) 993 BUG_IF_WRONG_CR3 no_user_check=1 994 popfl 995 popl %eax 996 997 /* 998 * Return back to the vDSO, which will pop ecx and edx. 999 * Don't bother with DS and ES (they already contain __USER_DS). 1000 */ 1001 sti 1002 sysexit 1003 1004.pushsection .fixup, "ax" 10052: movl $0, PT_FS(%esp) 1006 jmp 1b 1007.popsection 1008 _ASM_EXTABLE(1b, 2b) 1009 PTGS_TO_GS_EX 1010 1011.Lsysenter_fix_flags: 1012 pushl $X86_EFLAGS_FIXED 1013 popfl 1014 jmp .Lsysenter_flags_fixed 1015GLOBAL(__end_SYSENTER_singlestep_region) 1016ENDPROC(entry_SYSENTER_32) 1017 1018/* 1019 * 32-bit legacy system call entry. 1020 * 1021 * 32-bit x86 Linux system calls traditionally used the INT $0x80 1022 * instruction. INT $0x80 lands here. 1023 * 1024 * This entry point can be used by any 32-bit perform system calls. 1025 * Instances of INT $0x80 can be found inline in various programs and 1026 * libraries. It is also used by the vDSO's __kernel_vsyscall 1027 * fallback for hardware that doesn't support a faster entry method. 1028 * Restarted 32-bit system calls also fall back to INT $0x80 1029 * regardless of what instruction was originally used to do the system 1030 * call. (64-bit programs can use INT $0x80 as well, but they can 1031 * only run on 64-bit kernels and therefore land in 1032 * entry_INT80_compat.) 1033 * 1034 * This is considered a slow path. It is not used by most libc 1035 * implementations on modern hardware except during process startup. 1036 * 1037 * Arguments: 1038 * eax system call number 1039 * ebx arg1 1040 * ecx arg2 1041 * edx arg3 1042 * esi arg4 1043 * edi arg5 1044 * ebp arg6 1045 */ 1046ENTRY(entry_INT80_32) 1047 ASM_CLAC 1048 pushl %eax /* pt_regs->orig_ax */ 1049 1050 SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */ 1051 1052 /* 1053 * User mode is traced as though IRQs are on, and the interrupt gate 1054 * turned them off. 1055 */ 1056 TRACE_IRQS_OFF 1057 1058 movl %esp, %eax 1059 call do_int80_syscall_32 1060.Lsyscall_32_done: 1061 1062 STACKLEAK_ERASE 1063 1064restore_all: 1065 TRACE_IRQS_IRET 1066 SWITCH_TO_ENTRY_STACK 1067.Lrestore_all_notrace: 1068 CHECK_AND_APPLY_ESPFIX 1069.Lrestore_nocheck: 1070 /* Switch back to user CR3 */ 1071 SWITCH_TO_USER_CR3 scratch_reg=%eax 1072 1073 BUG_IF_WRONG_CR3 1074 1075 /* Restore user state */ 1076 RESTORE_REGS pop=4 # skip orig_eax/error_code 1077.Lirq_return: 1078 IRET_FRAME 1079 /* 1080 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization 1081 * when returning from IPI handler and when returning from 1082 * scheduler to user-space. 1083 */ 1084 INTERRUPT_RETURN 1085 1086restore_all_kernel: 1087#ifdef CONFIG_PREEMPTION 1088 DISABLE_INTERRUPTS(CLBR_ANY) 1089 cmpl $0, PER_CPU_VAR(__preempt_count) 1090 jnz .Lno_preempt 1091 testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? 1092 jz .Lno_preempt 1093 call preempt_schedule_irq 1094.Lno_preempt: 1095#endif 1096 TRACE_IRQS_IRET 1097 PARANOID_EXIT_TO_KERNEL_MODE 1098 BUG_IF_WRONG_CR3 1099 RESTORE_REGS 4 1100 jmp .Lirq_return 1101 1102.section .fixup, "ax" 1103ENTRY(iret_exc ) 1104 pushl $0 # no error code 1105 pushl $do_iret_error 1106 1107#ifdef CONFIG_DEBUG_ENTRY 1108 /* 1109 * The stack-frame here is the one that iret faulted on, so its a 1110 * return-to-user frame. We are on kernel-cr3 because we come here from 1111 * the fixup code. This confuses the CR3 checker, so switch to user-cr3 1112 * as the checker expects it. 1113 */ 1114 pushl %eax 1115 SWITCH_TO_USER_CR3 scratch_reg=%eax 1116 popl %eax 1117#endif 1118 1119 jmp common_exception 1120.previous 1121 _ASM_EXTABLE(.Lirq_return, iret_exc) 1122ENDPROC(entry_INT80_32) 1123 1124.macro FIXUP_ESPFIX_STACK 1125/* 1126 * Switch back for ESPFIX stack to the normal zerobased stack 1127 * 1128 * We can't call C functions using the ESPFIX stack. This code reads 1129 * the high word of the segment base from the GDT and swiches to the 1130 * normal stack and adjusts ESP with the matching offset. 1131 */ 1132#ifdef CONFIG_X86_ESPFIX32 1133 /* fixup the stack */ 1134 mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ 1135 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ 1136 shl $16, %eax 1137 addl %esp, %eax /* the adjusted stack pointer */ 1138 pushl $__KERNEL_DS 1139 pushl %eax 1140 lss (%esp), %esp /* switch to the normal stack segment */ 1141#endif 1142.endm 1143.macro UNWIND_ESPFIX_STACK 1144#ifdef CONFIG_X86_ESPFIX32 1145 movl %ss, %eax 1146 /* see if on espfix stack */ 1147 cmpw $__ESPFIX_SS, %ax 1148 jne 27f 1149 movl $__KERNEL_DS, %eax 1150 movl %eax, %ds 1151 movl %eax, %es 1152 /* switch to normal stack */ 1153 FIXUP_ESPFIX_STACK 115427: 1155#endif 1156.endm 1157 1158/* 1159 * Build the entry stubs with some assembler magic. 1160 * We pack 1 stub into every 8-byte block. 1161 */ 1162 .align 8 1163ENTRY(irq_entries_start) 1164 vector=FIRST_EXTERNAL_VECTOR 1165 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) 1166 pushl $(~vector+0x80) /* Note: always in signed byte range */ 1167 vector=vector+1 1168 jmp common_interrupt 1169 .align 8 1170 .endr 1171END(irq_entries_start) 1172 1173#ifdef CONFIG_X86_LOCAL_APIC 1174 .align 8 1175ENTRY(spurious_entries_start) 1176 vector=FIRST_SYSTEM_VECTOR 1177 .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) 1178 pushl $(~vector+0x80) /* Note: always in signed byte range */ 1179 vector=vector+1 1180 jmp common_spurious 1181 .align 8 1182 .endr 1183END(spurious_entries_start) 1184 1185common_spurious: 1186 ASM_CLAC 1187 addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ 1188 SAVE_ALL switch_stacks=1 1189 ENCODE_FRAME_POINTER 1190 TRACE_IRQS_OFF 1191 movl %esp, %eax 1192 call smp_spurious_interrupt 1193 jmp ret_from_intr 1194ENDPROC(common_spurious) 1195#endif 1196 1197/* 1198 * the CPU automatically disables interrupts when executing an IRQ vector, 1199 * so IRQ-flags tracing has to follow that: 1200 */ 1201 .p2align CONFIG_X86_L1_CACHE_SHIFT 1202common_interrupt: 1203 ASM_CLAC 1204 addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ 1205 1206 SAVE_ALL switch_stacks=1 1207 ENCODE_FRAME_POINTER 1208 TRACE_IRQS_OFF 1209 movl %esp, %eax 1210 call do_IRQ 1211 jmp ret_from_intr 1212ENDPROC(common_interrupt) 1213 1214#define BUILD_INTERRUPT3(name, nr, fn) \ 1215ENTRY(name) \ 1216 ASM_CLAC; \ 1217 pushl $~(nr); \ 1218 SAVE_ALL switch_stacks=1; \ 1219 ENCODE_FRAME_POINTER; \ 1220 TRACE_IRQS_OFF \ 1221 movl %esp, %eax; \ 1222 call fn; \ 1223 jmp ret_from_intr; \ 1224ENDPROC(name) 1225 1226#define BUILD_INTERRUPT(name, nr) \ 1227 BUILD_INTERRUPT3(name, nr, smp_##name); \ 1228 1229/* The include is where all of the SMP etc. interrupts come from */ 1230#include <asm/entry_arch.h> 1231 1232ENTRY(coprocessor_error) 1233 ASM_CLAC 1234 pushl $0 1235 pushl $do_coprocessor_error 1236 jmp common_exception 1237END(coprocessor_error) 1238 1239ENTRY(simd_coprocessor_error) 1240 ASM_CLAC 1241 pushl $0 1242#ifdef CONFIG_X86_INVD_BUG 1243 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ 1244 ALTERNATIVE "pushl $do_general_protection", \ 1245 "pushl $do_simd_coprocessor_error", \ 1246 X86_FEATURE_XMM 1247#else 1248 pushl $do_simd_coprocessor_error 1249#endif 1250 jmp common_exception 1251END(simd_coprocessor_error) 1252 1253ENTRY(device_not_available) 1254 ASM_CLAC 1255 pushl $-1 # mark this as an int 1256 pushl $do_device_not_available 1257 jmp common_exception 1258END(device_not_available) 1259 1260#ifdef CONFIG_PARAVIRT 1261ENTRY(native_iret) 1262 iret 1263 _ASM_EXTABLE(native_iret, iret_exc) 1264END(native_iret) 1265#endif 1266 1267ENTRY(overflow) 1268 ASM_CLAC 1269 pushl $0 1270 pushl $do_overflow 1271 jmp common_exception 1272END(overflow) 1273 1274ENTRY(bounds) 1275 ASM_CLAC 1276 pushl $0 1277 pushl $do_bounds 1278 jmp common_exception 1279END(bounds) 1280 1281ENTRY(invalid_op) 1282 ASM_CLAC 1283 pushl $0 1284 pushl $do_invalid_op 1285 jmp common_exception 1286END(invalid_op) 1287 1288ENTRY(coprocessor_segment_overrun) 1289 ASM_CLAC 1290 pushl $0 1291 pushl $do_coprocessor_segment_overrun 1292 jmp common_exception 1293END(coprocessor_segment_overrun) 1294 1295ENTRY(invalid_TSS) 1296 ASM_CLAC 1297 pushl $do_invalid_TSS 1298 jmp common_exception 1299END(invalid_TSS) 1300 1301ENTRY(segment_not_present) 1302 ASM_CLAC 1303 pushl $do_segment_not_present 1304 jmp common_exception 1305END(segment_not_present) 1306 1307ENTRY(stack_segment) 1308 ASM_CLAC 1309 pushl $do_stack_segment 1310 jmp common_exception 1311END(stack_segment) 1312 1313ENTRY(alignment_check) 1314 ASM_CLAC 1315 pushl $do_alignment_check 1316 jmp common_exception 1317END(alignment_check) 1318 1319ENTRY(divide_error) 1320 ASM_CLAC 1321 pushl $0 # no error code 1322 pushl $do_divide_error 1323 jmp common_exception 1324END(divide_error) 1325 1326#ifdef CONFIG_X86_MCE 1327ENTRY(machine_check) 1328 ASM_CLAC 1329 pushl $0 1330 pushl machine_check_vector 1331 jmp common_exception 1332END(machine_check) 1333#endif 1334 1335ENTRY(spurious_interrupt_bug) 1336 ASM_CLAC 1337 pushl $0 1338 pushl $do_spurious_interrupt_bug 1339 jmp common_exception 1340END(spurious_interrupt_bug) 1341 1342#ifdef CONFIG_XEN_PV 1343ENTRY(xen_hypervisor_callback) 1344 pushl $-1 /* orig_ax = -1 => not a system call */ 1345 SAVE_ALL 1346 ENCODE_FRAME_POINTER 1347 TRACE_IRQS_OFF 1348 1349 /* 1350 * Check to see if we got the event in the critical 1351 * region in xen_iret_direct, after we've reenabled 1352 * events and checked for pending events. This simulates 1353 * iret instruction's behaviour where it delivers a 1354 * pending interrupt when enabling interrupts: 1355 */ 1356 movl PT_EIP(%esp), %eax 1357 cmpl $xen_iret_start_crit, %eax 1358 jb 1f 1359 cmpl $xen_iret_end_crit, %eax 1360 jae 1f 1361 1362 jmp xen_iret_crit_fixup 1363 1364ENTRY(xen_do_upcall) 13651: mov %esp, %eax 1366 call xen_evtchn_do_upcall 1367#ifndef CONFIG_PREEMPTION 1368 call xen_maybe_preempt_hcall 1369#endif 1370 jmp ret_from_intr 1371ENDPROC(xen_hypervisor_callback) 1372 1373/* 1374 * Hypervisor uses this for application faults while it executes. 1375 * We get here for two reasons: 1376 * 1. Fault while reloading DS, ES, FS or GS 1377 * 2. Fault while executing IRET 1378 * Category 1 we fix up by reattempting the load, and zeroing the segment 1379 * register if the load fails. 1380 * Category 2 we fix up by jumping to do_iret_error. We cannot use the 1381 * normal Linux return path in this case because if we use the IRET hypercall 1382 * to pop the stack frame we end up in an infinite loop of failsafe callbacks. 1383 * We distinguish between categories by maintaining a status value in EAX. 1384 */ 1385ENTRY(xen_failsafe_callback) 1386 pushl %eax 1387 movl $1, %eax 13881: mov 4(%esp), %ds 13892: mov 8(%esp), %es 13903: mov 12(%esp), %fs 13914: mov 16(%esp), %gs 1392 /* EAX == 0 => Category 1 (Bad segment) 1393 EAX != 0 => Category 2 (Bad IRET) */ 1394 testl %eax, %eax 1395 popl %eax 1396 lea 16(%esp), %esp 1397 jz 5f 1398 jmp iret_exc 13995: pushl $-1 /* orig_ax = -1 => not a system call */ 1400 SAVE_ALL 1401 ENCODE_FRAME_POINTER 1402 jmp ret_from_exception 1403 1404.section .fixup, "ax" 14056: xorl %eax, %eax 1406 movl %eax, 4(%esp) 1407 jmp 1b 14087: xorl %eax, %eax 1409 movl %eax, 8(%esp) 1410 jmp 2b 14118: xorl %eax, %eax 1412 movl %eax, 12(%esp) 1413 jmp 3b 14149: xorl %eax, %eax 1415 movl %eax, 16(%esp) 1416 jmp 4b 1417.previous 1418 _ASM_EXTABLE(1b, 6b) 1419 _ASM_EXTABLE(2b, 7b) 1420 _ASM_EXTABLE(3b, 8b) 1421 _ASM_EXTABLE(4b, 9b) 1422ENDPROC(xen_failsafe_callback) 1423#endif /* CONFIG_XEN_PV */ 1424 1425#ifdef CONFIG_XEN_PVHVM 1426BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, 1427 xen_evtchn_do_upcall) 1428#endif 1429 1430 1431#if IS_ENABLED(CONFIG_HYPERV) 1432 1433BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, 1434 hyperv_vector_handler) 1435 1436BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR, 1437 hyperv_reenlightenment_intr) 1438 1439BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, 1440 hv_stimer0_vector_handler) 1441 1442#endif /* CONFIG_HYPERV */ 1443 1444ENTRY(page_fault) 1445 ASM_CLAC 1446 pushl $do_page_fault 1447 jmp common_exception_read_cr2 1448END(page_fault) 1449 1450common_exception_read_cr2: 1451 /* the function address is in %gs's slot on the stack */ 1452 SAVE_ALL switch_stacks=1 skip_gs=1 1453 1454 ENCODE_FRAME_POINTER 1455 UNWIND_ESPFIX_STACK 1456 1457 /* fixup %gs */ 1458 GS_TO_REG %ecx 1459 movl PT_GS(%esp), %edi 1460 REG_TO_PTGS %ecx 1461 SET_KERNEL_GS %ecx 1462 1463 GET_CR2_INTO(%ecx) # might clobber %eax 1464 1465 /* fixup orig %eax */ 1466 movl PT_ORIG_EAX(%esp), %edx # get the error code 1467 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart 1468 1469 TRACE_IRQS_OFF 1470 movl %esp, %eax # pt_regs pointer 1471 CALL_NOSPEC %edi 1472 jmp ret_from_exception 1473END(common_exception_read_cr2) 1474 1475common_exception: 1476 /* the function address is in %gs's slot on the stack */ 1477 SAVE_ALL switch_stacks=1 skip_gs=1 1478 ENCODE_FRAME_POINTER 1479 UNWIND_ESPFIX_STACK 1480 1481 /* fixup %gs */ 1482 GS_TO_REG %ecx 1483 movl PT_GS(%esp), %edi # get the function address 1484 REG_TO_PTGS %ecx 1485 SET_KERNEL_GS %ecx 1486 1487 /* fixup orig %eax */ 1488 movl PT_ORIG_EAX(%esp), %edx # get the error code 1489 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart 1490 1491 TRACE_IRQS_OFF 1492 movl %esp, %eax # pt_regs pointer 1493 CALL_NOSPEC %edi 1494 jmp ret_from_exception 1495END(common_exception) 1496 1497ENTRY(debug) 1498 /* 1499 * Entry from sysenter is now handled in common_exception 1500 */ 1501 ASM_CLAC 1502 pushl $-1 # mark this as an int 1503 pushl $do_debug 1504 jmp common_exception 1505END(debug) 1506 1507/* 1508 * NMI is doubly nasty. It can happen on the first instruction of 1509 * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning 1510 * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32 1511 * switched stacks. We handle both conditions by simply checking whether we 1512 * interrupted kernel code running on the SYSENTER stack. 1513 */ 1514ENTRY(nmi) 1515 ASM_CLAC 1516 1517#ifdef CONFIG_X86_ESPFIX32 1518 pushl %eax 1519 movl %ss, %eax 1520 cmpw $__ESPFIX_SS, %ax 1521 popl %eax 1522 je .Lnmi_espfix_stack 1523#endif 1524 1525 pushl %eax # pt_regs->orig_ax 1526 SAVE_ALL_NMI cr3_reg=%edi 1527 ENCODE_FRAME_POINTER 1528 xorl %edx, %edx # zero error code 1529 movl %esp, %eax # pt_regs pointer 1530 1531 /* Are we currently on the SYSENTER stack? */ 1532 movl PER_CPU_VAR(cpu_entry_area), %ecx 1533 addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx 1534 subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ 1535 cmpl $SIZEOF_entry_stack, %ecx 1536 jb .Lnmi_from_sysenter_stack 1537 1538 /* Not on SYSENTER stack. */ 1539 call do_nmi 1540 jmp .Lnmi_return 1541 1542.Lnmi_from_sysenter_stack: 1543 /* 1544 * We're on the SYSENTER stack. Switch off. No one (not even debug) 1545 * is using the thread stack right now, so it's safe for us to use it. 1546 */ 1547 movl %esp, %ebx 1548 movl PER_CPU_VAR(cpu_current_top_of_stack), %esp 1549 call do_nmi 1550 movl %ebx, %esp 1551 1552.Lnmi_return: 1553 CHECK_AND_APPLY_ESPFIX 1554 RESTORE_ALL_NMI cr3_reg=%edi pop=4 1555 jmp .Lirq_return 1556 1557#ifdef CONFIG_X86_ESPFIX32 1558.Lnmi_espfix_stack: 1559 /* 1560 * create the pointer to lss back 1561 */ 1562 pushl %ss 1563 pushl %esp 1564 addl $4, (%esp) 1565 /* copy the iret frame of 12 bytes */ 1566 .rept 3 1567 pushl 16(%esp) 1568 .endr 1569 pushl %eax 1570 SAVE_ALL_NMI cr3_reg=%edi 1571 ENCODE_FRAME_POINTER 1572 FIXUP_ESPFIX_STACK # %eax == %esp 1573 xorl %edx, %edx # zero error code 1574 call do_nmi 1575 RESTORE_ALL_NMI cr3_reg=%edi 1576 lss 12+4(%esp), %esp # back to espfix stack 1577 jmp .Lirq_return 1578#endif 1579END(nmi) 1580 1581ENTRY(int3) 1582 ASM_CLAC 1583 pushl $-1 # mark this as an int 1584 1585 SAVE_ALL switch_stacks=1 1586 ENCODE_FRAME_POINTER 1587 TRACE_IRQS_OFF 1588 xorl %edx, %edx # zero error code 1589 movl %esp, %eax # pt_regs pointer 1590 call do_int3 1591 jmp ret_from_exception 1592END(int3) 1593 1594ENTRY(general_protection) 1595 pushl $do_general_protection 1596 jmp common_exception 1597END(general_protection) 1598 1599#ifdef CONFIG_KVM_GUEST 1600ENTRY(async_page_fault) 1601 ASM_CLAC 1602 pushl $do_async_page_fault 1603 jmp common_exception_read_cr2 1604END(async_page_fault) 1605#endif 1606 1607ENTRY(rewind_stack_do_exit) 1608 /* Prevent any naive code from trying to unwind to our caller. */ 1609 xorl %ebp, %ebp 1610 1611 movl PER_CPU_VAR(cpu_current_top_of_stack), %esi 1612 leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp 1613 1614 call do_exit 16151: jmp 1b 1616END(rewind_stack_do_exit) 1617