voidsyscall_init(void){wrmsr(MSR_STAR,0, (__USER32_CS <<16) | __KERNEL_CS);wrmsrl(MSR_LSTAR, (unsignedlong)entry_SYSCALL_64);#ifdefCONFIG_IA32_EMULATIONwrmsrl(MSR_CSTAR, (unsignedlong)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. * This does not cause SYSENTER to jump to the wrong location, because * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);wrmsrl_safe(MSR_IA32_SYSENTER_ESP,0ULL);wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);#elsewrmsrl(MSR_CSTAR, (unsignedlong)ignore_sysret);wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);wrmsrl_safe(MSR_IA32_SYSENTER_ESP,0ULL);wrmsrl_safe(MSR_IA32_SYSENTER_EIP,0ULL);#endif /* Flags to clear on syscall */wrmsrl(MSR_SYSCALL_MASK, X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);}
ENTRY(entry_SYSCALL_64) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK// KPTI 进内核态需要切到内核页表 SWITCH_KERNEL_CR3_NO_STACK /* * A hypervisor implementation might want to use a label * after the swapgs, so that it can do the swapgs * for the guest and jump here on syscall. */GLOBAL(entry_SYSCALL_64_after_swapgs)// 将用户栈偏移保存到 per-cpu 变量 rsp_scratch 中 movq %rsp,PER_CPU_VAR(rsp_scratch)// 加载内核栈偏移 movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp TRACE_IRQS_OFF /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ pushq %r8 /* pt_regs->r8 */ pushq %r9 /* pt_regs->r9 */ pushq %r10 /* pt_regs->r10 */ pushq %r11 /* pt_regs->r11 */// 为r12-r15, rbp, rbx保留位置 sub $(6*8),%rsp /* pt_regs->bp, bx, r12-15 not saved */ /* * If we need to do entry work or if we guess we'll need to do * exit work, go straight to the slow path. */ movq PER_CPU_VAR(current_task),%r11 testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK,TASK_TI_flags(%r11) jnz entry_SYSCALL64_slow_pathentry_SYSCALL_64_fastpath: /* * Easy case: enable interrupts and issue the syscall. If the syscall * needs pt_regs, we'll call a stub that disables interrupts again * and jumps to the slow path. */ TRACE_IRQS_ONENABLE_INTERRUPTS(CLBR_NONE)#if__SYSCALL_MASK==~0// 确保系统调用号没超过最大值,超过了则跳转到后面的符号 1 处进行返回 cmpq $__NR_syscall_max,%rax#else andl $__SYSCALL_MASK,%eax cmpl $__NR_syscall_max,%eax#endif ja 1f /* return -ENOSYS (already in pt_regs->ax) */// 除系统调用外的其他调用都通过 rcx 来传第四个参数,因此将 r10 的内容设置到 rcx movq %r10,%rcx /* * This call instruction is handled specially in stub_ptregs_64. * It might end up jumping to the slow path. If it jumps, RAX * and all argument registers are clobbered. */// 调用系统调用表中对应的函数 call *sys_call_table(,%rax,8).Lentry_SYSCALL_64_after_fastpath_call:// 将函数返回值压到栈中,返回时弹出 movq %rax,RAX(%rsp)1: /* * If we get here, then we know that pt_regs is clean for SYSRET64. * If we see that no exit work is required (which we are required * to check with IRQs off), then we can go straight to SYSRET64. */DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movq PER_CPU_VAR(current_task),%r11 testl $_TIF_ALLWORK_MASK,TASK_TI_flags(%r11) jnz 1f LOCKDEP_SYS_EXIT // 宏的实现与 CONFIG_DEBUG_LOCK_ALLOC 内核配置选项相关,该配置允许在退出系统调用时调试锁。 TRACE_IRQS_ON /* user mode is traced as IRQs on */ movq RIP(%rsp),%rcx movq EFLAGS(%rsp),%r11 RESTORE_C_REGS_EXCEPT_RCX_R11// 恢复除 rxc 和 r11 外所有通用寄存器, 因为 rcx 寄存器为调用系统调用的应用程序的返回地址, r11 寄存器为老的 flags register /* * This opens a window where we have a user CR3, but are * running in the kernel. This makes using the CS * register useless for telling whether or not we need to * switch CR3 in NMIs. Normal interrupts are OK because * they are off here. */ SWITCH_USER_CR3 // KPTI 返回用户态需要切回用户页表 /* 根据压栈的内容,恢复 rsp 为用户态的栈顶 */ movq RSP(%rsp),%rsp USERGS_SYSRET64 /* 调用宏 USERGS_SYSRET64 ,其扩展调用 swapgs 指令交换用户 GS 和内核GS, sysret 指令执行从系统调用处理退出 */................
arch/x86/entry/entry_64.SSYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) POP_REGS pop_rdi=0 /* * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. * Save old stack pointer and switch to trampoline stack. */ movq %rsp,%rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0),%rsp /* Copy the IRET frame to the trampoline stack. */ pushq 6*8(%rdi) /* SS */ pushq 5*8(%rdi) /* RSP */ pushq 4*8(%rdi) /* EFLAGS */ pushq 3*8(%rdi) /* CS */ pushq 2*8(%rdi) /* RIP */ /* Push user RDI on the trampoline stack. */pushq (%rdi) /* * We are on the trampoline stack. All regs except RDI are live. * We can do future final exit work right here. */ STACKLEAK_ERASE_NOCLOBBER SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi /* Restore RDI. */ popq %rdi SWAPGS INTERRUPT_RETURN
纯汇编代码如下:
swapgs_restore_regs_and_return_to_usermode.text:FFFFFFFF81600A34 41 5F pop r15.text:FFFFFFFF81600A36 41 5E pop r14.text:FFFFFFFF81600A38 41 5D pop r13.text:FFFFFFFF81600A3A 41 5C pop r12.text:FFFFFFFF81600A3C 5D pop rbp.text:FFFFFFFF81600A3D 5B pop rbx.text:FFFFFFFF81600A3E 41 5B pop r11.text:FFFFFFFF81600A40 41 5A pop r10.text:FFFFFFFF81600A42 4159 pop r9.text:FFFFFFFF81600A44 4158 pop r8.text:FFFFFFFF81600A46 58 pop rax.text:FFFFFFFF81600A47 59 pop rcx.text:FFFFFFFF81600A48 5A pop rdx.text:FFFFFFFF81600A49 5E pop rsi.text:FFFFFFFF81600A4A 4889 E7 mov rdi, rsp <<<<<<<<<<<<<<<<<<.text:FFFFFFFF81600A4D 6548 8B 2425+ mov rsp, gs:0x5004.text:FFFFFFFF81600A56 FF 7730 push qword ptr [rdi+30h].text:FFFFFFFF81600A59 FF 7728 push qword ptr [rdi+28h].text:FFFFFFFF81600A5C FF 7720 push qword ptr [rdi+20h].text:FFFFFFFF81600A5F FF 7718 push qword ptr [rdi+18h].text:FFFFFFFF81600A62 FF 7710 push qword ptr [rdi+10h].text:FFFFFFFF81600A65 FF 37 push qword ptr [rdi].text:FFFFFFFF81600A67 50 push rax.text:FFFFFFFF81600A68 EB 43 nop.text:FFFFFFFF81600A6A 0F 20 DF mov rdi, cr3.text:FFFFFFFF81600A6D EB 34 jmp 0xFFFFFFFF81600AA3.text:FFFFFFFF81600AA3 4881 CF 0010+or rdi, 1000h.text:FFFFFFFF81600AAA 0F 22 DF mov cr3, rdi.text:FFFFFFFF81600AAD 58 pop rax.text:FFFFFFFF81600AAE 5F pop rdi.text:FFFFFFFF81600AAF FF 15236562+ call cs: SWAPGS.text:FFFFFFFF81600AB5 FF 25156562+ jmp cs: INTERRUPT_RETURN_SWAPGS.text:FFFFFFFF8103EFC0 55 push rbp.text:FFFFFFFF8103EFC1 4889 E5 mov rbp, rsp.text:FFFFFFFF8103EFC4 0F 01 F8 swapgs.text:FFFFFFFF8103EFC7 5D pop rbp.text:FFFFFFFF8103EFC8 C3 retn_INTERRUPT_RETURN.text:FFFFFFFF81600AE0 F6 44242004 test byte ptr [rsp+0x20],4.text:FFFFFFFF81600AE5 7502 jnz native_irq_return_ldt.text:FFFFFFFF81600AE7 48 CF iretq