void syscall_init(void)
{
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
* This does not cause SYSENTER to jump to the wrong location, because
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif
/* Flags to clear on syscall */
wrmsrl(MSR_SYSCALL_MASK,
X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}
ENTRY(entry_SYSCALL_64)
/*
* Interrupts are off on entry.
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
SWAPGS_UNSAFE_STACK
// KPTI 进内核态需要切到内核页表
SWITCH_KERNEL_CR3_NO_STACK
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
* for the guest and jump here on syscall.
*/
GLOBAL(entry_SYSCALL_64_after_swapgs)
// 将用户栈偏移保存到 per-cpu 变量 rsp_scratch 中
movq %rsp, PER_CPU_VAR(rsp_scratch)
// 加载内核栈偏移
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
TRACE_IRQS_OFF
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 */
pushq %r9 /* pt_regs->r9 */
pushq %r10 /* pt_regs->r10 */
pushq %r11 /* pt_regs->r11 */
// 为r12-r15, rbp, rbx保留位置
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
/*
* If we need to do entry work or if we guess we'll need to do
* exit work, go straight to the slow path.
*/
movq PER_CPU_VAR(current_task), %r11
testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz entry_SYSCALL64_slow_path
entry_SYSCALL_64_fastpath:
/*
* Easy case: enable interrupts and issue the syscall. If the syscall
* needs pt_regs, we'll call a stub that disables interrupts again
* and jumps to the slow path.
*/
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
#if __SYSCALL_MASK == ~0
// 确保系统调用号没超过最大值,超过了则跳转到后面的符号 1 处进行返回
cmpq $__NR_syscall_max, %rax
#else
andl $__SYSCALL_MASK, %eax
cmpl $__NR_syscall_max, %eax
#endif
ja 1f /* return -ENOSYS (already in pt_regs->ax) */
// 除系统调用外的其他调用都通过 rcx 来传第四个参数,因此将 r10 的内容设置到 rcx
movq %r10, %rcx
/*
* This call instruction is handled specially in stub_ptregs_64.
* It might end up jumping to the slow path. If it jumps, RAX
* and all argument registers are clobbered.
*/
// 调用系统调用表中对应的函数
call *sys_call_table(, %rax, 8)
.Lentry_SYSCALL_64_after_fastpath_call:
// 将函数返回值压到栈中,返回时弹出
movq %rax, RAX(%rsp)
1:
/*
* If we get here, then we know that pt_regs is clean for SYSRET64.
* If we see that no exit work is required (which we are required
* to check with IRQs off), then we can go straight to SYSRET64.
*/
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
movq PER_CPU_VAR(current_task), %r11
testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz 1f
LOCKDEP_SYS_EXIT // 宏的实现与 CONFIG_DEBUG_LOCK_ALLOC 内核配置选项相关,该配置允许在退出系统调用时调试锁。
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
// 恢复除 rxc 和 r11 外所有通用寄存器, 因为 rcx 寄存器为调用系统调用的应用程序的返回地址, r11 寄存器为老的 flags register
/*
* This opens a window where we have a user CR3, but are
* running in the kernel. This makes using the CS
* register useless for telling whether or not we need to
* switch CR3 in NMIs. Normal interrupts are OK because
* they are off here.
*/
SWITCH_USER_CR3 // KPTI 返回用户态需要切回用户页表
/* 根据压栈的内容,恢复 rsp 为用户态的栈顶 */
movq RSP(%rsp), %rsp
USERGS_SYSRET64
/* 调用宏 USERGS_SYSRET64 ,其扩展调用 swapgs 指令交换用户 GS 和内核GS, sysret 指令执行从系统调用处理退出 */
........
........
arch/x86/entry/entry_64.S
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
POP_REGS pop_rdi=0
/*
* The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
/* Copy the IRET frame to the trampoline stack. */
pushq 6*8(%rdi) /* SS */
pushq 5*8(%rdi) /* RSP */
pushq 4*8(%rdi) /* EFLAGS */
pushq 3*8(%rdi) /* CS */
pushq 2*8(%rdi) /* RIP */
/* Push user RDI on the trampoline stack. */
pushq (%rdi)
/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
STACKLEAK_ERASE_NOCLOBBER
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
/* Restore RDI. */
popq %rdi
SWAPGS
INTERRUPT_RETURN
纯汇编代码如下:
swapgs_restore_regs_and_return_to_usermode.text:FFFFFFFF81600A34 41 5F pop r15.text:FFFFFFFF81600A36 41 5E pop r14.text:FFFFFFFF81600A38 41 5D pop r13.text:FFFFFFFF81600A3A 41 5C pop r12.text:FFFFFFFF81600A3C 5D pop rbp.text:FFFFFFFF81600A3D 5B pop rbx.text:FFFFFFFF81600A3E 41 5B pop r11.text:FFFFFFFF81600A40 41 5A pop r10.text:FFFFFFFF81600A42 4159 pop r9.text:FFFFFFFF81600A44 4158 pop r8.text:FFFFFFFF81600A46 58 pop rax.text:FFFFFFFF81600A47 59 pop rcx.text:FFFFFFFF81600A48 5A pop rdx.text:FFFFFFFF81600A49 5E pop rsi.text:FFFFFFFF81600A4A 4889 E7 mov rdi, rsp <<<<<<<<<<<<<<<<<<.text:FFFFFFFF81600A4D 6548 8B 2425+ mov rsp, gs:0x5004.text:FFFFFFFF81600A56 FF 7730 push qword ptr [rdi+30h].text:FFFFFFFF81600A59 FF 7728 push qword ptr [rdi+28h].text:FFFFFFFF81600A5C FF 7720 push qword ptr [rdi+20h].text:FFFFFFFF81600A5F FF 7718 push qword ptr [rdi+18h].text:FFFFFFFF81600A62 FF 7710 push qword ptr [rdi+10h].text:FFFFFFFF81600A65 FF 37 push qword ptr [rdi].text:FFFFFFFF81600A67 50 push rax.text:FFFFFFFF81600A68 EB 43 nop.text:FFFFFFFF81600A6A 0F 20 DF mov rdi, cr3.text:FFFFFFFF81600A6D EB 34 jmp 0xFFFFFFFF81600AA3.text:FFFFFFFF81600AA3 4881 CF 0010+or rdi, 1000h.text:FFFFFFFF81600AAA 0F 22 DF mov cr3, rdi.text:FFFFFFFF81600AAD 58 pop rax.text:FFFFFFFF81600AAE 5F pop rdi.text:FFFFFFFF81600AAF FF 15236562+ call cs: SWAPGS.text:FFFFFFFF81600AB5 FF 25156562+ jmp cs: INTERRUPT_RETURN_SWAPGS.text:FFFFFFFF8103EFC0 55 push rbp.text:FFFFFFFF8103EFC1 4889 E5 mov rbp, rsp.text:FFFFFFFF8103EFC4 0F 01 F8 swapgs.text:FFFFFFFF8103EFC7 5D pop rbp.text:FFFFFFFF8103EFC8 C3 retn_INTERRUPT_RETURN.text:FFFFFFFF81600AE0 F6 44242004 test byte ptr [rsp+0x20],4.text:FFFFFFFF81600AE5 7502 jnz native_irq_return_ldt.text:FFFFFFFF81600AE7 48 CF iretq