PERFORCE change 30984 for review
Peter Wemm
peter at FreeBSD.org
Sun May 11 18:18:32 PDT 2003
http://perforce.freebsd.org/chv.cgi?CH=30984
Change 30984 by peter at peter_hammer on 2003/05/11 18:17:36
Use swapgs. Ouch; this is hairy. We have to avoid doing
it a second time when trapping from kernel context, so
check the frame's TF_CS to see if we're coming from kernel
context. This means converting *all* the trap gates to interrupt
gates so that we can do the swapgs without the risk of an
intermediate interrupt firing after entering supervisor mode
but before swapgs. This means that we have to undo the
effects of the interrupt gate when we really want the
trap gate. Ugh.
The other option is to have the regular entry points use the
rdmsr/wrmsr stuff to save/restore the %GS.base etc *in the trap
handlers*! and load the kernel %gs values and leave swapgs for
the fast syscall stuff. I'll do a time comparison later to see
if this is infact faster.
Update comments.
Affected files ...
.. //depot/projects/hammer/sys/amd64/amd64/exception.S#6 edit
.. //depot/projects/hammer/sys/amd64/amd64/machdep.c#22 edit
.. //depot/projects/hammer/sys/amd64/isa/icu_vector.S#2 edit
Differences ...
==== //depot/projects/hammer/sys/amd64/amd64/exception.S#6 (text+ko) ====
@@ -51,16 +51,16 @@
/*
* Trap and fault vector routines.
*
- * Most traps are 'trap gates', SDT_SYS386TGT. A trap gate pushes state on
- * the stack that mostly looks like an interrupt, but does not disable
- * interrupts. A few of the traps we are use are interrupt gates,
- * SDT_SYS386IGT, which are nearly the same thing except interrupts are
- * disabled on entry.
+ * All traps are 'interrupt gates', SDT_SYSIGT. An interrupt gate pushes
+ * state on the stack but also disables interrupts. This is important for
+ * us for the use of the swapgs instruction. We cannot be interrupted
+ * until the GS.base value is correct. For most traps, we automatically
+ * then enable interrupts if the interrupted context had them enabled.
+ * This is equivalent to the i386 port's use of SDT_SYS386TGT.
*
* The cpu will push a certain amount of state onto the kernel stack for
- * the current process. The amount of state depends on the type of trap
- * and whether the trap crossed rings or not. See i386/include/frame.h.
- * At the very least the current EFLAGS (status register, which includes
+ * the current process. See amd64/include/frame.h.
+ * This includes the current RFLAGS (status register, which includes
* the interrupt disable state prior to the trap), the code segment register,
* and the return instruction pointer are pushed by the cpu. The cpu
* will also push an 'error' code for certain traps. We push a dummy
@@ -75,6 +75,7 @@
#define IDTVEC(name) ALIGN_TEXT; .globl __CONCAT(X,name); \
.type __CONCAT(X,name), at function; __CONCAT(X,name):
#define TRAP(a) pushq $(a) ; jmp alltraps
+#define TRAP_NOEN(a) pushq $(a) ; jmp alltraps_noen
MCOUNT_LABEL(user)
MCOUNT_LABEL(btrap)
@@ -82,11 +83,11 @@
IDTVEC(div)
pushq $0; TRAP(T_DIVIDE)
IDTVEC(dbg)
- pushq $0; TRAP(T_TRCTRAP)
+ pushq $0; TRAP_NOEN(T_TRCTRAP)
IDTVEC(nmi)
pushq $0; TRAP(T_NMI)
IDTVEC(bpt)
- pushq $0; TRAP(T_BPTFLT)
+ pushq $0; TRAP_NOEN(T_BPTFLT)
IDTVEC(ofl)
pushq $0; TRAP(T_OFLOW)
IDTVEC(bnd)
@@ -106,7 +107,7 @@
IDTVEC(prot)
TRAP(T_PROTFLT)
IDTVEC(page)
- TRAP(T_PAGEFLT)
+ TRAP_NOEN(T_PAGEFLT)
IDTVEC(mchk)
pushq $0; TRAP(T_MCHK)
IDTVEC(rsvd)
@@ -119,10 +120,9 @@
pushq $0; TRAP(T_XMMFLT)
/*
- * alltraps entry point. Interrupts are enabled if this was a trap
- * gate (TGT), else disabled if this was an interrupt gate (IGT).
- * Note that int0x80_syscall is a trap gate. Only page faults
- * use an interrupt gate.
+ * alltraps entry point. Use swapgs if this is the first time in the
+ * kernel from userland. Reenable interrupts if they were enabled
+ * before the trap. This approximates SDT_SYS386TGT on the i386 port.
*/
SUPERALIGN_TEXT
@@ -130,6 +130,14 @@
.type alltraps, at function
alltraps:
subq $TF_TRAPNO,%rsp /* tf_err and tf_trapno already pushed */
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+ jz alltraps_testi /* already running with kernel GS.base */
+ swapgs
+alltraps_testi:
+ testl $PSL_I,TF_RFLAGS(%rsp)
+ jz alltraps_pushregs
+ sti
+alltraps_pushregs:
movq %rdi,TF_RDI(%rsp)
movq %rsi,TF_RSI(%rsp)
movq %rdx,TF_RDX(%rsp)
@@ -153,22 +161,43 @@
MEXITCOUNT
jmp doreti /* Handle any pending ASTs */
+ /*
+ * alltraps_noen entry point. Unlike alltraps above, we want to
+ * leave the interrupts disabled. This corresponds to
+ * SDT_SYS386IGT on the i386 port.
+ */
+ SUPERALIGN_TEXT
+ .globl alltraps_noen
+ .type alltraps_noen, at function
+alltraps_noen:
+ subq $TF_TRAPNO,%rsp /* tf_err and tf_trapno already pushed */
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+ jz alltraps_pushregs /* already running with kernel GS.base */
+ swapgs
+ jmp alltraps_pushregs
+
+IDTVEC(dblfault)
+ pushq $T_DOUBLEFLT
+ subq $TF_TRAPNO,%rsp /* tf_err and tf_trapno already pushed */
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+ jz 1f /* already running with kernel GS.base */
+ swapgs
+1: call dblfault_handler
+2: hlt
+ jmp 2b
+
/*
* Call gate entry for FreeBSD ELF and Linux/NetBSD syscall (int 0x80)
*
- * Even though the name says 'int0x80', this is actually a TGT (trap gate)
- * rather then an IGT (interrupt gate). Thus interrupts are enabled on
- * entry just as they are for a normal syscall.
- *
- * This leaves a place to put eflags so that the call frame can be
- * converted to a trap frame. Note that the eflags is (semi-)bogusly
- * pushed into (what will be) tf_err and then copied later into the
- * final spot. It has to be done this way because esp can't be just
- * temporarily altered for the pushfl - an interrupt might come in
- * and clobber the saved cs/eip.
+ * This is a SDT_SYSIDT entry point (unlike the i386 port) so that we
+ * can do a swapgs before enabling interrupts. This is critical because
+ * if we took an interrupt before swapgs, the interrupt code would see
+ * that it originated in supervisor mode and skip the swapgs.
*/
SUPERALIGN_TEXT
IDTVEC(int0x80_syscall)
+ swapgs
+ sti
pushq $2 /* sizeof "int 0x80" */
subq $TF_ERR,%rsp /* skip over tf_trapno */
movq %rdi,TF_RDI(%rsp)
@@ -196,19 +225,21 @@
* and the new privilige level. We are still running on the old user stack
* pointer. We have to juggle a few things around to find our stack etc.
* swapgs gives us access to our PCPU space only.
- * XXX The PCPU stuff is stubbed out right now...
*/
IDTVEC(fast_syscall)
- /* XXX swapgs */
+ swapgs
movq %rsp,PCPU(SCRATCH_RSP)
movq common_tss+COMMON_TSS_RSP0,%rsp
/* Now emulate a trapframe. Ugh. */
subq $TF_SIZE,%rsp
- movq $KUDSEL,TF_SS(%rsp)
/* defer TF_RSP till we have a spare register */
movq %r11,TF_RFLAGS(%rsp)
+ movq %rcx,TF_RIP(%rsp) /* %rcx original value is in %r10 */
+ movq PCPU(SCRATCH_RSP),%r11 /* %r11 already saved */
+ movq %r11,TF_RSP(%rsp) /* user stack pointer */
+ sti
+ movq $KUDSEL,TF_SS(%rsp)
movq $KUCSEL,TF_CS(%rsp)
- movq %rcx,TF_RIP(%rsp) /* %rcx original value is in %r10 */
movq $2,TF_ERR(%rsp)
movq %rdi,TF_RDI(%rsp) /* arg 1 */
movq %rsi,TF_RSI(%rsp) /* arg 2 */
@@ -223,14 +254,10 @@
movq %r13,TF_R13(%rsp) /* C preserved */
movq %r14,TF_R14(%rsp) /* C preserved */
movq %r15,TF_R15(%rsp) /* C preserved */
- movq PCPU(SCRATCH_RSP),%r12 /* %r12 already saved */
- movq %r12,TF_RSP(%rsp) /* user stack pointer */
- sti
call syscall
movq PCPU(CURPCB),%rax
testq $PCB_FULLCTX,PCB_FLAGS(%rax)
jne 3f
- /* simplified from doreti */
1: /* Check for and handle AST's on return to userland */
cli
movq PCPU(CURTHREAD),%rax
@@ -255,7 +282,7 @@
movq TF_RIP(%rsp),%rcx /* original %rip */
movq TF_RSP(%rsp),%r9 /* user stack pointer */
movq %r9,%rsp /* original %rsp */
- /* XXX swapgs */
+ swapgs
sysretq
3: /* Requested full context restore, use doreti for that */
andq $~PCB_FULLCTX,PCB_FLAGS(%rax)
@@ -344,12 +371,16 @@
movq TF_R13(%rsp),%r13
movq TF_R14(%rsp),%r14
movq TF_R15(%rsp),%r15
- addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+ jz 1f /* keep running with kernel GS.base */
+ cli
+ swapgs
+1: addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */
.globl doreti_iret
doreti_iret:
iretq
- /*
+ /*
* doreti_iret_fault and friends. Alternative return code for
* the case where we get a fault in the doreti_exit code
* above. trap() (i386/i386/trap.c) catches this specific
@@ -360,7 +391,13 @@
.globl doreti_iret_fault
doreti_iret_fault:
subq $TF_RIP,%rsp /* space including tf_err, tf_trapno */
- movq %rdi,TF_RDI(%rsp)
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+ jz 1f /* already running with kernel GS.base */
+ swapgs
+1: testl $PSL_I,TF_RFLAGS(%rsp)
+ jz 2f
+ sti
+2: movq %rdi,TF_RDI(%rsp)
movq %rsi,TF_RSI(%rsp)
movq %rdx,TF_RDX(%rsp)
movq %rcx,TF_RCX(%rsp)
==== //depot/projects/hammer/sys/amd64/amd64/machdep.c#22 (text+ko) ====
@@ -654,7 +654,7 @@
IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
- IDTVEC(xmm), IDTVEC(int0x80_syscall),
+ IDTVEC(xmm), IDTVEC(dblfault), IDTVEC(int0x80_syscall),
IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
void
@@ -1182,9 +1182,9 @@
lgdt(&r_gdt);
pc = &__pcpu;
- wrmsr(MSR_FSBASE, (u_int64_t)pc);
+ wrmsr(MSR_FSBASE, 0); /* User value */
wrmsr(MSR_GSBASE, (u_int64_t)pc);
- wrmsr(MSR_KGSBASE, (u_int64_t)pc);
+ wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */
pcpu_init(pc, 0, sizeof(struct pcpu));
PCPU_SET(prvspace, pc);
@@ -1204,28 +1204,28 @@
/* exceptions */
for (x = 0; x < NIDT; x++)
- setidt(x, &IDTVEC(rsvd), SDT_SYSTGT, SEL_KPL, 0);
- setidt(0, &IDTVEC(div), SDT_SYSTGT, SEL_KPL, 0);
+ setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(0, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0);
setidt(1, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0);
- setidt(2, &IDTVEC(nmi), SDT_SYSTGT, SEL_KPL, 0);
+ setidt(2, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 0);
setidt(3, &IDTVEC(bpt), SDT_SYSIGT, SEL_KPL, 0);
- setidt(4, &IDTVEC(ofl), SDT_SYSTGT, SEL_KPL, 0);
- setidt(5, &IDTVEC(bnd), SDT_SYSTGT, SEL_KPL, 0);
- setidt(6, &IDTVEC(ill), SDT_SYSTGT, SEL_KPL, 0);
- setidt(7, &IDTVEC(dna), SDT_SYSTGT, SEL_KPL, 0);
- setidt(8, (inthand_t *)dblfault_handler, SDT_SYSIGT, SEL_KPL, 1);
- setidt(9, &IDTVEC(fpusegm), SDT_SYSTGT, SEL_KPL, 0);
- setidt(10, &IDTVEC(tss), SDT_SYSTGT, SEL_KPL, 0);
- setidt(11, &IDTVEC(missing), SDT_SYSTGT, SEL_KPL, 0);
- setidt(12, &IDTVEC(stk), SDT_SYSTGT, SEL_KPL, 0);
- setidt(13, &IDTVEC(prot), SDT_SYSTGT, SEL_KPL, 0);
+ setidt(4, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(5, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(6, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(7, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(8, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
+ setidt(9, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(10, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(11, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(12, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(13, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0);
setidt(14, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0);
- setidt(15, &IDTVEC(rsvd), SDT_SYSTGT, SEL_KPL, 0);
- setidt(16, &IDTVEC(fpu), SDT_SYSTGT, SEL_KPL, 0);
- setidt(17, &IDTVEC(align), SDT_SYSTGT, SEL_KPL, 0);
- setidt(18, &IDTVEC(mchk), SDT_SYSTGT, SEL_KPL, 0);
- setidt(19, &IDTVEC(xmm), SDT_SYSTGT, SEL_KPL, 0);
- setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYSTGT, SEL_UPL, 0);
+ setidt(15, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(16, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(17, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(18, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(19, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0);
r_idt.rd_limit = sizeof(idt0) - 1;
r_idt.rd_base = (long) idt;
@@ -1251,8 +1251,6 @@
/* make an initial tss so cpu can get interrupt stack on syscall! */
common_tss.tss_rsp0 = thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb);
- /* XXX we need to update tss_rsp0 in cpu_switch */
- /* XXX maybe not yet, everything is still running in supervisor mode */
/* doublefault stack space, runs on ist1 */
common_tss.tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
==== //depot/projects/hammer/sys/amd64/isa/icu_vector.S#2 (text+ko) ====
@@ -25,7 +25,10 @@
SUPERALIGN_TEXT ; \
IDTVEC(vec_name) ; \
subq $TF_RIP,%rsp ; /* skip dummy tf_err and tf_trapno */ \
- movq %rdi,TF_RDI(%rsp) ; \
+ testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \
+ jz 1f ; /* Yes, dont swapgs again */ \
+ swapgs ; \
+1: movq %rdi,TF_RDI(%rsp) ; \
movq %rsi,TF_RSI(%rsp) ; \
movq %rdx,TF_RDX(%rsp) ; \
movq %rcx,TF_RCX(%rsp) ; \
@@ -69,7 +72,10 @@
SUPERALIGN_TEXT ; \
IDTVEC(vec_name) ; \
subq $TF_RIP,%rsp ; /* skip dummy tf_err and tf_trapno */ \
- movq %rdi,TF_RDI(%rsp) ; \
+ testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \
+ jz 1f ; /* Yes, dont swapgs again */ \
+ swapgs ; \
+1: movq %rdi,TF_RDI(%rsp) ; \
movq %rsi,TF_RSI(%rsp) ; \
movq %rdx,TF_RDX(%rsp) ; \
movq %rcx,TF_RCX(%rsp) ; \
More information about the p4-projects
mailing list