PERFORCE change 30984 for review

Peter Wemm peter at FreeBSD.org
Sun May 11 18:18:32 PDT 2003


http://perforce.freebsd.org/chv.cgi?CH=30984

Change 30984 by peter at peter_hammer on 2003/05/11 18:17:36

	Use swapgs.  Ouch; this is hairy.  We have to avoid doing
	it a second time when trapping from kernel context, so
	check the frame's TF_CS to see if we're coming from kernel
	context.  This means converting *all* the trap gates to interrupt
	gates so that we can do the swapgs without the risk of an
	intermediate interrupt firing after entering supervisor mode
	but before swapgs.  This means that we have to undo the
	effects of the interrupt gate when we really want the
	trap gate.  Ugh.
	
	The other option is to have the regular entry points use the
	rdmsr/wrmsr stuff to save/restore the %GS.base etc *in the trap
	handlers*! and load the kernel %gs values and leave swapgs for
	the fast syscall stuff.  I'll do a time comparison later to see
	if this is infact faster.
	
	Update comments.

Affected files ...

.. //depot/projects/hammer/sys/amd64/amd64/exception.S#6 edit
.. //depot/projects/hammer/sys/amd64/amd64/machdep.c#22 edit
.. //depot/projects/hammer/sys/amd64/isa/icu_vector.S#2 edit

Differences ...

==== //depot/projects/hammer/sys/amd64/amd64/exception.S#6 (text+ko) ====

@@ -51,16 +51,16 @@
 /*
  * Trap and fault vector routines.
  *
- * Most traps are 'trap gates', SDT_SYS386TGT.  A trap gate pushes state on
- * the stack that mostly looks like an interrupt, but does not disable 
- * interrupts.  A few of the traps we are use are interrupt gates, 
- * SDT_SYS386IGT, which are nearly the same thing except interrupts are
- * disabled on entry.
+ * All traps are 'interrupt gates', SDT_SYSIGT.  An interrupt gate pushes
+ * state on the stack but also disables interrupts.  This is important for
+ * us for the use of the swapgs instruction.  We cannot be interrupted
+ * until the GS.base value is correct.  For most traps, we automatically
+ * then enable interrupts if the interrupted context had them enabled.
+ * This is equivalent to the i386 port's use of SDT_SYS386TGT.
  *
  * The cpu will push a certain amount of state onto the kernel stack for
- * the current process.  The amount of state depends on the type of trap 
- * and whether the trap crossed rings or not.  See i386/include/frame.h.  
- * At the very least the current EFLAGS (status register, which includes 
+ * the current process.  See amd64/include/frame.h.  
+ * This includes the current RFLAGS (status register, which includes 
  * the interrupt disable state prior to the trap), the code segment register,
  * and the return instruction pointer are pushed by the cpu.  The cpu 
  * will also push an 'error' code for certain traps.  We push a dummy 
@@ -75,6 +75,7 @@
 #define	IDTVEC(name)	ALIGN_TEXT; .globl __CONCAT(X,name); \
 			.type __CONCAT(X,name), at function; __CONCAT(X,name):
 #define	TRAP(a)		pushq $(a) ; jmp alltraps
+#define	TRAP_NOEN(a)	pushq $(a) ; jmp alltraps_noen
 
 MCOUNT_LABEL(user)
 MCOUNT_LABEL(btrap)
@@ -82,11 +83,11 @@
 IDTVEC(div)
 	pushq $0; TRAP(T_DIVIDE)
 IDTVEC(dbg)
-	pushq $0; TRAP(T_TRCTRAP)
+	pushq $0; TRAP_NOEN(T_TRCTRAP)
 IDTVEC(nmi)
 	pushq $0; TRAP(T_NMI)
 IDTVEC(bpt)
-	pushq $0; TRAP(T_BPTFLT)
+	pushq $0; TRAP_NOEN(T_BPTFLT)
 IDTVEC(ofl)
 	pushq $0; TRAP(T_OFLOW)
 IDTVEC(bnd)
@@ -106,7 +107,7 @@
 IDTVEC(prot)
 	TRAP(T_PROTFLT)
 IDTVEC(page)
-	TRAP(T_PAGEFLT)
+	TRAP_NOEN(T_PAGEFLT)
 IDTVEC(mchk)
 	pushq $0; TRAP(T_MCHK)
 IDTVEC(rsvd)
@@ -119,10 +120,9 @@
 	pushq $0; TRAP(T_XMMFLT)
 	
 	/*
-	 * alltraps entry point.  Interrupts are enabled if this was a trap
-	 * gate (TGT), else disabled if this was an interrupt gate (IGT).
-	 * Note that int0x80_syscall is a trap gate.  Only page faults
-	 * use an interrupt gate.
+	 * alltraps entry point.  Use swapgs if this is the first time in the
+	 * kernel from userland.  Reenable interrupts if they were enabled
+	 * before the trap.  This approximates SDT_SYS386TGT on the i386 port.
 	 */
 
 	SUPERALIGN_TEXT
@@ -130,6 +130,14 @@
 	.type	alltraps, at function
 alltraps:
 	subq	$TF_TRAPNO,%rsp		/* tf_err and tf_trapno already pushed */
+	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+	jz	alltraps_testi		/* already running with kernel GS.base */
+	swapgs
+alltraps_testi:
+	testl	$PSL_I,TF_RFLAGS(%rsp)
+	jz	alltraps_pushregs
+	sti
+alltraps_pushregs:
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
@@ -153,22 +161,43 @@
 	MEXITCOUNT
 	jmp	doreti			/* Handle any pending ASTs */
 
+	/*
+	 * alltraps_noen entry point.  Unlike alltraps above, we want to
+	 * leave the interrupts disabled.  This corresponds to
+	 * SDT_SYS386IGT on the i386 port.
+	 */
+	SUPERALIGN_TEXT
+	.globl	alltraps_noen
+	.type	alltraps_noen, at function
+alltraps_noen:
+	subq	$TF_TRAPNO,%rsp		/* tf_err and tf_trapno already pushed */
+	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+	jz	alltraps_pushregs	/* already running with kernel GS.base */
+	swapgs
+	jmp	alltraps_pushregs
+
+IDTVEC(dblfault)
+	pushq	$T_DOUBLEFLT
+	subq	$TF_TRAPNO,%rsp		/* tf_err and tf_trapno already pushed */
+	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+	jz	1f			/* already running with kernel GS.base */
+	swapgs
+1:	call	dblfault_handler
+2:	hlt
+	jmp	2b
+
 /*
  * Call gate entry for FreeBSD ELF and Linux/NetBSD syscall (int 0x80)
  *
- * Even though the name says 'int0x80', this is actually a TGT (trap gate)
- * rather then an IGT (interrupt gate).  Thus interrupts are enabled on
- * entry just as they are for a normal syscall.
- *
- * This leaves a place to put eflags so that the call frame can be
- * converted to a trap frame. Note that the eflags is (semi-)bogusly
- * pushed into (what will be) tf_err and then copied later into the
- * final spot. It has to be done this way because esp can't be just
- * temporarily altered for the pushfl - an interrupt might come in
- * and clobber the saved cs/eip.
+ * This is a SDT_SYSIDT entry point (unlike the i386 port) so that we
+ * can do a swapgs before enabling interrupts.  This is critical because
+ * if we took an interrupt before swapgs, the interrupt code would see
+ * that it originated in supervisor mode and skip the swapgs.
  */
 	SUPERALIGN_TEXT
 IDTVEC(int0x80_syscall)
+	swapgs
+	sti
 	pushq	$2			/* sizeof "int 0x80" */
 	subq	$TF_ERR,%rsp		/* skip over tf_trapno */
 	movq	%rdi,TF_RDI(%rsp)
@@ -196,19 +225,21 @@
  * and the new privilige level.  We are still running on the old user stack
  * pointer.  We have to juggle a few things around to find our stack etc.
  * swapgs gives us access to our PCPU space only.
- * XXX The PCPU stuff is stubbed out right now...
  */
 IDTVEC(fast_syscall)
-	/* XXX swapgs */
+	swapgs
 	movq	%rsp,PCPU(SCRATCH_RSP)
 	movq	common_tss+COMMON_TSS_RSP0,%rsp
 	/* Now emulate a trapframe. Ugh. */
 	subq	$TF_SIZE,%rsp
-	movq	$KUDSEL,TF_SS(%rsp)
 	/* defer TF_RSP till we have a spare register */
 	movq	%r11,TF_RFLAGS(%rsp)
+	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
+	movq	PCPU(SCRATCH_RSP),%r11	/* %r11 already saved */
+	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
+	sti
+	movq	$KUDSEL,TF_SS(%rsp)
 	movq	$KUCSEL,TF_CS(%rsp)
-	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
 	movq	$2,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* arg 1 */
 	movq	%rsi,TF_RSI(%rsp)	/* arg 2 */
@@ -223,14 +254,10 @@
 	movq	%r13,TF_R13(%rsp)	/* C preserved */
 	movq	%r14,TF_R14(%rsp)	/* C preserved */
 	movq	%r15,TF_R15(%rsp)	/* C preserved */
-	movq	PCPU(SCRATCH_RSP),%r12	/* %r12 already saved */
-	movq	%r12,TF_RSP(%rsp)	/* user stack pointer */
-	sti
 	call	syscall
 	movq	PCPU(CURPCB),%rax
 	testq	$PCB_FULLCTX,PCB_FLAGS(%rax)
 	jne	3f
-	/* simplified from doreti */
 1:	/* Check for and handle AST's on return to userland */
 	cli
 	movq	PCPU(CURTHREAD),%rax
@@ -255,7 +282,7 @@
 	movq	TF_RIP(%rsp),%rcx	/* original %rip */
 	movq	TF_RSP(%rsp),%r9	/* user stack pointer */
 	movq	%r9,%rsp		/* original %rsp */
-	/* XXX swapgs */
+	swapgs
 	sysretq
 3:	/* Requested full context restore, use doreti for that */
 	andq	$~PCB_FULLCTX,PCB_FLAGS(%rax)
@@ -344,12 +371,16 @@
 	movq	TF_R13(%rsp),%r13
 	movq	TF_R14(%rsp),%r14
 	movq	TF_R15(%rsp),%r15
-	addq	$TF_RIP,%rsp		/* skip over tf_err, tf_trapno */
+	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+	jz	1f			/* keep running with kernel GS.base */
+	cli
+	swapgs
+1:	addq	$TF_RIP,%rsp		/* skip over tf_err, tf_trapno */
 	.globl	doreti_iret
 doreti_iret:
 	iretq
 
-  	/*
+	/*
 	 * doreti_iret_fault and friends.  Alternative return code for
 	 * the case where we get a fault in the doreti_exit code
 	 * above.  trap() (i386/i386/trap.c) catches this specific
@@ -360,7 +391,13 @@
 	.globl	doreti_iret_fault
 doreti_iret_fault:
 	subq	$TF_RIP,%rsp		/* space including tf_err, tf_trapno */
-	movq	%rdi,TF_RDI(%rsp)
+	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+	jz	1f			/* already running with kernel GS.base */
+	swapgs
+1:	testl	$PSL_I,TF_RFLAGS(%rsp)
+	jz	2f
+	sti
+2:	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rcx,TF_RCX(%rsp)

==== //depot/projects/hammer/sys/amd64/amd64/machdep.c#22 (text+ko) ====

@@ -654,7 +654,7 @@
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
-	IDTVEC(xmm), IDTVEC(int0x80_syscall),
+	IDTVEC(xmm), IDTVEC(dblfault), IDTVEC(int0x80_syscall),
 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
 
 void
@@ -1182,9 +1182,9 @@
 	lgdt(&r_gdt);
 	pc = &__pcpu;
 
-	wrmsr(MSR_FSBASE, (u_int64_t)pc);
+	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
-	wrmsr(MSR_KGSBASE, (u_int64_t)pc);
+	wrmsr(MSR_KGSBASE, 0);		/* User value while we're in the kernel */
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	PCPU_SET(prvspace, pc);
@@ -1204,28 +1204,28 @@
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
-		setidt(x, &IDTVEC(rsvd), SDT_SYSTGT, SEL_KPL, 0);
-	setidt(0, &IDTVEC(div),  SDT_SYSTGT, SEL_KPL, 0);
+		setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+	setidt(0, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(1, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(2, &IDTVEC(nmi),  SDT_SYSTGT, SEL_KPL, 0);
+	setidt(2, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 0);
  	setidt(3, &IDTVEC(bpt),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(4, &IDTVEC(ofl),  SDT_SYSTGT, SEL_KPL, 0);
-	setidt(5, &IDTVEC(bnd),  SDT_SYSTGT, SEL_KPL, 0);
-	setidt(6, &IDTVEC(ill),  SDT_SYSTGT, SEL_KPL, 0);
-	setidt(7, &IDTVEC(dna),  SDT_SYSTGT, SEL_KPL, 0);
-	setidt(8, (inthand_t *)dblfault_handler, SDT_SYSIGT, SEL_KPL, 1);
-	setidt(9, &IDTVEC(fpusegm),  SDT_SYSTGT, SEL_KPL, 0);
-	setidt(10, &IDTVEC(tss),  SDT_SYSTGT, SEL_KPL, 0);
-	setidt(11, &IDTVEC(missing),  SDT_SYSTGT, SEL_KPL, 0);
-	setidt(12, &IDTVEC(stk),  SDT_SYSTGT, SEL_KPL, 0);
-	setidt(13, &IDTVEC(prot),  SDT_SYSTGT, SEL_KPL, 0);
+	setidt(4, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
+	setidt(5, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
+	setidt(6, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
+	setidt(7, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
+	setidt(8, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
+	setidt(9, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
+	setidt(10, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
+	setidt(11, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
+	setidt(12, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
+	setidt(13, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(14, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(15, &IDTVEC(rsvd),  SDT_SYSTGT, SEL_KPL, 0);
-	setidt(16, &IDTVEC(fpu),  SDT_SYSTGT, SEL_KPL, 0);
-	setidt(17, &IDTVEC(align), SDT_SYSTGT, SEL_KPL, 0);
-	setidt(18, &IDTVEC(mchk),  SDT_SYSTGT, SEL_KPL, 0);
-	setidt(19, &IDTVEC(xmm), SDT_SYSTGT, SEL_KPL, 0);
- 	setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYSTGT, SEL_UPL, 0);
+	setidt(15, &IDTVEC(rsvd),  SDT_SYSIGT, SEL_KPL, 0);
+	setidt(16, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
+	setidt(17, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
+	setidt(18, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
+	setidt(19, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
+ 	setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0);
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (long) idt;
@@ -1251,8 +1251,6 @@
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	common_tss.tss_rsp0 = thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb);
-	/* XXX we need to update tss_rsp0 in cpu_switch */
-	/* XXX maybe not yet, everything is still running in supervisor mode */
 
 	/* doublefault stack space, runs on ist1 */
 	common_tss.tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];

==== //depot/projects/hammer/sys/amd64/isa/icu_vector.S#2 (text+ko) ====

@@ -25,7 +25,10 @@
 	SUPERALIGN_TEXT ;						\
 IDTVEC(vec_name) ;							\
 	subq	$TF_RIP,%rsp ;	/* skip dummy tf_err and tf_trapno */	\
-	movq	%rdi,TF_RDI(%rsp) ;					\
+	testb	$SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */	\
+	jz	1f ;		/* Yes, dont swapgs again */		\
+	swapgs ;							\
+1:	movq	%rdi,TF_RDI(%rsp) ;					\
 	movq	%rsi,TF_RSI(%rsp) ;					\
 	movq	%rdx,TF_RDX(%rsp) ;					\
 	movq	%rcx,TF_RCX(%rsp) ;					\
@@ -69,7 +72,10 @@
 	SUPERALIGN_TEXT ;						\
 IDTVEC(vec_name) ;							\
 	subq	$TF_RIP,%rsp ;	/* skip dummy tf_err and tf_trapno */	\
-	movq	%rdi,TF_RDI(%rsp) ;					\
+	testb	$SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */	\
+	jz	1f ;		/* Yes, dont swapgs again */		\
+	swapgs ;							\
+1:	movq	%rdi,TF_RDI(%rsp) ;					\
 	movq	%rsi,TF_RSI(%rsp) ;					\
 	movq	%rdx,TF_RDX(%rsp) ;					\
 	movq	%rcx,TF_RCX(%rsp) ;					\


More information about the p4-projects mailing list