git: 75316a59b39e - releng/13.3 - rdmsr_safe/wrmsr_safe: handle pcb_onfault nesting

From: Andriy Gapon <avg_at_FreeBSD.org>
Date: Mon, 19 Feb 2024 09:52:03 UTC
The branch releng/13.3 has been updated by avg:

URL: https://cgit.FreeBSD.org/src/commit/?id=75316a59b39ee83ed8b0e98597abb7baeea6a788

commit 75316a59b39ee83ed8b0e98597abb7baeea6a788
Author:     Andriy Gapon <avg@FreeBSD.org>
AuthorDate: 2024-01-30 06:45:01 +0000
Commit:     Andriy Gapon <avg@FreeBSD.org>
CommitDate: 2024-02-19 09:51:07 +0000

    rdmsr_safe/wrmsr_safe: handle pcb_onfault nesting
    
    rdmsr_safe and wrmsr_safe can be called while pcb_onfault is already
    set, so the functions are modified to preserve the handler rather than
    resetting it before returning.
    
    One case where that happens is when AMD microcode update routine
    is executed on a stack where copyin / copyout was already active.
    
    Here is a sample panic message from a crash caused by resetting the
    handler:
    
      <118>Updating CPU Microcode...
    
      Fatal trap 12: page fault while in kernel mode
      cpuid = 3; apic id = 03
      fault virtual address   = 0x11ed0de6000
      fault code              = supervisor write data, page not present
      instruction pointer     = 0x20:0xffffffff80c2df03
      stack pointer           = 0x28:0xfffffe01ce4a4c70
      frame pointer           = 0x28:0xfffffe01ce4a4c70
      code segment            = base 0x0, limit 0xfffff, type 0x1b
                              = DPL 0, pres 1, long 1, def32 0, gran 1
      processor eflags        = interrupt enabled, resume, IOPL = 0
      current process         = 117 (logger)
      trap number             = 12
      panic: page fault
      cpuid = 3
      time = 1681462027
      KDB: stack backtrace:
      db_trace_self_wrapper() at 0xffffffff80615deb = db_trace_self_wrapper+0x2b/frame 0xfffffe01ce4a4830
      kdb_backtrace() at 0xffffffff80943c77 = kdb_backtrace+0x37/frame 0xfffffe01ce4a48e0
      vpanic() at 0xffffffff808f5fe5 = vpanic+0x185/frame 0xfffffe01ce4a4940
      panic() at 0xffffffff808f5da3 = panic+0x43/frame 0xfffffe01ce4a49a0
      trap_fatal() at 0xffffffff80c31849 = trap_fatal+0x379/frame 0xfffffe01ce4a4a00
      trap_pfault() at 0xffffffff80c318b5 = trap_pfault+0x65/frame 0xfffffe01ce4a4a60
      trap() at 0xffffffff80c30f5f = trap+0x29f/frame 0xfffffe01ce4a4b80
      trap_check() at 0xffffffff80c31c29 = trap_check+0x29/frame 0xfffffe01ce4a4ba0
      calltrap() at 0xffffffff80c07fd8 = calltrap+0x8/frame 0xfffffe01ce4a4ba0
      --- trap 0xc, rip = 0xffffffff80c2df03, rsp = 0xfffffe01ce4a4c70, rbp = 0xfffffe01ce4a4c70 ---
      copyout_nosmap_std() at 0xffffffff80c2df03 = copyout_nosmap_std+0x63/frame 0xfffffe01ce4a4c70
      uiomove_faultflag() at 0xffffffff8095f0d5 = uiomove_faultflag+0xe5/frame 0xfffffe01ce4a4cb0
      uiomove() at 0xffffffff8095efeb = uiomove+0xb/frame 0xfffffe01ce4a4cc0
      pipe_read() at 0xffffffff80968860 = pipe_read+0x230/frame 0xfffffe01ce4a4d30
      dofileread() at 0xffffffff809653cb = dofileread+0x8b/frame 0xfffffe01ce4a4d80
      sys_read() at 0xffffffff80964fa0 = sys_read+0xc0/frame 0xfffffe01ce4a4df0
      amd64_syscall() at 0xffffffff80c3221a = amd64_syscall+0x18a/frame 0xfffffe01ce4a4f30
      fast_syscall_common() at 0xffffffff80c088eb = fast_syscall_common+0xf8/frame 0xfffffe01ce4a4f30
      --- syscall (3, FreeBSD ELF64, read), rip = 0x11ece41cfaa, rsp = 0x11ecbec4908, rbp = 0x11ecbec4920 ---
      Uptime: 41s
    
    And another one:
    
      Fatal trap 12: page fault while in kernel mode
      cpuid = 4; apic id = 04
      fault virtual address   = 0x800a22000
      fault code              = supervisor write data, page not present
      instruction pointer     = 0x20:0xffffffff80b2c7ca
      stack pointer           = 0x28:0xfffffe01c55b5480
      frame pointer           = 0x28:0xfffffe01c55b5480
      code segment            = base 0x0, limit 0xfffff, type 0x1b
                              = DPL 0, pres 1, long 1, def32 0, gran 1
      processor eflags        = interrupt enabled, resume, IOPL = 0
      current process         = 68418 (pfctl)
      trap number             = 12
      panic: page fault
      cpuid = 4
      time = 1625184463
      KDB: stack backtrace:
      db_trace_self_wrapper() at 0xffffffff805c1e8b = db_trace_self_wrapper+0x2b/frame 0xfffffe01c55b5040
      kdb_backtrace() at 0xffffffff808874b7 = kdb_backtrace+0x37/frame 0xfffffe01c55b50f0
      vpanic() at 0xffffffff808449d8 = vpanic+0x188/frame 0xfffffe01c55b5150
      panic() at 0xffffffff808445f3 = panic+0x43/frame 0xfffffe01c55b51b0
      trap_fatal() at 0xffffffff80b300a5 = trap_fatal+0x375/frame 0xfffffe01c55b5210
      trap_pfault() at 0xffffffff80b30180 = trap_pfault+0x80/frame 0xfffffe01c55b5280
      trap() at 0xffffffff80b2f729 = trap+0x289/frame 0xfffffe01c55b5390
      trap_check() at 0xffffffff80b304d9 = trap_check+0x29/frame 0xfffffe01c55b53b0
      calltrap() at 0xffffffff80b0bb28 = calltrap+0x8/frame 0xfffffe01c55b53b0
      --- trap 0xc, rip = 0xffffffff80b2c7ca, rsp = 0xfffffe01c55b5480, rbp = 0xfffffe01c55b5480 ---
      copyout_nosmap_std() at 0xffffffff80b2c7ca = copyout_nosmap_std+0x15a/frame 0xfffffe01c55b5480
      pfioctl() at 0xffffffff85539358 = pfioctl+0x4d28/frame 0xfffffe01c55b5940
      devfs_ioctl() at 0xffffffff807176cf = devfs_ioctl+0xcf/frame 0xfffffe01c55b59a0
      VOP_IOCTL_APV() at 0xffffffff80bb26e2 = VOP_IOCTL_APV+0x92/frame 0xfffffe01c55b59c0
      VOP_IOCTL() at 0xffffffff80928014 = VOP_IOCTL+0x34/frame 0xfffffe01c55b5a10
      vn_ioctl() at 0xffffffff80923330 = vn_ioctl+0xc0/frame 0xfffffe01c55b5b00
      devfs_ioctl_f() at 0xffffffff80717bbe = devfs_ioctl_f+0x1e/frame 0xfffffe01c55b5b20
      fo_ioctl() at 0xffffffff808abc6b = fo_ioctl+0xb/frame 0xfffffe01c55b5b30
      kern_ioctl() at 0xffffffff808abc01 = kern_ioctl+0x1d1/frame 0xfffffe01c55b5b80
      sys_ioctl() at 0xffffffff808ab982 = sys_ioctl+0x132/frame 0xfffffe01c55b5c50
      syscallenter() at 0xffffffff80b30cc9 = syscallenter+0x159/frame 0xfffffe01c55b5ca0
      amd64_syscall() at 0xffffffff80b309a5 = amd64_syscall+0x15/frame 0xfffffe01c55b5d30
      fast_syscall_common() at 0xffffffff80b0c44e = fast_syscall_common+0xf8/frame 0xfffffe01c55b5d30
    
    PR:             276426
    Reviewed by:    kib, markj
    Approved by:    re (cperciva)
    
    (cherry picked from commit 486b265a8fb6b2aad37f2819fa04feacf8184d53)
---
 sys/amd64/amd64/support.S | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index 936065a78879..dd269138c23f 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -1533,6 +1533,7 @@ ENTRY(rdmsr_safe)
 /* int rdmsr_safe(u_int msr, uint64_t *data) */
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
+	movq	PCB_ONFAULT(%r8),%r9
 	movq	$msr_onfault,PCB_ONFAULT(%r8)
 	movl	%edi,%ecx
 	rdmsr			/* Read MSR pointed by %ecx. Returns
@@ -1541,8 +1542,8 @@ ENTRY(rdmsr_safe)
 	movl	%eax,%eax	/* zero-extend %eax -> %rax */
 	orq	%rdx,%rax
 	movq	%rax,(%rsi)
-	xorq	%rax,%rax
-	movq	%rax,PCB_ONFAULT(%r8)
+	movq	%r9,PCB_ONFAULT(%r8)
+	xorl	%eax,%eax
 	POP_FRAME_POINTER
 	ret
 
@@ -1554,6 +1555,7 @@ ENTRY(wrmsr_safe)
 /* int wrmsr_safe(u_int msr, uint64_t data) */
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
+	movq	PCB_ONFAULT(%r8),%r9
 	movq	$msr_onfault,PCB_ONFAULT(%r8)
 	movl	%edi,%ecx
 	movl	%esi,%eax
@@ -1561,8 +1563,8 @@ ENTRY(wrmsr_safe)
 	movl	%esi,%edx
 	wrmsr			/* Write MSR pointed by %ecx. Accepts
 				   hi byte in edx, lo in %eax. */
-	xorq	%rax,%rax
-	movq	%rax,PCB_ONFAULT(%r8)
+	movq	%r9,PCB_ONFAULT(%r8)
+	xorl	%eax,%eax
 	POP_FRAME_POINTER
 	ret
 
@@ -1571,7 +1573,7 @@ ENTRY(wrmsr_safe)
  */
 	ALIGN_TEXT
 msr_onfault:
-	movq	$0,PCB_ONFAULT(%r8)
+	movq	%r9,PCB_ONFAULT(%r8)
 	movl	$EFAULT,%eax
 	POP_FRAME_POINTER
 	ret