Re: FYI: aarch64 boot (HoneyComb): example crash during system checks (power-off/power-on form of reboot still fails)

From: Mark Millard via arm <arm_at_freebsd.org>
Date: Sun, 21 Nov 2021 19:36:08 UTC
On 2021-Nov-21, at 11:26, Mark Millard <marklmi@yahoo.com> wrote:

> Starting file system checks:
> /dev/gpt/CA72opt0EFI: 41 files, 242 MiB free (15469 clusters)
> FIXED
> /d  x0: ffff000000e43ec8 (blocked_lock + 0)
>  x1: ffff00013efa9f50
>  x2: ffff00000090e39a (cam_status_table + 1d132)
>  x3:         deadc0d8
>  x4:                0
>  x5: ffff00000082a138 (data_abort + 0)
>  x6:                5
>  x7:              601
>  x8: ffff000000e43ec8 (blocked_lock + 0)
>  x9:         deadc0de
> x10:                0
> x11:          3938700
> x12:                0
> x13:             8000
> x14:              1de
> x15:             81ce
> x16:         425b9080
> x17:             8000
> x18: ffff00013efa9f40
> x19: ffff000000e43ec8 (blocked_lock + 0)
> x20: ffffa0001a826000
> x21:                0
> x22: ffffa0001a826000
> x23:                0
> x24: ffff000000bed000 (queue_ops + 0)
> x25:           98967f
> x26: ffff000000e43ee0 (blocked_lock + 18)
> x27:                0
> x28:              114
> x29: ffff00013efa9f40
>  sp: ffff00013efa9f40
>  lr: ffff0000004b9028 (thread_lock_flags_ + c0)
> elr: ffff0000004b9028 (thread_lock_flags_ + c0)
> spsr:              2c5
> far:         deadc178
> esr:         96000004
> timeout stopping cpus
> panic: data abort in critical section or under mutex
> cpuid = 5
> time = 1637492224
> KDB: stack backtrace:
> db_trace_self() at db_trace_self_wrapper+0x30
>         pc = 0xffff000000807770  lr = 0xffff00000011d9ec
>         sp = 0xffff00013efa9990  fp = 0xffff00013efa9b90
> 
> db_trace_self_wrapper() at vpanic+0x188
>         pc = 0xffff00000011d9ec  lr = 0xffff0000004e1d10
>         sp = 0xffff00013efa9ba0  fp = 0xffff00013efa9c00
> 
> vpanic() at panic+0x44
>         pc = 0xffff0000004e1d10  lr = 0xffff0000004e1b84
>         sp = 0xffff00013efa9c10  fp = 0xffff00013efa9cc0
> 
> panic() at data_abort+0x290
>         pc = 0xffff0000004e1b84  lr = 0xffff00000082a3c8
>         sp = 0xffff00013efa9cd0  fp = 0xffff00013efa9d50
> 
> data_abort() at handle_el1h_sync+0x78
>         pc = 0xffff00000082a3c8  lr = 0xffff00000080a078
>         sp = 0xffff00013efa9d60  fp = 0xffff00013efa9eb0
> 
> handle_el1h_sync() at thread_lock_flags_+0xbc
>         pc = 0xffff00000080a078  lr = 0xffff0000004b9024
>         sp = 0xffff00013efa9ec0  fp = 0xffff00013efa9f40
> 
> thread_lock_flags_() at thread_lock_flags_+0xbc
>         pc = 0xffff0000004b9024  lr = 0xffff0000004b9024
>         sp = 0xffff00013efa9f50  fp = 0xffff00013efa9f60
> 
> thread_lock_flags_() at sleepq_timeout+0x10
>         pc = 0xffff0000004b9024  lr = 0xffff00000054b2a8
>         sp = 0xffff00013efa9f70  fp = 0xffff00013efa9fb0
> 
> sleepq_timeout() at softclock_call_cc+0x14c
>         pc = 0xffff00000054b2a8  lr = 0xffff000000503134
>         sp = 0xffff00013efa9fc0  fp = 0xffff00013efaa020
> 
> softclock_call_cc() at callout_process+0x17c
>         pc = 0xffff000000503134  lr = 0xffff000000502df0
>         sp = 0xffff00013efaa030  fp = 0xffff00013efaa0a0
> 
> callout_process() at handleevents+0x188
>         pc = 0xffff000000502df0  lr = 0xffff00000045b42c
>         sp = 0xffff00013efaa0b0  fp = 0xffff00013efaa100
> 
> handleevents() at timercb+0x304
>         pc = 0xffff00000045b42c  lr = 0xffff00000045be7c
>         sp = 0xffff00013efaa110  fp = 0xffff00013efaa170
> 
> timercb() at arm_tmr_intr+0x5c
>         pc = 0xffff00000045be7c  lr = 0xffff0000007ff850
>         sp = 0xffff00013efaa180  fp = 0xffff00013efaa1d0
> 
> arm_tmr_intr() at intr_event_handle+0xac
>         pc = 0xffff0000007ff850  lr = 0xffff000000493c54
>         sp = 0xffff00013efaa1e0  fp = 0xffff00013efaa1e0
> 
> intr_event_handle() at intr_isrc_dispatch+0x70
>         pc = 0xffff000000493c54  lr = 0xffff0000007fb238
>         sp = 0xffff00013efaa1f0  fp = 0xffff00013efaa230
> 
> intr_isrc_dispatch() at arm_gic_v3_intr+0x11c
>         pc = 0xffff0000007fb238  lr = 0xffff00000080ff34
>         sp = 0xffff00013efaa240  fp = 0xffff00013efaa250
> 
> arm_gic_v3_intr() at intr_irq_handler+0x7c
>         pc = 0xffff00000080ff34  lr = 0xffff0000007faff0
>         sp = 0xffff00013efaa260  fp = 0xffff00013efaa2b0
> 
> intr_irq_handler() at handle_el1h_irq+0x74
>         pc = 0xffff0000007faff0  lr = 0xffff00000080a140
>         sp = 0xffff00013efaa2c0  fp = 0xffff00013efaa3f0
> 
> handle_el1h_irq() at handle_el1h_sync+0x78
>         pc = 0xffff00000080a140  lr = 0xffff00000080a078
>         sp = 0xffff00013efaa400  fp = 0xffff00013efaa500
> 
> handle_el1h_sync() at handle_el1h_sync+0x78
>         pc = 0xffff00000080a078  lr = 0xffff00000080a078
>         sp = 0xffff00013efaa510  fp = 0xffff00013efaa660
> 
> handle_el1h_sync() at sched_switch+0x6a8
>         pc = 0xffff00000080a078  lr = 0xffff0000005197fc
>         sp = 0xffff00013efaa670  fp = 0xffff00013efaa6f0
> 
> sched_switch() at sched_switch+0x6a8
>         pc = 0xffff0000005197fc  lr = 0xffff0000005197fc
>         sp = 0xffff00013efaa700  fp = 0xffff00013efaa790
> 
> sched_switch() at mi_switch+0xf4
>         pc = 0xffff0000005197fc  lr = 0xffff0000004f03a0
>         sp = 0xffff00013efaa7a0  fp = 0xffff00013efaa7f0
> 
> mi_switch() at sleepq_timedwait+0x28
>         pc = 0xffff0000004f03a0  lr = 0xffff00000054bd0c
>         sp = 0xffff00013efaa800  fp = 0xffff00013efaa830
> 
> sleepq_timedwait() at _cv_timedwait_sbt+0x110
>         pc = 0xffff00000054bd0c  lr = 0xffff00000045e7b0
>         sp = 0xffff00013efaa840  fp = 0xffff00013efaa850
> 
> _cv_timedwait_sbt() at dbuf_evict_thread+0x410
>         pc = 0xffff00000045e7b0  lr = 0xffff0000013ca59c
>         sp = 0xffff00013efaa860  fp = 0xffff00013efaa8f0
> 
> dbuf_evict_thread() at fork_exit+0x94
>         pc = 0xffff0000013ca59c  lr = 0xffff00000048fbf0
>         sp = 0xffff00013efaa900  fp = 0xffff00013efaa950
> 
> fork_exit() at fork_trampoline+0x10
>         pc = 0xffff00000048fbf0  lr = 0xffff000000828ed8
>         sp = 0xffff00013efaa960  fp = 0x0000000000000000
> 
> KDB: enter: panic
> [ thread pid 26 tid 100194 ]
> Stopped at      kdb_enter+0x48: undefined       f906411f
> 
> For reference:
> 
> # uname -apKU
> FreeBSD CA72_16Gp_ZFS 13.0-STABLE FreeBSD 13.0-STABLE #13 stable/13-n248062-109330155000-dirty: Sat Nov 13 23:55:14 PST 2021     root@CA72_16Gp_ZFS:/usr/obj/BUILDs/13S-CA72-nodbg-clang/usr/13S-src/arm64.aarch64/sys/GENERIC-NODBG-CA72  arm64 aarch64 1300520 1300520
> 
> It is a root-on-ZFS context on Optane media in the PCie slot.
> 
> I've no clue if this will repeat. I've never gotten
> this before.


The reboot attempt got the following, involving
zthr_procedure instead of dbuf_evict_thread .

Starting file system checks:
/dev/gpt/CA72opt0EFI: FILESYSTEM CLEAN; SKIPPING CHECKS
  x0: ffff000000e43ec8 (blocked_lock + 0)
  x1: ffff00013ef9ff90
  x2: ffff00000090e39a (cam_status_table + 1d132)
  x3:         deadc0d8
  x4:                0
  x5: ffff00000082a138 (data_abort + 0)
  x6:                9
  x7:              601
  x8: ffff000000e43ec8 (blocked_lock + 0)
  x9:         deadc0de
 x10:                0
 x11:          3938700
 x12:                1
 x13:             8000
 x14:              1ee
 x15:             81cd
 x16:         425b9080
 x17:             8000
 x18: ffff00013ef9ff80
 x19: ffff000000e43ec8 (blocked_lock + 0)
 x20: ffffa0000c011000
 x21:                0
 x22: ffffa0000c011000
 x23:                0
 x24: ffff000000bed000 (queue_ops + 0)
 x25:           98967f
 x26: ffff000000e43ee0 (blocked_lock + 18)
 x27:                0
 x28:              114
 x29: ffff00013ef9ff80
  sp: ffff00013ef9ff80
  lr: ffff0000004b9028 (thread_lock_flags_ + c0)
 elr: ffff0000004b9028 (thread_lock_flags_ + c0)
spsr:              2c5
 far:         deadc178
 esr:         96000004
timeout stopping cpus
panic: data abort in critical section or under mutex
cpuid = 9
time = 1637492224
KDB: stack backtrace:
db_trace_self() at db_trace_self_wrapper+0x30
         pc = 0xffff000000807770  lr = 0xffff00000011d9ec
         sp = 0xffff00013ef9f9d0  fp = 0xffff00013ef9fbd0

db_trace_self_wrapper() at vpanic+0x188
         pc = 0xffff00000011d9ec  lr = 0xffff0000004e1d10
         sp = 0xffff00013ef9fbe0  fp = 0xffff00013ef9fc40

vpanic() at panic+0x44
         pc = 0xffff0000004e1d10  lr = 0xffff0000004e1b84
         sp = 0xffff00013ef9fc50  fp = 0xffff00013ef9fd00

panic() at data_abort+0x290
         pc = 0xffff0000004e1b84  lr = 0xffff00000082a3c8
         sp = 0xffff00013ef9fd10  fp = 0xffff00013ef9fd90

data_abort() at handle_el1h_sync+0x78
         pc = 0xffff00000082a3c8  lr = 0xffff00000080a078
         sp = 0xffff00013ef9fda0  fp = 0xffff00013ef9fef0

handle_el1h_sync() at thread_lock_flags_+0xbc
         pc = 0xffff00000080a078  lr = 0xffff0000004b9024
         sp = 0xffff00013ef9ff00  fp = 0xffff00013ef9ff80

thread_lock_flags_() at thread_lock_flags_+0xbc
         pc = 0xffff0000004b9024  lr = 0xffff0000004b9024
         sp = 0xffff00013ef9ff90  fp = 0xffff00013ef9ffa0

thread_lock_flags_() at sleepq_timeout+0x10
         pc = 0xffff0000004b9024  lr = 0xffff00000054b2a8
         sp = 0xffff00013ef9ffb0  fp = 0xffff00013ef9fff0

sleepq_timeout() at softclock_call_cc+0x14c
         pc = 0xffff00000054b2a8  lr = 0xffff000000503134
         sp = 0xffff00013efa0000  fp = 0xffff00013efa0060

softclock_call_cc() at callout_process+0x17c
         pc = 0xffff000000503134  lr = 0xffff000000502df0
         sp = 0xffff00013efa0070  fp = 0xffff00013efa00e0

callout_process() at handleevents+0x188
         pc = 0xffff000000502df0  lr = 0xffff00000045b42c
         sp = 0xffff00013efa00f0  fp = 0xffff00013efa0140

handleevents() at timercb+0x304
         pc = 0xffff00000045b42c  lr = 0xffff00000045be7c
         sp = 0xffff00013efa0150  fp = 0xffff00013efa01b0

timercb() at arm_tmr_intr+0x5c
         pc = 0xffff00000045be7c  lr = 0xffff0000007ff850
         sp = 0xffff00013efa01c0  fp = 0xffff00013efa0210

arm_tmr_intr() at intr_event_handle+0xac
         pc = 0xffff0000007ff850  lr = 0xffff000000493c54
         sp = 0xffff00013efa0220  fp = 0xffff00013efa0220

intr_event_handle() at intr_isrc_dispatch+0x70
         pc = 0xffff000000493c54  lr = 0xffff0000007fb238
         sp = 0xffff00013efa0230  fp = 0xffff00013efa0270

intr_isrc_dispatch() at arm_gic_v3_intr+0x11c
         pc = 0xffff0000007fb238  lr = 0xffff00000080ff34
         sp = 0xffff00013efa0280  fp = 0xffff00013efa0290

arm_gic_v3_intr() at intr_irq_handler+0x7c
         pc = 0xffff00000080ff34  lr = 0xffff0000007faff0
         sp = 0xffff00013efa02a0  fp = 0xffff00013efa02f0

intr_irq_handler() at handle_el1h_irq+0x74
         pc = 0xffff0000007faff0  lr = 0xffff00000080a140
         sp = 0xffff00013efa0300  fp = 0xffff00013efa0430

handle_el1h_irq() at handle_el1h_sync+0x78
         pc = 0xffff00000080a140  lr = 0xffff00000080a078
         sp = 0xffff00013efa0440  fp = 0xffff00013efa0540

handle_el1h_sync() at handle_el1h_sync+0x78
         pc = 0xffff00000080a078  lr = 0xffff00000080a078
         sp = 0xffff00013efa0550  fp = 0xffff00013efa06a0

handle_el1h_sync() at sched_switch+0x6a8
         pc = 0xffff00000080a078  lr = 0xffff0000005197fc
         sp = 0xffff00013efa06b0  fp = 0xffff00013efa0730

sched_switch() at sched_switch+0x6a8
         pc = 0xffff0000005197fc  lr = 0xffff0000005197fc
         sp = 0xffff00013efa0740  fp = 0xffff00013efa07d0

sched_switch() at mi_switch+0xf4
         pc = 0xffff0000005197fc  lr = 0xffff0000004f03a0
         sp = 0xffff00013efa07e0  fp = 0xffff00013efa0830

mi_switch() at sleepq_timedwait+0x28
         pc = 0xffff0000004f03a0  lr = 0xffff00000054bd0c
         sp = 0xffff00013efa0840  fp = 0xffff00013efa0870

sleepq_timedwait() at _cv_timedwait_sbt+0x110
         pc = 0xffff00000054bd0c  lr = 0xffff00000045e7b0
         sp = 0xffff00013efa0880  fp = 0xffff00013efa0890

_cv_timedwait_sbt() at zthr_procedure+0x20c
         pc = 0xffff00000045e7b0  lr = 0xffff0000014e2fb0
         sp = 0xffff00013efa08a0  fp = 0xffff00013efa08f0

zthr_procedure() at fork_exit+0x94
         pc = 0xffff0000014e2fb0  lr = 0xffff00000048fbf0
         sp = 0xffff00013efa0900  fp = 0xffff00013efa0950

fork_exit() at fork_trampoline+0x10
         pc = 0xffff00000048fbf0  lr = 0xffff000000828ed8
         sp = 0xffff00013efa0960  fp = 0x0000000000000000

KDB: enter: panic
[ thread pid 26 tid 100192 ]
Stopped at      kdb_enter+0x48: undefined       f906411f


Note:

All this started after a stress based I/O hangup test
that required a forced reboot.


===
Mark Millard
marklmi at yahoo.com
( dsl-only.net went
away in early 2018-Mar)