Re: FYI: aarch64 boot (HoneyComb): example crash during system checks (power-off/power-on form of reboot still fails)
Date: Sun, 21 Nov 2021 20:09:35 UTC
On 2021-Nov-21, at 11:36, Mark Millard <marklmi@yahoo.com> wrote: > On 2021-Nov-21, at 11:26, Mark Millard <marklmi@yahoo.com> wrote: > >> Starting file system checks: >> /dev/gpt/CA72opt0EFI: 41 files, 242 MiB free (15469 clusters) >> FIXED >> /d x0: ffff000000e43ec8 (blocked_lock + 0) >> x1: ffff00013efa9f50 >> x2: ffff00000090e39a (cam_status_table + 1d132) >> x3: deadc0d8 >> x4: 0 >> x5: ffff00000082a138 (data_abort + 0) >> x6: 5 >> x7: 601 >> x8: ffff000000e43ec8 (blocked_lock + 0) >> x9: deadc0de >> x10: 0 >> x11: 3938700 >> x12: 0 >> x13: 8000 >> x14: 1de >> x15: 81ce >> x16: 425b9080 >> x17: 8000 >> x18: ffff00013efa9f40 >> x19: ffff000000e43ec8 (blocked_lock + 0) >> x20: ffffa0001a826000 >> x21: 0 >> x22: ffffa0001a826000 >> x23: 0 >> x24: ffff000000bed000 (queue_ops + 0) >> x25: 98967f >> x26: ffff000000e43ee0 (blocked_lock + 18) >> x27: 0 >> x28: 114 >> x29: ffff00013efa9f40 >> sp: ffff00013efa9f40 >> lr: ffff0000004b9028 (thread_lock_flags_ + c0) >> elr: ffff0000004b9028 (thread_lock_flags_ + c0) >> spsr: 2c5 >> far: deadc178 >> esr: 96000004 >> timeout stopping cpus >> panic: data abort in critical section or under mutex >> cpuid = 5 >> time = 1637492224 >> KDB: stack backtrace: >> db_trace_self() at db_trace_self_wrapper+0x30 >> pc = 0xffff000000807770 lr = 0xffff00000011d9ec >> sp = 0xffff00013efa9990 fp = 0xffff00013efa9b90 >> >> db_trace_self_wrapper() at vpanic+0x188 >> pc = 0xffff00000011d9ec lr = 0xffff0000004e1d10 >> sp = 0xffff00013efa9ba0 fp = 0xffff00013efa9c00 >> >> vpanic() at panic+0x44 >> pc = 0xffff0000004e1d10 lr = 0xffff0000004e1b84 >> sp = 0xffff00013efa9c10 fp = 0xffff00013efa9cc0 >> >> panic() at data_abort+0x290 >> pc = 0xffff0000004e1b84 lr = 0xffff00000082a3c8 >> sp = 0xffff00013efa9cd0 fp = 0xffff00013efa9d50 >> >> data_abort() at handle_el1h_sync+0x78 >> pc = 0xffff00000082a3c8 lr = 0xffff00000080a078 >> sp = 0xffff00013efa9d60 fp = 0xffff00013efa9eb0 >> >> handle_el1h_sync() at thread_lock_flags_+0xbc >> pc = 0xffff00000080a078 lr = 0xffff0000004b9024 >> sp = 0xffff00013efa9ec0 fp = 0xffff00013efa9f40 >> >> thread_lock_flags_() at thread_lock_flags_+0xbc >> pc = 0xffff0000004b9024 lr = 0xffff0000004b9024 >> sp = 0xffff00013efa9f50 fp = 0xffff00013efa9f60 >> >> thread_lock_flags_() at sleepq_timeout+0x10 >> pc = 0xffff0000004b9024 lr = 0xffff00000054b2a8 >> sp = 0xffff00013efa9f70 fp = 0xffff00013efa9fb0 >> >> sleepq_timeout() at softclock_call_cc+0x14c >> pc = 0xffff00000054b2a8 lr = 0xffff000000503134 >> sp = 0xffff00013efa9fc0 fp = 0xffff00013efaa020 >> >> softclock_call_cc() at callout_process+0x17c >> pc = 0xffff000000503134 lr = 0xffff000000502df0 >> sp = 0xffff00013efaa030 fp = 0xffff00013efaa0a0 >> >> callout_process() at handleevents+0x188 >> pc = 0xffff000000502df0 lr = 0xffff00000045b42c >> sp = 0xffff00013efaa0b0 fp = 0xffff00013efaa100 >> >> handleevents() at timercb+0x304 >> pc = 0xffff00000045b42c lr = 0xffff00000045be7c >> sp = 0xffff00013efaa110 fp = 0xffff00013efaa170 >> >> timercb() at arm_tmr_intr+0x5c >> pc = 0xffff00000045be7c lr = 0xffff0000007ff850 >> sp = 0xffff00013efaa180 fp = 0xffff00013efaa1d0 >> >> arm_tmr_intr() at intr_event_handle+0xac >> pc = 0xffff0000007ff850 lr = 0xffff000000493c54 >> sp = 0xffff00013efaa1e0 fp = 0xffff00013efaa1e0 >> >> intr_event_handle() at intr_isrc_dispatch+0x70 >> pc = 0xffff000000493c54 lr = 0xffff0000007fb238 >> sp = 0xffff00013efaa1f0 fp = 0xffff00013efaa230 >> >> intr_isrc_dispatch() at arm_gic_v3_intr+0x11c >> pc = 0xffff0000007fb238 lr = 0xffff00000080ff34 >> sp = 0xffff00013efaa240 fp = 0xffff00013efaa250 >> >> arm_gic_v3_intr() at intr_irq_handler+0x7c >> pc = 0xffff00000080ff34 lr = 0xffff0000007faff0 >> sp = 0xffff00013efaa260 fp = 0xffff00013efaa2b0 >> >> intr_irq_handler() at handle_el1h_irq+0x74 >> pc = 0xffff0000007faff0 lr = 0xffff00000080a140 >> sp = 0xffff00013efaa2c0 fp = 0xffff00013efaa3f0 >> >> handle_el1h_irq() at handle_el1h_sync+0x78 >> pc = 0xffff00000080a140 lr = 0xffff00000080a078 >> sp = 0xffff00013efaa400 fp = 0xffff00013efaa500 >> >> handle_el1h_sync() at handle_el1h_sync+0x78 >> pc = 0xffff00000080a078 lr = 0xffff00000080a078 >> sp = 0xffff00013efaa510 fp = 0xffff00013efaa660 >> >> handle_el1h_sync() at sched_switch+0x6a8 >> pc = 0xffff00000080a078 lr = 0xffff0000005197fc >> sp = 0xffff00013efaa670 fp = 0xffff00013efaa6f0 >> >> sched_switch() at sched_switch+0x6a8 >> pc = 0xffff0000005197fc lr = 0xffff0000005197fc >> sp = 0xffff00013efaa700 fp = 0xffff00013efaa790 >> >> sched_switch() at mi_switch+0xf4 >> pc = 0xffff0000005197fc lr = 0xffff0000004f03a0 >> sp = 0xffff00013efaa7a0 fp = 0xffff00013efaa7f0 >> >> mi_switch() at sleepq_timedwait+0x28 >> pc = 0xffff0000004f03a0 lr = 0xffff00000054bd0c >> sp = 0xffff00013efaa800 fp = 0xffff00013efaa830 >> >> sleepq_timedwait() at _cv_timedwait_sbt+0x110 >> pc = 0xffff00000054bd0c lr = 0xffff00000045e7b0 >> sp = 0xffff00013efaa840 fp = 0xffff00013efaa850 >> >> _cv_timedwait_sbt() at dbuf_evict_thread+0x410 >> pc = 0xffff00000045e7b0 lr = 0xffff0000013ca59c >> sp = 0xffff00013efaa860 fp = 0xffff00013efaa8f0 >> >> dbuf_evict_thread() at fork_exit+0x94 >> pc = 0xffff0000013ca59c lr = 0xffff00000048fbf0 >> sp = 0xffff00013efaa900 fp = 0xffff00013efaa950 >> >> fork_exit() at fork_trampoline+0x10 >> pc = 0xffff00000048fbf0 lr = 0xffff000000828ed8 >> sp = 0xffff00013efaa960 fp = 0x0000000000000000 >> >> KDB: enter: panic >> [ thread pid 26 tid 100194 ] >> Stopped at kdb_enter+0x48: undefined f906411f >> >> For reference: >> >> # uname -apKU >> FreeBSD CA72_16Gp_ZFS 13.0-STABLE FreeBSD 13.0-STABLE #13 stable/13-n248062-109330155000-dirty: Sat Nov 13 23:55:14 PST 2021 root@CA72_16Gp_ZFS:/usr/obj/BUILDs/13S-CA72-nodbg-clang/usr/13S-src/arm64.aarch64/sys/GENERIC-NODBG-CA72 arm64 aarch64 1300520 1300520 >> >> It is a root-on-ZFS context on Optane media in the PCie slot. >> >> I've no clue if this will repeat. I've never gotten >> this before. > > > The reboot attempt got the following, involving > zthr_procedure instead of dbuf_evict_thread . > > Starting file system checks: > /dev/gpt/CA72opt0EFI: FILESYSTEM CLEAN; SKIPPING CHECKS > x0: ffff000000e43ec8 (blocked_lock + 0) > x1: ffff00013ef9ff90 > x2: ffff00000090e39a (cam_status_table + 1d132) > x3: deadc0d8 > x4: 0 > x5: ffff00000082a138 (data_abort + 0) > x6: 9 > x7: 601 > x8: ffff000000e43ec8 (blocked_lock + 0) > x9: deadc0de > x10: 0 > x11: 3938700 > x12: 1 > x13: 8000 > x14: 1ee > x15: 81cd > x16: 425b9080 > x17: 8000 > x18: ffff00013ef9ff80 > x19: ffff000000e43ec8 (blocked_lock + 0) > x20: ffffa0000c011000 > x21: 0 > x22: ffffa0000c011000 > x23: 0 > x24: ffff000000bed000 (queue_ops + 0) > x25: 98967f > x26: ffff000000e43ee0 (blocked_lock + 18) > x27: 0 > x28: 114 > x29: ffff00013ef9ff80 > sp: ffff00013ef9ff80 > lr: ffff0000004b9028 (thread_lock_flags_ + c0) > elr: ffff0000004b9028 (thread_lock_flags_ + c0) > spsr: 2c5 > far: deadc178 > esr: 96000004 > timeout stopping cpus > panic: data abort in critical section or under mutex > cpuid = 9 > time = 1637492224 > KDB: stack backtrace: > db_trace_self() at db_trace_self_wrapper+0x30 > pc = 0xffff000000807770 lr = 0xffff00000011d9ec > sp = 0xffff00013ef9f9d0 fp = 0xffff00013ef9fbd0 > > db_trace_self_wrapper() at vpanic+0x188 > pc = 0xffff00000011d9ec lr = 0xffff0000004e1d10 > sp = 0xffff00013ef9fbe0 fp = 0xffff00013ef9fc40 > > vpanic() at panic+0x44 > pc = 0xffff0000004e1d10 lr = 0xffff0000004e1b84 > sp = 0xffff00013ef9fc50 fp = 0xffff00013ef9fd00 > > panic() at data_abort+0x290 > pc = 0xffff0000004e1b84 lr = 0xffff00000082a3c8 > sp = 0xffff00013ef9fd10 fp = 0xffff00013ef9fd90 > > data_abort() at handle_el1h_sync+0x78 > pc = 0xffff00000082a3c8 lr = 0xffff00000080a078 > sp = 0xffff00013ef9fda0 fp = 0xffff00013ef9fef0 > > handle_el1h_sync() at thread_lock_flags_+0xbc > pc = 0xffff00000080a078 lr = 0xffff0000004b9024 > sp = 0xffff00013ef9ff00 fp = 0xffff00013ef9ff80 > > thread_lock_flags_() at thread_lock_flags_+0xbc > pc = 0xffff0000004b9024 lr = 0xffff0000004b9024 > sp = 0xffff00013ef9ff90 fp = 0xffff00013ef9ffa0 > > thread_lock_flags_() at sleepq_timeout+0x10 > pc = 0xffff0000004b9024 lr = 0xffff00000054b2a8 > sp = 0xffff00013ef9ffb0 fp = 0xffff00013ef9fff0 > > sleepq_timeout() at softclock_call_cc+0x14c > pc = 0xffff00000054b2a8 lr = 0xffff000000503134 > sp = 0xffff00013efa0000 fp = 0xffff00013efa0060 > > softclock_call_cc() at callout_process+0x17c > pc = 0xffff000000503134 lr = 0xffff000000502df0 > sp = 0xffff00013efa0070 fp = 0xffff00013efa00e0 > > callout_process() at handleevents+0x188 > pc = 0xffff000000502df0 lr = 0xffff00000045b42c > sp = 0xffff00013efa00f0 fp = 0xffff00013efa0140 > > handleevents() at timercb+0x304 > pc = 0xffff00000045b42c lr = 0xffff00000045be7c > sp = 0xffff00013efa0150 fp = 0xffff00013efa01b0 > > timercb() at arm_tmr_intr+0x5c > pc = 0xffff00000045be7c lr = 0xffff0000007ff850 > sp = 0xffff00013efa01c0 fp = 0xffff00013efa0210 > > arm_tmr_intr() at intr_event_handle+0xac > pc = 0xffff0000007ff850 lr = 0xffff000000493c54 > sp = 0xffff00013efa0220 fp = 0xffff00013efa0220 > > intr_event_handle() at intr_isrc_dispatch+0x70 > pc = 0xffff000000493c54 lr = 0xffff0000007fb238 > sp = 0xffff00013efa0230 fp = 0xffff00013efa0270 > > intr_isrc_dispatch() at arm_gic_v3_intr+0x11c > pc = 0xffff0000007fb238 lr = 0xffff00000080ff34 > sp = 0xffff00013efa0280 fp = 0xffff00013efa0290 > > arm_gic_v3_intr() at intr_irq_handler+0x7c > pc = 0xffff00000080ff34 lr = 0xffff0000007faff0 > sp = 0xffff00013efa02a0 fp = 0xffff00013efa02f0 > > intr_irq_handler() at handle_el1h_irq+0x74 > pc = 0xffff0000007faff0 lr = 0xffff00000080a140 > sp = 0xffff00013efa0300 fp = 0xffff00013efa0430 > > handle_el1h_irq() at handle_el1h_sync+0x78 > pc = 0xffff00000080a140 lr = 0xffff00000080a078 > sp = 0xffff00013efa0440 fp = 0xffff00013efa0540 > > handle_el1h_sync() at handle_el1h_sync+0x78 > pc = 0xffff00000080a078 lr = 0xffff00000080a078 > sp = 0xffff00013efa0550 fp = 0xffff00013efa06a0 > > handle_el1h_sync() at sched_switch+0x6a8 > pc = 0xffff00000080a078 lr = 0xffff0000005197fc > sp = 0xffff00013efa06b0 fp = 0xffff00013efa0730 > > sched_switch() at sched_switch+0x6a8 > pc = 0xffff0000005197fc lr = 0xffff0000005197fc > sp = 0xffff00013efa0740 fp = 0xffff00013efa07d0 > > sched_switch() at mi_switch+0xf4 > pc = 0xffff0000005197fc lr = 0xffff0000004f03a0 > sp = 0xffff00013efa07e0 fp = 0xffff00013efa0830 > > mi_switch() at sleepq_timedwait+0x28 > pc = 0xffff0000004f03a0 lr = 0xffff00000054bd0c > sp = 0xffff00013efa0840 fp = 0xffff00013efa0870 > > sleepq_timedwait() at _cv_timedwait_sbt+0x110 > pc = 0xffff00000054bd0c lr = 0xffff00000045e7b0 > sp = 0xffff00013efa0880 fp = 0xffff00013efa0890 > > _cv_timedwait_sbt() at zthr_procedure+0x20c > pc = 0xffff00000045e7b0 lr = 0xffff0000014e2fb0 > sp = 0xffff00013efa08a0 fp = 0xffff00013efa08f0 > > zthr_procedure() at fork_exit+0x94 > pc = 0xffff0000014e2fb0 lr = 0xffff00000048fbf0 > sp = 0xffff00013efa0900 fp = 0xffff00013efa0950 > > fork_exit() at fork_trampoline+0x10 > pc = 0xffff00000048fbf0 lr = 0xffff000000828ed8 > sp = 0xffff00013efa0960 fp = 0x0000000000000000 > > KDB: enter: panic > [ thread pid 26 tid 100192 ] > Stopped at kdb_enter+0x48: undefined f906411f > > > Note: > > All this started after a stress based I/O hangup test > that required a forced reboot. It is a bectl environment for that Optane: # bectl list BE Active Mountpoint Space Created 13S-CA72-nodbg R - 6.38G 2021-09-29 00:57 13_0R-CA72-nodbg N / 3.57G 2021-09-29 00:45 main-CA72-nodbg - - 7.99G 2021-09-29 00:42 13_0R-CA72-nodbg and main-CA72-nodbg boot fine, just 13S-CA72-nodbg has the problem. For reference: => 40 1875384928 nda1 GPT (894G) 40 532480 nda1p1 CA72opt0EFI (260M) 532520 2008 - free - (1.0M) 534528 515899392 nda1p2 CA72opt0SWP (246G) 516433920 20971520 - free - (10G) 537405440 1337979528 nda1p3 CA72opt0ZFS (638G) Another attempt at booting have gotten reports of . . . lots of waiting notices . . . Root mount waiting for: CAM usbus1 Root mount waiting for: CAM usbus1 Root mount waiting for: CAM usbus1 Root mount waiting for: CAM usbus1 Root mount waiting for: CAM usbus1 x0: ffff000000e43ec8 (blocked_lock + 0) x1: ffff00013efa9f60 x2: ffff00000090e39a (cam_status_table + 1d132) x3: deadc0d8 x4: 0 x5: ffff00000082a138 (data_abort + 0) x6: 4 x7: 601 x8: ffff000000e43ec8 (blocked_lock + 0) x9: deadc0de x10: 0 x11: 3938700 x12: 0 x13: 8000 x14: 1dd x15: 81cd x16: 0 x17: 1 x18: ffff00013efa9f50 x19: ffff000000e43ec8 (blocked_lock + 0) x20: ffffa0000c806000 x21: 0 x22: ffffa0000c806000 x23: 0 x24: ffff000000bed000 (queue_ops + 0) x25: 98967f x26: ffff000000e43ee0 (blocked_lock + 18) x27: 0 x28: 114 x29: ffff00013efa9f50 sp: ffff00013efa9f50 lr: ffff0000004b9028 (thread_lock_flags_ + c0) elr: ffff0000004b9028 (thread_lock_flags_ + c0) spin lock 0xffff00004317b100 (sched lock 4) held by 0xffffa0000c806000 (tid 100194) too long spin lock 0xffff00004317b100 (sched lock 4) held by 0xffffa0000c806000 (tid 100194) too long spin lock 0xffff00004317b100 (sched lock 4) held by 0xffffa0000c806000 (tid 100194) too long spsr: 2c5 far: deadc178 esr: 96000004 timeout stopping cpus panic: spin lock held too long cpuid = 9 time = 56 KDB: stack backtrace: db_trace_self() at db_trace_self_wrapper+0x30 pc = 0xffff000000807770 lr = 0xffff00000011d9ec sp = 0xffff000156eea430 fp = 0xffff000156eea630 db_trace_self_wrapper() at vpanic+0x188 pc = 0xffff00000011d9ec lr = 0xffff0000004e1d10 sp = 0xffff000156eea640 fp = 0xffff000156eea6a0 vpanic() at panic+0x44 pc = 0xffff0000004e1d10 lr = 0xffff0000004e1b84 sp = 0xffff000156eea6b0 fp = 0xffff000156eea760 panic() at _mtx_lock_indefinite_check+0x8c pc = 0xffff0000004e1b84 lr = 0xffff0000004b8ea4 sp = 0xffff000156eea770 fp = 0xffff000156eea770 _mtx_lock_indefinite_check() at _mtx_lock_spin_cookie+0xb8 pc = 0xffff0000004b8ea4 lr = 0xffff0000004b8a0c sp = 0xffff000156eea780 fp = 0xffff000156eea790 _mtx_lock_spin_cookie() at sched_add+0x400 pc = 0xffff0000004b8a0c lr = 0xffff00000051ac94 sp = 0xffff000156eea7a0 fp = 0xffff000156eea7f0 sched_add() at kthread_add+0x238 pc = 0xffff00000051ac94 lr = 0xffff0000004a01e0 sp = 0xffff000156eea800 fp = 0xffff000156eea880 kthread_add() at vm_pageout+0x1b8 pc = 0xffff0000004a01e0 lr = 0xffff0000007e5a18 sp = 0xffff000156eea890 fp = 0xffff000156eea8f0 . . . Garbled . . . sp = 0xffff00013efa9bb0 fp = 0xffff00013efa9c10 vpanic() at panic+0x44 pc = 0xffff0000004e1d10 lr = 0xffff0000004e1b84 sp = 0xffff00013efa9c20 fp = 0xffff00013efa9cd0 panic() at data_abort+0x290 pc = 0xffff0000004e1b84 lr = 0xffff00000082a3c8 sp = 0xffff00013efa9ce0 fp = 0xffff00013efa9d60 data_abort() at handle_el1h_sync+0x78 pc = 0xffff00000082a3c8 lr = 0xffff00000080a078 sp = 0xffff00013efa9d70 fp = 0xffff00013efa9ec0 handle_el1h_sync() at thread_lock_flags_+0xbc pc = 0xffff00000080a078 lr = 0xffff0000004b9024 sp = 0xffff00013efa9ed0 fp = 0xffff00013efa9f50 thread_lock_flags_() at thread_lock_flags_+0xbc pc = 0xffff0000004b9024 lr = 0xffff0000004b9024 . . . Garbled . . . But after that the next attempt booted just fine. zpool scrub got: # zpool status pool: zopt0 state: ONLINE scan: scrub repaired 0B in 00:02:05 with 0 errors on Sun Nov 21 12:00:09 2021 config: NAME STATE READ WRITE CKSUM zopt0 ONLINE 0 0 0 nda1p3 ONLINE 0 0 0 errors: No known data errors Further reboots into 13S-CA72-nodbg have worked fine. For reference: The HoneyComb is a 16 Cortex-A72 system. I've no clue if releng/13 (-p5) or main could be induced to get similar behavior to what 13S-CA72-nodbg showed for the Optane media. === Mark Millard marklmi at yahoo.com ( dsl-only.net went away in early 2018-Mar)