[Bug 229614] ZFS lockup in zil_commit_impl
bugzilla-noreply at freebsd.org
bugzilla-noreply at freebsd.org
Mon Nov 12 08:42:27 UTC 2018
https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=229614
--- Comment #41 from Andreas Sommer <andreas.sommer87 at googlemail.com> ---
(In reply to Michael Gmelin from comment #40)
Here are the things I tried to capture the latest issue I found. As Michael
noted, this happened with the patched kernel (on 11.2), after days of stomping
the system with I/O-heavy infinite background loops.
Interesting fact: `top` does not update anymore once per second, but can be
updated with the Space key as usual. Also, my `screen` processes seem gone –
the ones I used to run the background loops.
Please note that we now have two processes in zil_commit_impl, and possibly
those two are interlocked?! I'm keeping the system in this state for now, just
in case someone has an idea which debug information we should collect apart
from the below.
-----------------
root at asommer-devpm-ci:/usr/home/asommer # top
last pid: 58504; load averages: 2.42, 3.37, 3.19
up 3+10:22:17 02:49:54
61 processes: 1 running, 59 sleeping, 1 zombie
CPU: % user, % nice, % system, % interrupt, % idle
Mem: 69M Active, 1650M Inact, 10G Wired, 3544M Free
ARC: 4243M Total, 1624M MFU, 1829M MRU, 2539K Anon, 111M Header, 775M Other
2517M Compressed, 4824M Uncompressed, 1.92:1 Ratio
Swap: 1024M Total, 1024M Free
PID USERNAME THR PRI NICE SIZE RES STATE C TIME WCPU
COMMAND
1326 1001 3 44 0 177M 169M select 3 197:02 53.96%
python3.6 # <------------
1777 1001 1 52 0 38216K 33724K select 0 1:09 0.00%
python3.6
23006 root 1 20 0 10352K 6592K select 0 0:55 0.00% ssh
833 root 1 20 0 6464K 2396K nanslp 0 0:23 0.00% cron
1220 root 1 20 0 6336K 2016K nanslp 1 0:10 0.00% svscan
1047 root 1 20 0 6336K 2016K nanslp 3 0:10 0.00% svscan
1066 root 1 20 0 10444K 6260K select 1 0:06 0.00%
sendmail
1240 root 1 20 0 6464K 2404K nanslp 3 0:04 0.00% cron
816 root 1 20 0 10452K 6300K select 0 0:04 0.00%
sendmail
1233 root 1 20 0 10444K 6264K select 0 0:04 0.00%
sendmail
1073 root 1 20 0 6464K 2400K nanslp 2 0:03 0.00% cron
513 root 1 20 0 9180K 5120K select 2 0:02 0.00% devd
596 root 1 20 0 6412K 2420K select 0 0:01 0.00%
syslogd
999 root 1 20 0 6420K 2464K select 0 0:01 0.00%
syslogd
1167 root 1 20 0 6420K 2524K select 1 0:01 0.00%
syslogd
1236 smmsp 1 20 0 10444K 5960K pause 3 0:00 0.00%
sendmail
778 www 1 20 0 13332K 8668K kqread 0 0:00 0.00% nginx
461 root 1 42 0 6544K 2380K select 0 0:00 0.00%
dhclient
23003 root 1 44 0 7596K 3616K select 0 0:00 0.00% sudo
1069 smmsp 1 20 0 10444K 5556K pause 2 0:00 0.00%
sendmail
819 smmsp 1 20 0 10452K 5912K pause 2 0:00 0.00%
sendmail
885 root 1 52 0 6408K 2124K ttyin 1 0:00 0.00% getty
507 _dhcp 1 20 0 6544K 2512K select 2 0:00 0.00%
dhclient
58428 asommer 1 23 0 13160K 8012K select 0 0:00 0.00% sshd
58433 root 1 52 0 7412K 4048K pause 3 0:00 0.00% csh
58494 root 1 23 0 13160K 7956K select 3 0:00 0.00% sshd
58426 root 1 23 0 13160K 7956K select 0 0:00 0.00% sshd
1224 root 1 20 0 4244K 1976K select 0 0:00 0.00%
supervise
1052 root 1 20 0 4244K 1976K select 0 0:00 0.00%
supervise
58501 root 1 47 0 7412K 4068K pause 0 0:00 0.00% csh
1225 root 1 20 0 4244K 1976K select 2 0:00 0.00%
supervise
1053 root 1 20 0 4244K 1976K select 3 0:00 0.00%
supervise
58482 root 1 52 0 7064K 2868K wait 2 0:00 0.00% sh
58496 asommer 1 23 0 13160K 7968K select 2 0:00 0.00% sshd
58378 asommer 1 21 0 0K 16K vmo_de 2 0:00 0.00% sshd
797 root 1 21 0 12848K 7108K select 0 0:00 0.00% sshd
1776 root 1 20 0 7584K 3624K select 2 0:00 0.00% sudo
58497 asommer 1 26 0 7796K 3784K wait 2 0:00 0.00% bash
58499 root 1 26 0 7596K 3616K select 1 0:00 0.00% sudo
58431 root 1 26 0 7596K 3616K select 3 0:00 0.00% sudo
58429 asommer 1 26 0 7796K 3784K wait 0 0:00 0.00% bash
58381 asommer 1 20 0 4232K 1928K zcw->z 2 0:00 0.00% sync #
<------------
1230 1001 1 20 0 4260K 1988K piperd 3 0:00 0.00%
multilog
58432 root 1 26 0 6932K 2900K wait 2 0:00 0.00% su
23002 asommer 1 20 0 7600K 4132K select 1 0:00 0.00% screen
58500 root 1 26 0 6932K 2900K wait 0 0:00 0.00% su
1059 1001 1 52 0 4260K 1984K piperd 1 0:00 0.00%
multilog
58504 root 1 46 0 7916K 3140K CPU3 3 0:00 0.00% top
882 root 1 52 0 6408K 2128K ttyin 0 0:00 0.00% getty
877 root 1 52 0 6408K 2128K ttyin 2 0:00 0.00% getty
879 root 1 52 0 6408K 2128K ttyin 2 0:00 0.00% getty
878 root 1 52 0 6408K 2128K ttyin 3 0:00 0.00% getty
881 root 1 52 0 6408K 2128K ttyin 3 0:00 0.00% getty
884 root 1 52 0 6408K 2128K ttyin 0 0:00 0.00% getty
880 root 1 52 0 6408K 2128K ttyin 1 0:00 0.00% getty
883 root 1 52 0 6408K 2128K ttyin 3 0:00 0.00% getty
1048 root 1 52 0 4232K 1960K piperd 1 0:00 0.00%
readproctitle
root at asommer-devpm-ci:/usr/home/asommer # procstat -kk 1326
PID TID COMM TDNAME KSTACK
1326 100422 python3.6 - mi_switch+0xe6
sleepq_catch_signals+0x40c sleepq_timedwait_sig+0x14
_cv_timedwait_sig_sbt+0x18c seltdwait+0x7b kern_poll+0x3fd sys_poll+0x50
amd64_syscall+0xa38 fast_syscall_common+0x101
1326 100433 python3.6 - mi_switch+0xe6
sleepq_catch_signals+0x40c sleepq_wait_sig+0xf _sleep+0x231 umtxq_sleep+0x143
do_sem2_wait+0x68a __umtx_op_sem2_wait+0x4b amd64_syscall+0xa38
fast_syscall_common+0x101
1326 100436 python3.6 - mi_switch+0xe6
sleepq_timedwait+0x2f _cv_timedwait_sbt+0x18c zil_commit_impl+0xcd3
zfs_freebsd_putpages+0x635 VOP_PUTPAGES_APV+0x82 vnode_pager_putpages+0x8e
vm_pageout_flush+0xea vm_object_page_collect_flush+0x213
vm_object_page_clean+0x146 vm_object_terminate+0x93 zfs_freebsd_reclaim+0x1e
VOP_RECLAIM_APV+0x82 vgonel+0x208 vrecycle+0x4a zfs_freebsd_inactive+0xd
VOP_INACTIVE_APV+0x82 vinactive+0xfc
root at asommer-devpm-ci:/usr/home/asommer # procstat 58381
PID PPID PGID SID TSID THR LOGIN WCHAN EMUL COMM
58381 1 58381 58379 58379 1 asommer zcw->zcw FreeBSD ELF64 sync
root at asommer-devpm-ci:/usr/home/asommer # procstat -f 58381
PID COMM FD T V FLAGS REF OFFSET PRO NAME
58381 sync text v r r------- - - - /bin/sync
58381 sync cwd v d r------- - - - /usr/home/asommer
58381 sync root v d r------- - - - /
58381 sync 0 v x rw------ 3 40 - -
58381 sync 1 v x rw------ 3 40 - -
58381 sync 2 v x rw------ 3 40 - -
root at asommer-devpm-ci:/usr/home/asommer # procstat -r 58381
PID COMM RESOURCE VALUE
58381 sync user time 00:00:00.000000
58381 sync system time 00:00:00.003656
58381 sync maximum RSS 0.0 KB
58381 sync integral shared memory 0.0 KB
58381 sync integral unshared data 0.0 KB
58381 sync integral unshared stack 0.0 KB
58381 sync page reclaims 89
58381 sync page faults 0
58381 sync swaps 0
58381 sync block reads 3
58381 sync block writes 0
58381 sync messages sent 0
58381 sync messages received 0
58381 sync signals received 0
58381 sync voluntary context switches 6
58381 sync involuntary context switches 0
root at asommer-devpm-ci:/usr/home/asommer # procstat -t 58381
PID TID COMM TDNAME CPU PRI STATE WCHAN
58381 101315 sync - -1 120 sleep zcw->zcw
root at asommer-devpm-ci:/usr/home/asommer # procstat -k 58381
PID TID COMM TDNAME KSTACK
58381 101315 sync - mi_switch sleepq_timedwait
_cv_timedwait_sbt zil_commit_impl zfs_sync sys_sync amd64_syscall
fast_syscall_common
root at asommer-devpm-ci:/usr/home/asommer # procstat -kk 58381
PID TID COMM TDNAME KSTACK
58381 101315 sync - mi_switch+0xe6
sleepq_timedwait+0x2f _cv_timedwait_sbt+0x18c zil_commit_impl+0xcd3
zfs_sync+0xa6 sys_sync+0x116 amd64_syscall+0xa38 fast_syscall_common+0x101
--
You are receiving this mail because:
You are on the CC list for the bug.
More information about the freebsd-fs
mailing list