Re: ZFS deadlock in 14

From: Cy Schubert <Cy.Schubert_at_cschubert.com>
Date: Thu, 10 Aug 2023 23:33:12 UTC
I haven't experienced any problems (yet) either.


-- 
Cheers,
Cy Schubert <Cy.Schubert@cschubert.com>
FreeBSD UNIX:  <cy@FreeBSD.org>   Web:  https://FreeBSD.org
NTP:           <cy@nwtime.org>    Web:  https://nwtime.org

			e^(i*pi)+1=0


In message <CAK7dMtDJQtaai3_6VjEkwVwW5JN6e8v=kKTOPffp371xb=ORUg@mail.gmail.c
om>
, Kevin Bowling writes:
> The two MFVs on head have improved/fixed stability with poudriere for
> me 48 core bare metal.
>
> On Thu, Aug 10, 2023 at 6:37=E2=80=AFAM Cy Schubert <Cy.Schubert@cschubert.=
> com> wrote:
> >
> > In message <CAK7dMtDJeuf8rjWbsNEZABUfeqpjUyCHzuOL9AAhKk93sy+PKg@mail.gmai=
> l.c
> > om>
> > , Kevin Bowling writes:
> > > Possibly https://github.com/openzfs/zfs/commit/2cb992a99ccadb78d97049b4=
> 0bd4=3D
> > > 42eb4fdc549d
> > >
> > > On Tue, Aug 8, 2023 at 10:08=3DE2=3D80=3DAFAM Dag-Erling Sm=3DC3=3DB8rg=
> rav <des@freeb=3D
> > > sd.org> wrote:
> > > >
> > > > At some point between 42d088299c (4 May) and f0c9703301 (26 June), a
> > > > deadlock was introduced in ZFS.  It is still present as of 9c2823bae9=
>  (4
> > > > August) and is 100% reproducable just by starting poudriere bulk in a
> > > > 16-core VM and waiting a few hours until deadlkres kicks in.  In the
> > > > latest instance, deadlkres complained about a bash process:
> > > >
> > > >     #0  sched_switch (td=3D3Dtd@entry=3D3D0xfffffe02fb1d8000, flags=
> =3D3Dflags@e=3D
> > > ntry=3D3D259) at /usr/src/sys/kern/sched_ule.c:2299
> > > >     #1  0xffffffff80b5a0a3 in mi_switch (flags=3D3Dflags@entry=3D3D25=
> 9) at /u=3D
> > > sr/src/sys/kern/kern_synch.c:550
> > > >     #2  0xffffffff80babcb4 in sleepq_switch (wchan=3D3D0xfffff818543a=
> 9e70, =3D
> > > pri=3D3D64) at /usr/src/sys/kern/subr_sleepqueue.c:609
> > > >     #3  0xffffffff80babb8c in sleepq_wait (wchan=3D3D<unavailable>, p=
> ri=3D3D<=3D
> > > unavailable>) at /usr/src/sys/kern/subr_sleepqueue.c:660
> > > >     #4  0xffffffff80b1c1b0 in sleeplk (lk=3D3Dlk@entry=3D3D0xfffff818=
> 543a9e70=3D
> > > , flags=3D3Dflags@entry=3D3D2121728, ilk=3D3Dilk@entry=3D3D0x0, wmesg=
> =3D3Dwmesg@entry=3D
> > > =3D3D0xffffffff8222a054 "zfs", pri=3D3D<optimized out>, pri@entry=3D3D6=
> 4, timo=3D3D=3D
> > > timo@entry=3D3D6, queue=3D3D1) at /usr/src/sys/kern/kern_lock.c:310
> > > >     #5  0xffffffff80b1a23f in lockmgr_slock_hard (lk=3D3D0xfffff81854=
> 3a9e70=3D
> > > , flags=3D3D2121728, ilk=3D3D<optimized out>, file=3D3D0xffffffff812544=
> fb "/usr/s=3D
> > > rc/sys/kern/vfs_subr.c", line=3D3D3057, lwa=3D3D0x0) at /usr/src/sys/ke=
> rn/kern_=3D
> > > lock.c:705
> > > >     #6  0xffffffff80c59ec3 in VOP_LOCK1 (vp=3D3D0xfffff818543a9e00, f=
> lags=3D
> > > =3D3D2105344, file=3D3D0xffffffff812544fb "/usr/src/sys/kern/vfs_subr.c=
> ", line=3D
> > > =3D3D3057) at ./vnode_if.h:1120
> > > >     #7  _vn_lock (vp=3D3Dvp@entry=3D3D0xfffff818543a9e00, flags=3D3D2=
> 105344, fi=3D
> > > le=3D3D<unavailable>, line=3D3D<unavailable>, line@entry=3D3D3057) at /=
> usr/src/sy=3D
> > > s/kern/vfs_vnops.c:1815
> > > >     #8  0xffffffff80c4173d in vget_finish (vp=3D3D0xfffff818543a9e00,=
>  flags=3D
> > > =3D3D<unavailable>, vs=3D3Dvs@entry=3D3DVGET_USECOUNT) at /usr/src/sys/=
> kern/vfs_s=3D
> > > ubr.c:3057
> > > >     #9  0xffffffff80c1c9b7 in cache_lookup (dvp=3D3Ddvp@entry=3D3D0xf=
> ffff802c=3D
> > > d02ac40, vpp=3D3Dvpp@entry=3D3D0xfffffe046b20ac30, cnp=3D3Dcnp@entry=3D=
> 3D0xfffffe04=3D
> > > 6b20ac58, tsp=3D3Dtsp@entry=3D3D0x0, ticksp=3D3Dticksp@entry=3D3D0x0) a=
> t /usr/src/s=3D
> > > ys/kern/vfs_cache.c:2086
> > > >     #10 0xffffffff80c2150c in vfs_cache_lookup (ap=3D3D<optimized out=
> >) at =3D
> > > /usr/src/sys/kern/vfs_cache.c:3068
> > > >     #11 0xffffffff80c32c37 in VOP_LOOKUP (dvp=3D3D0xfffff802cd02ac40,=
>  vpp=3D
> > > =3D3D0xfffffe046b20ac30, cnp=3D3D0xfffffe046b20ac58) at ./vnode_if.h:69
> > > >     #12 vfs_lookup (ndp=3D3Dndp@entry=3D3D0xfffffe046b20abd8) at /usr=
> /src/sys=3D
> > > /kern/vfs_lookup.c:1266
> > > >     #13 0xffffffff80c31ce1 in namei (ndp=3D3Dndp@entry=3D3D0xfffffe04=
> 6b20abd8=3D
> > > ) at /usr/src/sys/kern/vfs_lookup.c:689
> > > >     #14 0xffffffff80c52090 in kern_statat (td=3D3D0xfffffe02fb1d8000,=
>  flag=3D
> > > =3D3D<optimized out>, fd=3D3D-100, path=3D3D0xa75b480e070 <error: Canno=
> t access m=3D
> > > emory at address 0xa75b480e070>, pathseg=3D3Dpathseg@entry=3D3DUIO_USER=
> SPACE, s=3D
> > > bp=3D3Dsbp@entry=3D3D0xfffffe046b20ad18)
> > > >         at /usr/src/sys/kern/vfs_syscalls.c:2441
> > > >     #15 0xffffffff80c52797 in sys_fstatat (td=3D3D<unavailable>, uap=
> =3D3D0xff=3D
> > > fffe02fb1d8400) at /usr/src/sys/kern/vfs_syscalls.c:2419
> > > >     #16 0xffffffff81049398 in syscallenter (td=3D3D<optimized out>) a=
> t /usr=3D
> > > /src/sys/amd64/amd64/../../kern/subr_syscall.c:190
> > > >     #17 amd64_syscall (td=3D3D0xfffffe02fb1d8000, traced=3D3D0) at /u=
> sr/src/s=3D
> > > ys/amd64/amd64/trap.c:1199
> > > >     #18 <signal handler called>
> > > >
> > > > The lock it is trying to acquire in frame 5 belongs to another bash
> > > > process which is in the process of creating a fifo:
> > > >
> > > >     #0  sched_switch (td=3D3Dtd@entry=3D3D0xfffffe046acd8e40, flags=
> =3D3Dflags@e=3D
> > > ntry=3D3D259) at /usr/src/sys/kern/sched_ule.c:2299
> > > >     #1  0xffffffff80b5a0a3 in mi_switch (flags=3D3Dflags@entry=3D3D25=
> 9) at /u=3D
> > > sr/src/sys/kern/kern_synch.c:550
> > > >     #2  0xffffffff80babcb4 in sleepq_switch (wchan=3D3D0xfffff8018acb=
> f154, =3D
> > > pri=3D3D87) at /usr/src/sys/kern/subr_sleepqueue.c:609
> > > >     #3  0xffffffff80babb8c in sleepq_wait (wchan=3D3D<unavailable>, p=
> ri=3D3D<=3D
> > > unavailable>) at /usr/src/sys/kern/subr_sleepqueue.c:660
> > > >     #4  0xffffffff80b59606 in _sleep (ident=3D3Dident@entry=3D3D0xfff=
> ff8018ac=3D
> > > bf154, lock=3D3Dlock@entry=3D3D0xfffff8018acbf120, priority=3D3Dpriorit=
> y@entry=3D3D=3D
> > > 87, wmesg=3D3D0xffffffff8223af0e "zfs teardown inactive", sbt=3D3Dsbt@e=
> ntry=3D3D0=3D
> > > , pr=3D3Dpr@entry=3D3D0, flags=3D3D256)
> > > >         at /usr/src/sys/kern/kern_synch.c:225
> > > >     #5  0xffffffff80b45dc0 in rms_rlock_fallback (rms=3D3D0xfffff8018=
> acbf12=3D
> > > 0) at /usr/src/sys/kern/kern_rmlock.c:1015
> > > >     #6  0xffffffff80b45c93 in rms_rlock (rms=3D3D<unavailable>, rms@e=
> ntry=3D
> > > =3D3D0xfffff8018acbf120) at /usr/src/sys/kern/kern_rmlock.c:1036
> > > >     #7  0xffffffff81fb147b in zfs_freebsd_reclaim (ap=3D3D<optimized =
> out>) =3D
> > > at /usr/src/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c:51=
> 64
> > > >     #8  0xffffffff8111d245 in VOP_RECLAIM_APV (vop=3D3D0xffffffff822e=
> 71a0 <=3D
> > > zfs_vnodeops>, a=3D3Da@entry=3D3D0xfffffe0410f1c9c8) at vnode_if.c:2180
> > > >     #9  0xffffffff80c43569 in VOP_RECLAIM (vp=3D3D0xfffff802cdbaca80)=
>  at ./=3D
> > > vnode_if.h:1084
> > > >     #10 vgonel (vp=3D3Dvp@entry=3D3D0xfffff802cdbaca80) at /usr/src/s=
> ys/kern/=3D
> > > vfs_subr.c:4143
> > > >     #11 0xffffffff80c3ef61 in vtryrecycle (vp=3D3D0xfffff802cdbaca80)=
>  at /u=3D
> > > sr/src/sys/kern/vfs_subr.c:1693
> > > >     #12 vnlru_free_impl (count=3D3Dcount@entry=3D3D1, mnt_op=3D3Dmnt_=
> op@entry=3D
> > > =3D3D0x0, mvp=3D3D0xfffff8010864da00) at /usr/src/sys/kern/vfs_subr.c:1=
> 344
> > > >     #13 0xfffff