ZFS deadlock in 14

From: Dag-Erling_Smørgrav <des_at_FreeBSD.org>
Date: Tue, 08 Aug 2023 17:07:48 UTC
At some point between 42d088299c (4 May) and f0c9703301 (26 June), a
deadlock was introduced in ZFS.  It is still present as of 9c2823bae9 (4
August) and is 100% reproducable just by starting poudriere bulk in a
16-core VM and waiting a few hours until deadlkres kicks in.  In the
latest instance, deadlkres complained about a bash process:

    #0  sched_switch (td=td@entry=0xfffffe02fb1d8000, flags=flags@entry=259) at /usr/src/sys/kern/sched_ule.c:2299
    #1  0xffffffff80b5a0a3 in mi_switch (flags=flags@entry=259) at /usr/src/sys/kern/kern_synch.c:550
    #2  0xffffffff80babcb4 in sleepq_switch (wchan=0xfffff818543a9e70, pri=64) at /usr/src/sys/kern/subr_sleepqueue.c:609
    #3  0xffffffff80babb8c in sleepq_wait (wchan=<unavailable>, pri=<unavailable>) at /usr/src/sys/kern/subr_sleepqueue.c:660
    #4  0xffffffff80b1c1b0 in sleeplk (lk=lk@entry=0xfffff818543a9e70, flags=flags@entry=2121728, ilk=ilk@entry=0x0, wmesg=wmesg@entry=0xffffffff8222a054 "zfs", pri=<optimized out>, pri@entry=64, timo=timo@entry=6, queue=1) at /usr/src/sys/kern/kern_lock.c:310
    #5  0xffffffff80b1a23f in lockmgr_slock_hard (lk=0xfffff818543a9e70, flags=2121728, ilk=<optimized out>, file=0xffffffff812544fb "/usr/src/sys/kern/vfs_subr.c", line=3057, lwa=0x0) at /usr/src/sys/kern/kern_lock.c:705
    #6  0xffffffff80c59ec3 in VOP_LOCK1 (vp=0xfffff818543a9e00, flags=2105344, file=0xffffffff812544fb "/usr/src/sys/kern/vfs_subr.c", line=3057) at ./vnode_if.h:1120
    #7  _vn_lock (vp=vp@entry=0xfffff818543a9e00, flags=2105344, file=<unavailable>, line=<unavailable>, line@entry=3057) at /usr/src/sys/kern/vfs_vnops.c:1815
    #8  0xffffffff80c4173d in vget_finish (vp=0xfffff818543a9e00, flags=<unavailable>, vs=vs@entry=VGET_USECOUNT) at /usr/src/sys/kern/vfs_subr.c:3057
    #9  0xffffffff80c1c9b7 in cache_lookup (dvp=dvp@entry=0xfffff802cd02ac40, vpp=vpp@entry=0xfffffe046b20ac30, cnp=cnp@entry=0xfffffe046b20ac58, tsp=tsp@entry=0x0, ticksp=ticksp@entry=0x0) at /usr/src/sys/kern/vfs_cache.c:2086
    #10 0xffffffff80c2150c in vfs_cache_lookup (ap=<optimized out>) at /usr/src/sys/kern/vfs_cache.c:3068
    #11 0xffffffff80c32c37 in VOP_LOOKUP (dvp=0xfffff802cd02ac40, vpp=0xfffffe046b20ac30, cnp=0xfffffe046b20ac58) at ./vnode_if.h:69
    #12 vfs_lookup (ndp=ndp@entry=0xfffffe046b20abd8) at /usr/src/sys/kern/vfs_lookup.c:1266
    #13 0xffffffff80c31ce1 in namei (ndp=ndp@entry=0xfffffe046b20abd8) at /usr/src/sys/kern/vfs_lookup.c:689
    #14 0xffffffff80c52090 in kern_statat (td=0xfffffe02fb1d8000, flag=<optimized out>, fd=-100, path=0xa75b480e070 <error: Cannot access memory at address 0xa75b480e070>, pathseg=pathseg@entry=UIO_USERSPACE, sbp=sbp@entry=0xfffffe046b20ad18)
        at /usr/src/sys/kern/vfs_syscalls.c:2441
    #15 0xffffffff80c52797 in sys_fstatat (td=<unavailable>, uap=0xfffffe02fb1d8400) at /usr/src/sys/kern/vfs_syscalls.c:2419
    #16 0xffffffff81049398 in syscallenter (td=<optimized out>) at /usr/src/sys/amd64/amd64/../../kern/subr_syscall.c:190
    #17 amd64_syscall (td=0xfffffe02fb1d8000, traced=0) at /usr/src/sys/amd64/amd64/trap.c:1199
    #18 <signal handler called>

The lock it is trying to acquire in frame 5 belongs to another bash
process which is in the process of creating a fifo:

    #0  sched_switch (td=td@entry=0xfffffe046acd8e40, flags=flags@entry=259) at /usr/src/sys/kern/sched_ule.c:2299
    #1  0xffffffff80b5a0a3 in mi_switch (flags=flags@entry=259) at /usr/src/sys/kern/kern_synch.c:550
    #2  0xffffffff80babcb4 in sleepq_switch (wchan=0xfffff8018acbf154, pri=87) at /usr/src/sys/kern/subr_sleepqueue.c:609
    #3  0xffffffff80babb8c in sleepq_wait (wchan=<unavailable>, pri=<unavailable>) at /usr/src/sys/kern/subr_sleepqueue.c:660
    #4  0xffffffff80b59606 in _sleep (ident=ident@entry=0xfffff8018acbf154, lock=lock@entry=0xfffff8018acbf120, priority=priority@entry=87, wmesg=0xffffffff8223af0e "zfs teardown inactive", sbt=sbt@entry=0, pr=pr@entry=0, flags=256)
        at /usr/src/sys/kern/kern_synch.c:225
    #5  0xffffffff80b45dc0 in rms_rlock_fallback (rms=0xfffff8018acbf120) at /usr/src/sys/kern/kern_rmlock.c:1015
    #6  0xffffffff80b45c93 in rms_rlock (rms=<unavailable>, rms@entry=0xfffff8018acbf120) at /usr/src/sys/kern/kern_rmlock.c:1036
    #7  0xffffffff81fb147b in zfs_freebsd_reclaim (ap=<optimized out>) at /usr/src/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c:5164
    #8  0xffffffff8111d245 in VOP_RECLAIM_APV (vop=0xffffffff822e71a0 <zfs_vnodeops>, a=a@entry=0xfffffe0410f1c9c8) at vnode_if.c:2180
    #9  0xffffffff80c43569 in VOP_RECLAIM (vp=0xfffff802cdbaca80) at ./vnode_if.h:1084
    #10 vgonel (vp=vp@entry=0xfffff802cdbaca80) at /usr/src/sys/kern/vfs_subr.c:4143
    #11 0xffffffff80c3ef61 in vtryrecycle (vp=0xfffff802cdbaca80) at /usr/src/sys/kern/vfs_subr.c:1693
    #12 vnlru_free_impl (count=count@entry=1, mnt_op=mnt_op@entry=0x0, mvp=0xfffff8010864da00) at /usr/src/sys/kern/vfs_subr.c:1344
    #13 0xffffffff80c49553 in vnlru_free_locked (count=1) at /usr/src/sys/kern/vfs_subr.c:1357
    #14 vn_alloc_hard (mp=mp@entry=0x0) at /usr/src/sys/kern/vfs_subr.c:1744
    #15 0xffffffff80c3f6f0 in vn_alloc (mp=0x0) at /usr/src/sys/amd64/include/atomic.h:375
    #16 getnewvnode_reserve () at /usr/src/sys/kern/vfs_subr.c:1888
    #17 0xffffffff81faa072 in zfs_create (dzp=0xfffff812200261d0, name=0xfffff8011b8ac805 "sh-np.yPbxoo", vap=0xfffffe0410f1cc20, excl=<optimized out>, mode=<optimized out>, zpp=zpp@entry=0xfffffe0410f1cbc8, cr=0xfffff80140fb1100, flag=<optimized out>, vsecp=0x0, mnt_ns=0x0)
        at /usr/src/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c:1146
    #18 0xffffffff81faea57 in zfs_freebsd_create (ap=0xfffffe0410f1cda0) at /usr/src/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c:4618
    #19 0xffffffff8111aa9a in VOP_MKNOD_APV (vop=0xffffffff822e71a0 <zfs_vnodeops>, a=a@entry=0xfffffe0410f1cda0) at vnode_if.c:372
    #20 0xffffffff80c50207 in VOP_MKNOD (dvp=<unavailable>, cnp=0xfffffe0410f1cd50, vap=0xfffffe0410f1cc20, vpp=<optimized out>) at ./vnode_if.h:188
    #21 kern_mkfifoat (td=0xfffffe046acd8e40, fd=-100, path=0x12772f073500 <error: Cannot access memory at address 0x12772f073500>, pathseg=UIO_USERSPACE, mode=<optimized out>) at /usr/src/sys/kern/vfs_syscalls.c:1492
    #22 0xffffffff81049398 in syscallenter (td=<optimized out>) at /usr/src/sys/amd64/amd64/../../kern/subr_syscall.c:190
    #23 amd64_syscall (td=0xfffffe046acd8e40, traced=0) at /usr/src/sys/amd64/amd64/trap.c:1199
    #24 <signal handler called>

Frame 7 is trying to acquire the ZFS teardown inactive lock, which is
held by a process which is performing a ZFS rollback and is waiting for
the transaction to sync:

    #0  sched_switch (td=td@entry=0xfffffe0422ef8560, flags=flags@entry=259) at /usr/src/sys/kern/sched_ule.c:2299
    #1  0xffffffff80b5a0a3 in mi_switch (flags=flags@entry=259) at /usr/src/sys/kern/kern_synch.c:550
    #2  0xffffffff80babcb4 in sleepq_switch (wchan=0xfffff8011b83d540, pri=0) at /usr/src/sys/kern/subr_sleepqueue.c:609
    #3  0xffffffff80babb8c in sleepq_wait (wchan=<unavailable>, wchan@entry=0xfffff8011b83d540, pri=<unavailable>, pri@entry=0) at /usr/src/sys/kern/subr_sleepqueue.c:660
    #4  0xffffffff80ad7f75 in _cv_wait (cvp=cvp@entry=0xfffff8011b83d540, lock=lock@entry=0xfffff8011b83d4d0) at /usr/src/sys/kern/kern_condvar.c:146
    #5  0xffffffff820b42fb in txg_wait_synced_impl (dp=dp@entry=0xfffff8011b83d000, txg=8585097, wait_sig=wait_sig@entry=0) at /usr/src/sys/contrib/openzfs/module/zfs/txg.c:726
    #6  0xffffffff820b3cab in txg_wait_synced (dp=<unavailable>, dp@entry=0xfffff8011b83d000, txg=<unavailable>) at /usr/src/sys/contrib/openzfs/module/zfs/txg.c:736
    #7  0xffffffff8206d5b5 in dsl_sync_task_common (pool=pool@entry=0xfffffe0401d15000 "zroot/poudriere/jails/13amd64-default-ref/15", checkfunc=<optimized out>, syncfunc=0xffffffff8203fbc0 <dsl_dataset_rollback_sync>, sigfunc=sigfunc@entry=0x0, arg=arg@entry=0xfffffe02fb827a90, 
        blocks_modified=blocks_modified@entry=1, space_check=ZFS_SPACE_CHECK_RESERVED, early=0) at /usr/src/sys/contrib/openzfs/module/zfs/dsl_synctask.c:93
    #8  0xffffffff8206d3c7 in dsl_sync_task (pool=<unavailable>, pool@entry=0xfffffe0401d15000 "zroot/poudriere/jails/13amd64-default-ref/15", checkfunc=<unavailable>, syncfunc=<unavailable>, arg=<unavailable>, arg@entry=0xfffffe02fb827a90, blocks_modified=<unavailable>, 
        blocks_modified@entry=1, space_check=<unavailable>, space_check@entry=ZFS_SPACE_CHECK_RESERVED) at /usr/src/sys/contrib/openzfs/module/zfs/dsl_synctask.c:132
    #9  0xffffffff8204075b in dsl_dataset_rollback (fsname=<unavailable>, fsname@entry=0xfffffe0401d15000 "zroot/poudriere/jails/13amd64-default-ref/15", tosnap=<optimized out>, owner=<optimized out>, result=result@entry=0xfffff81c826a9ea0)
        at /usr/src/sys/contrib/openzfs/module/zfs/dsl_dataset.c:3261
    #10 0xffffffff82168dd9 in zfs_ioc_rollback (fsname=0xfffffe0401d15000 "zroot/poudriere/jails/13amd64-default-ref/15", fsname@entry=<error reading variable: value is not available>, innvl=<unavailable>, innvl@entry=<error reading variable: value is not available>, 
        outnvl=0xfffff81c826a9ea0, outnvl@entry=<error reading variable: value is not available>) at /usr/src/sys/contrib/openzfs/module/zfs/zfs_ioctl.c:4405
    #11 0xffffffff82164522 in zfsdev_ioctl_common (vecnum=vecnum@entry=25, zc=zc@entry=0xfffffe0401d15000, flag=flag@entry=0) at /usr/src/sys/contrib/openzfs/module/zfs/zfs_ioctl.c:7798
    #12 0xffffffff81f97fca in zfsdev_ioctl (dev=<optimized out>, zcmd=<unavailable>, zcmd@entry=<error reading variable: value is not available>, arg=0xfffffe02fb827d50 "\017", arg@entry=<error reading variable: value is not available>, flag=<optimized out>, td=<optimized out>)
        at /usr/src/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c:168
    #13 0xffffffff809d6212 in devfs_ioctl (ap=0xfffffe02fb827c50) at /usr/src/sys/fs/devfs/devfs_vnops.c:935
    #14 0xffffffff80c585f2 in vn_ioctl (fp=0xfffff8052cdd80f0, com=<optimized out>, data=0xfffffe02fb827d50, active_cred=0xfffff80122ab1e00, td=<unavailable>) at /usr/src/sys/kern/vfs_vnops.c:1704
    #15 0xffffffff809d68ee in devfs_ioctl_f (fp=<unavailable>, fp@entry=<error reading variable: value is not available>, com=<unavailable>, com@entry=<error reading variable: value is not available>, data=<unavailable>, data@entry=<error reading variable: value is not available>, 
        cred=<unavailable>, cred@entry=<error reading variable: value is not available>, td=<unavailable>, td@entry=<error reading variable: value is not available>) at /usr/src/sys/fs/devfs/devfs_vnops.c:866
    #16 0xffffffff80bc57e6 in fo_ioctl (fp=0xfffff8052cdd80f0, com=3222821401, data=<unavailable>, active_cred=<unavailable>, td=0xfffffe0422ef8560) at /usr/src/sys/sys/file.h:367
    #17 kern_ioctl (td=td@entry=0xfffffe0422ef8560, fd=4, com=com@entry=3222821401, data=<unavailable>, data@entry=0xfffffe02fb827d50 "\017") at /usr/src/sys/kern/sys_generic.c:807
    #18 0xffffffff80bc54f2 in sys_ioctl (td=0xfffffe0422ef8560, uap=0xfffffe0422ef8960) at /usr/src/sys/kern/sys_generic.c:715
    #19 0xffffffff81049398 in syscallenter (td=<optimized out>) at /usr/src/sys/amd64/amd64/../../kern/subr_syscall.c:190
    #20 amd64_syscall (td=0xfffffe0422ef8560, traced=0) at /usr/src/sys/amd64/amd64/trap.c:1199
    #21 <signal handler called>

OpenZFS commits in the relevant range:

    commit e639e0d27cc863ba1b8de20e861e6b5d9b922a8e
    Merge: 92c23f6d9c20 e61076683850
    Author:     Martin Matuska <mm@FreeBSD.org>
    AuthorDate: Fri May 12 13:12:59 2023 +0200
    Commit:     Martin Matuska <mm@FreeBSD.org>
    CommitDate: Fri May 12 13:13:33 2023 +0200
    
        zfs: merge openzfs/zfs@e61076683
        
        Notable upstream pull request merges:
          #14744 Optimize check_filesystem() and process_error_log()
          #14773 Allow zhack label repair to restore detached devices
          #14794 zpool import -m also removing spare and cache when log device
                 is missing
          #14805 Simplify and optimize random_int_between()
          #14813 Enable the head_errlog feature to remove errors
          #14816 Fix two abd_gang_add_gang() issues
          #14817 Verify block pointers before writing them out
          #14819 Add dmu_tx_hold_append() interface
          #14823 Remove single parent assertion from zio_nowait()
          #14824 Plug memory leak in zfsdev_state
          #14825 Block cloning dbuf fixes
          #14828 Remove duplicate code in l2arc_evict()
          #14837 Fixes in head_errlog feature with encryption
          #14839 Prevent panic during concurrent snapshot rollback and zvol read
          #14853 zil: Don't expect zio_shrink() to succeed
        
        Obtained from:  OpenZFS
        OpenZFS commit: e6107668385044718b0a73330ed6423650806473
    
    commit 4d846d260e2b9a3d4d0a701462568268cbfe7a5b
    Author:     Warner Losh <imp@FreeBSD.org>
    AuthorDate: Wed May 10 09:40:58 2023 -0600
    Commit:     Warner Losh <imp@FreeBSD.org>
    CommitDate: Fri May 12 10:44:03 2023 -0600
    
        spdx: The BSD-2-Clause-FreeBSD identifier is obsolete, drop -FreeBSD
        
        The SPDX folks have obsoleted the BSD-2-Clause-FreeBSD identifier. Catch
        up to that fact and revert to their recommended match of BSD-2-Clause.
        
        Discussed with:         pfg
        MFC After:              3 days
        Sponsored by:           Netflix
    
    commit c0a83fe074a375c66ca669bfe1f128fe12b9f377
    Merge: 634a770a5e16 ad0a554614b0
    Author:     Martin Matuska <mm@FreeBSD.org>
    AuthorDate: Tue May 23 11:50:17 2023 +0200
    Commit:     Martin Matuska <mm@FreeBSD.org>
    CommitDate: Tue May 23 11:51:52 2023 +0200
    
        zfs: merge openzfs/zfs@ad0a55461
        
        Notable upstream pull request merges:
          #12355 Teach zpool scrub to scrub only blocks in error log
          #14811 Refine special_small_blocks property validation
          #14854 zil: Some micro-optimizations
          #14855 zil: Free lwb_buf after write completion
          #14860 Fixes for issues identified by recent Coverity defect reports
          #14861 Probe vdevs before marking removed
          #14873 Add the ability to uninitialize a zpool
          #14875 Hold db_mtx when updating db_state
        
        Obtained from:  OpenZFS
        OpenZFS commit: ad0a554614b096698d9969340c4c593690042d5b
    
    commit 48f52d9179d5920750cef0c5d921db63de4d767d
    Author:     John Baldwin <jhb@FreeBSD.org>
    AuthorDate: Thu May 25 07:11:38 2023 -0700
    Commit:     John Baldwin <jhb@FreeBSD.org>
    CommitDate: Thu May 25 07:11:38 2023 -0700
    
        zfs: Fix build on 32-bit platforms after most recent import.
        
        unsigned long is not a uint64_t on 32-bit platforms.  The zfs.4
        manpage documents this variable as a uint, and it is only compared
        with other variables of type int, so uint_t makes more sense than
        unsigned long.
        
        (I also wasn't sure if ULONG would work as a ZFS_MODULE_PARAM type
        on other OS's)
    
    commit 4e8d558c9d1cf3e7e424e3fb123b01979c3d57f2
    Merge: 5ca7f0294694 feff9dfed3df
    Author:     Martin Matuska <mm@FreeBSD.org>
    AuthorDate: Sat Jun 10 19:31:17 2023 +0200
    Commit:     Martin Matuska <mm@FreeBSD.org>
    CommitDate: Sat Jun 10 19:31:17 2023 +0200
    
        zfs: merge openzfs/zfs@feff9dfed
        
        Notable upstream pull request merges:
          #14833 Update compatibility.d files
          #14841 ZIL: Reduce scope of per-dataset zl_issuer_lock
          #14863 zil: Add some more statistics
          #14866 btree: Implement faster binary search algorithm
          #14894 Fix inconsistent definition of zfs_scrub_error_blocks_per_txg
          #14892 Fix concurrent resilvers initiated at same time
          #14903 Fix NULL pointer dereference when doing concurrent 'send' operations
          #14910 ZIL: Allow to replay blocks of any size
          #14939 Fix the L2ARC write size calculating logic
          #14934 Introduce zfs_refcount_(add|remove)_few()
          #14946 Improve l2arc reporting in arc_summary
          #14953 Finally drop long disabled vdev cache
          #14954 Fix the L2ARC write size calculating logic (2)
          #14955 Use list_remove_head() where possible
          #14959 ZIL: Fix race introduced by f63811f0721
        
        Obtained from:  OpenZFS
        OpenZFS commit: feff9dfed3df1bbae5dd74959a6ad87d11f27ffb
    
    commit b7198dcfc03967cba191a373d99df47ee52d6e2a
    Merge: 2c7279bae776 10e36e17612b
    Author:     Martin Matuska <mm@FreeBSD.org>
    AuthorDate: Fri Jun 16 23:12:27 2023 +0200
    Commit:     Martin Matuska <mm@FreeBSD.org>
    CommitDate: Fri Jun 16 23:13:05 2023 +0200
    
        zfs: merge openzfs/zfs@10e36e176
        
        Notable upstream pull request merges:
          #14948 Remove ARC/ZIO physdone callbacks
          #14963 Store the L2ARC device ashift in the vdev label
          #14970 Switch refcount tracking from lists to AVL-trees
          #14981 Shorten arcstat_quiescence sleep time
        
        Obtained from:  OpenZFS
        OpenZFS commit: 10e36e17612ba9c634b140ae76847bb62b5be68f
    
    commit f190c36b5d45cbfadc922a69d79628c43cdda22f
    Merge: 229d643c4dd5 8e8acabdcaeb
    Author:     Martin Matuska <mm@FreeBSD.org>
    AuthorDate: Sun Jun 25 10:31:19 2023 +0200
    Commit:     Martin Matuska <mm@FreeBSD.org>
    CommitDate: Sun Jun 25 10:32:42 2023 +0200
    
        zfs: merge openzfs/zfs@8e8acabdc
        
        Notable upstream pull request merges:
          #14987 Fix memory leak in zil_parse()
        
        Obtained from:  OpenZFS
        OpenZFS commit: 8e8acabdcaeb831c777f71361722f4235b698a8d

We can't ship 14.0 with this deadlock.

DES
-- 
Dag-Erling Smørgrav - des@FreeBSD.org