6.0 hangs (while building OOo)

Thu Oct 6 12:01:56 PDT 2005

On  6 Oct, Kris Kennaway wrote:
> On Tue, Oct 04, 2005 at 07:20:13PM -0700, Don Lewis wrote:

> There is code in -current that saves stack traces when lockmgr locks
> are acquired, when DEBUG_LOCKS is enabled - except it sometimes panics
> while trying to save the trace because of a code bug.  I remind jeffr
> about this on a more-or-less daily basis, but he hasn't had time to
> commit the fix he has yet.  It still may be useful if this is easily
> reproducible.

>> This problem appears to be some sort of vnode lock leak.
> 
> leaked lockmgr locks usually panic when the thread exits.

More info, gathered from a repeat of the OOo build:

There are two processes involved:

  UID   PID  PPID CPU PRI NI   VSZ   RSS MWCHAN STAT  TT       TIME COMMAND
    0  7055 47910 251   8  0  1728  1124 wait   I     p1    0:00.39 dmake produ
    0  7093  7055 251  -4  0  1548   716 ufs    D     p1    0:00.00 /bin/tcsh -

Process 7093 is stuck here:

#0  sched_switch (td=0xc247e900, newtd=0xc1dba780, flags=1)
    at /usr/src/sys/kern/sched_4bsd.c:980
#1  0xc247e900 in ?? ()
#2  0xc0525d68 in mi_switch (flags=1, newtd=0x0)
    at /usr/src/sys/kern/kern_synch.c:356
#3  0xc0544c77 in sleepq_switch (wchan=0x0)
    at /usr/src/sys/kern/subr_sleepqueue.c:427
#4  0xc0544f0f in sleepq_wait (wchan=0x0)
    at /usr/src/sys/kern/subr_sleepqueue.c:539
#5  0xc05259c7 in msleep (ident=0xc2874af8, mtx=0xc0781f38, priority=80,
    wmesg=0x0, timo=0) at /usr/src/sys/kern/kern_synch.c:227
#6  0xc050d74e in acquire (lkpp=0xd5ab2ae0, extflags=64, wanted=393216)
    at /usr/src/sys/kern/kern_lock.c:110
#7  0xc050dd24 in lockmgr (lkp=0xc2874af8, flags=12290, interlkp=0x40,
    td=0xc247e900) at /usr/src/sys/kern/kern_lock.c:348
#8  0xc058624d in vop_stdlock (ap=0x0) at /usr/src/sys/kern/vfs_default.c:258
#9  0xc06ff4c4 in VOP_LOCK_APV (vop=0xc0760dc0, a=0xd5ab2b50)
    at vnode_if.c:1642
#10 0xc065f897 in ffs_lock (ap=0x0) at /usr/src/sys/ufs/ffs/ffs_vnops.c:341
#11 0xc06ff4c4 in VOP_LOCK_APV (vop=0xc076a820, a=0xd5ab2b50)
    at vnode_if.c:1642
#12 0xc05a1a8e in vn_lock (vp=0xc2874aa0, flags=4098, td=0xc247e900)
    at vnode_if.h:844
#13 0xc0588881 in lookup (ndp=0xd5ab2c64) at /usr/src/sys/kern/vfs_lookup.c:382
#14 0xc0588499 in namei (ndp=0xd5ab2c64) at /usr/src/sys/kern/vfs_lookup.c:203
#15 0xc059980c in kern_access (td=0xc247e900, path=0x0, pathseg=UIO_USERSPACE,
    flags=0) at /usr/src/sys/kern/vfs_syscalls.c:1879
#16 0xc0599788 in access (td=0x0, uap=0x0)
    at /usr/src/sys/kern/vfs_syscalls.c:1856
#17 0xc06ec012 in syscall (frame=
      {tf_fs = 59, tf_es = 59, tf_ds = 59, tf_edi = 6, tf_esi = 671846400, tf_eb
p = -1077950592, tf_isp = -710202012, tf_ebx = 671798328, tf_edx = 134517062, tf
_ecx = 671846415, tf_eax = 33, tf_trapno = 12, tf_err = 2, tf_eip = 671705663, t
f_cs = 51, tf_eflags = 582, tf_esp = -1077950692, tf_ss = 59})
    at /usr/src/sys/i386/i386/trap.c:986

The vnode in question:

(kdb) print *(struct vnode *)0xc2874aa0
$1 = {v_type = VDIR, v_tag = 0xc0728095 "ufs", v_op = 0xc076a820,
  v_data = 0xc2566420, v_mount = 0xc1f7c400, v_nmntvnodes = {
    tqe_next = 0xc28dc880, tqe_prev = 0xc37f7bc4}, v_un = {vu_mount = 0x0,
    vu_socket = 0x0, vu_cdev = 0x0, vu_fifoinfo = 0x0}, v_hashlist = {
    le_next = 0x0, le_prev = 0xc3253020}, v_hash = 854582, v_cache_src = {
    lh_first = 0x0}, v_cache_dst = {tqh_first = 0x0, tqh_last = 0xc2874ad0},
  v_dd = 0x0, v_cstart = 0, v_lasta = 0, v_lastw = 0, v_clen = 0, v_lock = {
    lk_interlock = 0xc0781f38, lk_flags = 33816640, lk_sharecount = 0,
    lk_waitcount = 1, lk_exclusivecount = 1, lk_prio = 80,
    lk_wmesg = 0xc0728095 "ufs", lk_timo = 6, lk_lockholder = 0xc1fbd900,
    lk_newlock = 0x0}, v_interlock = {mtx_object = {lo_class = 0xc0759484,
      lo_name = 0xc07295db "vnode interlock",
      lo_type = 0xc07295db "vnode interlock", lo_flags = 196608, lo_list = {
        tqe_next = 0x0, tqe_prev = 0x0}, lo_witness = 0x0}, mtx_lock = 4,
    mtx_recurse = 0}, v_vnlock = 0xc2874af8, v_holdcnt = 4, v_usecount = 4,
  v_iflag = 0, v_vflag = 0, v_writecount = 0, v_freelist = {tqe_next = 0x0,
    tqe_prev = 0xc2855c68}, v_bufobj = {bo_mtx = 0xc2874b1c, bo_clean = {
      bv_hd = {tqh_first = 0x0, tqh_last = 0xc2874b64}, bv_root = 0x0,
      bv_cnt = 0}, bo_dirty = {bv_hd = {tqh_first = 0x0,
        tqh_last = 0xc2874b74}, bv_root = 0x0, bv_cnt = 0}, bo_numoutput = 0,
    bo_flag = 0, bo_ops = 0xc075fd64, bo_bsize = 16384,
    bo_object = 0xc29928c4, bo_synclist = {le_next = 0x0, le_prev = 0x0},
    bo_private = 0xc2874aa0, __bo_vnode = 0xc2874aa0}, v_pollinfo = 0x0,
  v_label = 0x0}

Process 7055, which only has one thread, is holding the lock:

(kgdb) print ((struct vnode *)0xc2874aa0)->v_lock->lk_lockholder->td_proc->p_pid
$4 = 7055
(kgdb) print ((struct vnode *)0xc2874aa0)->v_lock->lk_lockholder->td_proc->p_numthreads
$1 = 1

This is what process 7055 is doing:

#0  sched_switch (td=0xc1fbd900, newtd=0xc1dba900, flags=1)
    at /usr/src/sys/kern/sched_4bsd.c:980
#1  0xc1fbd900 in ?? ()
#2  0xc0525d68 in mi_switch (flags=1, newtd=0x0)
    at /usr/src/sys/kern/kern_synch.c:356
#3  0xc0544c77 in sleepq_switch (wchan=0x0)
    at /usr/src/sys/kern/subr_sleepqueue.c:427
#4  0xc0544f50 in sleepq_wait_sig (wchan=0x0)
    at /usr/src/sys/kern/subr_sleepqueue.c:552
#5  0xc05259b7 in msleep (ident=0xc1fbc830, mtx=0xc1fbc898, priority=348,
    wmesg=0x0, timo=0) at /usr/src/sys/kern/kern_synch.c:225
#6  0xc04fe89c in kern_wait (td=0xc1fbd900, pid=-1, status=0xd4912c80,
    options=0, rusage=0x0) at /usr/src/sys/kern/kern_exit.c:772
#7  0xc04fdc9d in wait4 (td=0x0, uap=0xd4912d04)
    at /usr/src/sys/kern/kern_exit.c:569
#8  0xc06ec012 in syscall (frame=
      {tf_fs = 134873147, tf_es = 134545467, tf_ds = -1078001605, tf_edi = 13488
5888, tf_esi = 7093, tf_ebp = -1077951208, tf_isp = -728683164, tf_ebx = 6725478
76, tf_edx = 0, tf_ecx = 134913208, tf_eax = 7, tf_trapno = 12, tf_err = 2, tf_e
ip = 671979599, tf_cs = 51, tf_eflags = 534, tf_esp = -1077951236, tf_ss = 59})

Please 'splain to me why process 7055 is holding a locked vnode while it
is executing the wait4() syscall.

It looks like the DEBUG_LOCKS option might be useful in this case,
though this is RELENG_6, which doesn't have the (buggy?) stack trace
save feature.