Bad link elm in vm_object_terminate [Was: crash on process exit.. current at about r332467]
Andriy Gapon
avg at FreeBSD.org
Tue May 29 13:50:19 UTC 2018
On 23/04/2018 17:50, Julian Elischer wrote:
> back trace at: http://www.freebsd.org/~julian/bob-crash.png
>
> If anyone wants to take a look..
>
> In the exit syscall, while deallocating a vm object.
>
> I haven't see references to a similar crash in the last 10 days or so.. But if
> it rings any bells...
We have just got another one:
panic: Bad link elm 0xfffff80cc3938360 prev->next != elm
Matching disassembled code to C code, it seems that the crash is somewhere in
vm_object_terminate_pages (inlined into vm_object_terminate), probably in one of
TAILQ_REMOVE-s there:
if (p->queue != PQ_NONE) {
KASSERT(p->queue < PQ_COUNT, ("vm_object_terminate: "
"page %p is not queued", p));
pq1 = vm_page_pagequeue(p);
if (pq != pq1) {
if (pq != NULL) {
vm_pagequeue_cnt_add(pq, dequeued);
vm_pagequeue_unlock(pq);
}
pq = pq1;
vm_pagequeue_lock(pq);
dequeued = 0;
}
p->queue = PQ_NONE;
TAILQ_REMOVE(&pq->pq_pl, p, plinks.q);
dequeued--;
}
if (vm_page_free_prep(p, true))
continue;
unlist:
TAILQ_REMOVE(&object->memq, p, listq);
}
Please note that this is the code before r332974 Improve VM page queue scalability.
I am not sure if r332974 + r333256 would fix the problem or if it just would get
moved to a different place.
Does this ring a bell to anyone who tinkered with that part of the VM code recently?
Looking a little bit further, I think that object->memq somehow got corrupted.
memq contains just two elements and the reported element is not there.
(kgdb) p *(struct vm_page *)0xfffff80cc3938360
$22 = {
plinks = {
q = {
tqe_next = 0xfffff80cd7175398,
tqe_prev = 0xfffff80cb9f69170
},
s = {
ss = {
sle_next = 0xfffff80cd7175398
},
pv = 0xfffff80cb9f69170
},
memguard = {
p = 18446735332764767128,
v = 18446735332276081008
}
},
listq = {
tqe_next = 0xfffff80cc3938568, <=============
tqe_prev = 0xfffff8078c11b848 <=============
},
object = 0x0,
pindex = 1548,
phys_addr = 14695911424,
md = {
pv_list = {
tqh_first = 0x0,
tqh_last = 0xfffff80cc3938398
},
pv_gen = 1205766,
pat_mode = 6
},
wire_count = 0,
busy_lock = 1,
hold_count = 0,
flags = 0,
aflags = 0 '\000',
oflags = 0 '\000',
queue = 255 '\377',
psind = 0 '\000',
segind = 5 '\005',
order = 13 '\r',
pool = 0 '\000',
act_count = 5 '\005',
valid = 0 '\000',
dirty = 0 '\000'
}
(kgdb) p object->memq
$11 = {
tqh_first = 0xfffff80cb861cfb8,
tqh_last = 0xfffff80cc3938780
}
(kgdb) p *object->memq.tqh_first
$25 = {
plinks = {
q = {
tqe_next = 0xfffff80cb9f69108,
tqe_prev = 0xfffff80cd7175398
},
s = {
ss = {
sle_next = 0xfffff80cb9f69108
},
pv = 0xfffff80cd7175398
},
memguard = {
p = 18446735332276080904,
v = 18446735332764767128
}
},
listq = {
tqe_next = 0xfffff80cb56eafb0, <=============
tqe_prev = 0xfffff8078c11b848 <=============
},
object = 0xfffff8078c11b800,
pindex = 515,
phys_addr = 7299219456,
md = {
pv_list = {
tqh_first = 0xfffff80b99e4ff88,
tqh_last = 0xfffff80b99e4ff90
},
pv_gen = 466177,
pat_mode = 6
},
wire_count = 0,
busy_lock = 2,
hold_count = 0,
flags = 0,
aflags = 0 '\000',
oflags = 0 '\000',
queue = 255 '\377',
psind = 0 '\000',
segind = 5 '\005',
order = 13 '\r',
pool = 0 '\000',
act_count = 5 '\005',
valid = 255 '\377',
dirty = 0 '\000'
}
(kgdb) p *object->memq.tqh_first->listq.tqe_next
$26 = {
plinks = {
q = {
tqe_next = 0x0,
tqe_prev = 0xfffff80cc92e1d18
},
s = {
ss = {
sle_next = 0x0
},
pv = 0xfffff80cc92e1d18
},
memguard = {
p = 0,
v = 18446735332531379480
}
},
listq = {
tqe_next = 0x0, <=============
tqe_prev = 0xfffff80cb861cfc8 <=============
},
object = 0xfffff8078c11b800,
pindex = 1548,
phys_addr = 5350158336,
md = {
pv_list = {
tqh_first = 0xfffff80a07222808,
tqh_last = 0xfffff80a07222810
},
pv_gen = 7085,
pat_mode = 6
},
wire_count = 0,
busy_lock = 1,
hold_count = 0,
flags = 0,
aflags = 1 '\001',
oflags = 0 '\000',
queue = 1 '\001',
psind = 0 '\000',
segind = 5 '\005',
order = 13 '\r',
pool = 0 '\000',
act_count = 5 '\005',
valid = 255 '\377',
dirty = 255 '\377'
}
Pages 0xfffff80cc3938360 (the reported one) and 0xfffff80cb56eafb0 (the last one
on memq) have the same index 1548. Also, memq.tqh_last points to the reported
page, but it is not reachable via tqe_next pointers.
It's also potentially interesting is that the reported page looks like it's
already freed and the replacement page is both valid and dirty.
The object, just in case:
(kgdb) p *object
$34 = {
lock = {
lock_object = {
lo_name = 0xffffffff81202c27 "vm object",
lo_flags = 627245056,
lo_data = 0,
lo_witness = 0xfffff80cffd6a700
},
rw_lock = 18446735286009226592
},
object_list = {
tqe_next = 0xfffff80b2481e200,
tqe_prev = 0xfffff80b2481e020
},
shadow_head = {
lh_first = 0x0
},
shadow_list = {
le_next = 0xfffff809c070f900,
le_prev = 0xfffff80869c06c30
},
memq = {
tqh_first = 0xfffff80cb861cfb8,
tqh_last = 0xfffff80cc3938780
},
rtree = {
rt_root = 18446735279843613792
},
size = 1561,
domain = {
dr_policy = 0x0,
dr_iterator = 0
},
generation = 1,
ref_count = 0,
shadow_count = 0,
memattr = 6 '\006',
type = 0 '\000',
flags = 12296,
pg_color = 1809,
paging_in_progress = 0,
resident_page_count = 5,
backing_object = 0x0,
backing_object_offset = 0,
pager_object_list = {
tqe_next = 0x0,
tqe_prev = 0x0
},
rvq = {
lh_first = 0xfffff80cad278b60
},
handle = 0x0,
un_pager = {
vnp = {
vnp_size = 19444,
writemappings = 0
},
devp = {
devp_pglist = {
tqh_first = 0x4bf4,
tqh_last = 0x0
},
ops = 0x0,
dev = 0x0
},
sgp = {
sgp_pglist = {
tqh_first = 0x4bf4,
tqh_last = 0x0
}
},
swp = {
swp_tmpfs = 0x4bf4,
swp_blks = {
pt_root = 0
}
}
},
cred = 0xfffff806811adc00,
charge = 6393856,
umtx_data = 0x0
}
Interesting that it is on a shadow list.
--
Andriy Gapon
More information about the freebsd-current
mailing list