Issue with epoch_drain_callbacks and unloading iavf(4) [using iflib]
Eric Joyner
erj at freebsd.org
Wed Mar 11 23:33:01 UTC 2020
On Wed, Feb 12, 2020 at 2:22 PM Mark Johnston <markj at freebsd.org> wrote:
> On Mon, Feb 10, 2020 at 01:30:47PM -0800, Eric Joyner wrote:
> > On Fri, Jan 31, 2020 at 11:44 AM Eric Joyner <erj at freebsd.org> wrote:
> > > Hi Mark,
> > >
> > > I applied your patch to a FreeBSD-current VM and reran the test I was
> > > running originally, and I still see the same behavior.
> > >
> > > Your patch doesn't appear to fix the problem.
> > >
> > > - Eric
> > >
> >
> > *bump*
>
> Sorry, I had missed this. I think this implies that even the epoch
> callback tasks are not getting a chance to run, but I guess that makes
> sense: all grouptask threads run at PI_SOFT, a realtime priority, so the
> iflib and epoch callback task threads are all running at the same
> priority. Unless iflib voluntarily yields the CPU it'll keep getting
> scheduled.
>
> Here's an updated batch which also allows the priority of group task
> threads to be specified. I don't think it makes much sense to have more
> than one per-CPU group task in the system without that parameter. With
> this, epoch callbacks should run at a slightly higher priority than
> iflib.
>
> The combined patch also fixes an annoying problem where interface
> destruction can appear to hang for a short period (~0.5s). This is
> because epoch callbacks are scheduled off hardclock, but hardclock
> interrupts run at a very low frequency on an idle CPU.
>
> diff --git a/sys/kern/subr_epoch.c b/sys/kern/subr_epoch.c
> index 0a477e1d6f7b..4ed7e7e11a3e 100644
> --- a/sys/kern/subr_epoch.c
> +++ b/sys/kern/subr_epoch.c
> @@ -74,15 +74,19 @@ typedef struct epoch_record {
> volatile struct epoch_tdlist er_tdlist;
> volatile uint32_t er_gen;
> uint32_t er_cpuid;
> + int er_drain_state;
> } __aligned(EPOCH_ALIGN) *epoch_record_t;
>
> +#define EPOCH_DRAIN_START 2
> +#define EPOCH_DRAIN_RUNNING 1
> +#define EPOCH_DRAIN_DONE 0
> +
> struct epoch {
> struct ck_epoch e_epoch __aligned(EPOCH_ALIGN);
> epoch_record_t e_pcpu_record;
> int e_idx;
> int e_flags;
> struct sx e_drain_sx;
> - struct mtx e_drain_mtx;
> volatile int e_drain_count;
> const char *e_name;
> };
> @@ -335,7 +339,6 @@ epoch_alloc(const char *name, int flags)
> epoch->e_idx = epoch_count;
> epoch->e_name = name;
> sx_init(&epoch->e_drain_sx, "epoch-drain-sx");
> - mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF);
> allepochs[epoch_count++] = epoch;
> return (epoch);
> }
> @@ -348,7 +351,6 @@ epoch_free(epoch_t epoch)
> allepochs[epoch->e_idx] = NULL;
> epoch_wait(global_epoch);
> uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record);
> - mtx_destroy(&epoch->e_drain_mtx);
> sx_destroy(&epoch->e_drain_sx);
> free(epoch, M_EPOCH);
> }
> @@ -699,14 +701,24 @@ epoch_call_task(void *arg __unused)
> epoch_t epoch;
> ck_stack_t cb_stack;
> int i, npending, total;
> + bool draining;
> +
> + KASSERT(curthread->td_pinned > 0,
> + ("%s: callback task thread is not pinned", __func__));
>
> ck_stack_init(&cb_stack);
> critical_enter();
> epoch_enter(global_epoch);
> - for (total = i = 0; i < epoch_count; i++) {
> + for (total = i = 0, draining = false; i < epoch_count; i++) {
> if (__predict_false((epoch = allepochs[i]) == NULL))
> continue;
> er = epoch_currecord(epoch);
> + if (atomic_load_int(&er->er_drain_state) ==
> EPOCH_DRAIN_START) {
> + atomic_store_int(&er->er_drain_state,
> + EPOCH_DRAIN_RUNNING);
> + draining = true;
> + }
> +
> record = &er->er_record;
> if ((npending = record->n_pending) == 0)
> continue;
> @@ -728,6 +740,20 @@ epoch_call_task(void *arg __unused)
> next = CK_STACK_NEXT(cursor);
> entry->function(entry);
> }
> +
> + if (__predict_false(draining)) {
> + epoch_enter(global_epoch);
> + for (i = 0; i < epoch_count; i++) {
> + if (__predict_false((epoch = allepochs[i]) ==
> NULL))
> + continue;
> + er = epoch_currecord(epoch);
> + if (atomic_load_int(&er->er_drain_state) ==
> + EPOCH_DRAIN_RUNNING)
> + atomic_store_int(&er->er_drain_state,
> + EPOCH_DRAIN_DONE);
> + }
> + epoch_exit(global_epoch);
> + }
> }
>
> int
> @@ -769,27 +795,18 @@ in_epoch(epoch_t epoch)
> }
>
> static void
> -epoch_drain_cb(struct epoch_context *ctx)
> +epoch_drain_handler(struct ck_epoch *global __unused,
> + ck_epoch_record_t *cr __unused, void *arg __unused)
> {
> - struct epoch *epoch =
> - __containerof(ctx, struct epoch_record,
> er_drain_ctx)->er_parent;
> -
> - if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) {
> - mtx_lock(&epoch->e_drain_mtx);
> - wakeup(epoch);
> - mtx_unlock(&epoch->e_drain_mtx);
> - }
> + maybe_yield();
> }
>
> void
> epoch_drain_callbacks(epoch_t epoch)
> {
> epoch_record_t er;
> - struct thread *td;
> - int was_bound;
> - int old_pinned;
> - int old_cpu;
> - int cpu;
> + int cpu, state;
> + bool pending;
>
> WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
> "epoch_drain_callbacks() may sleep!");
> @@ -802,45 +819,28 @@ epoch_drain_callbacks(epoch_t epoch)
> return;
> #endif
> DROP_GIANT();
> -
> sx_xlock(&epoch->e_drain_sx);
> - mtx_lock(&epoch->e_drain_mtx);
>
> - td = curthread;
> - thread_lock(td);
> - old_cpu = PCPU_GET(cpuid);
> - old_pinned = td->td_pinned;
> - was_bound = sched_is_bound(td);
> - sched_unbind(td);
> - td->td_pinned = 0;
> + /* Make sure that all pending callbacks are available. */
> + ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_drain_handler,
> NULL);
>
> - CPU_FOREACH(cpu)
> - epoch->e_drain_count++;
> CPU_FOREACH(cpu) {
> er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
> - sched_bind(td, cpu);
> - epoch_call(epoch, &epoch_drain_cb, &er->er_drain_ctx);
> + atomic_store_int(&er->er_drain_state, EPOCH_DRAIN_START);
> + GROUPTASK_ENQUEUE(DPCPU_ID_PTR(cpu, epoch_cb_task));
> }
>
> - /* restore CPU binding, if any */
> - if (was_bound != 0) {
> - sched_bind(td, old_cpu);
> - } else {
> - /* get thread back to initial CPU, if any */
> - if (old_pinned != 0)
> - sched_bind(td, old_cpu);
> - sched_unbind(td);
> - }
> - /* restore pinned after bind */
> - td->td_pinned = old_pinned;
> -
> - thread_unlock(td);
> -
> - while (epoch->e_drain_count != 0)
> - msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0);
> + do {
> + pending = false;
> + CPU_FOREACH(cpu) {
> + er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
> + state = atomic_load_int(&er->er_drain_state);
> + if (state != EPOCH_DRAIN_DONE)
> + pending = true;
> + }
> + pause("edrain", 1);
> + } while (pending);
>
> - mtx_unlock(&epoch->e_drain_mtx);
> sx_xunlock(&epoch->e_drain_sx);
> -
> PICKUP_GIANT();
> }
> diff --git a/sys/kern/subr_gtaskqueue.c b/sys/kern/subr_gtaskqueue.c
> index 9bf4b0ca3ad3..f2e7f74325fa 100644
> --- a/sys/kern/subr_gtaskqueue.c
> +++ b/sys/kern/subr_gtaskqueue.c
> @@ -54,8 +54,8 @@ static void gtaskqueue_thread_loop(void *arg);
> static int task_is_running(struct gtaskqueue *queue, struct gtask
> *gtask);
> static void gtaskqueue_drain_locked(struct gtaskqueue *queue, struct
> gtask *gtask);
>
> -TASKQGROUP_DEFINE(softirq, mp_ncpus, 1);
> -TASKQGROUP_DEFINE(config, 1, 1);
> +TASKQGROUP_DEFINE(softirq, mp_ncpus, 1, PI_SOFT);
> +TASKQGROUP_DEFINE(config, 1, 1, PI_SOFT);
>
> struct gtaskqueue_busy {
> struct gtask *tb_running;
> @@ -612,7 +612,7 @@ struct taskq_bind_task {
> };
>
> static void
> -taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu)
> +taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu, int
> pri)
> {
> struct taskqgroup_cpu *qcpu;
>
> @@ -620,7 +620,7 @@ taskqgroup_cpu_create(struct taskqgroup *qgroup, int
> idx, int cpu)
> LIST_INIT(&qcpu->tgc_tasks);
> qcpu->tgc_taskq = gtaskqueue_create_fast(NULL, M_WAITOK,
> taskqueue_thread_enqueue, &qcpu->tgc_taskq);
> - gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, PI_SOFT,
> + gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, pri,
> "%s_%d", qgroup->tqg_name, idx);
> qcpu->tgc_cpu = cpu;
> }
> @@ -900,7 +900,7 @@ taskqgroup_config_init(void *arg)
> LIST_SWAP(>ask_head, &qgroup->tqg_queue[0].tgc_tasks,
> grouptask, gt_list);
> qgroup->tqg_queue[0].tgc_cnt = 0;
> - taskqgroup_cpu_create(qgroup, 0, 0);
> + taskqgroup_cpu_create(qgroup, 0, 0, PI_SOFT);
>
> qgroup->tqg_cnt = 1;
> qgroup->tqg_stride = 1;
> @@ -910,7 +910,7 @@ SYSINIT(taskqgroup_config_init, SI_SUB_TASKQ,
> SI_ORDER_SECOND,
> taskqgroup_config_init, NULL);
>
> static int
> -_taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
> +_taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride, int
> pri)
> {
> LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL);
> struct grouptask *gtask;
> @@ -948,7 +948,7 @@ _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt,
> int stride)
> */
> cpu = old_cpu;
> for (i = old_cnt; i < cnt; i++) {
> - taskqgroup_cpu_create(qgroup, i, cpu);
> + taskqgroup_cpu_create(qgroup, i, cpu, pri);
>
> for (k = 0; k < stride; k++)
> cpu = CPU_NEXT(cpu);
> @@ -1001,12 +1001,12 @@ _taskqgroup_adjust(struct taskqgroup *qgroup, int
> cnt, int stride)
> }
>
> int
> -taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
> +taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride, int pri)
> {
> int error;
>
> mtx_lock(&qgroup->tqg_lock);
> - error = _taskqgroup_adjust(qgroup, cnt, stride);
> + error = _taskqgroup_adjust(qgroup, cnt, stride, pri);
> mtx_unlock(&qgroup->tqg_lock);
>
> return (error);
> diff --git a/sys/net/iflib.c b/sys/net/iflib.c
> index a9fc8090c48a..1163425bf15a 100644
> --- a/sys/net/iflib.c
> +++ b/sys/net/iflib.c
> @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$");
> #include <sys/types.h>
> #include <sys/bus.h>
> #include <sys/eventhandler.h>
> +#include <sys/interrupt.h>
> #include <sys/kernel.h>
> #include <sys/lock.h>
> #include <sys/mutex.h>
> @@ -560,8 +561,8 @@ MODULE_VERSION(iflib, 1);
> MODULE_DEPEND(iflib, pci, 1, 1, 1);
> MODULE_DEPEND(iflib, ether, 1, 1, 1);
>
> -TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
> -TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
> +TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1, PI_SWI(SWI_NET));
> +TASKQGROUP_DEFINE(if_config_tqg, 1, 1, PI_SWI(SWI_NET));
>
> #ifndef IFLIB_DEBUG_COUNTERS
> #ifdef INVARIANTS
> diff --git a/sys/sys/gtaskqueue.h b/sys/sys/gtaskqueue.h
> index 96fd57dfb76d..a57bcfc1881a 100644
> --- a/sys/sys/gtaskqueue.h
> +++ b/sys/sys/gtaskqueue.h
> @@ -79,7 +79,8 @@ int taskqgroup_attach_cpu(struct taskqgroup *qgroup,
> void taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask
> *gtask);
> struct taskqgroup *taskqgroup_create(const char *name);
> void taskqgroup_destroy(struct taskqgroup *qgroup);
> -int taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride);
> +int taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride,
> + int pri);
> void taskqgroup_config_gtask_init(void *ctx, struct grouptask *gtask,
> gtask_fn_t *fn, const char *name);
> void taskqgroup_config_gtask_deinit(struct grouptask *gtask);
> @@ -100,7 +101,7 @@ void taskqgroup_config_gtask_deinit(struct
> grouptask *gtask);
> #define TASKQGROUP_DECLARE(name) \
> extern struct taskqgroup *qgroup_##name
>
> -#define TASKQGROUP_DEFINE(name, cnt, stride) \
> +#define TASKQGROUP_DEFINE(name, cnt, stride, pri) \
> \
> struct taskqgroup *qgroup_##name; \
> \
> @@ -116,7 +117,7 @@ SYSINIT(taskqgroup_##name, SI_SUB_TASKQ,
> SI_ORDER_FIRST, \
> static void \
> taskqgroup_adjust_##name(void *arg) \
> { \
> - taskqgroup_adjust(qgroup_##name, (cnt), (stride)); \
> + taskqgroup_adjust(qgroup_##name, (cnt), (stride), (pri)); \
> } \
> \
> SYSINIT(taskqgroup_adj_##name, SI_SUB_SMP, SI_ORDER_ANY, \
>
Mark,
I did get some time to get back and retry this; however your second patch
still doesn't solve the problem. Looking into it a bit, it looks like the
kldunload process isn't hitting the code you've changed; it's hanging in
epoch_wait_preempt() in if_detach_internal(), which is immediately before
epoch_drain_callbacks().
I did a kernel dump while it was hanging, and this is the backtrace for the
kldunload process:
#0 sched_switch (td=0xfffffe00da78b700, flags=<optimized out>) at
/usr/src/sys/kern/sched_ule.c:2147
#1 0xffffffff80ba2419 in mi_switch (flags=256) at
/usr/src/sys/kern/kern_synch.c:533
#2 0xffffffff80bc749c in sched_bind (td=0xfffffe00da78b700, cpu=31) at
/usr/src/sys/kern/sched_ule.c:2729
#3 0xffffffff80bdaf5b in epoch_block_handler_preempt (global=<optimized
out>, cr=0xfffffe0036387b00, arg=<optimized out>) at
/usr/src/sys/kern/subr_epoch.c:516
#4 0xffffffff803b602d in epoch_block (global=0xfffff80003be2680,
cr=<optimized out>, cb=<unavailable>, ct=<unavailable>) at
/usr/src/sys/contrib/ck/src/ck_epoch.c:416
#5 ck_epoch_synchronize_wait (global=0xfffff80003be2680, cb=<optimized
out>, ct=<optimized out>) at /usr/src/sys/contrib/ck/src/ck_epoch.c:465
#6 0xffffffff80bdaabe in epoch_wait_preempt (epoch=0xfffff80003be2680) at
/usr/src/sys/kern/subr_epoch.c:628
#7 0xffffffff80ca0fea in if_detach_internal (ifp=0xfffff80074893800,
vmove=<optimized out>, ifcp=0x0) at /usr/src/sys/net/if.c:1123
#8 0xffffffff80ca0dad in if_detach (ifp=<unavailable>) at
/usr/src/sys/net/if.c:1063
#9 0xffffffff80cbcc46 in iflib_device_deregister (ctx=0xfffff800711f0000)
at /usr/src/sys/net/iflib.c:5127
#10 0xffffffff80bcd95e in DEVICE_DETACH (dev=0xfffff80004388700) at
./device_if.h:234
#11 device_detach (dev=0xfffff80004388700) at
/usr/src/sys/kern/subr_bus.c:3049
#12 0xffffffff80bccefd in devclass_driver_deleted
(busclass=0xfffff80003f67d80, dc=0xfffff8000d1bc280,
driver=0xffffffff8232e9e0 <i40e_write_nvm_aq+80>) at
/usr/src/sys/kern/subr_bus.c:1235
#13 0xffffffff80bccddf in devclass_delete_driver
(busclass=0xfffff80003f67d80, driver=0xffffffff8232e9e0
<i40e_write_nvm_aq+80>) at /usr/src/sys/kern/subr_bus.c:1310
#14 0xffffffff80bd2ddc in driver_module_handler (mod=0xfffff8000db94a00,
what=1, arg=0xffffffff8232e9b0 <i40e_write_nvm_aq+32>) at
/usr/src/sys/kern/subr_bus.c:5229
#15 0xffffffff80b72ea2 in module_unload (mod=0xfffff8000db94a00) at
/usr/src/sys/kern/kern_module.c:261
#16 0xffffffff80b63b8b in linker_file_unload (file=0xfffff8000d833c00,
flags=<optimized out>) at /usr/src/sys/kern/kern_linker.c:700
#17 0xffffffff80b64f7d in kern_kldunload (td=<optimized out>, fileid=4,
flags=0) at /usr/src/sys/kern/kern_linker.c:1153
#18 0xffffffff8103b8dd in syscallenter (td=<optimized out>) at
/usr/src/sys/amd64/amd64/../../kern/subr_syscall.c:163
#19 amd64_syscall (td=0xfffffe00da78b700, traced=0) at
/usr/src/sys/amd64/amd64/trap.c:1162
#20 <signal handler called>
#21 0x00000008002de6da in ?? ()
- Eric
More information about the freebsd-net
mailing list